mirror of
https://github.com/1Panel-dev/1Panel.git
synced 2025-03-01 11:34:13 +08:00
255 lines
6.8 KiB
Go
255 lines
6.8 KiB
Go
![]() |
package xpu
|
||
|
|
||
|
import (
|
||
|
"encoding/json"
|
||
|
"fmt"
|
||
|
"sort"
|
||
|
"strconv"
|
||
|
"sync"
|
||
|
"time"
|
||
|
|
||
|
baseGlobal "github.com/1Panel-dev/1Panel/agent/global"
|
||
|
"github.com/1Panel-dev/1Panel/agent/utils/cmd"
|
||
|
)
|
||
|
|
||
|
type XpuSMI struct{}
|
||
|
|
||
|
func New() (bool, XpuSMI) {
|
||
|
return cmd.Which("xpu-smi"), XpuSMI{}
|
||
|
}
|
||
|
|
||
|
func (x XpuSMI) loadDeviceData(device Device, wg *sync.WaitGroup, res *[]XPUSimpleInfo, mu *sync.Mutex) {
|
||
|
defer wg.Done()
|
||
|
|
||
|
var xpu XPUSimpleInfo
|
||
|
xpu.DeviceID = device.DeviceID
|
||
|
xpu.DeviceName = device.DeviceName
|
||
|
|
||
|
var xpuData, statsData string
|
||
|
var xpuErr, statsErr error
|
||
|
|
||
|
var wgCmd sync.WaitGroup
|
||
|
wgCmd.Add(2)
|
||
|
|
||
|
go func() {
|
||
|
defer wgCmd.Done()
|
||
|
xpuData, xpuErr = cmd.ExecWithTimeOut(fmt.Sprintf("xpu-smi discovery -d %d -j", device.DeviceID), 5*time.Second)
|
||
|
}()
|
||
|
|
||
|
go func() {
|
||
|
defer wgCmd.Done()
|
||
|
statsData, statsErr = cmd.ExecWithTimeOut(fmt.Sprintf("xpu-smi stats -d %d -j", device.DeviceID), 5*time.Second)
|
||
|
}()
|
||
|
|
||
|
wgCmd.Wait()
|
||
|
|
||
|
if xpuErr != nil {
|
||
|
baseGlobal.LOG.Errorf("calling xpu-smi discovery failed for device %d, err: %v\n", device.DeviceID, xpuErr)
|
||
|
return
|
||
|
}
|
||
|
|
||
|
var info Device
|
||
|
if err := json.Unmarshal([]byte(xpuData), &info); err != nil {
|
||
|
baseGlobal.LOG.Errorf("xpuData json unmarshal failed for device %d, err: %v\n", device.DeviceID, err)
|
||
|
return
|
||
|
}
|
||
|
|
||
|
bytes, err := strconv.ParseInt(info.MemoryPhysicalSizeByte, 10, 64)
|
||
|
if err != nil {
|
||
|
baseGlobal.LOG.Errorf("Error parsing memory size for device %d, err: %v\n", device.DeviceID, err)
|
||
|
return
|
||
|
}
|
||
|
xpu.Memory = fmt.Sprintf("%.1f MB", float64(bytes)/(1024*1024))
|
||
|
|
||
|
if statsErr != nil {
|
||
|
baseGlobal.LOG.Errorf("calling xpu-smi stats failed for device %d, err: %v\n", device.DeviceID, statsErr)
|
||
|
return
|
||
|
}
|
||
|
|
||
|
var stats DeviceStats
|
||
|
if err := json.Unmarshal([]byte(statsData), &stats); err != nil {
|
||
|
baseGlobal.LOG.Errorf("statsData json unmarshal failed for device %d, err: %v\n", device.DeviceID, err)
|
||
|
return
|
||
|
}
|
||
|
|
||
|
for _, stat := range stats.DeviceLevel {
|
||
|
switch stat.MetricsType {
|
||
|
case "XPUM_STATS_POWER":
|
||
|
xpu.Power = fmt.Sprintf("%.1fW", stat.Value)
|
||
|
case "XPUM_STATS_GPU_CORE_TEMPERATURE":
|
||
|
xpu.Temperature = fmt.Sprintf("%.1f°C", stat.Value)
|
||
|
case "XPUM_STATS_MEMORY_USED":
|
||
|
xpu.MemoryUsed = fmt.Sprintf("%.1fMB", stat.Value)
|
||
|
case "XPUM_STATS_MEMORY_UTILIZATION":
|
||
|
xpu.MemoryUtil = fmt.Sprintf("%.1f%%", stat.Value)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
mu.Lock()
|
||
|
*res = append(*res, xpu)
|
||
|
mu.Unlock()
|
||
|
}
|
||
|
|
||
|
func (x XpuSMI) LoadDashData() ([]XPUSimpleInfo, error) {
|
||
|
data, err := cmd.ExecWithTimeOut("xpu-smi discovery -j", 5*time.Second)
|
||
|
if err != nil {
|
||
|
return nil, fmt.Errorf("calling xpu-smi failed, err: %w", err)
|
||
|
}
|
||
|
|
||
|
var deviceInfo DeviceInfo
|
||
|
if err := json.Unmarshal([]byte(data), &deviceInfo); err != nil {
|
||
|
return nil, fmt.Errorf("deviceInfo json unmarshal failed, err: %w", err)
|
||
|
}
|
||
|
|
||
|
var res []XPUSimpleInfo
|
||
|
var wg sync.WaitGroup
|
||
|
var mu sync.Mutex
|
||
|
|
||
|
for _, device := range deviceInfo.DeviceList {
|
||
|
wg.Add(1)
|
||
|
go x.loadDeviceData(device, &wg, &res, &mu)
|
||
|
}
|
||
|
|
||
|
wg.Wait()
|
||
|
|
||
|
sort.Slice(res, func(i, j int) bool {
|
||
|
return res[i].DeviceID < res[j].DeviceID
|
||
|
})
|
||
|
return res, nil
|
||
|
}
|
||
|
|
||
|
func (x XpuSMI) LoadGpuInfo() (*XpuInfo, error) {
|
||
|
data, err := cmd.ExecWithTimeOut("xpu-smi discovery -j", 5*time.Second)
|
||
|
if err != nil {
|
||
|
return nil, fmt.Errorf("calling xpu-smi failed, err: %w", err)
|
||
|
}
|
||
|
var deviceInfo DeviceInfo
|
||
|
if err := json.Unmarshal([]byte(data), &deviceInfo); err != nil {
|
||
|
return nil, fmt.Errorf("deviceInfo json unmarshal failed, err: %w", err)
|
||
|
}
|
||
|
res := &XpuInfo{
|
||
|
Type: "xpu",
|
||
|
}
|
||
|
|
||
|
var wg sync.WaitGroup
|
||
|
var mu sync.Mutex
|
||
|
|
||
|
for _, device := range deviceInfo.DeviceList {
|
||
|
wg.Add(1)
|
||
|
go x.loadDeviceInfo(device, &wg, res, &mu)
|
||
|
}
|
||
|
|
||
|
wg.Wait()
|
||
|
|
||
|
processData, err := cmd.ExecWithTimeOut(fmt.Sprintf("xpu-smi ps -j"), 5*time.Second)
|
||
|
if err != nil {
|
||
|
return nil, fmt.Errorf("calling xpu-smi ps failed, err: %w", err)
|
||
|
}
|
||
|
var psList DeviceUtilByProcList
|
||
|
if err := json.Unmarshal([]byte(processData), &psList); err != nil {
|
||
|
return nil, fmt.Errorf("processData json unmarshal failed, err: %w", err)
|
||
|
}
|
||
|
for _, ps := range psList.DeviceUtilByProcList {
|
||
|
process := Process{
|
||
|
PID: ps.ProcessID,
|
||
|
Command: ps.ProcessName,
|
||
|
}
|
||
|
if ps.SharedMemSize > 0 {
|
||
|
process.SHR = fmt.Sprintf("%.1f MB", ps.SharedMemSize/1024)
|
||
|
}
|
||
|
if ps.MemSize > 0 {
|
||
|
process.Memory = fmt.Sprintf("%.1f MB", ps.MemSize/1024)
|
||
|
}
|
||
|
for index, xpu := range res.Xpu {
|
||
|
if xpu.Basic.DeviceID == ps.DeviceID {
|
||
|
res.Xpu[index].Processes = append(res.Xpu[index].Processes, process)
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return res, nil
|
||
|
}
|
||
|
|
||
|
func (x XpuSMI) loadDeviceInfo(device Device, wg *sync.WaitGroup, res *XpuInfo, mu *sync.Mutex) {
|
||
|
defer wg.Done()
|
||
|
|
||
|
xpu := Xpu{
|
||
|
Basic: Basic{
|
||
|
DeviceID: device.DeviceID,
|
||
|
DeviceName: device.DeviceName,
|
||
|
VendorName: device.VendorName,
|
||
|
PciBdfAddress: device.PciBdfAddress,
|
||
|
},
|
||
|
}
|
||
|
|
||
|
var xpuData, statsData string
|
||
|
var xpuErr, statsErr error
|
||
|
|
||
|
var wgCmd sync.WaitGroup
|
||
|
wgCmd.Add(2)
|
||
|
|
||
|
go func() {
|
||
|
defer wgCmd.Done()
|
||
|
xpuData, xpuErr = cmd.ExecWithTimeOut(fmt.Sprintf("xpu-smi discovery -d %d -j", device.DeviceID), 5*time.Second)
|
||
|
}()
|
||
|
|
||
|
go func() {
|
||
|
defer wgCmd.Done()
|
||
|
statsData, statsErr = cmd.ExecWithTimeOut(fmt.Sprintf("xpu-smi stats -d %d -j", device.DeviceID), 5*time.Second)
|
||
|
}()
|
||
|
|
||
|
wgCmd.Wait()
|
||
|
|
||
|
if xpuErr != nil {
|
||
|
baseGlobal.LOG.Errorf("calling xpu-smi discovery failed for device %d, err: %v\n", device.DeviceID, xpuErr)
|
||
|
return
|
||
|
}
|
||
|
|
||
|
var info Device
|
||
|
if err := json.Unmarshal([]byte(xpuData), &info); err != nil {
|
||
|
baseGlobal.LOG.Errorf("xpuData json unmarshal failed for device %d, err: %v\n", device.DeviceID, err)
|
||
|
return
|
||
|
}
|
||
|
|
||
|
res.DriverVersion = info.DriverVersion
|
||
|
xpu.Basic.DriverVersion = info.DriverVersion
|
||
|
|
||
|
bytes, err := strconv.ParseInt(info.MemoryPhysicalSizeByte, 10, 64)
|
||
|
if err != nil {
|
||
|
baseGlobal.LOG.Errorf("Error parsing memory size for device %d, err: %v\n", device.DeviceID, err)
|
||
|
return
|
||
|
}
|
||
|
xpu.Basic.Memory = fmt.Sprintf("%.1f MB", float64(bytes)/(1024*1024))
|
||
|
xpu.Basic.FreeMemory = info.MemoryFreeSizeByte
|
||
|
|
||
|
if statsErr != nil {
|
||
|
baseGlobal.LOG.Errorf("calling xpu-smi stats failed for device %d, err: %v\n", device.DeviceID, statsErr)
|
||
|
return
|
||
|
}
|
||
|
|
||
|
var stats DeviceStats
|
||
|
if err := json.Unmarshal([]byte(statsData), &stats); err != nil {
|
||
|
baseGlobal.LOG.Errorf("statsData json unmarshal failed for device %d, err: %v\n", device.DeviceID, err)
|
||
|
return
|
||
|
}
|
||
|
|
||
|
for _, stat := range stats.DeviceLevel {
|
||
|
switch stat.MetricsType {
|
||
|
case "XPUM_STATS_POWER":
|
||
|
xpu.Stats.Power = fmt.Sprintf("%.1fW", stat.Value)
|
||
|
case "XPUM_STATS_GPU_FREQUENCY":
|
||
|
xpu.Stats.Frequency = fmt.Sprintf("%.1fMHz", stat.Value)
|
||
|
case "XPUM_STATS_GPU_CORE_TEMPERATURE":
|
||
|
xpu.Stats.Temperature = fmt.Sprintf("%.1f°C", stat.Value)
|
||
|
case "XPUM_STATS_MEMORY_USED":
|
||
|
xpu.Stats.MemoryUsed = fmt.Sprintf("%.1fMB", stat.Value)
|
||
|
case "XPUM_STATS_MEMORY_UTILIZATION":
|
||
|
xpu.Stats.MemoryUtil = fmt.Sprintf("%.1f%%", stat.Value)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
mu.Lock()
|
||
|
res.Xpu = append(res.Xpu, xpu)
|
||
|
mu.Unlock()
|
||
|
}
|