Skip to content

Commit 57c5dc4

Browse files
authored
Merge pull request #2087 from FedML-AI/alaydshah/gpu_utils/qualcomm
Refactor Util + Add support for Qualcomm Hardware
2 parents 5e41f65 + 2d458a8 commit 57c5dc4

11 files changed

Lines changed: 371 additions & 68 deletions

File tree

python/fedml/computing/scheduler/comm_utils/container_utils.py

Lines changed: 19 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,18 @@
22
import os
33
import traceback
44
import datetime
5+
from typing import List
6+
57
from dateutil.parser import isoparse
68

79
import docker
810
from docker import errors
911

1012
from fedml.computing.scheduler.comm_utils import sys_utils
13+
from fedml.computing.scheduler.comm_utils.hardware_utils import HardwareUtil
1114
from fedml.core.common.singleton import Singleton
1215
from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants
1316
import time
14-
from GPUtil import getGPUs
1517

1618

1719
class ContainerUtils(Singleton):
@@ -252,7 +254,7 @@ def get_container_perf(self, c_name) -> ContainerMetrics:
252254
CPU % MEM USAGE / LIMIT MEM % NET I/O BLOCK I/O
253255
0.26% 8.703GiB / 503.5GiB 1.73% 17.4GB / 176MB 545kB / 20.9GB
254256
255-
GPU: We currently use GPUtil to get the GPU stats on host machine since one GPU is not
257+
GPU: We currently use HardwareUtil to get the GPU stats on host machine since one GPU is not
256258
shared by multiple containers
257259
(TODO: get the GPU stats inside the container)
258260
"""
@@ -320,47 +322,35 @@ def get_container_perf(self, c_name) -> ContainerMetrics:
320322
round(blk_read_bytes / (1024 * 1024), 1), round(blk_write_bytes / (1024 * 1024), 1))
321323

322324
# Calculate the gpu usage
323-
gpus_stat = self.generate_container_gpu_stats(c_name)
325+
gpus_stat = self.generate_container_gpu_stats(container_name=c_name)
324326

325327
# Record timestamp
326328
timestamp = stats["read"]
327329

328330
return ContainerUtils.ContainerMetrics(cpu_percent, mem_gb_used, mem_gb_avail, recv_megabytes, sent_megabytes,
329331
blk_read_bytes, blk_write_bytes, timestamp, gpus_stat)
330332

331-
def generate_container_gpu_stats(self, c_name):
332-
gpu_ids = self.get_gpu_ids_by_container_name(c_name)
333+
def generate_container_gpu_stats(self, container_name):
334+
client = self.get_docker_client()
335+
gpu_ids = HardwareUtil.get_docker_gpu_ids_by_container_name(container_name=container_name, docker_client=client)
333336
gpu_stats = self.gpu_stats(gpu_ids)
334337
return gpu_stats
335338

336-
def get_gpu_ids_by_container_name(self, c_name):
337-
client = self.get_docker_client()
338-
gpu_ids = []
339-
try:
340-
gpu_ids = client.api.inspect_container(c_name)["HostConfig"]["DeviceRequests"][0]["DeviceIDs"]
341-
gpu_ids = list(map(int, gpu_ids))
342-
except Exception as e:
343-
logging.error(f"Failed to get GPU IDs: {e}")
344-
pass
345-
346-
return gpu_ids
347-
348339
@staticmethod
349-
def gpu_stats(gpu_ids):
340+
def gpu_stats(gpu_ids: List[int]):
350341
utilz, memory, temp = None, None, None
351342
gpu_stats_map = {} # gpu_id: int -> {"gpu_utilization", "gpu_memory_allocated", "gpu_temp"}
343+
gpu_ids = set(gpu_ids)
352344
try:
353-
gpus = getGPUs()
354-
355-
for i in gpu_ids:
356-
gpu = gpus[i]
357-
gpu_stats_map[i] = {
358-
"gpu_utilization": gpu.load*100,
359-
"gpu_memory_allocated": gpu.memoryUtil*100,
360-
"gpu_temp": gpu.temperature,
361-
# "gpu_power_usage": pynvml.nvmlDeviceGetPowerUsage(handle) / 1000, # in watts
362-
# "gpu_time_spent_accessing_memory": utilz.memory # in ms
363-
}
345+
for gpu in HardwareUtil.get_gpus():
346+
if gpu.id in gpu_ids:
347+
gpu_stats_map[gpu.id] = {
348+
"gpu_utilization": gpu.load * 100,
349+
"gpu_memory_allocated": gpu.memoryUsed / gpu.memoryTotal * 100,
350+
"gpu_temp": gpu.temperature,
351+
# "gpu_power_usage": pynvml.nvmlDeviceGetPowerUsage(handle) / 1000, # in watts
352+
# "gpu_time_spent_accessing_memory": utilz.memory # in ms
353+
}
364354
except Exception as e:
365355
logging.error(f"Failed to get GPU stats: {e}")
366356

python/fedml/computing/scheduler/comm_utils/gpu_utils/__init__.py

Whitespace-only changes.
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
from abc import ABC, abstractmethod
2+
from dataclasses import dataclass
3+
from enum import Enum, auto
4+
from typing import Optional, List, Dict
5+
6+
from docker import DockerClient
7+
8+
9+
class GPUCardType(Enum):
10+
NVIDIA = auto()
11+
QUALCOMM = auto()
12+
UNKNOWN = auto()
13+
14+
def __str__(self):
15+
return self.name
16+
17+
18+
@dataclass
19+
class GPUCard:
20+
id: int
21+
name: str
22+
driver: str
23+
serial: str
24+
vendor: str
25+
memoryTotal: float
26+
memoryFree: float
27+
memoryUsed: float
28+
memoryUtil: float
29+
load: Optional[float] = 0.0
30+
uuid: Optional[str] = ""
31+
display_mode: Optional[str] = ""
32+
display_active: Optional[str] = ""
33+
temperature: Optional[float] = 0.0
34+
35+
36+
class GPUCardUtil(ABC):
37+
38+
@classmethod
39+
@abstractmethod
40+
def detect_gpu_card_type(cls) -> Optional[GPUCardType]:
41+
raise NotImplementedError
42+
43+
@staticmethod
44+
@abstractmethod
45+
def get_available_gpu_card_ids(order: str, limit: int, max_load: float, max_memory: float) -> List[int]:
46+
raise NotImplementedError
47+
48+
@staticmethod
49+
@abstractmethod
50+
def get_gpu_cards() -> List[GPUCard]:
51+
raise NotImplementedError
52+
53+
@staticmethod
54+
@abstractmethod
55+
def get_docker_gpu_device_mapping(gpu_ids: List[int]) -> Optional[Dict]:
56+
raise NotImplementedError
57+
58+
@staticmethod
59+
@abstractmethod
60+
def get_docker_gpu_ids_by_container_name(container_name: str, docker_client: DockerClient) -> List[int]:
61+
raise NotImplementedError
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import logging
2+
import subprocess
3+
from typing import List, Optional, Dict
4+
5+
import docker
6+
from docker import types, DockerClient
7+
from GPUtil import GPUtil, GPU
8+
9+
from fedml.computing.scheduler.comm_utils.gpu_utils.gpu_utils import GPUCard, GPUCardUtil, GPUCardType
10+
11+
12+
class NvidiaGPUtil(GPUCardUtil):
13+
14+
@classmethod
15+
def detect_gpu_card_type(cls) -> Optional[GPUCardType]:
16+
try:
17+
subprocess.check_output(["nvidia-smi"], universal_newlines=True)
18+
return GPUCardType.NVIDIA
19+
except Exception:
20+
return None
21+
22+
@staticmethod
23+
def get_gpu_cards() -> List[GPUCard]:
24+
return [NvidiaGPUtil.__convert(gpu) for gpu in GPUtil.getGPUs()]
25+
26+
@staticmethod
27+
def get_available_gpu_card_ids(order: str, limit: int, max_load: float, max_memory: float) -> List[int]:
28+
return GPUtil.getAvailable(order=order, limit=limit, maxLoad=max_load, maxMemory=max_memory)
29+
30+
@staticmethod
31+
def get_docker_gpu_device_mapping(gpu_ids: List[int]) -> Optional[Dict]:
32+
if gpu_ids and len(gpu_ids):
33+
gpu_id_list = list(map(lambda x: str(x), gpu_ids))
34+
return {"device_requests": [docker.types.DeviceRequest(device_ids=gpu_id_list, capabilities=[["gpu"]])]}
35+
return None
36+
37+
@staticmethod
38+
def get_docker_gpu_ids_by_container_name(container_name: str, docker_client: DockerClient) -> List[int]:
39+
try:
40+
gpu_ids = docker_client.api.inspect_container(container_name)["HostConfig"]["DeviceRequests"][0]["DeviceIDs"]
41+
return list(map(int, gpu_ids))
42+
except Exception as e:
43+
logging.error(f"Failed to get GPU IDs: {e}")
44+
pass
45+
return []
46+
47+
@staticmethod
48+
def __convert(gpu: GPU) -> GPUCard:
49+
return GPUCard(
50+
id=gpu.id,
51+
name=gpu.name,
52+
driver=gpu.driver,
53+
serial=gpu.serial,
54+
vendor=GPUCardType.NVIDIA.name,
55+
memoryTotal=gpu.memoryTotal,
56+
memoryFree=gpu.memoryFree,
57+
memoryUsed=gpu.memoryUsed,
58+
memoryUtil=gpu.memoryUtil,
59+
load=gpu.load,
60+
uuid=gpu.uuid,
61+
display_mode=gpu.display_mode,
62+
display_active=gpu.display_active,
63+
temperature=gpu.temperature,
64+
)
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
import logging
2+
import math
3+
import re
4+
import subprocess
5+
import sys
6+
from typing import List, Optional, Dict
7+
8+
from docker import DockerClient
9+
10+
from fedml.computing.scheduler.comm_utils.gpu_utils.gpu_utils import GPUCard, GPUCardUtil, GPUCardType
11+
12+
13+
class QualcommNPUtil(GPUCardUtil):
14+
NPU_CARD_PATH = "/dev/accel/accel"
15+
16+
def __init__(self):
17+
sys.path.append("/opt/qti-aic/dev/lib/x86_64/")
18+
19+
@classmethod
20+
def detect_gpu_card_type(cls) -> Optional[GPUCardType]:
21+
try:
22+
subprocess.check_output(["/opt/qti-aic/tools/qaic-util"], universal_newlines=True)
23+
return GPUCardType.QUALCOMM
24+
except Exception:
25+
return None
26+
27+
@staticmethod
28+
def get_gpu_cards() -> List[GPUCard]:
29+
from qaicrt import Util, QIDList, QDevInfo, QStatus
30+
31+
cards = []
32+
util = Util()
33+
status, card_list = util.getDeviceIds()
34+
if status.value == 0:
35+
for card in card_list:
36+
status, card_info = util.getDeviceInfo(card)
37+
if status.value == 0 and card_info.devStatus.value == 1:
38+
cards.append(QualcommNPUtil.__convert(card_info))
39+
40+
else:
41+
logging.error("Qualcomm Card Status not Healthy")
42+
return cards
43+
44+
@staticmethod
45+
def get_available_gpu_card_ids(order: str, limit: int, max_load: float, max_memory: float) -> List[int]:
46+
47+
if order != "memory":
48+
raise NotImplementedError(f"Qualcomm utils doesn't have support to compute availability based on {order}. "
49+
f"Supported criteria: [memory]")
50+
51+
gpu_cards: List[GPUCard] = QualcommNPUtil.get_gpu_cards()
52+
gpu_cards = list(filter(lambda card: card.memoryUtil < max_memory, gpu_cards))
53+
gpu_cards.sort(key=lambda card: float('inf') if math.isnan(card.memoryUtil) else card.memoryUtil, reverse=False)
54+
gpu_cards = gpu_cards[0:min(limit, len(gpu_cards))]
55+
return list(map(lambda card: card.id, gpu_cards))
56+
57+
@staticmethod
58+
def get_docker_gpu_device_mapping(gpu_ids: List[int]) -> Optional[Dict]:
59+
if gpu_ids and len(gpu_ids):
60+
return {
61+
"devices": [f"{QualcommNPUtil.NPU_CARD_PATH}{gpu_id}:{QualcommNPUtil.NPU_CARD_PATH}{gpu_id}" for gpu_id
62+
in gpu_ids]}
63+
return None
64+
65+
@staticmethod
66+
def get_docker_gpu_ids_by_container_name(container_name: str, docker_client: DockerClient) -> List[int]:
67+
gpu_ids = []
68+
try:
69+
docker_inspect_info = docker_client.api.inspect_container(container_name)
70+
gpu_ids = QualcommNPUtil.__parse_gpu_ids(docker_inspect_info.get("HostConfig", {}))
71+
except Exception as e:
72+
logging.error(f"Failed to get GPU IDs: {e}")
73+
pass
74+
return gpu_ids
75+
76+
@staticmethod
77+
def __convert(npu) -> GPUCard:
78+
# TODO (alaydshah): Add support for load, memoryUtil, temperature
79+
memory_total = npu.devData.resourceInfo.dramTotal / 1024
80+
memory_free = npu.devData.resourceInfo.dramFree / 1024
81+
memory_used = memory_total - memory_free
82+
memory_utilized = float(memory_used) / float(memory_total)
83+
84+
return GPUCard(
85+
id=npu.qid,
86+
name=npu.pciInfo.devicename,
87+
driver=npu.devData.fwQCImageVersionString,
88+
serial=npu.devData.serial,
89+
vendor=GPUCardType.QUALCOMM.name,
90+
memoryTotal=memory_total,
91+
memoryFree=memory_free,
92+
memoryUsed=memory_used,
93+
memoryUtil=memory_utilized,
94+
)
95+
96+
@staticmethod
97+
def __parse_gpu_ids(host_config: dict) -> List[int]:
98+
devices = host_config.get('Devices', [])
99+
gpu_ids = []
100+
for device in devices:
101+
gpu_id = QualcommNPUtil.__extract_integer_from_host_path(device.get('PathOnHost', None))
102+
103+
# Check explicitly if gpu_id is not None, as gpu_id can be 0, which is a valid value to include.
104+
if gpu_id is not None:
105+
gpu_ids.append(gpu_id)
106+
return gpu_ids
107+
108+
@staticmethod
109+
def __extract_integer_from_host_path(host_path: str) -> Optional[int]:
110+
if not host_path:
111+
logging.error("Host Path is None; GPU Id extraction Failed")
112+
return None
113+
114+
npu_card_path = QualcommNPUtil.NPU_CARD_PATH
115+
116+
# Check if host_path starts with npu_card_path
117+
if host_path.startswith(npu_card_path):
118+
119+
# Extract the numeric suffix from the host path
120+
suffix = host_path[len(npu_card_path):] # Get the substring after npu_card_path
121+
match = re.match(r'^(\d+)', suffix) # Use regex to match the leading integer
122+
if match:
123+
return int(match.group(1)) # Return the extracted integer
124+
else:
125+
logging.error(f"Failed to extract GPU id from Host Path {host_path}")
126+
else:
127+
logging.error(f"Host Path {host_path} doesn't start with NPU Card Path {npu_card_path}")
128+
129+
# Return None if extraction fails
130+
return None

0 commit comments

Comments
 (0)