FedML-AI
diff --git a/‎python/fedml/computing/scheduler/comm_utils/container_utils.py‎
Lines changed: 19 additions & 29 deletions b/‎python/fedml/computing/scheduler/comm_utils/container_utils.py‎
Lines changed: 19 additions & 29 deletions
diff --git a/‎python/fedml/computing/scheduler/comm_utils/gpu_utils/__init__.py‎ b/‎python/fedml/computing/scheduler/comm_utils/gpu_utils/__init__.py‎
diff --git a/‎python/fedml/computing/scheduler/comm_utils/gpu_utils/gpu_utils.py‎
Lines changed: 61 additions & 0 deletions b/‎python/fedml/computing/scheduler/comm_utils/gpu_utils/gpu_utils.py‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎python/fedml/computing/scheduler/comm_utils/gpu_utils/nvidia_utils.py‎
Lines changed: 64 additions & 0 deletions b/‎python/fedml/computing/scheduler/comm_utils/gpu_utils/nvidia_utils.py‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎python/fedml/computing/scheduler/comm_utils/gpu_utils/qualcomm_utils.py‎
Lines changed: 130 additions & 0 deletions b/‎python/fedml/computing/scheduler/comm_utils/gpu_utils/qualcomm_utils.py‎
Lines changed: 130 additions & 0 deletions
@@ -2,16 +2,18 @@
 import os
 import traceback
 import datetime
+from typing import List
+
 from dateutil.parser import isoparse
 
 import docker
 from docker import errors
 
 from fedml.computing.scheduler.comm_utils import sys_utils
+from fedml.computing.scheduler.comm_utils.hardware_utils import HardwareUtil
 from fedml.core.common.singleton import Singleton
 from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants
 import time
-from GPUtil import getGPUs
 
 
 class ContainerUtils(Singleton):
@@ -252,7 +254,7 @@ def get_container_perf(self, c_name) -> ContainerMetrics:
         CPU %     MEM USAGE / LIMIT     MEM %     NET I/O          BLOCK I/O
         0.26%     8.703GiB / 503.5GiB   1.73%     17.4GB / 176MB   545kB / 20.9GB
 
-        GPU: We currently use GPUtil to get the GPU stats on host machine since one GPU is not
+        GPU: We currently use HardwareUtil to get the GPU stats on host machine since one GPU is not
         shared by multiple containers
         (TODO: get the GPU stats inside the container)
         """
@@ -320,47 +322,35 @@ def get_container_perf(self, c_name) -> ContainerMetrics:
             round(blk_read_bytes / (1024 * 1024), 1), round(blk_write_bytes / (1024 * 1024), 1))
 
         # Calculate the gpu usage
-        gpus_stat = self.generate_container_gpu_stats(c_name)
+        gpus_stat = self.generate_container_gpu_stats(container_name=c_name)
 
         # Record timestamp
         timestamp = stats["read"]
 
         return ContainerUtils.ContainerMetrics(cpu_percent, mem_gb_used, mem_gb_avail, recv_megabytes, sent_megabytes,
                                                blk_read_bytes, blk_write_bytes, timestamp, gpus_stat)
 
-    def generate_container_gpu_stats(self, c_name):
-        gpu_ids = self.get_gpu_ids_by_container_name(c_name)
+    def generate_container_gpu_stats(self, container_name):
+        client = self.get_docker_client()
+        gpu_ids = HardwareUtil.get_docker_gpu_ids_by_container_name(container_name=container_name, docker_client=client)
         gpu_stats = self.gpu_stats(gpu_ids)
         return gpu_stats
 
-    def get_gpu_ids_by_container_name(self, c_name):
-        client = self.get_docker_client()
-        gpu_ids = []
-        try:
-            gpu_ids = client.api.inspect_container(c_name)["HostConfig"]["DeviceRequests"][0]["DeviceIDs"]
-            gpu_ids = list(map(int, gpu_ids))
-        except Exception as e:
-            logging.error(f"Failed to get GPU IDs: {e}")
-            pass
-
-        return gpu_ids
-
     @staticmethod
-    def gpu_stats(gpu_ids):
+    def gpu_stats(gpu_ids: List[int]):
         utilz, memory, temp = None, None, None
         gpu_stats_map = {}  # gpu_id: int -> {"gpu_utilization", "gpu_memory_allocated", "gpu_temp"}
+        gpu_ids = set(gpu_ids)
         try:
-            gpus = getGPUs()
-
-            for i in gpu_ids:
-                gpu = gpus[i]
-                gpu_stats_map[i] = {
-                    "gpu_utilization": gpu.load*100,
-                    "gpu_memory_allocated": gpu.memoryUtil*100,
-                    "gpu_temp": gpu.temperature,
-                    # "gpu_power_usage": pynvml.nvmlDeviceGetPowerUsage(handle) / 1000,   # in watts
-                    # "gpu_time_spent_accessing_memory": utilz.memory   # in ms
-                }
+            for gpu in HardwareUtil.get_gpus():
+                if gpu.id in gpu_ids:
+                    gpu_stats_map[gpu.id] = {
+                        "gpu_utilization": gpu.load * 100,
+                        "gpu_memory_allocated": gpu.memoryUsed / gpu.memoryTotal * 100,
+                        "gpu_temp": gpu.temperature,
+                        # "gpu_power_usage": pynvml.nvmlDeviceGetPowerUsage(handle) / 1000,   # in watts
+                        # "gpu_time_spent_accessing_memory": utilz.memory   # in ms
+                    }
         except Exception as e:
             logging.error(f"Failed to get GPU stats: {e}")
 
 
@@ -0,0 +1,61 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import Optional, List, Dict
+
+from docker import DockerClient
+
+
+class GPUCardType(Enum):
+    NVIDIA = auto()
+    QUALCOMM = auto()
+    UNKNOWN = auto()
+
+    def __str__(self):
+        return self.name
+
+
+@dataclass
+class GPUCard:
+    id: int
+    name: str
+    driver: str
+    serial: str
+    vendor: str
+    memoryTotal: float
+    memoryFree: float
+    memoryUsed: float
+    memoryUtil: float
+    load: Optional[float] = 0.0
+    uuid: Optional[str] = ""
+    display_mode: Optional[str] = ""
+    display_active: Optional[str] = ""
+    temperature: Optional[float] = 0.0
+
+
+class GPUCardUtil(ABC):
+
+    @classmethod
+    @abstractmethod
+    def detect_gpu_card_type(cls) -> Optional[GPUCardType]:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def get_available_gpu_card_ids(order: str, limit: int, max_load: float, max_memory: float) -> List[int]:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def get_gpu_cards() -> List[GPUCard]:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def get_docker_gpu_device_mapping(gpu_ids: List[int]) -> Optional[Dict]:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def get_docker_gpu_ids_by_container_name(container_name: str, docker_client: DockerClient) -> List[int]:
+        raise NotImplementedError
@@ -0,0 +1,64 @@
+import logging
+import subprocess
+from typing import List, Optional, Dict
+
+import docker
+from docker import types, DockerClient
+from GPUtil import GPUtil, GPU
+
+from fedml.computing.scheduler.comm_utils.gpu_utils.gpu_utils import GPUCard, GPUCardUtil, GPUCardType
+
+
+class NvidiaGPUtil(GPUCardUtil):
+
+    @classmethod
+    def detect_gpu_card_type(cls) -> Optional[GPUCardType]:
+        try:
+            subprocess.check_output(["nvidia-smi"], universal_newlines=True)
+            return GPUCardType.NVIDIA
+        except Exception:
+            return None
+
+    @staticmethod
+    def get_gpu_cards() -> List[GPUCard]:
+        return [NvidiaGPUtil.__convert(gpu) for gpu in GPUtil.getGPUs()]
+
+    @staticmethod
+    def get_available_gpu_card_ids(order: str, limit: int, max_load: float, max_memory: float) -> List[int]:
+        return GPUtil.getAvailable(order=order, limit=limit, maxLoad=max_load, maxMemory=max_memory)
+
+    @staticmethod
+    def get_docker_gpu_device_mapping(gpu_ids: List[int]) -> Optional[Dict]:
+        if gpu_ids and len(gpu_ids):
+            gpu_id_list = list(map(lambda x: str(x), gpu_ids))
+            return {"device_requests": [docker.types.DeviceRequest(device_ids=gpu_id_list, capabilities=[["gpu"]])]}
+        return None
+
+    @staticmethod
+    def get_docker_gpu_ids_by_container_name(container_name: str, docker_client: DockerClient) -> List[int]:
+        try:
+            gpu_ids = docker_client.api.inspect_container(container_name)["HostConfig"]["DeviceRequests"][0]["DeviceIDs"]
+            return list(map(int, gpu_ids))
+        except Exception as e:
+            logging.error(f"Failed to get GPU IDs: {e}")
+            pass
+        return []
+
+    @staticmethod
+    def __convert(gpu: GPU) -> GPUCard:
+        return GPUCard(
+            id=gpu.id,
+            name=gpu.name,
+            driver=gpu.driver,
+            serial=gpu.serial,
+            vendor=GPUCardType.NVIDIA.name,
+            memoryTotal=gpu.memoryTotal,
+            memoryFree=gpu.memoryFree,
+            memoryUsed=gpu.memoryUsed,
+            memoryUtil=gpu.memoryUtil,
+            load=gpu.load,
+            uuid=gpu.uuid,
+            display_mode=gpu.display_mode,
+            display_active=gpu.display_active,
+            temperature=gpu.temperature,
+        )
@@ -0,0 +1,130 @@
+import logging
+import math
+import re
+import subprocess
+import sys
+from typing import List, Optional, Dict
+
+from docker import DockerClient
+
+from fedml.computing.scheduler.comm_utils.gpu_utils.gpu_utils import GPUCard, GPUCardUtil, GPUCardType
+
+
+class QualcommNPUtil(GPUCardUtil):
+    NPU_CARD_PATH = "/dev/accel/accel"
+
+    def __init__(self):
+        sys.path.append("/opt/qti-aic/dev/lib/x86_64/")
+
+    @classmethod
+    def detect_gpu_card_type(cls) -> Optional[GPUCardType]:
+        try:
+            subprocess.check_output(["/opt/qti-aic/tools/qaic-util"], universal_newlines=True)
+            return GPUCardType.QUALCOMM
+        except Exception:
+            return None
+
+    @staticmethod
+    def get_gpu_cards() -> List[GPUCard]:
+        from qaicrt import Util, QIDList, QDevInfo, QStatus
+
+        cards = []
+        util = Util()
+        status, card_list = util.getDeviceIds()
+        if status.value == 0:
+            for card in card_list:
+                status, card_info = util.getDeviceInfo(card)
+                if status.value == 0 and card_info.devStatus.value == 1:
+                    cards.append(QualcommNPUtil.__convert(card_info))
+
+        else:
+            logging.error("Qualcomm Card Status not Healthy")
+        return cards
+
+    @staticmethod
+    def get_available_gpu_card_ids(order: str, limit: int, max_load: float, max_memory: float) -> List[int]:
+
+        if order != "memory":
+            raise NotImplementedError(f"Qualcomm utils doesn't have support to compute availability based on {order}. "
+                                      f"Supported criteria: [memory]")
+
+        gpu_cards: List[GPUCard] = QualcommNPUtil.get_gpu_cards()
+        gpu_cards = list(filter(lambda card: card.memoryUtil < max_memory, gpu_cards))
+        gpu_cards.sort(key=lambda card: float('inf') if math.isnan(card.memoryUtil) else card.memoryUtil, reverse=False)
+        gpu_cards = gpu_cards[0:min(limit, len(gpu_cards))]
+        return list(map(lambda card: card.id, gpu_cards))
+
+    @staticmethod
+    def get_docker_gpu_device_mapping(gpu_ids: List[int]) -> Optional[Dict]:
+        if gpu_ids and len(gpu_ids):
+            return {
+                "devices": [f"{QualcommNPUtil.NPU_CARD_PATH}{gpu_id}:{QualcommNPUtil.NPU_CARD_PATH}{gpu_id}" for gpu_id
+                            in gpu_ids]}
+        return None
+
+    @staticmethod
+    def get_docker_gpu_ids_by_container_name(container_name: str, docker_client: DockerClient) -> List[int]:
+        gpu_ids = []
+        try:
+            docker_inspect_info = docker_client.api.inspect_container(container_name)
+            gpu_ids = QualcommNPUtil.__parse_gpu_ids(docker_inspect_info.get("HostConfig", {}))
+        except Exception as e:
+            logging.error(f"Failed to get GPU IDs: {e}")
+            pass
+        return gpu_ids
+
+    @staticmethod
+    def __convert(npu) -> GPUCard:
+        # TODO (alaydshah): Add support for load, memoryUtil, temperature
+        memory_total = npu.devData.resourceInfo.dramTotal / 1024
+        memory_free = npu.devData.resourceInfo.dramFree / 1024
+        memory_used = memory_total - memory_free
+        memory_utilized = float(memory_used) / float(memory_total)
+
+        return GPUCard(
+            id=npu.qid,
+            name=npu.pciInfo.devicename,
+            driver=npu.devData.fwQCImageVersionString,
+            serial=npu.devData.serial,
+            vendor=GPUCardType.QUALCOMM.name,
+            memoryTotal=memory_total,
+            memoryFree=memory_free,
+            memoryUsed=memory_used,
+            memoryUtil=memory_utilized,
+        )
+
+    @staticmethod
+    def __parse_gpu_ids(host_config: dict) -> List[int]:
+        devices = host_config.get('Devices', [])
+        gpu_ids = []
+        for device in devices:
+            gpu_id = QualcommNPUtil.__extract_integer_from_host_path(device.get('PathOnHost', None))
+
+            # Check explicitly if gpu_id is not None, as gpu_id can be 0, which is a valid value to include.
+            if gpu_id is not None:
+                gpu_ids.append(gpu_id)
+        return gpu_ids
+
+    @staticmethod
+    def __extract_integer_from_host_path(host_path: str) -> Optional[int]:
+        if not host_path:
+            logging.error("Host Path is None; GPU Id extraction Failed")
+            return None
+
+        npu_card_path = QualcommNPUtil.NPU_CARD_PATH
+
+        # Check if host_path starts with npu_card_path
+        if host_path.startswith(npu_card_path):
+
+            # Extract the numeric suffix from the host path
+            suffix = host_path[len(npu_card_path):]  # Get the substring after npu_card_path
+            match = re.match(r'^(\d+)', suffix)  # Use regex to match the leading integer
+            if match:
+                return int(match.group(1))  # Return the extracted integer
+            else:
+                logging.error(f"Failed to extract GPU id from Host Path {host_path}")
+        else:
+            logging.error(f"Host Path {host_path} doesn't start with NPU Card Path {npu_card_path}")
+
+        # Return None if extraction fails
+        return None