Skip to content

Commit 4ceba31

Browse files
authored
Merge pull request #2179 from FedML-AI/alaydshah/qualcomm/workaround/device_path_inconsistency
Workaround device mapping inconsistency
2 parents 87e11f7 + 89219fb commit 4ceba31

4 files changed

Lines changed: 29 additions & 12 deletions

File tree

python/fedml/computing/scheduler/comm_utils/gpu_utils/gpu_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ class GPUCard:
2727
memoryUsed: float
2828
memoryUtil: float
2929
load: Optional[float] = 0.0
30+
device_path: Optional[str] = ""
3031
uuid: Optional[str] = ""
3132
display_mode: Optional[str] = ""
3233
display_active: Optional[str] = ""

python/fedml/computing/scheduler/comm_utils/gpu_utils/qualcomm_utils.py

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -26,19 +26,22 @@ def detect_gpu_card_type(cls) -> Optional[GPUCardType]:
2626

2727
@staticmethod
2828
def get_gpu_cards() -> List[GPUCard]:
29-
from qaicrt import Util, QIDList, QDevInfo, QStatus
29+
return list(QualcommNPUtil.__get_gpu_cards().values())
3030

31-
cards = []
31+
@staticmethod
32+
def __get_gpu_cards() -> Dict[int, GPUCard]:
33+
from qaicrt import Util, QIDList, QDevInfo, QStatus
34+
cards = dict()
3235
util = Util()
3336
status, card_list = util.getDeviceIds()
3437
if status.value == 0:
3538
for card in card_list:
3639
status, card_info = util.getDeviceInfo(card)
3740
if status.value == 0 and card_info.devStatus.value == 1:
38-
cards.append(QualcommNPUtil.__convert(card_info))
39-
41+
gpu_card = QualcommNPUtil.__convert(card_info)
42+
cards[gpu_card.id] = gpu_card
4043
else:
41-
logging.error("Qualcomm Card Status not Healthy")
44+
logging.error("Qualcomm Cards Status not Healthy")
4245
return cards
4346

4447
@staticmethod
@@ -58,11 +61,21 @@ def get_available_gpu_card_ids(order: str, limit: int, max_load: float, max_memo
5861

5962
@staticmethod
6063
def get_docker_gpu_device_mapping(gpu_ids: Optional[List[int]], num_gpus: int = 0) -> Optional[Dict]:
61-
if gpu_ids is not None and len(gpu_ids):
62-
return {
63-
"devices": [f"{QualcommNPUtil.NPU_CARD_PATH}{gpu_id}:{QualcommNPUtil.NPU_CARD_PATH}{gpu_id}" for gpu_id
64-
in gpu_ids]}
65-
return None
64+
if gpu_ids is None or not len(gpu_ids):
65+
return None
66+
67+
devices = []
68+
gpu_cards = QualcommNPUtil.__get_gpu_cards()
69+
70+
for gpu_id in gpu_ids:
71+
if not (gpu_id in gpu_cards and gpu_cards[gpu_id].device_path):
72+
logging.error("Failed to get gpu device mapping for docker")
73+
break
74+
else:
75+
device_path = gpu_cards[gpu_id].device_path
76+
devices.append(f"{device_path}:{device_path}")
77+
78+
return {"devices": devices} if len(devices) == len(gpu_ids) else None
6679

6780
@staticmethod
6881
def get_docker_gpu_ids_by_container_name(container_name: str, docker_client: DockerClient) -> List[int]:
@@ -87,7 +100,8 @@ def __convert(npu) -> GPUCard:
87100
load = (nsp_total - nsp_free) / nsp_total
88101

89102
return GPUCard(
90-
id=npu.qid,
103+
id=npu.mhiId,
104+
device_path=npu.name,
91105
name=npu.pciInfo.devicename,
92106
driver=npu.devData.fwQCImageVersionString,
93107
serial=npu.devData.serial,

python/fedml/computing/scheduler/comm_utils/hardware_utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,5 +60,7 @@ def get_docker_gpu_ids_by_container_name(container_name: str, docker_client: Doc
6060
if __name__ == "__main__":
6161
gpus = HardwareUtil.get_gpus()
6262
get_available_gpu_cards = HardwareUtil.get_available_gpu_ids(limit=len(gpus))
63+
device_mapping = HardwareUtil.get_docker_gpu_device_mapping(get_available_gpu_cards, len(get_available_gpu_cards))
6364
print(gpus)
6465
print(get_available_gpu_cards)
66+
print(device_mapping)

python/fedml/computing/scheduler/scheduler_core/account_manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -266,7 +266,7 @@ def get_uuid():
266266
if not use_machine_id:
267267
device_id = hex(uuid.getnode())
268268
else:
269-
device_id = device_id = FedMLAccountManager.get_gpu_machine_id()
269+
device_id = FedMLAccountManager.get_gpu_machine_id()
270270
else:
271271
device_id = sys_utils.run_subprocess_open(
272272
"hal-get-property --udi /org/freedesktop/Hal/devices/computer --key system.hardware.uuid".split()

0 commit comments

Comments
 (0)