Skip to content

Commit cc7b9c3

Browse files
authored
Merge pull request #2120 from FedML-AI/alaydshah/qualcomm/add/gpu_utilization
Qualcomm: Add support for GPU Utilization
2 parents 9a16aa4 + a0664d8 commit cc7b9c3

1 file changed

Lines changed: 13 additions & 7 deletions

File tree

python/fedml/computing/scheduler/comm_utils/gpu_utils/qualcomm_utils.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,14 +43,16 @@ def get_gpu_cards() -> List[GPUCard]:
4343

4444
@staticmethod
4545
def get_available_gpu_card_ids(order: str, limit: int, max_load: float, max_memory: float) -> List[int]:
46-
47-
if order != "memory":
46+
gpu_cards: List[GPUCard] = QualcommNPUtil.get_gpu_cards()
47+
gpu_cards = list(filter(lambda card: (card.memoryUtil < max_memory and card.load < max_load), gpu_cards))
48+
if order == 'memory':
49+
gpu_cards.sort(key=lambda card: float('inf') if math.isnan(card.memoryUtil) else card.memoryUtil, reverse=False)
50+
elif order == 'load':
51+
gpu_cards.sort(key=lambda card: float('inf') if math.isnan(card.memoryUtil) else card.load, reverse=False)
52+
else:
4853
raise NotImplementedError(f"Qualcomm utils doesn't have support to compute availability based on {order}. "
49-
f"Supported criteria: [memory]")
54+
f"Supported criteria: [memory, load]")
5055

51-
gpu_cards: List[GPUCard] = QualcommNPUtil.get_gpu_cards()
52-
gpu_cards = list(filter(lambda card: card.memoryUtil < max_memory, gpu_cards))
53-
gpu_cards.sort(key=lambda card: float('inf') if math.isnan(card.memoryUtil) else card.memoryUtil, reverse=False)
5456
gpu_cards = gpu_cards[0:min(limit, len(gpu_cards))]
5557
return list(map(lambda card: card.id, gpu_cards))
5658

@@ -75,11 +77,14 @@ def get_docker_gpu_ids_by_container_name(container_name: str, docker_client: Doc
7577

7678
@staticmethod
7779
def __convert(npu) -> GPUCard:
78-
# TODO (alaydshah): Add support for load, memoryUtil, temperature
80+
# TODO (alaydshah): Add support for temperature
7981
memory_total = npu.devData.resourceInfo.dramTotal / 1024
8082
memory_free = npu.devData.resourceInfo.dramFree / 1024
8183
memory_used = memory_total - memory_free
8284
memory_utilized = float(memory_used) / float(memory_total)
85+
nsp_free = npu.devData.resourceInfo.nspFree
86+
nsp_total = npu.devData.resourceInfo.nspTotal
87+
load = (nsp_total - nsp_free) / nsp_total
8388

8489
return GPUCard(
8590
id=npu.qid,
@@ -91,6 +96,7 @@ def __convert(npu) -> GPUCard:
9196
memoryFree=memory_free,
9297
memoryUsed=memory_used,
9398
memoryUtil=memory_utilized,
99+
load=load,
94100
)
95101

96102
@staticmethod

0 commit comments

Comments
 (0)