Skip to content

Commit d057a30

Browse files
committed
in case of same memory use, prefer GPUs that are less often reserved
This round-robins over reserved but unused GPUs in case the config NV_EXCLUSIVE_CONTAINER_GPU_RESERVATION wasn't set. Before it could happen that many users got assigned GPU 0 before starting to use it.
1 parent 1fffb3c commit d057a30

1 file changed

Lines changed: 10 additions & 6 deletions

File tree

userdocker/helpers/nvidia.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,9 @@ def nvidia_get_gpus_used_by_containers(docker):
1919
return_status=False,
2020
loglvl=logging.DEBUG,
2121
).split()
22+
gpu_used_by_containers = defaultdict(list)
2223
if not running_containers:
23-
return {}
24+
return gpu_used_by_containers
2425
gpu_used_by_containers_str = exec_cmd(
2526
[
2627
docker, 'inspect', '--format',
@@ -32,7 +33,6 @@ def nvidia_get_gpus_used_by_containers(docker):
3233
)
3334
logger.debug('gpu_used_by_containers_str: %s', gpu_used_by_containers_str)
3435
gpu_dev_id_re = re.compile('^/dev/nvidia([0-9]+)$')
35-
gpu_used_by_containers = defaultdict(list)
3636
for line in gpu_used_by_containers_str.splitlines():
3737
container_name, container, env, devs = json.loads(line)
3838
for dev in devs:
@@ -74,11 +74,16 @@ def nvidia_get_available_gpus(docker, nvidia_smi=NVIDIA_SMI):
7474
mem_used = int(mem_used.split(' MiB')[0])
7575
gpu_mem_used[gpu] = mem_used
7676

77-
# get available gpus asc by mem used
77+
gpus_used_by_containers = nvidia_get_gpus_used_by_containers(docker)
78+
79+
# get available gpus asc by mem used and reservation counts
7880
mem_limit = NV_GPU_UNAVAILABLE_ABOVE_MEMORY_USED
81+
mem_res_gpu = [
82+
(m, len(gpus_used_by_containers.get(gpu, [])), gpu)
83+
for gpu, m in gpu_mem_used.items()
84+
]
7985
available_gpus = [
80-
g for g, m in sorted(gpu_mem_used.items(), key=itemgetter(1, 0))
81-
if mem_limit < 0 or m <= mem_limit
86+
g for m, r, g in sorted(mem_res_gpu) if mem_limit < 0 or m <= mem_limit
8287
]
8388
if NV_ALLOWED_GPUS != 'ALL':
8489
available_gpus = [g for g in available_gpus if g in NV_ALLOWED_GPUS]
@@ -88,5 +93,4 @@ def nvidia_get_available_gpus(docker, nvidia_smi=NVIDIA_SMI):
8893
if not NV_EXCLUSIVE_CONTAINER_GPU_RESERVATION:
8994
return available_gpus
9095

91-
gpus_used_by_containers = nvidia_get_gpus_used_by_containers(docker)
9296
return [gpu for gpu in available_gpus if gpu not in gpus_used_by_containers]

0 commit comments

Comments
 (0)