Skip to content

Commit

Permalink
Merge pull request #219 from henryleberre/master
Browse files Browse the repository at this point in the history
  • Loading branch information
sbryngelson authored Oct 29, 2023
2 parents c312cd7 + e2abac2 commit 92fac09
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 22 deletions.
2 changes: 1 addition & 1 deletion toolchain/mfc/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def add_common_arguments(p, mask = None):
p.add_argument(f"--no-{target.name}", action="store_true", help=f"Do not build the {target.name} dependency. Use the system's instead.")

if "g" not in mask:
p.add_argument("-g", "--gpus", nargs="+", type=int, default=[0], help="(GPU) List of GPU #s to use.")
p.add_argument("-g", "--gpus", nargs="+", type=int, default=None, help="(Optional GPU override) List of GPU #s to use (environment default if unspecified).")

# === BUILD ===
add_common_arguments(build, "g")
Expand Down
16 changes: 9 additions & 7 deletions toolchain/mfc/run/engines.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,14 +147,16 @@ def run(self, targets: typing.List[MFCTarget]) -> None:

if not ARG("dry_run"):
start_time = time.monotonic()

env = os.environ.copy()
if ARG('gpus') is not None:
env['CUDA_VISIBLE_DEVICES'] = ','.join([str(_) for _ in ARG('gpus')])

system(
self.get_exec_cmd(target), cwd=self.input.case_dirpath,
env={
**os.environ.copy(),
'CUDA_VISIBLE_DEVICES': ','.join([str(_) for _ in ARG('gpus')])
}
env=env
)
end_time = time.monotonic()
end_time = time.monotonic()
cons.print(no_indent=True)

cons.print(f"[bold green]Done[/bold green] (in {datetime.timedelta(seconds=end_time - start_time)})")
Expand Down Expand Up @@ -307,11 +309,11 @@ def __create_batch_file(self, system: queues.QueueSystem, targets: typing.List[M
cons.print("> Writing batch file...")
file_write(filepath, content)

def __execute_batch_file(self, system: queues.QueueSystem):
def __execute_batch_file(self, queue: queues.QueueSystem):
# We CD to the case directory before executing the batch file so that
# any files the queue system generates (like .err and .out) are created
# in the correct directory.
cmd = system.gen_submit_cmd(self.__get_batch_filename())
cmd = queue.gen_submit_cmd(self.__get_batch_filename())

if system(cmd, cwd=self.__get_batch_dirpath()) != 0:
raise MFCException(f"Submitting batch file for {system.name} failed. It can be found here: {self.__get_batch_filepath()}. Please check the file for errors.")
Expand Down
21 changes: 12 additions & 9 deletions toolchain/mfc/sched.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@ class Task:
load: float


def sched(tasks: typing.List[Task], nThreads: int, devices: typing.Set[int]) -> None:
def sched(tasks: typing.List[Task], nThreads: int, devices: typing.Set[int] = None) -> None:
nAvailable: int = nThreads
threads: typing.List[WorkerThreadHolder] = []

sched.LOAD = { id: 0.0 for id in devices }
sched.LOAD = { id: 0.0 for id in devices or [] }

def join_first_dead_thread(progress, complete_tracker) -> None:
nonlocal threads, nAvailable
Expand All @@ -50,7 +50,7 @@ def join_first_dead_thread(progress, complete_tracker) -> None:
raise threadHolder.thread.exc

nAvailable += threadHolder.ppn
for device in threadHolder.devices:
for device in threadHolder.devices or set():
sched.LOAD[device] -= threadHolder.load / threadHolder.ppn

progress.advance(complete_tracker)
Expand Down Expand Up @@ -82,18 +82,21 @@ def join_first_dead_thread(progress, complete_tracker) -> None:
# Launch Thread
progress.advance(queue_tracker)

use_devices = None
# Use the least loaded devices
devices = set()
for _ in range(task.ppn):
device = min(sched.LOAD.items(), key=lambda x: x[1])[0]
sched.LOAD[device] += task.load / task.ppn
if devices is not None:
use_devices = set()
for _ in range(task.ppn):
device = min(sched.LOAD.items(), key=lambda x: x[1])[0]
sched.LOAD[device] += task.load / task.ppn
use_devices.add(device)

nAvailable -= task.ppn

thread = WorkerThread(target=task.func, args=tuple(task.args) + (devices,))
thread = WorkerThread(target=task.func, args=tuple(task.args) + (use_devices,))
thread.start()

threads.append(WorkerThreadHolder(thread, task.ppn, task.load, devices))
threads.append(WorkerThreadHolder(thread, task.ppn, task.load, use_devices))


# Wait for the lasts tests to complete
Expand Down
11 changes: 8 additions & 3 deletions toolchain/mfc/test/case.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,11 @@ def __init__(self, trace: str, mods: dict, ppn: int = None) -> None:
super().__init__({**BASE_CFG.copy(), **mods})

def run(self, targets: typing.List[str], gpus: typing.Set[int]) -> subprocess.CompletedProcess:
gpu_select = f"CUDA_VISIBLE_DEVICES={','.join([str(_) for _ in gpus])}"
if gpus is not None and len(gpus) != 0:
gpus_select = f"--gpus {' '.join([str(_) for _ in gpus])}"
else:
gpus_select = ""

filepath = f'"{self.get_dirpath()}/case.py"'
tasks = f"-n {self.ppn}"
jobs = f"-j {ARG('jobs')}" if ARG("case_optimization") else ""
Expand All @@ -110,8 +114,9 @@ def run(self, targets: typing.List[str], gpus: typing.Set[int]) -> subprocess.Co
mfc_script = ".\mfc.bat" if os.name == 'nt' else "./mfc.sh"

command: str = f'''\
{gpu_select} {mfc_script} run {filepath} {tasks} {binary_option} \
{case_optimization} {jobs} -t {' '.join(targets)} 2>&1\
{mfc_script} run {filepath} {tasks} {binary_option} \
{case_optimization} {jobs} -t {' '.join(targets)} \
{gpus_select} 2>&1\
'''

return subprocess.run(command, stdout=subprocess.PIPE,
Expand Down
2 changes: 0 additions & 2 deletions toolchain/mfc/test/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,6 @@ def test():
cons.print(f" tests/[bold magenta]UUID[/bold magenta] Summary")
cons.print()

_handle_case.GPU_LOAD = { id: 0 for id in ARG("gpus") }

# Select the correct number of threads to use to launch test CASES
# We can't use ARG("jobs") when the --case-optimization option is set
# because running a test case may cause it to rebuild, and thus
Expand Down

0 comments on commit 92fac09

Please sign in to comment.