-
Notifications
You must be signed in to change notification settings - Fork 48
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Manager Quality of Life statistics #313
Changes from 4 commits
c33eb9a
9248de0
2612315
76b37e5
34eb760
3a1dd61
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,6 +35,10 @@ def _submit_task(self, task_spec: Dict[str, Any]) -> Tuple[Hashable, Any]: | |
task = self.client.submit(func, *task_spec["spec"]["args"], **task_spec["spec"]["kwargs"]) | ||
return task_spec["id"], task | ||
|
||
def count_running_workers(self) -> int: | ||
# This is always "running", even if there are no tasks since its running locally | ||
return 1 | ||
|
||
def acquire_complete(self) -> Dict[str, Any]: | ||
ret = {} | ||
del_keys = [] | ||
|
@@ -78,6 +82,10 @@ def _submit_task(self, task_spec: Dict[str, Any]) -> Tuple[Hashable, Any]: | |
func, *task_spec["spec"]["args"], **task_spec["spec"]["kwargs"], resources={"process": 1}) | ||
return task_spec["id"], task | ||
|
||
def count_running_workers(self) -> int: | ||
# Note: This number may not quite be right if its treating "worker" as a "job" or Dask-Distributed "worker" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
return self.client._count_active_workers() | ||
|
||
def await_results(self) -> bool: | ||
from dask.distributed import wait | ||
wait(list(self.queue.values())) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,7 @@ | |
import uuid | ||
from typing import Any, Callable, Dict, List, Optional, Union | ||
|
||
from pydantic import BaseModel, validator | ||
import qcengine as qcng | ||
from qcfractal.extras import get_information | ||
|
||
|
@@ -19,6 +20,43 @@ | |
__all__ = ["QueueManager"] | ||
|
||
|
||
class QueueStatistics(BaseModel): | ||
""" | ||
Queue Manager Job statistics | ||
""" | ||
# Dynamic quantities | ||
total_successful_tasks: int = 0 | ||
total_failed_tasks: int = 0 | ||
total_core_hours: float = 0 | ||
total_core_hours_consumed: float = 0 | ||
total_core_hours_possible: float = 0 | ||
# Static Quantities | ||
max_concurrent_tasks: int = 0 | ||
cores_per_task: int = 0 | ||
last_update_time: float = None | ||
# sum_task(task_wall_time * nthread/task) / sum_time(number_running_worker * nthread/worker * interval) | ||
|
||
def __init__(self, **kwargs): | ||
if kwargs.get('last_update_time', None) is None: | ||
kwargs['last_update_time'] = time.time() | ||
super().__init__(**kwargs) | ||
|
||
@property | ||
def total_completed_tasks(self): | ||
return self.total_successful_tasks + self.total_failed_tasks | ||
|
||
@property | ||
def theoretical_max_consumption(self): | ||
"""In Core Hours""" | ||
return self.max_concurrent_tasks * self.cores_per_task * (time.time() - self.last_update_time) / 3600 | ||
|
||
@validator('cores_per_task', pre=True) | ||
def cores_per_tasks_none(cls, v): | ||
if v is None: | ||
v = 1 | ||
return v | ||
|
||
|
||
class QueueManager: | ||
""" | ||
This object maintains a computational queue and watches for finished tasks for different | ||
|
@@ -42,7 +80,7 @@ def __init__(self, | |
logger: Optional[logging.Logger] = None, | ||
max_tasks: int = 200, | ||
queue_tag: str = None, | ||
manager_name: str = "unlabled", | ||
manager_name: str = "unlabeled", | ||
update_frequency: Union[int, float] = 2, | ||
verbose: bool = True, | ||
server_error_retries: Optional[int] = 1, | ||
|
@@ -112,6 +150,10 @@ def __init__(self, | |
self.max_tasks = max_tasks | ||
self.queue_tag = queue_tag | ||
self.verbose = verbose | ||
self.statistics = QueueStatistics(max_concurrent_tasks=self.max_tasks, | ||
cores_per_task=cores_per_task, | ||
update_frequency=update_frequency | ||
) | ||
|
||
self.scheduler = None | ||
self.update_frequency = update_frequency | ||
|
@@ -280,7 +322,6 @@ def close_adapter(self) -> bool: | |
|
||
return self.queue_adapter.close() | ||
|
||
|
||
## Queue Manager functions | ||
|
||
def heartbeat(self) -> None: | ||
|
@@ -428,9 +469,28 @@ def update(self, new_tasks: bool = True, allow_shutdown=True) -> bool: | |
self._update_stale_jobs(allow_shutdown=allow_shutdown) | ||
|
||
results = self.queue_adapter.acquire_complete() | ||
|
||
# Stats fetching for running tasks, as close to the time we got the jobs as we can | ||
last_time = self.statistics.last_update_time | ||
now = self.statistics.last_update_time = time.time() | ||
time_delta_seconds = now - last_time | ||
try: | ||
running_workers = self.queue_adapter.count_running_workers() | ||
log_efficiency = True | ||
except NotImplementedError: | ||
running_workers = 0 | ||
log_efficiency = False | ||
max_core_hours_running = time_delta_seconds * running_workers * self.statistics.cores_per_task / 3600 | ||
max_core_hours_possible = (time_delta_seconds * self.statistics.max_concurrent_tasks | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Im not sure There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd like to leave this one in because it gives an upfront estimate of how well we are utilizing the resources you requested over the resources you were allocated. If, say, someone fires this up and they only get half the nodes they requested, they can look into that. Or, for instance in case of Dask sometimes, users request a bunch of resources they know are available and could be allocated, but then the adapter only spins up a fraction of them, it would be indicative. I could also just mask this behind a verbosity level since it could get distracting, especially in the case of drained queues, or remove it outright, but I think there are valid reasons for tracking it at least. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fair enough, let’s leave it in for now. I think it might be too much standard info, but easier to remove or move to a debug logging than to add back later. |
||
* self.statistics.cores_per_task / 3600) | ||
self.statistics.total_core_hours_consumed += max_core_hours_running | ||
self.statistics.total_core_hours_possible += max_core_hours_possible | ||
|
||
# Process jobs | ||
n_success = 0 | ||
n_fail = 0 | ||
n_result = len(results) | ||
task_cpu_hours = 0 | ||
error_payload = [] | ||
jobs_pushed = f"Pushed {n_result} complete tasks to the server " | ||
if n_result: | ||
|
@@ -448,11 +508,29 @@ def update(self, new_tasks: bool = True, allow_shutdown=True) -> bool: | |
|
||
self.active -= n_result | ||
for key, result in results.items(): | ||
wall_time_seconds = 0 | ||
if result.success: | ||
n_success += 1 | ||
if hasattr(result.provenance, 'wall_time'): | ||
wall_time_seconds = float(result.provenance.wall_time) | ||
else: | ||
error_payload.append(f"Job {key} failed: {result.error.error_type} - " | ||
f"Msg: {result.error.error_message}") | ||
# Try to get the wall time in the most fault-tolerant way | ||
try: | ||
wall_time_seconds = float(result.input_data.get('provenance', {}).get('wall_time', 0)) | ||
except AttributeError: | ||
# Trap the result.input_data is None, but let other attribute errors go | ||
if result.input_data is None: | ||
wall_time_seconds = 0 | ||
else: | ||
raise | ||
except TypeError: | ||
# Trap wall time corruption, e.g. float(None) | ||
# Other Result corruptions will raise an error correctly | ||
wall_time_seconds = 0 | ||
|
||
task_cpu_hours += wall_time_seconds * self.statistics.cores_per_task / 3600 | ||
n_fail = n_result - n_success | ||
|
||
self.logger.info(jobs_pushed + f"({n_success} success / {n_fail} fail).") | ||
|
@@ -469,6 +547,41 @@ def update(self, new_tasks: bool = True, allow_shutdown=True) -> bool: | |
# Get new tasks | ||
payload = self._payload_template() | ||
payload["data"]["limit"] = open_slots | ||
|
||
# Crunch Statistics | ||
self.statistics.total_failed_tasks += n_fail | ||
self.statistics.total_successful_tasks += n_success | ||
self.statistics.total_core_hours += task_cpu_hours | ||
na_format = '' | ||
percent_format = '.2f' | ||
if self.statistics.total_completed_tasks == 0: | ||
success_rate = "(N/A yet)" | ||
success_format = na_format | ||
else: | ||
success_rate = self.statistics.total_successful_tasks / self.statistics.total_completed_tasks * 100 | ||
success_format = percent_format | ||
stats_str = (f"Stats (Tasks): Processed={self.statistics.total_completed_tasks}, " | ||
f"Failed={self.statistics.total_failed_tasks}, " | ||
f"Success={success_rate:{success_format}}%, " | ||
f"Core Hours (est.)={self.statistics.total_core_hours}") | ||
|
||
# Handle efficiency calculations | ||
if log_efficiency: | ||
# Efficiency calculated as: | ||
# sum_task(task_wall_time * nthread / task) / sum_time(number_running_worker * nthread / worker * interval) | ||
if self.statistics.total_core_hours_consumed == 0 or self.statistics.total_core_hours_possible == 0: | ||
efficiency_of_running = "(N/A yet)" | ||
efficiency_of_potential = "(N/A yet)" | ||
efficiency_format = na_format | ||
else: | ||
efficiency_of_running = self.statistics.total_core_hours / self.statistics.total_core_hours_consumed | ||
efficiency_of_potential = self.statistics.total_core_hours / self.statistics.total_core_hours_possible | ||
efficiency_format = percent_format | ||
stats_str += (f", Worker Core Efficiency (est.): {efficiency_of_running*100:{efficiency_format}}%, " | ||
f"Max Resource Core Efficiency (est.): {efficiency_of_potential*100:{efficiency_format}}%") | ||
|
||
self.logger.info(stats_str) | ||
|
||
try: | ||
new_tasks = self.client._automodel_request("queue_manager", "get", payload) | ||
except IOError as exc: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,6 +33,7 @@ def __init__(self, client: Any, logger: Optional[logging.Logger] = None, **kwarg | |
|
||
import parsl | ||
self.client = parsl.dataflow.dflow.DataFlowKernel(self.client) | ||
self._parsl_states = parsl.dataflow.states.States | ||
self.app_map = {} | ||
|
||
def __repr__(self): | ||
|
@@ -77,6 +78,29 @@ def _submit_task(self, task_spec: Dict[str, Any]) -> Tuple[Hashable, Any]: | |
task = func(*task_spec["spec"]["args"], **task_spec["spec"]["kwargs"]) | ||
return task_spec["id"], task | ||
|
||
def count_running_workers(self) -> int: | ||
|
||
running = 0 | ||
executor_running_task_map = {key: False for key in self.client.executors.keys()} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This sounds like something we could ask of Parsl. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Possibly, it would really need to be a way to loop over every executor in the Dataflow client and count the workers in a way which makes sense to the executor. Since we only really allow ThreadPool and HighThroughput executors, I just dealt with those for now. I could probably make this faster and simpler by simply assuming "Is there a ThreadPoolWorker present? Of so it might as well be working since its effectively running locally." Thoughts? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, this is something the parsl team can easily integrate and test without us being in the loop. I think it’s fine for now, but this level of introspection is often easily breakable. |
||
for task in self.queue.values(): | ||
status = self.client.tasks.get(task.tid, {}).get('status', None) | ||
if status == self._parsl_states.running: | ||
executor_running_task_map[task['executor']] = True | ||
if all(executor_running_task_map.values()): | ||
# Efficiency loop break | ||
break | ||
|
||
for executor_key, executor in self.client.executors.items(): | ||
if hasattr(executor, 'connected_workers'): | ||
# Should return an int | ||
running += executor.connected_workers | ||
elif hasattr(executor, 'max_threads') and executor_running_task_map[executor_key]: | ||
running += 1 | ||
else: | ||
raise NotImplementedError("Cannot accurately estimate consumption from executors") | ||
|
||
return running | ||
|
||
def acquire_complete(self) -> Dict[str, Any]: | ||
ret = {} | ||
del_keys = [] | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should be able to call this function.