-
Notifications
You must be signed in to change notification settings - Fork 4
/
my_task.py
88 lines (72 loc) · 3.73 KB
/
my_task.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# This file contains the main code for your task.
# You can delete these comments, and any empty optional methods, before submitting.
from metr_task_standard.types import VMSpec
from typing import TypedDict
# If you try to import third-party libraries here, the task will crash because `install` hasn't run yet.
# You can import libraries in `start` and `score`, which run after `install`.
# The Task class contains data specific to each task in the family.
class Task(TypedDict):
problem: str
answer: int
class TaskFamily:
# Which version of the Task Standard this TaskFamily adheres to
standard_version = "0.2.2"
# Optional: a list of environment variable names that must be set in TaskFamily#start
# and TaskFamily#score (where they can be accessed with e.g. `os.environ.get("MY_ENV_VAR")`).
required_environment_variables = []
# `install` is called while building a task-family-specific Docker image.
# See the Dockerfile for more information about how `install` is called.
# `install` may install packages, clone Git repos, copy assets into `/home/agent`, or make other such changes.
# The `install` method (which runs at build time) can access the
# internet, even if the task will not allow internet access at run time.
@staticmethod
def install() -> None:
pass
# `get_tasks` returns a mapping from task names to arbitrary data that is specific to each task.
# `get_tasks` is first called after `install` but before `start`.
# Note that this method should return the same value every time it is called.
@staticmethod
def get_tasks() -> dict[str, Task]:
return {"addition": {"problem": "1234567 + 2345678", "answer": 1234567 + 2345678}}
# `get_instructions` returns a string containing initial task instructions for the agent.
@staticmethod
def get_instructions(t: Task) -> str:
problem = t["problem"]
return f"""Solve the following math problem: {problem}"""
# `get_permissions` is called after `install` but before `start`.
# It returns either the empty list or a list containing only the string "full_internet".
# Refer to the Task Standard more information.
@staticmethod
def get_permissions(t: Task) -> list[str]:
return ["full_internet"]
# Optional: `get_aux_vm_spec` specifies an auxiliary virtual machine that will be set up along with the task.
# This is useful if you want to let the agent use a GPU.
# Refer to the Task Standard for more information.
@staticmethod
def get_aux_vm_spec(t: Task) -> VMSpec | None:
return None
# `start` is called after creating a Docker container for a run, but before the agent process starts.
# `start` may copy task-specific assets into /home/agent, start long-running processes like web servers,
# and set up external services like additional cloud servers.
@staticmethod
def start(t: Task) -> None:
pass
# `score` is called after the agent submits a solution. If the solution is more complex than a short string,
# `score` can read files, run programs, make network requests, etc.
# `score` should return a float between 0 and 1, or `None` to indicate that manual scoring is required.
@staticmethod
def score(t: Task, submission: str) -> float | None:
# Make sure to handle errors if the agent submits an invalid solution
try:
answer_int = int(submission)
except ValueError:
print("Answer must be an integer")
return 0
if answer_int == t["answer"]:
return 1
else:
return 0
# Optional: `teardown` cleans up any external resources created during setup.
@staticmethod
def teardown(t: Task) -> None:
pass