From b3e97dfd2d19bedb2769e7cdf54fbefd1b4a2f5b Mon Sep 17 00:00:00 2001 From: Ben Denham Date: Mon, 6 May 2024 21:16:57 +1200 Subject: [PATCH 1/3] Prevent .gitignore from being detected as a key --- labtech/storage.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/labtech/storage.py b/labtech/storage.py index 6d50f7c..ba7ac7e 100644 --- a/labtech/storage.py +++ b/labtech/storage.py @@ -59,7 +59,10 @@ def _key_path(self, key: str) -> Path: return key_path def find_keys(self) -> Sequence[str]: - return sorted([key_path.name for key_path in self._storage_path.iterdir()]) + return sorted([ + key_path.name for key_path in self._storage_path.iterdir() + if key_path.is_dir() + ]) def exists(self, key: str) -> bool: key_path = self._key_path(key) From d367c2f934d93d41bd13b8a5a1d5a4800ce2dbe5 Mon Sep 17 00:00:00 2001 From: Ben Denham Date: Sat, 18 May 2024 19:49:42 +1200 Subject: [PATCH 2/3] Automatically detect when running in a notebook --- README.md | 3 +- docs/cookbook.md | 62 +++------------- docs/tutorial.md | 17 +---- examples/basic.ipynb | 1 - examples/cookbook.ipynb | 160 +++++++++++++++------------------------- examples/tutorial.ipynb | 103 ++++++++++++-------------- labtech/diagram.py | 13 +--- labtech/lab.py | 11 +-- labtech/utils.py | 6 ++ 9 files changed, 137 insertions(+), 239 deletions(-) diff --git a/README.md b/README.md index 3b8df44..6fc1730 100644 --- a/README.md +++ b/README.md @@ -91,8 +91,7 @@ if __name__ == '__main__': ![Animated GIF of labtech demo on the command-line](https://ben-denham.github.io/labtech/images/labtech-demo.gif) Labtech can also produce graphical progress bars in -[Jupyter](https://jupyter.org/) when the `Lab` is initialized with -`notebook=True`: +[Jupyter](https://jupyter.org/) notebooks: ![Animated GIF of labtech demo in Jupyter](https://ben-denham.github.io/labtech/images/labtech-demo-jupyter.gif) diff --git a/docs/cookbook.md b/docs/cookbook.md index beba1b7..ae45f74 100644 --- a/docs/cookbook.md +++ b/docs/cookbook.md @@ -50,10 +50,7 @@ experiments = [ ) for seed in range(5) ] -lab = labtech.Lab( - storage=None, - notebook=True, -) +lab = labtech.Lab(storage=None) results = lab.run_tasks(experiments) ``` @@ -120,10 +117,7 @@ class ClassifierExperiment: experiment = ClassifierExperiment( classifier_task=LRClassifierTask(random_state=42), ) -lab = labtech.Lab( - storage=None, - notebook=True, -) +lab = labtech.Lab(storage=None) results = lab.run_tasks([experiment]) ``` @@ -182,10 +176,7 @@ experiments = [ ClassifierExperiment(classifier_task=classifier_task) for classifier_task in classifier_tasks ] -lab = labtech.Lab( - storage=None, - notebook=True, -) +lab = labtech.Lab(storage=None) results = lab.run_tasks(experiments) ``` @@ -242,10 +233,7 @@ experiments = [ ) for dataset in DatasetOption ] -lab = labtech.Lab( - storage=None, - notebook=True, -) +lab = labtech.Lab(storage=None) results = lab.run_tasks(experiments) ``` @@ -291,7 +279,6 @@ experiments = [ ] lab = labtech.Lab( storage=None, - notebook=True, context={ 'DATASETS': DATASETS, }, @@ -340,7 +327,6 @@ experiments = [ ] lab = labtech.Lab( storage=None, - notebook=True, context={ 'within_task_workers': 4, }, @@ -521,10 +507,7 @@ class TabularTask: }) -lab = labtech.Lab( - storage='storage/parquet_example', - notebook=True, -) +lab = labtech.Lab(storage='storage/parquet_example') lab.run_tasks([TabularTask()]) ``` @@ -602,10 +585,7 @@ experiments = [ ) for seed in range(100) ] -lab = labtech.Lab( - storage=LocalFsspecStorage('storage/fsspec_example'), - notebook=True, -) +lab = labtech.Lab(storage=LocalFsspecStorage('storage/fsspec_example')) results = lab.run_tasks(experiments) ``` @@ -676,10 +656,7 @@ experiments = [ aggregation_task = AggregationTask( sub_tasks=experiments, ) -lab = labtech.Lab( - storage='storage/aggregation_lab', - notebook=True, -) +lab = labtech.Lab(storage='storage/aggregation_lab') result = lab.run_task(aggregation_task) ``` @@ -718,10 +695,7 @@ experiments = [ aggregation_task = AggregationTask( sub_tasks=experiments, ) -lab = labtech.Lab( - storage=None, - notebook=True, -) +lab = labtech.Lab(storage=None) results = lab.run_tasks([ aggregation_task, # Include intermediate tasks to access their results @@ -751,10 +725,7 @@ experiments = [ aggregation_task = AggregationTask( sub_tasks=experiments, ) -lab = labtech.Lab( - storage=None, - notebook=True, -) +lab = labtech.Lab(storage=None) result = lab.run_task( aggregation_task, keep_nested_results=True, @@ -817,10 +788,7 @@ task_c = StepC( task_b=task_b, ) -lab = labtech.Lab( - storage=None, - notebook=True, -) +lab = labtech.Lab(storage=None) result = lab.run_task(task_c) print(result) ``` @@ -897,10 +865,7 @@ runs = [ ] mlflow.set_experiment('example_labtech_experiment') -lab = labtech.Lab( - storage=None, - notebook=True, -) +lab = labtech.Lab(storage=None) results = lab.run_tasks(runs) ``` @@ -956,10 +921,7 @@ def main(): ) for seed in range(1000) ] - lab = labtech.Lab( - storage='storage/guarded_lab', - notebook=True, - ) + lab = labtech.Lab(storage='storage/guarded_lab') result = lab.run_tasks(experiments) print(result) diff --git a/docs/tutorial.md b/docs/tutorial.md index 293aa6f..30613c9 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -100,10 +100,7 @@ called `storage/tutorial/classification_lab_1` and to display notebook-friendly progress bars: ``` {.python .code} -lab = labtech.Lab( - storage='storage/tutorial/classification_lab_1', - notebook=True, -) +lab = labtech.Lab(storage='storage/tutorial/classification_lab_1') ``` Finally, we create a task instance of `ClassifierExperiment` and call @@ -211,10 +208,7 @@ we'll keep caches for the new definition separate by constructing a new lab that uses a different storage directory: ``` {.python .code} -lab = labtech.Lab( - storage='storage/tutorial/classification_lab_2', - notebook=True, -) +lab = labtech.Lab(storage='storage/tutorial/classification_lab_2') results = lab.run_tasks(classifier_experiments) ``` @@ -392,10 +386,7 @@ classifier_experiments = [ ] ] -lab = labtech.Lab( - storage='storage/tutorial/classification_lab_3', - notebook=True, -) +lab = labtech.Lab(storage='storage/tutorial/classification_lab_3') results = lab.run_tasks(classifier_experiments) for experiment, prob_y in results.items(): @@ -460,7 +451,6 @@ classifier_experiments = [ lab = labtech.Lab( storage='storage/tutorial/classification_lab_4', - notebook=True, context={ 'DATASETS': DATASETS, }, @@ -668,7 +658,6 @@ import mlflow mlflow.set_experiment('example_labtech_experiment') lab = labtech.Lab( storage='storage/tutorial/classification_lab_final', - notebook=True, context={ 'DATASETS': DATASETS, }, diff --git a/examples/basic.ipynb b/examples/basic.ipynb index e66b6fe..fb4af6d 100644 --- a/examples/basic.ipynb +++ b/examples/basic.ipynb @@ -128,7 +128,6 @@ "\n", "lab = labtech.Lab(\n", " storage='storage/ipy_basic_lab',\n", - " notebook=True,\n", ")\n", "cached_experiments = lab.cached_tasks([Experiment])\n", "print(f'Clearing {len(cached_experiments)} cached experiments.')\n", diff --git a/examples/cookbook.ipynb b/examples/cookbook.ipynb index 3e98a3f..8edebb1 100644 --- a/examples/cookbook.ipynb +++ b/examples/cookbook.ipynb @@ -11,7 +11,7 @@ "You can also run this cookbook as an [interactive\n", "notebook](https://mybinder.org/v2/gh/ben-denham/labtech/main?filepath=examples/cookbook.ipynb)." ], - "id": "43d16460-da02-45eb-b759-4bb67bc29edf" + "id": "533a4d14-255e-41bd-ac0a-9d4ca373bec8" }, { "cell_type": "code", @@ -21,7 +21,7 @@ "source": [ "%pip install labtech fsspec mlflow pandas scikit-learn setuptools" ], - "id": "962b0fe1-043b-4b78-9c3b-a92692fb99ae" + "id": "72852738-4536-4562-aa9e-4ace8b8105d4" }, { "cell_type": "code", @@ -31,7 +31,7 @@ "source": [ "!mkdir storage" ], - "id": "e2bd0537-2692-4026-8789-a82c44b39050" + "id": "74643859-3f8b-4261-85db-27184097742f" }, { "cell_type": "code", @@ -50,7 +50,7 @@ "digits_X, digits_y = datasets.load_digits(return_X_y=True)\n", "digits_X = StandardScaler().fit_transform(digits_X)" ], - "id": "f578f6b4-dce3-47f3-9404-6d9e667f0984" + "id": "3e6e5fc8-aeaa-4c88-97c3-ec9482fe931b" }, { "cell_type": "markdown", @@ -64,7 +64,7 @@ "is sent to `STDOUT` (e.g. calls to `print()`) or `STDERR` (e.g. uncaught\n", "exceptions) will also be captured and logged:" ], - "id": "642c0a98-98b0-440b-866f-51d6e134719e" + "id": "620c0683-35e3-4062-9de8-075e8eefe990" }, { "cell_type": "code", @@ -88,13 +88,10 @@ " )\n", " for seed in range(5)\n", "]\n", - "lab = labtech.Lab(\n", - " storage=None,\n", - " notebook=True,\n", - ")\n", + "lab = labtech.Lab(storage=None)\n", "results = lab.run_tasks(experiments)" ], - "id": "cf6dd957-143b-4a70-b7c5-44352d44852d" + "id": "957db178-860f-48e5-a041-d6cc9b869fb1" }, { "cell_type": "markdown", @@ -133,7 +130,7 @@ "learning model (like `LRClassifierTask` below), and then make a task of\n", "that type a parameter for your primary experiment task:" ], - "id": "2cb6f616-447b-4918-8eb4-8f6520c8b309" + "id": "5a67a451-89de-4536-9485-96a98639091e" }, { "cell_type": "code", @@ -171,13 +168,10 @@ "experiment = ClassifierExperiment(\n", " classifier_task=LRClassifierTask(random_state=42),\n", ")\n", - "lab = labtech.Lab(\n", - " storage=None,\n", - " notebook=True,\n", - ")\n", + "lab = labtech.Lab(storage=None)\n", "results = lab.run_tasks([experiment])" ], - "id": "738ca3bf-4e73-450b-8af9-5d359e2b662e" + "id": "e9a78439-f968-42af-a1d2-576d84e0f121" }, { "cell_type": "markdown", @@ -188,7 +182,7 @@ "[Protocol](https://docs.python.org/3/library/typing.html#typing.Protocol)\n", "that defines their common result type:" ], - "id": "d0c51bec-c2a1-4bab-aa9d-6d58688e9db2" + "id": "41fd8871-ca96-440f-825d-c73abf37ec7e" }, { "cell_type": "code", @@ -245,13 +239,10 @@ " ClassifierExperiment(classifier_task=classifier_task)\n", " for classifier_task in classifier_tasks\n", "]\n", - "lab = labtech.Lab(\n", - " storage=None,\n", - " notebook=True,\n", - ")\n", + "lab = labtech.Lab(storage=None)\n", "results = lab.run_tasks(experiments)" ], - "id": "a338461b-e315-444b-a27d-dfef5089cf86" + "id": "52b9db69-f978-4f70-b90e-a877a840cf4d" }, { "cell_type": "markdown", @@ -271,7 +262,7 @@ "> `Enum` must support equality between identical (but distinct) object\n", "> instances." ], - "id": "46c0ebef-d6c2-442f-a475-d23e5aedec2c" + "id": "06676a9c-f55c-4663-ada4-01f38aa60017" }, { "cell_type": "code", @@ -317,13 +308,10 @@ " )\n", " for dataset in DatasetOption\n", "]\n", - "lab = labtech.Lab(\n", - " storage=None,\n", - " notebook=True,\n", - ")\n", + "lab = labtech.Lab(storage=None)\n", "results = lab.run_tasks(experiments)" ], - "id": "4c80d044-8dba-424d-a38e-da567055859f" + "id": "6e27b34d-dfcd-4cf7-a1bb-773f90c67c3d" }, { "cell_type": "markdown", @@ -346,7 +334,7 @@ "The following example demonstrates specifying a `dataset_key` parameter\n", "to a task that is used to look up a dataset from the lab context:" ], - "id": "9c14cf9a-6286-4fc0-a0d0-fbe207b82eab" + "id": "629f7dfb-5bdb-4be1-811d-3a662355a38d" }, { "cell_type": "code", @@ -377,14 +365,13 @@ "]\n", "lab = labtech.Lab(\n", " storage=None,\n", - " notebook=True,\n", " context={\n", " 'DATASETS': DATASETS,\n", " },\n", ")\n", "results = lab.run_tasks(experiments)" ], - "id": "ff849443-b45e-4ff3-8c4f-fa9e90430fc3" + "id": "348471cd-8479-4ea2-90b2-05a8f26e4e16" }, { "cell_type": "markdown", @@ -402,7 +389,7 @@ "cross-validation within the task using a number of workers specified in\n", "the lab context as `within_task_workers`:" ], - "id": "fd8db656-746b-40d8-a3f1-d7fff81b634e" + "id": "28af319a-3bbd-4177-9ac0-8c1c61567f4e" }, { "cell_type": "code", @@ -437,14 +424,13 @@ "]\n", "lab = labtech.Lab(\n", " storage=None,\n", - " notebook=True,\n", " context={\n", " 'within_task_workers': 4,\n", " },\n", ")\n", "results = lab.run_tasks(experiments)" ], - "id": "9f759e33-9c9d-4618-97eb-badd7ac9dd87" + "id": "53ddc5e6-6548-46ae-be9d-613a9085fe1e" }, { "cell_type": "markdown", @@ -471,7 +457,7 @@ "raised during the execution of a task will be logged, but the execution\n", "of other tasks will continue:" ], - "id": "4662a7aa-6220-45eb-a665-cae172537545" + "id": "e1bb2f06-e5c3-487a-9def-a6c88dfffde2" }, { "cell_type": "code", @@ -484,7 +470,7 @@ " continue_on_failure=True,\n", ")" ], - "id": "3a7433fd-2786-4c41-a6f5-78abff05f75c" + "id": "1f65a098-9d89-4c3e-810e-09553b8ed499" }, { "cell_type": "markdown", @@ -503,7 +489,7 @@ "sub-class for that extension so that you can continue using caches for\n", "the base class:" ], - "id": "47fd98c7-13b9-4b47-a318-29691dd2372e" + "id": "9d6fd7c3-71a7-464b-bd70-43c475f2522f" }, { "cell_type": "code", @@ -527,7 +513,7 @@ " base_result = super().run()\n", " return base_result * self.multiplier" ], - "id": "bad34fbc-e481-431d-b196-e28161d76981" + "id": "c378f99e-4b6a-479b-9996-e8de6a4d9b86" }, { "cell_type": "markdown", @@ -539,7 +525,7 @@ "all cached task instances for a list of task types. You can then “run”\n", "the tasks to load their cached results:" ], - "id": "701058e0-ac9e-47bd-91c0-329514a1c6ad" + "id": "01704aab-ba2c-4d3c-ba4e-6fbbe97529ba" }, { "cell_type": "code", @@ -550,7 +536,7 @@ "cached_cvexperiment_tasks = lab.cached_tasks([CVExperiment])\n", "results = lab.run_tasks(cached_cvexperiment_tasks)" ], - "id": "1c781b4c-a074-477a-abb4-2c2254918550" + "id": "95cbd22b-0c23-42e8-9ae1-2d24a6520cd8" }, { "cell_type": "markdown", @@ -561,7 +547,7 @@ "You can clear the cache for a list of tasks using the `uncache_tasks()`\n", "method of a `Lab` instance:" ], - "id": "e5f4d8a2-537f-449b-8026-445f1ab8ab9e" + "id": "2e03f60a-5db4-4123-a271-3ee855840e52" }, { "cell_type": "code", @@ -571,7 +557,7 @@ "source": [ "lab.uncache_tasks(cached_cvexperiment_tasks)" ], - "id": "becf76db-5145-4074-8b8c-bd8fa0b99b9b" + "id": "4a7897ff-023d-48c5-a939-44ecc3de9a66" }, { "cell_type": "markdown", @@ -580,7 +566,7 @@ "You can also ignore all previously cached results when running a list of\n", "tasks by passing the `bust_cache` option to `run_tasks()`:" ], - "id": "e247afb2-dfc8-44ba-a474-048fded868f3" + "id": "6d8ed06e-3036-4148-9039-ebbca567baf9" }, { "cell_type": "code", @@ -590,7 +576,7 @@ "source": [ "lab.run_tasks(cached_cvexperiment_tasks, bust_cache=True)" ], - "id": "bc312e01-7e22-459f-87ea-e2231436de97" + "id": "9e4b2bb3-4648-4188-9649-11e7180fe79e" }, { "cell_type": "markdown", @@ -614,7 +600,7 @@ "consider using a\n", "[`TypeDict`](https://docs.python.org/3/library/typing.html#typing.TypedDict):" ], - "id": "5c8e5742-b63f-45e3-aa43-e221834a8de3" + "id": "578d003b-55c9-49be-ba25-977fe4bfcaca" }, { "cell_type": "code", @@ -641,7 +627,7 @@ " model_weights=np.array([self.seed, self.seed ** 2]),\n", " )" ], - "id": "2ae1d586-9532-4614-95d7-79eff00e3c50" + "id": "d1e5c3d7-29a5-4291-8a53-65f57a341d0c" }, { "cell_type": "markdown", @@ -659,7 +645,7 @@ "The following example demonstrates defining and using a custom cache\n", "type to store Pandas DataFrames as parquet files:" ], - "id": "78d1d0e9-61fc-496a-808e-4cf83eed5647" + "id": "480d065b-9ee0-4868-aec1-f29f51c93404" }, { "cell_type": "code", @@ -699,13 +685,10 @@ " })\n", "\n", "\n", - "lab = labtech.Lab(\n", - " storage='storage/parquet_example',\n", - " notebook=True,\n", - ")\n", + "lab = labtech.Lab(storage='storage/parquet_example')\n", "lab.run_tasks([TabularTask()])" ], - "id": "d5f62efc-dbf8-4454-b7a2-b26dc1d320e6" + "id": "58abbd03-cef4-484e-a87a-67ed35a30730" }, { "cell_type": "markdown", @@ -727,7 +710,7 @@ "S3](https://s3fs.readthedocs.io/en/latest/) and [Azure Blob\n", "Storage](https://github.com/fsspec/adlfs):" ], - "id": "bbb66e16-5c64-4fb0-a03b-0f319075785c" + "id": "7ac1545d-3b29-48db-a19f-0d51e87b4d54" }, { "cell_type": "code", @@ -793,13 +776,10 @@ " )\n", " for seed in range(100)\n", "]\n", - "lab = labtech.Lab(\n", - " storage=LocalFsspecStorage('storage/fsspec_example'),\n", - " notebook=True,\n", - ")\n", + "lab = labtech.Lab(storage=LocalFsspecStorage('storage/fsspec_example'))\n", "results = lab.run_tasks(experiments)" ], - "id": "f34c9445-4197-484f-965a-5c92cd6150d8" + "id": "2afecf43-838e-47ee-bef0-f31723bab7c1" }, { "cell_type": "markdown", @@ -841,7 +821,7 @@ "`AggregationTask` to aggregate the results from many individual tasks to\n", "create an aggregated cache that can be loaded more efficiently:" ], - "id": "0cb5ec3b-7bca-4b92-8a62-f87535cb4ca8" + "id": "a81f9e13-5667-4360-a8c0-2b426351b44a" }, { "cell_type": "code", @@ -879,13 +859,10 @@ "aggregation_task = AggregationTask(\n", " sub_tasks=experiments,\n", ")\n", - "lab = labtech.Lab(\n", - " storage='storage/aggregation_lab',\n", - " notebook=True,\n", - ")\n", + "lab = labtech.Lab(storage='storage/aggregation_lab')\n", "result = lab.run_task(aggregation_task)" ], - "id": "059c6b28-ee22-4ce1-92f2-f5013c002fd7" + "id": "a0769c50-4817-446b-8561-238754e0dae6" }, { "cell_type": "markdown", @@ -897,7 +874,7 @@ "it was originally executed and how long it took to execute from the\n", "task’s `.result_meta` attribute:" ], - "id": "cb61ef93-ddb3-4490-9927-bc96b1285004" + "id": "ae7df3c6-13e1-4479-afbc-ec342ea34ecb" }, { "cell_type": "code", @@ -908,7 +885,7 @@ "print(f'The task was executed at: {aggregation_task.result_meta.start}')\n", "print(f'The task execution took: {aggregation_task.result_meta.duration}')" ], - "id": "e941ea78-f2ea-420d-9880-d84a3c6a12f3" + "id": "fcac43bf-c32d-4c2b-a515-c1bc1c8425bc" }, { "cell_type": "markdown", @@ -928,7 +905,7 @@ "Another approach is to include all of the intermediate tasks for which\n", "you wish to access the results for in the call to `run_tasks()`:" ], - "id": "e29419a5-c04c-48de-8b7b-c88b683fb01c" + "id": "f7099745-1186-494f-89c0-a922667a212f" }, { "cell_type": "code", @@ -945,10 +922,7 @@ "aggregation_task = AggregationTask(\n", " sub_tasks=experiments,\n", ")\n", - "lab = labtech.Lab(\n", - " storage=None,\n", - " notebook=True,\n", - ")\n", + "lab = labtech.Lab(storage=None)\n", "results = lab.run_tasks([\n", " aggregation_task,\n", " # Include intermediate tasks to access their results\n", @@ -959,7 +933,7 @@ " for experiment in experiments\n", "])" ], - "id": "0f000771-9cf0-4b94-bf70-ce8d9572412a" + "id": "6db5accb-62df-4cf5-b0ba-5fbad4947aa2" }, { "cell_type": "markdown", @@ -973,7 +947,7 @@ "available, so you may need to set `bust_cache=True` to ensure all\n", "intermediate tasks are executed:" ], - "id": "3f0e83c5-31a4-4c11-bf63-2d4c33335075" + "id": "77eb5c34-b6a0-429a-adae-f455c95ed0dd" }, { "cell_type": "code", @@ -990,10 +964,7 @@ "aggregation_task = AggregationTask(\n", " sub_tasks=experiments,\n", ")\n", - "lab = labtech.Lab(\n", - " storage=None,\n", - " notebook=True,\n", - ")\n", + "lab = labtech.Lab(storage=None)\n", "result = lab.run_task(\n", " aggregation_task,\n", " keep_nested_results=True,\n", @@ -1004,7 +975,7 @@ " for experiment in experiments\n", "])" ], - "id": "da301072-a4a4-420f-a39b-a0e205ad6118" + "id": "b78d8e65-6e8a-4281-80da-92ad2586e9c2" }, { "cell_type": "markdown", @@ -1020,7 +991,7 @@ "This is modeled in labtech by defining a task type for each step, and\n", "having each step depend on the result from the previous step:" ], - "id": "8ce783d7-126f-4b24-80fd-426f39267156" + "id": "137beb83-bffd-4b9b-8bbd-c95ec48d5bc2" }, { "cell_type": "code", @@ -1066,14 +1037,11 @@ " task_b=task_b,\n", ")\n", "\n", - "lab = labtech.Lab(\n", - " storage=None,\n", - " notebook=True,\n", - ")\n", + "lab = labtech.Lab(storage=None)\n", "result = lab.run_task(task_c)\n", "print(result)" ], - "id": "c5ba5ee3-5646-4f64-8e06-75e5a8977fa6" + "id": "adf8eaf6-0de3-40cd-aed2-9191624b41b2" }, { "cell_type": "markdown", @@ -1085,7 +1053,7 @@ "[Mermaid diagram](https://mermaid.js.org/syntax/classDiagram.html) of\n", "task types for a given list of tasks:" ], - "id": "5d03b8e4-2847-419c-bf19-428495dcd91c" + "id": "aa3acaca-51af-4a64-b258-d6c3fa5accc1" }, { "cell_type": "code", @@ -1100,7 +1068,7 @@ " direction='RL',\n", ")" ], - "id": "31bbc433-9007-4904-adc8-8a45aedacb18" + "id": "685e3d1c-5cdb-4c22-aa54-94e738ae7fac" }, { "cell_type": "markdown", @@ -1124,7 +1092,7 @@ "additional tracking calls (such as `mlflow.log_metric()` or\n", "`mlflow.log_model()`) in the body of your task’s `run()` method:" ], - "id": "64accdae-6c77-42f9-aff5-d9b4891f5fad" + "id": "89bc9771-b24c-4341-988f-3231c1b5d44d" }, { "cell_type": "code", @@ -1168,13 +1136,10 @@ "]\n", "\n", "mlflow.set_experiment('example_labtech_experiment')\n", - "lab = labtech.Lab(\n", - " storage=None,\n", - " notebook=True,\n", - ")\n", + "lab = labtech.Lab(storage=None)\n", "results = lab.run_tasks(runs)" ], - "id": "92d10243-581c-47d0-87b6-a31617162846" + "id": "95b195cc-b163-4521-ae6b-5ba55a4156af" }, { "cell_type": "markdown", @@ -1212,7 +1177,7 @@ "non-definition code for a Python script in a `main()` function, and then\n", "guard the call to `main()` with `__name__ == '__main__'`:" ], - "id": "b7821d54-5b42-4f20-b2ad-465cb245aaee" + "id": "63b3155e-7f9d-4520-b211-84b6b42fce2e" }, { "cell_type": "code", @@ -1236,17 +1201,14 @@ " )\n", " for seed in range(1000)\n", " ]\n", - " lab = labtech.Lab(\n", - " storage='storage/guarded_lab',\n", - " notebook=True,\n", - " )\n", + " lab = labtech.Lab(storage='storage/guarded_lab')\n", " result = lab.run_tasks(experiments)\n", " print(result)\n", "\n", "if __name__ == '__main__':\n", " main()" ], - "id": "e0af5e9b-e2da-4041-ab94-99ba51285025" + "id": "489e01df-f0e8-43bc-97c6-af1010009009" }, { "cell_type": "markdown", @@ -1255,7 +1217,7 @@ "For details, see [Safe importing of main\n", "module](https://docs.python.org/3/library/multiprocessing.html#multiprocessing-safe-main-import)." ], - "id": "1343b7e5-1e43-496b-9fe6-f13482814342" + "id": "4e0cfbd8-980f-4429-a221-643fb7d85a95" } ], "nbformat": 4, diff --git a/examples/tutorial.ipynb b/examples/tutorial.ipynb index 1dd2b75..0ab7169 100644 --- a/examples/tutorial.ipynb +++ b/examples/tutorial.ipynb @@ -15,7 +15,7 @@ "Before we begin, let’s install `labtech` along with some other\n", "dependencies we will use in this tutorial:" ], - "id": "cc043fb0-0600-4b79-9068-cbb633a552a2" + "id": "30229f2f-074a-4e5b-85b4-daa551239dc0" }, { "cell_type": "code", @@ -25,7 +25,7 @@ "source": [ "%pip install labtech mlflow scikit-learn" ], - "id": "85e973cc-0ee6-4223-b042-e300c6936519" + "id": "4f019847-1b30-43cc-a54a-6f077306678c" }, { "cell_type": "markdown", @@ -34,7 +34,7 @@ "Let’s also clear any caches that were created by previous runs of this\n", "tutorial:" ], - "id": "1af42208-5906-4b3f-88a1-6897a671114c" + "id": "4439eb04-e2b2-45f4-a262-5dc0f2eecf94" }, { "cell_type": "code", @@ -45,7 +45,7 @@ "!rm -rf storage/tutorial/\n", "!mkdir -p storage/tutorial/" ], - "id": "949ad54d-e88e-408a-aaeb-e4a5bdb9469e" + "id": "e3363ed5-5c36-4505-8da0-c35c9d6d2bd6" }, { "cell_type": "markdown", @@ -56,7 +56,7 @@ "To get started, we’ll take the following simple machine learning\n", "experiment code and convert it to be run with labtech." ], - "id": "69ea7dd4-eee4-4867-8678-80563c1908b1" + "id": "6779ec4c-6c88-4b05-8403-c0a36bbb38a6" }, { "cell_type": "code", @@ -81,7 +81,7 @@ "\n", "print(f'{log_loss(digits_y, prob_y) = :.3}')" ], - "id": "5f79c533-d50b-432c-912c-c3a7caad8994" + "id": "8c6c354c-a098-4d6c-ba45-c3bf57c4174f" }, { "cell_type": "markdown", @@ -119,7 +119,7 @@ "method that performs the experiment and returns its result (the\n", "predicted probabilities):" ], - "id": "1b4d8fb5-8ceb-4259-9858-9cc108922b83" + "id": "f2d14d3c-d49f-4c33-ab0c-2ab24a5d48cb" }, { "cell_type": "code", @@ -144,7 +144,7 @@ " prob_y = clf.predict_proba(digits_X)\n", " return prob_y" ], - "id": "943d2918-d707-44e2-8dc7-ba5fd193fa6d" + "id": "21b3c6c9-d661-4a54-bc0e-7c962af4a006" }, { "cell_type": "markdown", @@ -155,7 +155,7 @@ "`storage/tutorial/classification_lab_1` and to display notebook-friendly\n", "progress bars:" ], - "id": "e36f7423-d448-421e-93d9-e8c68c08bcb7" + "id": "70cb381b-ffdd-423e-a9e3-d7355e6970b8" }, { "cell_type": "code", @@ -163,12 +163,9 @@ "metadata": {}, "outputs": [], "source": [ - "lab = labtech.Lab(\n", - " storage='storage/tutorial/classification_lab_1',\n", - " notebook=True,\n", - ")" + "lab = labtech.Lab(storage='storage/tutorial/classification_lab_1')" ], - "id": "92c19a4d-1727-4dc7-bb67-6a44a6280a50" + "id": "11bc4d63-924b-47c9-ac64-b316ca7513a8" }, { "cell_type": "markdown", @@ -179,7 +176,7 @@ "probabilities returned by the task’s `run()` method, so we can calculate\n", "the loss from them as before:" ], - "id": "f9f7c8a0-c365-4108-b756-9928a0fb4332" + "id": "c0f86d13-3902-4df7-a5e9-b51316221b08" }, { "cell_type": "code", @@ -191,7 +188,7 @@ "prob_y = lab.run_task(classifier_experiment)\n", "print(f'{log_loss(digits_y, prob_y) = :.3}')" ], - "id": "f1414381-b2f8-4dbd-ba17-fcedd1196cf6" + "id": "9758afff-74ea-40a0-bf11-a9dc37038f3d" }, { "cell_type": "markdown", @@ -202,7 +199,7 @@ "calls to run the same experiment (even after restarting Python) will\n", "load the result from the cache:" ], - "id": "25ac55ca-a67a-4d98-8619-e552fce5546c" + "id": "1beca795-3d52-4923-9555-ab9b1dc966db" }, { "cell_type": "code", @@ -213,7 +210,7 @@ "prob_y = lab.run_task(classifier_experiment)\n", "print(f'{log_loss(digits_y, prob_y) = :.3}')" ], - "id": "d221cfe2-94c6-4f5a-aef4-a5ac246dcf10" + "id": "aefc115a-4972-42dd-9a37-4d0cff5c383c" }, { "cell_type": "markdown", @@ -230,7 +227,7 @@ "(or we could pass a list of tasks to `lab.run_tasks()`, as we will see\n", "in the next section of this tutorial)." ], - "id": "6156639b-affb-4567-8839-f7282d5fa2c1" + "id": "0c99b2c8-8212-4a92-b348-74a9fa4fd10f" }, { "cell_type": "code", @@ -242,7 +239,7 @@ " ClassifierExperiment,\n", "])" ], - "id": "192ed536-4f62-41ed-b930-5905ec695c54" + "id": "2c336e65-5592-4d7e-a02b-bf1ff2100fc5" }, { "cell_type": "markdown", @@ -256,7 +253,7 @@ "You can clear the cached results for a list of tasks with\n", "`lab.uncache_tasks()`:" ], - "id": "d5e74483-fb9e-4a40-8047-68f66e543fa4" + "id": "9e46602e-e8c1-458d-8f34-b61b009c704d" }, { "cell_type": "code", @@ -268,7 +265,7 @@ " classifier_experiment,\n", "])" ], - "id": "b72e2f48-3c45-4291-8603-a001769002b0" + "id": "370d8fe3-dc1e-4582-bd24-0c9d677310be" }, { "cell_type": "markdown", @@ -285,7 +282,7 @@ "the same way as\n", "[dataclass](https://docs.python.org/3/library/dataclasses.html) fields:" ], - "id": "59912642-d58f-43f5-8d15-06b007838f5b" + "id": "b6161831-ef2a-4fc5-b5aa-3fa6aac082cb" }, { "cell_type": "code", @@ -311,7 +308,7 @@ " prob_y = clf.predict_proba(digits_X)\n", " return prob_y" ], - "id": "c35771b3-2eb1-4558-a7d8-aeb0e1ed43b6" + "id": "9f6a8720-d26b-45d8-95c3-b3e71d1f6fc5" }, { "cell_type": "markdown", @@ -320,7 +317,7 @@ "Now we’ll use a list comprehension to construct a list of\n", "`ClassifierExperiment` tasks with different `n_estimators` values:" ], - "id": "52f4aebf-d48c-4412-9fc1-4bb347643303" + "id": "7fc9f47d-8c0f-47a8-8ac7-7691cf823bba" }, { "cell_type": "code", @@ -335,7 +332,7 @@ " for n_estimators in range(1, 11)\n", "]" ], - "id": "ea0e483c-2406-4b9b-b299-49e9afc209f7" + "id": "348d9114-9ec2-49ec-9610-b4bd0b6bede7" }, { "cell_type": "markdown", @@ -349,7 +346,7 @@ "caches for the new definition separate by constructing a new lab that\n", "uses a different storage directory:" ], - "id": "8bb37691-c96b-4537-b820-5be19fb55f16" + "id": "0ba219d6-b9c1-4e4d-8c57-07d07d09c2bf" }, { "cell_type": "code", @@ -357,13 +354,10 @@ "metadata": {}, "outputs": [], "source": [ - "lab = labtech.Lab(\n", - " storage='storage/tutorial/classification_lab_2',\n", - " notebook=True,\n", - ")\n", + "lab = labtech.Lab(storage='storage/tutorial/classification_lab_2')\n", "results = lab.run_tasks(classifier_experiments)" ], - "id": "960ccbfe-627c-4619-b0cd-0c8521bb9f01" + "id": "1ff74f14-3c8d-4269-b0b0-f9d1cdb9d24c" }, { "cell_type": "markdown", @@ -373,7 +367,7 @@ "result it returned, which we can loop over to print loss metrics for\n", "each experiment:" ], - "id": "6cc52712-d0e0-4b27-8307-0806f1c9512e" + "id": "2f2d1a54-6748-4394-9d1f-80a8d0519e1b" }, { "cell_type": "code", @@ -384,7 +378,7 @@ "for experiment, prob_y in results.items():\n", " print(f'{experiment}: {log_loss(digits_y, prob_y) = :.3}')" ], - "id": "7b4df29a-f6b5-4ba7-9c72-f7772bf858df" + "id": "44262603-c4d0-40e1-9313-81bb9f2f1758" }, { "cell_type": "markdown", @@ -410,7 +404,7 @@ "allowing us to access the result from the `.result` attribute of the\n", "task parameter (i.e. `self.classifier_experiment.result`):" ], - "id": "296e0edb-5c34-49f3-b469-dc676a67dba0" + "id": "76f01b15-89b3-48ba-9842-608b0756524d" }, { "cell_type": "code", @@ -433,7 +427,7 @@ " min_max_prob_y[np.arange(len(prob_y)), prob_y.argmax(axis=1)] = 1\n", " return min_max_prob_y" ], - "id": "c6153e33-6e8d-4d4f-9e98-b2a105078b42" + "id": "6094070e-c8e2-49aa-be64-2a6d4ec30d57" }, { "cell_type": "markdown", @@ -446,7 +440,7 @@ "`MinMaxProbabilityExperiment` is run, re-using results depended on by\n", "multiple tasks and loading previously cached results wherever possible:" ], - "id": "2aa1b598-45f2-4e23-acad-d3486c2d1548" + "id": "74d447ba-b628-4f3a-b843-d15c5024a66d" }, { "cell_type": "code", @@ -465,7 +459,7 @@ "for experiment, prob_y in results.items():\n", " print(f'{experiment}: {log_loss(digits_y, prob_y) = :.3}')" ], - "id": "d285311a-5cfb-4d33-925b-a8a1eb46570b" + "id": "c505f0c7-b69c-4030-a6df-70d37e7859b2" }, { "cell_type": "markdown", @@ -508,7 +502,7 @@ " `ClassifierExperiment` tasks, the `run()` method first creates\n", " its own copy of the classifier with `clone()`." ], - "id": "bae6e714-eda1-48f8-acf0-540d8a28f4e9" + "id": "c8b95730-3a8b-4d00-8e60-232ee16f2c9a" }, { "cell_type": "code", @@ -562,7 +556,7 @@ " prob_y = clf.predict_proba(digits_X)\n", " return prob_y" ], - "id": "cc6d6105-aa41-4e75-9016-b4312c9be1a5" + "id": "9cac268e-f8ea-4e4f-b177-5cef911364c3" }, { "cell_type": "markdown", @@ -573,7 +567,7 @@ "for each of these `RFClassifierTask` tasks as well as an\n", "`LRClassifierTask` task:" ], - "id": "fbee6c6d-94cf-4261-928b-dcc1a7b9961c" + "id": "bc65c246-e194-4d59-ab2b-93231c9a3efa" }, { "cell_type": "code", @@ -597,16 +591,13 @@ " ]\n", "]\n", "\n", - "lab = labtech.Lab(\n", - " storage='storage/tutorial/classification_lab_3',\n", - " notebook=True,\n", - ")\n", + "lab = labtech.Lab(storage='storage/tutorial/classification_lab_3')\n", "\n", "results = lab.run_tasks(classifier_experiments)\n", "for experiment, prob_y in results.items():\n", " print(f'{experiment}: {log_loss(digits_y, prob_y) = :.3}')" ], - "id": "4d353ff2-e6a8-42ff-a48d-32897caddea7" + "id": "0d9c8e06-f95f-4067-b1fd-6c62adbc7fdd" }, { "cell_type": "markdown", @@ -620,7 +611,7 @@ "our experiments) outside of any task, allowing us to inspect these\n", "datasets before and after the tasks have been run:" ], - "id": "4b7ac6ed-d984-4750-9bb3-aec254fc980e" + "id": "1f151abc-0bc3-48c0-8cfe-a33e18767626" }, { "cell_type": "code", @@ -636,7 +627,7 @@ " 'iris': {'X': iris_X, 'y': iris_y},\n", "}" ], - "id": "f88efc2d-3b8c-44b2-9b3e-582d10fa8ad9" + "id": "f241b60d-5a14-4268-8d5a-fc283246deb9" }, { "cell_type": "markdown", @@ -653,7 +644,7 @@ "3. Alter the task generation and evaluation code to handle multiple\n", " datasets." ], - "id": "6d0b141c-060d-42ff-8185-83fd074f83ae" + "id": "41219ee2-fd14-474b-bd42-e66d2869e89a" }, { "cell_type": "code", @@ -689,7 +680,6 @@ "\n", "lab = labtech.Lab(\n", " storage='storage/tutorial/classification_lab_4',\n", - " notebook=True,\n", " context={\n", " 'DATASETS': DATASETS,\n", " },\n", @@ -700,7 +690,7 @@ " dataset_y = DATASETS[experiment.dataset_key][\"y\"]\n", " print(f'{experiment}: {log_loss(dataset_y, prob_y) = :.3}')" ], - "id": "e7eb8691-c5da-43a3-8001-aa2a3aaa9487" + "id": "1f0a1281-12c2-4cbc-8842-9443bc234da7" }, { "cell_type": "markdown", @@ -746,7 +736,7 @@ " `mlflow.set_experiment('example_labtech_experiment')` before the\n", " tasks are run." ], - "id": "b442afe9-d817-4d51-8161-e8262bce0c8a" + "id": "56112323-c79b-41ce-b4e0-d3300ceaa47f" }, { "cell_type": "code", @@ -909,7 +899,6 @@ "mlflow.set_experiment('example_labtech_experiment')\n", "lab = labtech.Lab(\n", " storage='storage/tutorial/classification_lab_final',\n", - " notebook=True,\n", " context={\n", " 'DATASETS': DATASETS,\n", " },\n", @@ -919,7 +908,7 @@ "for experiment, result in evaluation_result.items():\n", " print(f'{experiment}: log_loss = {result[\"log_loss\"]:.3}')" ], - "id": "cc322b63-7114-4b4d-9267-1846411a6f26" + "id": "7a9a46a0-c134-4c13-9832-bedf0d8f6c59" }, { "cell_type": "markdown", @@ -930,7 +919,7 @@ "Finally, we can use Labtech to generate a diagram of a list of tasks\n", "that shows all of the task types, parameters, and dependencies:" ], - "id": "ee7b2e8d-a2ea-4d7b-b4bd-a84b4d2d4e10" + "id": "1bb573af-da49-4f76-844e-3ec5b26c0d1a" }, { "cell_type": "code", @@ -944,7 +933,7 @@ " evaluation_task,\n", "], direction='BT')" ], - "id": "6019a742-519d-4601-9976-9880db58ec74" + "id": "c111dcb3-2e9e-4e79-a6e1-69aac9ee7c60" }, { "cell_type": "markdown", @@ -1006,7 +995,7 @@ "- [More\n", " examples](https://github.com/ben-denham/labtech/tree/main/examples)" ], - "id": "83879f91-be81-4a2e-a551-0cc33c1457a7" + "id": "721a3edd-5cf8-4556-8fab-20a90a6f60c1" } ], "nbformat": 4, diff --git a/labtech/diagram.py b/labtech/diagram.py index 71bd503..ff306c4 100644 --- a/labtech/diagram.py +++ b/labtech/diagram.py @@ -1,10 +1,10 @@ -import builtins from dataclasses import dataclass, fields from textwrap import indent from typing import Dict, Sequence, Type, get_args, get_origin, get_type_hints from .tasks import find_tasks_in_param from .types import Task, is_task +from .utils import is_ipython @dataclass(frozen=True) @@ -187,17 +187,8 @@ def display_task_diagram(tasks: Sequence[Task], **kwargs) -> None: """ diagram = build_task_diagram(tasks, **kwargs) - - ipython = False - try: + if is_ipython(): from IPython.display import Markdown, display - except ImportError: - pass - else: - if hasattr(builtins, '__IPYTHON__'): - ipython = True - - if ipython: display(Markdown(f'```mermaid\n{diagram}\n```')) else: print(diagram) diff --git a/labtech/lab.py b/labtech/lab.py index 4491e8b..7d6ed69 100644 --- a/labtech/lab.py +++ b/labtech/lab.py @@ -27,7 +27,7 @@ from .storage import LocalStorage, NullStorage from .tasks import find_tasks_in_param from .types import ResultMeta, ResultsMap, ResultT, Storage, Task, TaskResult, TaskT, is_task, is_task_type -from .utils import LoggerFileProxy, OrderedSet, logger +from .utils import LoggerFileProxy, OrderedSet, is_ipython, logger _IN_TASK_SUBPROCESS = False @@ -410,7 +410,7 @@ def __init__(self, *, storage: Union[str, Path, None, Storage], continue_on_failure: bool = True, max_workers: Optional[int] = None, - notebook: bool = False, + notebook: Optional[bool] = None, context: Optional[dict[str, Any]] = None): """ Args: @@ -426,8 +426,9 @@ def __init__(self, *, `concurrent.futures.ProcessPoolExecutor`: the number of processors on the machine. When `max_workers=1`, all tasks will be run in the main process, without multi-processing. - notebook: Should be set to `True` if run from a Jupyter notebook - for graphical progress bars. + notebook: Determines whether to use notebook-friendly graphical + progress bars. When set to `None` (the default), labtech will + detect whether the code is being run from an IPython notebook. context: A dictionary of additional variables to make available to tasks. The context will not be cached, so the values should not affect results (e.g. parallelism factors) or should be kept @@ -440,7 +441,7 @@ def __init__(self, *, self._storage = storage self.continue_on_failure = continue_on_failure self.max_workers = max_workers - self.notebook = notebook + self.notebook = is_ipython() if notebook is None else notebook if context is None: context = {} self.context = context diff --git a/labtech/utils.py b/labtech/utils.py index 6ad1554..d256753 100644 --- a/labtech/utils.py +++ b/labtech/utils.py @@ -1,5 +1,6 @@ """General labtech utilities.""" +import builtins import logging import re from typing import Dict, Generic, Optional, Sequence, Type, TypeVar, cast @@ -109,9 +110,14 @@ def ensure_dict_key_str(value, *, exception_type: Type[Exception]) -> str: return cast(str, value) +def is_ipython() -> bool: + return hasattr(builtins, '__IPYTHON__') + + __all__ = [ 'logger', 'OrderedSet', 'LoggerFileProxy', 'ensure_dict_key_str', + 'is_ipython', ] From 16d4656b92bf42c90231727b09a4247c21cc226c Mon Sep 17 00:00:00 2001 From: Ben Denham Date: Sat, 18 May 2024 20:05:43 +1200 Subject: [PATCH 3/3] Add documentation about using mlflow with SQLite --- docs/cookbook.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/cookbook.md b/docs/cookbook.md index ae45f74..a8538d3 100644 --- a/docs/cookbook.md +++ b/docs/cookbook.md @@ -874,6 +874,16 @@ results = lab.run_tasks(runs) > `mlflow.start_run()`, labtech wraps the entire call to the `run()` > method of your task in order to track execution times in mlflow. +> Note: Because mlflow logging will be performed from a separate +> process for each task, you must use an mlflow tracking backend that +> supports multiple simultaneous connections. Specifically, using an +> SQLite backend directly from multiple processes may result in +> database locking errors. Instead, consider using local files (the +> default used by mlflow), an SQL database that runs as a server (e.g. +> postgresql, mysql, or mssql), or running a local mlflow tracking +> server (which may itself connect to an sqlite database). For more +> details, see the [mlflow backend +> documentation](https://mlflow.org/docs/latest/tracking/backend-stores.html). ### Why do I see the following error: `An attempt has been made to start a new process before the current process has finished`?