Skip to content

Commit efe0ed0

Browse files
add additional step to pipeline to generate a metrics report
Signed-off-by: Michael Clifford <mcliffor@redhat.com>
1 parent 723fb6c commit efe0ed0

File tree

4 files changed

+199
-68
lines changed

4 files changed

+199
-68
lines changed

eval/final/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
from .components import run_final_eval_op
1+
from .components import generate_metrics_report_op, run_final_eval_op
22

33
# from . import faked
44

5-
__all__ = ["run_final_eval_op"]
5+
__all__ = ["run_final_eval_op", "generate_metrics_report_op"]

eval/final/components.py

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,13 @@
11
# type: ignore
22
# pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error
33

4-
from kfp.dsl import Artifact, Output, component
4+
from kfp.dsl import Artifact, Input, Metrics, Output, component
55

6-
from utils.consts import RHELAI_IMAGE
6+
from utils.consts import PYTHON_IMAGE, RHELAI_IMAGE
77

88

99
@component(base_image=RHELAI_IMAGE, install_kfp_package=False)
1010
def run_final_eval_op(
11-
mmlu_branch_output: Output[Artifact],
12-
mt_bench_branch_output: Output[Artifact],
1311
base_model_dir: str,
1412
base_branch: str,
1513
candidate_branch: str,
@@ -20,6 +18,8 @@ def run_final_eval_op(
2018
candidate_model: str = None,
2119
taxonomy_path: str = "/input/taxonomy",
2220
sdg_path: str = "/input/sdg",
21+
mmlu_branch_output_path: str = "/output/mmlu_branch",
22+
mt_bench_branch_output_path: str = "/output/mt_bench_branch",
2323
):
2424
import json
2525
import os
@@ -326,8 +326,13 @@ def find_node_dataset_directories(base_dir: str):
326326
"summary": summary,
327327
}
328328

329-
with open(mmlu_branch_output.path, "w", encoding="utf-8") as f:
329+
if not os.path.exists(mmlu_branch_output_path):
330+
os.makedirs(mmlu_branch_output_path)
331+
with open(
332+
f"{mmlu_branch_output_path}/mmlu_branch_data.json", "w", encoding="utf-8"
333+
) as f:
330334
json.dump(mmlu_branch_data, f, indent=4)
335+
331336
else:
332337
print("No MMLU tasks directories found, skipping MMLU_branch evaluation.")
333338

@@ -470,5 +475,41 @@ def find_node_dataset_directories(base_dir: str):
470475
"summary": summary,
471476
}
472477

473-
with open(mt_bench_branch_output.path, "w", encoding="utf-8") as f:
478+
if not os.path.exists(mt_bench_branch_output_path):
479+
os.makedirs(mt_bench_branch_output_path)
480+
with open(
481+
f"{mt_bench_branch_output_path}/mt_bench_branch_data.json",
482+
"w",
483+
encoding="utf-8",
484+
) as f:
474485
json.dump(mt_bench_branch_data, f, indent=4)
486+
487+
488+
@component(base_image=PYTHON_IMAGE, install_kfp_package=False)
489+
def generate_metrics_report_op(
490+
metrics: Output[Metrics],
491+
):
492+
import ast
493+
import json
494+
495+
with open("/output/mt_bench_data.json", "r") as f:
496+
mt_bench_data = f.read()
497+
mt_bench_data = ast.literal_eval(mt_bench_data)[0]
498+
499+
metrics.log_metric("mt_bench_best_model", mt_bench_data["model"])
500+
metrics.log_metric("mt_bench_best_score", mt_bench_data["overall_score"])
501+
metrics.log_metric("mt_bench_best_model_error_rate", mt_bench_data["error_rate"])
502+
503+
with open("/output/mt_bench_branch/mt_bench_branch_data.json", "r") as f:
504+
mt_bench_branch_data = json.loads(f.read())
505+
506+
metrics.log_metric("mt_bench_branch_score", mt_bench_branch_data["overall_score"])
507+
metrics.log_metric(
508+
"mt_bench_branch_base_score", mt_bench_branch_data["base_overall_score"]
509+
)
510+
511+
with open("/output/mmlu_branch/mmlu_branch_data.json", "r") as f:
512+
mmlu_branch_data = json.loads(f.read())
513+
514+
metrics.log_metric("mmlu_branch_score", mmlu_branch_data["model_score"])
515+
metrics.log_metric("mmlu_branch_base_score", mmlu_branch_data["base_model_score"])

pipeline.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ def ilab_pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]):
8383
)
8484

8585
# Imports for evaluation
86-
from eval.final import run_final_eval_op
86+
from eval.final import generate_metrics_report_op, run_final_eval_op
8787
from eval.mt_bench import run_mt_bench_op
8888

8989
@dsl.pipeline(
@@ -452,17 +452,29 @@ def pipeline(
452452
mount_path="/output",
453453
)
454454

455-
output_pvc_delete_task = DeletePVC(pvc_name=output_pvc_task.output)
456-
output_pvc_delete_task.after(
457-
output_model_task, output_mt_bench_task, final_eval_task
458-
)
459-
460455
sdg_pvc_delete_task = DeletePVC(pvc_name=sdg_input_pvc_task.output)
461456
sdg_pvc_delete_task.after(final_eval_task)
462457

463458
model_pvc_delete_task = DeletePVC(pvc_name=model_pvc_task.output)
464459
model_pvc_delete_task.after(final_eval_task)
465460

461+
generate_metrics_report_task = generate_metrics_report_op()
462+
generate_metrics_report_task.after(output_mt_bench_task, final_eval_task)
463+
generate_metrics_report_task.set_caching_options(False)
464+
mount_pvc(
465+
task=generate_metrics_report_task,
466+
pvc_name=output_pvc_task.output,
467+
mount_path="/output",
468+
)
469+
470+
output_pvc_delete_task = DeletePVC(pvc_name=output_pvc_task.output)
471+
output_pvc_delete_task.after(
472+
output_model_task,
473+
output_mt_bench_task,
474+
final_eval_task,
475+
generate_metrics_report_task,
476+
)
477+
466478
return
467479

468480
return pipeline

0 commit comments

Comments
 (0)