From 6c2f4fb9803a4dfd12a1c14dec6bb568c3980206 Mon Sep 17 00:00:00 2001 From: Roni Friedman-Melamed Date: Thu, 10 Oct 2024 13:34:01 +0300 Subject: [PATCH] ilab evaluation --- ilab/__init__.py | 0 ilab/adjust_digit_output_format.py | 37 +++ ilab/create_ilab_skill_yaml.py | 197 +++++++++++++++ ilab/evaluate_from_file.py | 196 +++++++++++++++ ilab/ilab_evaluate.py | 371 +++++++++++++++++++++++++++++ ilab/run_base_bam_model.py | 122 ++++++++++ src/unitxt/inference.py | 81 +++++++ 7 files changed, 1004 insertions(+) create mode 100644 ilab/__init__.py create mode 100644 ilab/adjust_digit_output_format.py create mode 100644 ilab/create_ilab_skill_yaml.py create mode 100644 ilab/evaluate_from_file.py create mode 100644 ilab/ilab_evaluate.py create mode 100644 ilab/run_base_bam_model.py diff --git a/ilab/__init__.py b/ilab/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ilab/adjust_digit_output_format.py b/ilab/adjust_digit_output_format.py new file mode 100644 index 000000000..67dcdc21f --- /dev/null +++ b/ilab/adjust_digit_output_format.py @@ -0,0 +1,37 @@ +import json,yaml,argparse + +def create_train_file(digit_file,train_output_file): + + + with open(digit_file, 'r') as infile, open(train_output_file, 'w') as outfile: + for line in infile: + record = json.loads(line.strip()) + record['assistant'] = record.pop('response') + record['system'] = '' + record['user'] = record.pop('instruction') + outfile.write(json.dumps(record) + '\n') + +def create_test_file(yaml_file,test_output_file): + with open(yaml_file, 'r') as f: + yaml_content = yaml.safe_load(f) + yaml_content = yaml_content.get("seed_examples", {}) + with open(test_output_file, 'w') as f: + if isinstance(yaml_content, list): + for entry in yaml_content: + entry['user'] = entry.pop('question') + entry['system'] = '' + entry['assistant'] = entry.pop('answer') + f.write(json.dumps(entry) + '\n') + +if __name__=="__main__": + + parser = argparse.ArgumentParser(description='evaluate dataset against ilab model and save results') + + parser.add_argument('--sdg_file', type=str, required=True, help='path of file created by sdg') + parser.add_argument('--yaml_file', type=str, required=True, help='path of yaml file') + parser.add_argument('--train_file',required=True, type=str,help='path of train output file') + parser.add_argument('--test_file',required=True, type=str,help='path of test output file') + args = parser.parse_args() + + create_train_file(digit_file=args.sdg_file, train_output_file=args.train_file) + create_test_file(yaml_file=args.yaml_file, test_output_file=args.test_file) \ No newline at end of file diff --git a/ilab/create_ilab_skill_yaml.py b/ilab/create_ilab_skill_yaml.py new file mode 100644 index 000000000..5026cd3fc --- /dev/null +++ b/ilab/create_ilab_skill_yaml.py @@ -0,0 +1,197 @@ +from lh_eval_api import load_lh_dataset +from unitxt.api import load_dataset +from unitxt import register_local_catalog +from typing import List, Optional +import yaml +from collections import Counter +import random +from dataclasses import dataclass +from unitxt.settings_utils import get_settings + +settings = get_settings() +settings.allow_unverified_code = True + +@dataclass +class SeedExample: + """ + Represents an example seed item with question, answer, and optionally context. + + Attributes: + question (str): A question for the model + answer (str): The desired response from the model + context (Optional[str]): For grounded skills - context containing information that the model is expected to take into account during processing + """ + question: str + answer: str + context: Optional[str] = None + max_length: int = 2300 + + def get_length(self): + q_len = len(self.question.split()) + a_len = len(self.answer.split()) + return a_len+q_len + + def __post_init__(self): + length = self.get_length() + if length > self.max_length: + raise ValueError(f"Question + Answer must not exceed {self.max_length} words. Currently there are ~{length} words") + + def _to_dict(self)->dict: + data = { + 'question': self.question, + 'answer': self.answer + } + if self.context is not None: + data['context'] = self.context + + return data + +@dataclass +class IlabSkillAdder: + """ + Represents the task description including the sdgbuilder, creator, and a list of seed examples. + + Attributes: + task_description (str): A description of the skill. + created_by (str): The GitHub username of the contributor. + seed_examples (List[SeedExample]): A list of seed examples related to the skill. The file must contain 5 examples. + """ + task_description: str + created_by: str + yaml_file_path: str + seed_examples: List[SeedExample] + data_builder:str = "skills_sdg" + num_required_examples:int = 5 + + def __post_init__(self): + num_examples = len(self.seed_examples) + if num_examples!= self.num_required_examples: + raise ValueError(f"Skill Adder must contain exactly {self.num_required_examples} examples. Currently there are {num_examples}") + self._save_to_yaml() + + + + def _save_to_yaml(self) -> None: + def quoted_presenter(dumper, data): + return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='"') + + yaml.add_representer(str, quoted_presenter) + yaml.add_representer(int, yaml.representer.SafeRepresenter.represent_int) + + data = { + 'data_builder': self.data_builder, + 'task_description': self.task_description, + 'created_by': self.created_by, + 'seed_examples': [example._to_dict() for example in self.seed_examples] + } + + with open(self.yaml_file_path, 'w') as file: + yaml.dump(data, file, default_flow_style=False, sort_keys=False) + + print(f"Data saved to {self.yaml_file_path}") + +@dataclass +class IlabParameters: + task_description:str + yaml_file:str + card:str + creator:str = None + template:str = None + template_index:int = None + question_field:str = 'source' + answer_field:str = 'target' + context_field:str = None + loader_limit:int = 100 + local_catalog:str = None + use_question_field_as_text:bool=False + + +def select_indices_by_classes(dataset, num_samples): + def get_target_indices(target,dataset, num_indices): + target_indices = [i for i,x in enumerate(dataset) if target in x['target']] + return random.sample(target_indices,num_indices) + + indices = [] + freq_classes = Counter(dataset['target']).most_common(num_samples) + n = len(freq_classes) + base, remainder = divmod(num_samples,n) + distribution = [base] * n + for i in range(remainder): + distribution[i]+=1 + + for i,cls in enumerate(freq_classes): + target = cls[0] + target_num_samples = distribution[i] + print(f"Fetching {target_num_samples} samples for target {target}") + indices.extend(get_target_indices(target,dataset,target_num_samples)) + + return indices + +def select_random_indices(dataset, num_samples): + return random.sample(range(len(dataset)), num_samples) + +def create_yaml(parameters:IlabParameters,distribute=True): + if parameters.local_catalog: + register_local_catalog(parameters.local_catalog) + if parameters.template is not None: + loaded_dataset = load_dataset(card=parameters.card, template =parameters.template, loader_limit = parameters.loader_limit) + elif parameters.template_index is not None: + loaded_dataset = load_dataset(card=parameters.card, template_card_index=parameters.template_index, loader_limit = parameters.loader_limit) + else: + ValueError("must have either template or template card index") #TODO error if both are not none + dataset = loaded_dataset['train'] + examples = [] + if distribute: + indices = select_indices_by_classes(dataset,5) + else: + indices = select_random_indices(dataset,5) + parameters.task_description = parameters.task_description + f" (indices: {indices})" + for idx in indices: + example_data = dataset[idx] + # if 'task_data' in example_data: + # example_data = json.loads(example_data['task_data']) + + if parameters.use_question_field_as_text: + question = parameters.question_field + else: + question = example_data[parameters.question_field] + answer = example_data[parameters.answer_field] + context = example_data[parameters.context_field] if parameters.context_field else None + examples.append(SeedExample( + question=question, answer=answer, context=context + )) + print(f"Using the following indices: {indices}") + + IlabSkillAdder( + task_description=parameters.task_description, + created_by=parameters.creator, + seed_examples=examples, + yaml_file_path=parameters.yaml_file + ) + + + +cnn_example = IlabParameters( + task_description="dailymail summarization with context simple", + card="cards.cnn_dailymail", + yaml_file="dailymail_summarization_w_context.yaml", + template="templates.classification.multi_label.text_before_instruction_with_type_of_classes_and_none", + question_field = 'Summarize the following article.\n', + use_question_field_as_text=True, + answer_field = 'summary', + context_field = 'document', +) + + +clapnq = IlabParameters( + card='cards.rag.response_generation.clapnq', + task_description='rag', + yaml_file='ilab/sdg/clapnq.yaml', + template="templates.rag.response_generation.please_respond" +) + + +if __name__ == "__main__": + create_yaml(clapnq) + + diff --git a/ilab/evaluate_from_file.py b/ilab/evaluate_from_file.py new file mode 100644 index 000000000..1bd0da00a --- /dev/null +++ b/ilab/evaluate_from_file.py @@ -0,0 +1,196 @@ +import pandas as pd +from unitxt import evaluate, load_dataset, register_local_catalog +from unitxt.logging_utils import get_logger,get_settings +from unitxt.blocks import ( + TaskCard, +) +from unitxt.loaders import LoadCSV +from unitxt.operators import Rename,Cast,ExecuteExpression +from unitxt.inference import IbmGenAiInferenceEngine + +from lh_eval_api import LakeHouseLoader +from typing import List,Tuple +import os, ast +from dataclasses import dataclass +from ilab_evaluate import save_results + +logger = get_logger() +settings = get_settings() +settings.allow_unverified_code = True + +LOCAL_CATALOG = "../fm-eval/fm_eval/catalogs/private" +force_import = type(LakeHouseLoader) # lakehouseloader import is needed here + +@dataclass +class JudgeModelParams: + name:str + model_id:str + template:str + format:str + norm_to_range:int + +STRICT_MODELS = [ + JudgeModelParams(name='prometheus_3_classify', model_id= 'kaist-ai/prometheus-8x7b-v2', format="formats.models.mistral.instruction", norm_to_range=3, + template="templates.response_assessment.rating.prometheus_single_turn_inst_classify_3"), + JudgeModelParams(name='mixtral_strict', model_id="mistralai/mixtral-8x7b-instruct-v01", format="formats.models.mistral.instruction", norm_to_range=10, template="templates.response_assessment.rating.mt_bench_single_turn_strict"), + JudgeModelParams(name='llama_strict', model_id="meta-llama/llama-3-70b-instruct", template= "templates.response_assessment.rating.generic_single_turn_strict", format="formats.llama3_instruct", norm_to_range=10), + +] + +JUDGE_MODELS = [ + JudgeModelParams(name='prometheus_5', model_id= 'kaist-ai/prometheus-8x7b-v2', format="formats.models.mistral.instruction", norm_to_range=5, + template="templates.response_assessment.rating.prometheus_single_turn_inst_comply_5"), + JudgeModelParams(name='mixtral', model_id="mistralai/mixtral-8x7b-instruct-v01", format="formats.models.mistral.instruction", norm_to_range=10, template="templates.response_assessment.rating.mt_bench_single_turn"), + JudgeModelParams(name='llama', model_id="meta-llama/llama-3-70b-instruct", template= "templates.response_assessment.rating.generic_single_turn", format="formats.llama3_instruct", norm_to_range=10), + ] + +def get_strict_val(template): + for substr in ['strict','format','classify']: + if substr in template: + return 'True' + return 'False' + +class PostEvaluate: + def __init__( + self, + preds_file:str, + pred_column:str = 'processed_model_prediction', + index_column:str = 'record_index', + score_column:str = 'score', + judging_models:List[JudgeModelParams]= JUDGE_MODELS, + local_catalog:str = LOCAL_CATALOG, + dataset_split:str = 'test' + ) -> None: + if not os.path.exists(preds_file): + raise ValueError(f"File doesn't exist: {preds_file}") + runs_file = preds_file.replace('predictions','run') + if not os.path.exists(runs_file): + raise ValueError( + f"Run file not found. Expecting a matching file with suffix 'run': {runs_file}" + ) + self.preds_file = preds_file + self.run_file = runs_file + self.pred_column = pred_column + self.index_column = index_column + self.score_column = score_column + self.judging_models = judging_models + self.local_catalog = local_catalog + self.dataset_split = dataset_split + logger.info("evaluator initiated") + + def get_params_from_file(self)->Tuple[str,str,str,str,str, dict]: + df = pd.read_csv(self.run_file) + data = df.iloc[0].to_dict() + model = data['model_name'] + card = f"cards.{data['dataset']}" + print(data['run_params']) + run_params = ast.literal_eval(data['run_params']) + try: + template = run_params['template'] + except KeyError: + raise ValueError('template data missing in file') + owner = data['owner'] + task = data['task'] + logger.info("params collected") + return card,template, model, owner, task, run_params + + + def run(self, overwrite = False, loader_limit = 100): + if self.local_catalog: + register_local_catalog(self.local_catalog) + card,template, model, owner,task, run_params = self.get_params_from_file() + run_params['template']=f"'{template}'" + loader_limit = min(loader_limit, int(run_params['loader_limit'])) + run_params['loader_limit'] = loader_limit + for modelparams in self.judging_models: + model = modelparams.name + model_csv_path = self.preds_file.replace('predictions',f"{model}") + if not overwrite: + if os.path.exists(model_csv_path.replace('.csv','_predictions.csv')): + logger.info(f"**** file already exists, skipping: {model_csv_path}") + continue + model_id = modelparams.model_id + logger.info(f"Judging model: {model_id}") + model_run_params = run_params.copy() + model_run_params['file'] = model_csv_path + model_run_params['meta_eval']='True' + model_run_params['with_reference'] = 'False' + model_run_params['judge_template'] = modelparams.template + model_run_params['judge_format'] = modelparams.format + model_run_params['strict'] = get_strict_val(modelparams.template) + try: + evaluated_dataset = self.evaluate_meta_task(modelparams,loader_limit=loader_limit) + except Exception as e: + logger.error(f"**** Error while inferring for: {model_csv_path}") + logger.error(e) + return + print(evaluated_dataset[0]['score']['global']) + save_results( + csv_path=model_csv_path, + evaluated_dataset=evaluated_dataset, + model_name=model_id, + owner=owner, + card=card, + task_name=task, + run_params_dict=model_run_params, + append_model_name=False, + pred_n_ref_are_scores=True, + ) + + + + def evaluate_meta_task(self, modelparams:JudgeModelParams, loader_limit:int): + task = "tasks.response_assessment.rating.single_turn" + template = modelparams.template + meta_card = TaskCard( + LoadCSV(files={'test':self.preds_file}), + preprocess_steps=[ + Rename( + field_to_field={ + "unformatted_input": "question", + # "score": "rating", + "processed_model_prediction": "answer", + "references": "reference_answer", + } + ), + ExecuteExpression(expression=f"1+(float(score)*({modelparams.norm_to_range}-1))", to_field="rating"), + Cast(to="str", failure_default='None', field_to_field={"answer":"answer"}) + ], + task = task, + templates = [template], + ) + + logger.info('loading evaluation dataset...') + dataset = load_dataset( + card = meta_card, + template = template, + format = modelparams.format, + loader_limit = loader_limit + # metrics = [ensemble_metric] + ) + model_id = modelparams.model_id + logger.info(f'Inferring with {model_id}') + multiseed_predictions = [] + for seed in [4]:#,100, 213, 706, 900]: + inference_model = IbmGenAiInferenceEngine(model_name=model_id, random_seed = seed) + predictions = inference_model.infer(dataset['test']) + multiseed_predictions.append(predictions) + # chain all predictions in a single instance + # change processor to extract 5 numbers and avg them + # change evaluated dataset input to be new predictions set + logger.info('Evaluating model judgments') + evaluated_dataset = evaluate(predictions=predictions, data=dataset['test']) + return evaluated_dataset + + +if __name__ == '__main__': + + from glob import glob + files_to_post_evaluate = glob('ilab/ilab_results/granite_ilab/base_*_shots_predictions.csv') + for file in files_to_post_evaluate: + ev = PostEvaluate( + file, dataset_split='test') + ev.run(overwrite=False, loader_limit=1000) + + + \ No newline at end of file diff --git a/ilab/ilab_evaluate.py b/ilab/ilab_evaluate.py new file mode 100644 index 000000000..7ea40bff7 --- /dev/null +++ b/ilab/ilab_evaluate.py @@ -0,0 +1,371 @@ +import ast +import re + +import yaml +#from lh_eval_api import load_lh_dataset +from unitxt.api import load_dataset +from unitxt import evaluate, load_dataset, register_local_catalog +from unitxt.inference import OpenAiInferenceEngineParams, \ + NonBatchedInstructLabInferenceEngine +import pandas as pd +from datetime import datetime +from typing import List,Dict,Any, Tuple +from datasets import DatasetDict +import argparse +import importlib +from dataclasses import dataclass,asdict +from unitxt.artifact import fetch_artifact +from unitxt.operator import SequentialOperator + +@dataclass +class IlabRunParams: + file:str + yaml_indices:List[int] + template:str + loader_limit:int + num_shots:int + base_model:bool + is_yaml:bool + + def to_dict(self): + return asdict(self) + + +class EvaluateIlab: + host_machine:str + card:str + task_name:str + yaml_file:str + is_trained:bool + template:str + template_index: int + local_catalog:str + num_test_samples:int + owner:str + llmaaj_metric:str + eval_yaml_only:bool + lh_predictions_namespace:bool + + + def __init__( + self, + host_machine:str, + card:str, + task_name:str, + yaml_file:str, + is_trained:bool = False, + template: str = None, + template_index: int = None, + local_catalog:str = None, + num_test_samples:int = 100, + owner:str = 'ilab', + llmaaj_metric:List[str] = ['metrics.llm_as_judge.rating.llama_3_70b_instruct_ibm_genai_template_generic_single_turn'], + eval_yaml_only:bool = False, + lh_predictions_namespace:bool = False + ): + self.card = card + self.host_machine = host_machine + self.template = template + self.template_index = template_index + self.task_name = task_name + self.yaml_file = yaml_file + self.local_catalog = local_catalog + self.num_test_samples= num_test_samples + self.is_trained = is_trained + self.owner = owner + if self.local_catalog: + register_local_catalog(self.local_catalog) + self.llmaaj_metric = llmaaj_metric + self.eval_yaml_only = eval_yaml_only + self.folder = 'ilab/ilab_results' + self.lh_predictions_namespace = lh_predictions_namespace + self.yaml_indices = self.get_yaml_indices() #reported in run details + used for evaluation + + def infer_from_model(self,dataset:DatasetDict) -> Tuple[List[Dict[str, Any]],str]: + test_dataset = dataset['test'] + inference_model = NonBatchedInstructLabInferenceEngine( + parameters=OpenAiInferenceEngineParams(max_tokens=1000), + base_url = f'http://{self.host_machine}.pok.ibm.com:9000/v1' + ) + predictions = inference_model.infer(test_dataset) + model_name = inference_model.model + evaluated_dataset = evaluate(predictions=predictions, data=test_dataset) + print(evaluated_dataset[0]['score']['global']) + return evaluated_dataset, model_name + + def load_test_data(self, num_shots:int): + dataset = load_dataset( + card=self.card, + template=self.template, + template_card_index = self.template_index, + loader_limit=self.num_test_samples, + num_demos = num_shots, + demos_pool_size = num_shots*4 + ) + return dataset + + def run(self, csv_path = None): + if not csv_path: + trained = 'trained' if self.is_trained else 'base' + csv_path = f'{self.folder}/{self.yaml_file.split("/")[-1].replace(".yaml","")}_{trained}.csv' + if not self.eval_yaml_only: + for numshot in [0,5]: + if numshot == 5 and self.num_test_samples < 50: + continue + self.test_load_infer_and_save(num_shots=numshot,file=csv_path) + self.yaml_load_infer_and_save(csv_path) + if self.lh_predictions_namespace: + upload_to_lh(self.folder, self.lh_predictions_namespace) + + def yaml_load_infer_and_save(self, file): + yaml_dataset = self.create_dataset_from_yaml() + csv_path = file.replace('.csv','_yaml_eval.csv') + evaluated_yaml_datset, model_name = self.infer_from_model(yaml_dataset) + save_results( + csv_path = csv_path, + evaluated_dataset = evaluated_yaml_datset, + model_name = model_name, + card = self.card, + task_name = self.task_name, + run_params_dict = IlabRunParams( + file=csv_path,yaml_indices=self.yaml_indices, + template=self.template if self.template else self.template_index, + base_model=is_base_model(model_name), + is_yaml='True', loader_limit=None, num_shots=None + ).to_dict() + ) + + def test_load_infer_and_save(self,num_shots:int, file:str): + csv_path = file.replace('.csv',f'_{num_shots}_shots_{self.num_test_samples}_samples.csv') + dataset = self.load_test_data(num_shots) + evaluated_dataset, model_name = self.infer_from_model(dataset=dataset) + base_run_params = IlabRunParams( + file=csv_path, yaml_indices=self.yaml_indices, + template=self.template if self.template else self.template_index, + loader_limit=self.num_test_samples, + num_shots=num_shots, + base_model=is_base_model(model_name), + is_yaml='False', + ).to_dict() + + save_results( + csv_path=csv_path, + evaluated_dataset=evaluated_dataset, + model_name= model_name, + owner = self.owner, + card=self.card, + task_name=self.task_name, + run_params_dict=base_run_params + ) + + def get_yaml_indices(self): + with open(self.yaml_file, 'r') as f: + yaml_content = yaml.safe_load(f) + pattern = r"\(indices: (\[.*?\])\)" + match = re.search(pattern, yaml_content['task_description']) + assert match, f"yaml description should contain the chosen indices. " \ + f"Description: {yaml_content['task_description']}" + + yaml_indices = match.group(1) + yaml_indices = ast.literal_eval(yaml_indices) + return yaml_indices + + def create_dataset_from_yaml(self)-> DatasetDict: + if self.local_catalog: + register_local_catalog(self.local_catalog) + if self.template is not None: + loaded_dataset = load_dataset(card=self.card, template=self.template, + loader_limit=self.num_test_samples) + elif self.template_index is not None: + loaded_dataset = load_dataset(card=self.card, template_card_index=self.template_index, + loader_limit=self.num_test_samples) + else: + raise ValueError("must have either template or template card index") # TODO error if both are not none + #llmaaj_metric = 'metrics.llm_as_judge.rating.llama_3_70b_instruct_ibm_genai_template_generic_single_turn' + dataset = {'test': [loaded_dataset['train'][i] for i in self.yaml_indices]} + #for instance in dataset['test']: + # instance['metrics'].append(llmaaj_metric) + return dataset + + +def upload_to_lh(folder, namespace): + + + import glob, pandas as pd, datetime,os + from lh_eval_api import EvaluationResultsUploader, PredictionRecord,RunRecord + from lh_eval_api.evaluation_data_services.evaluation_data_handlers.eval_uploader.evaluation_results_uploader import HandleExistingRuns + + def get_time(time_str): + try: + time = datetime.datetime.strptime(time_str,'%Y-%m-%d %H:%M:%S.%f') + except: + time = datetime.datetime.now() + return time + runs_files = glob.glob(os.path.join(folder,'*_run.csv')) + if len(runs_files) == 0: + raise ValueError("no files found") + print(f"Uploading {len(runs_files)} runs") + runs = [] + all_predictions = [] + for file in runs_files: + run_df = pd.read_csv(file) + prediction_file = file.replace('_run.csv','_predictions.csv') + run_df['inference_platform'] = 'ilab' + run_df['execution_env'] = 'ilab' + run_df['started_at'] = run_df['started_at'].apply(get_time) + for dict_str in ['all_scores', 'run_params']: + run_df[dict_str] = run_df[dict_str].apply(lambda x: eval(x.replace("np.float64", "float").replace("nan", "float('nan')"))) + row = run_df.iloc[0] + run_record = RunRecord( + **{col_name: row[col_name] for col_name in RunRecord.__dataclass_fields__ if col_name in run_df.columns} + ) + runs.append(run_record) + predictions_df = pd.read_csv(prediction_file) + predictions_df['run_id'] = run_record.run_id + if 'model_prediction' not in list(predictions_df): + predictions_df['model_prediction'] = predictions_df['processed_model_prediction'] + predictions_df['score'] = predictions_df['score'].apply(float) + predictions = predictions_df.apply( + lambda row: PredictionRecord( + **{col_name: row[col_name] for col_name in PredictionRecord.__dataclass_fields__ if col_name in predictions_df.columns} + ), + axis=1, + ).tolist() + all_predictions.extend(predictions) + + uploader = EvaluationResultsUploader( + runs=runs, + predictions=all_predictions, + predictions_namespace=namespace, + handle_existing=HandleExistingRuns.IGNORE + ) + uploader.upload() + +def is_base_model(model_name:str)->bool: + return 'merlinite-7b-lab-Q4_K_M.gguf' in model_name + +def save_results( + csv_path, + evaluated_dataset, + model_name,owner, + card, + task_name, + run_params_dict = {}, + append_model_name:bool = True, + pred_n_ref_are_scores:bool = False, + ): + global_scores = evaluated_dataset[0]['score']['global'] + num_instances = len(evaluated_dataset) + if pred_n_ref_are_scores: + global_scores['invalid_refs_%'] = len([inst for inst in evaluated_dataset if inst['processed_references'][0]<0])/num_instances + global_scores['invalid_preds_%'] = len([inst for inst in evaluated_dataset if inst['processed_prediction']<0])/num_instances + global_scores['refs_score'] = sum([inst["processed_references"][0] for inst in evaluated_dataset])/num_instances + global_scores['preds_score'] = sum([inst["processed_prediction"] for inst in evaluated_dataset])/num_instances + main_score_name = global_scores.pop('score_name') + global_main_score = global_scores[main_score_name] + if not csv_path.endswith('.csv'): + csv_path = csv_path+'.csv' + if append_model_name: + csv_path = csv_path.replace('.csv',f'_{model_name.split("/")[-1]}.csv') + print(f"saving to {csv_path}...") + run_data = { + 'owner':owner, + 'started_at':datetime.now(), + 'framework':'Unitxt', + 'benchmark':'ilab', + 'dataset':card.replace('cards.',''), + 'task': task_name, + 'model_name': model_name, + 'score': global_main_score, + 'score_name':main_score_name, + 'all_scores':global_scores, + 'run_params':run_params_dict + } + pd.DataFrame([run_data]).to_csv(csv_path.replace('.csv','_run.csv'),index=False) + predictions_data = [] + for i,item in enumerate(evaluated_dataset): + predictions_data.append({ + 'record_index':i, + 'model_input':item["task_data"]["source"], + 'references':str(item["references"]), + 'model_prediction':item['prediction'], + 'processed_model_prediction': item["processed_prediction"], + 'processed_references':str(item["processed_references"]), + 'score':item["score"]["instance"]["score"], + 'score_name':item["score"]["instance"]["score_name"], + 'data_split':"test", + 'unformatted_input':get_unformatted_input(item) + }) + pd.DataFrame(predictions_data).to_csv(csv_path.replace('.csv','_predictions.csv'),index=False) + +def get_unformatted_input(evaluated_dataset_instance): + task_data_instance = evaluated_dataset_instance['task_data'] + template = task_data_instance["metadata"]["template"] + template, _ = fetch_artifact(template) + cleaned_instance = SequentialOperator( + steps=[template, "formats.empty"] #add nshot=0? + ).process_instance( + { + "input_fields": task_data_instance, + "reference_fields": task_data_instance, + } + ) + return cleaned_instance['source'] + + + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description='evaluate dataset against ilab model and save results') + + parser.add_argument('--card', type=str, help='Card name') + parser.add_argument('--template', type=str, help='Template name') + parser.add_argument('--task_name', type=str,help='Task name, e.g. classification, translation etc.') + parser.add_argument('--host_machine', type=str, required=True, help='Name of the host machine serving the model (e.g. cccxc450)') + parser.add_argument('--yaml_file', type=str, help='Path of yaml file containing examples') + + parser.add_argument('--trained_model_flag',action="store_true", help='Optional: Mark if evaluation is on trained model') + parser.add_argument('--local_catalog', type=str, default=None, help='Optional: If using a non unitxt card, local Catalog path, None by default') + parser.add_argument('--num_test_samples', type=int, default=100, help='Optional: Num of assessed records, 100 by default') + parser.add_argument('--owner',type=str,default='ilab',help='Optional: Name of run owner, to be saved in result files') + parser.add_argument('--card_config', type=str, + help='Optional: card_config name. It should be defined at create_ilab_skill_yaml.py') + parser.add_argument('--only_yaml_flag', action='store_true', help='Optional: ran only yaml evaluation' ) + parser.add_argument('--lh_upload_namespace',type=str, help='Optional: specify predictions namespace in order to upload to lakehouse') + args = parser.parse_args() + + if args.card_config is not None: + module = importlib.import_module('create_ilab_skill_yaml') + config = getattr(module, args.card_config) + card = config.card + template = config.template + template_index = config.template_index + task_name = config.task_description + yaml_file = config.yaml_file + else: + card = args.card + template = args.template + task_name = args.task_name + yaml_file = args.yaml_file + + + evaluator = EvaluateIlab( + card = card, + template = template, + task_name=task_name, + host_machine=args.host_machine, + yaml_file=yaml_file, + is_trained=args.trained_model_flag, + num_test_samples=args.num_test_samples, + local_catalog=args.local_catalog, + owner = args.owner, + eval_yaml_only = args.only_yaml_flag, + template_index=template_index, + lh_predictions_namespace = args.lh_upload_namespace + ) + evaluator.run() + + # Example: + # python ilab/ilab_evaluate.py --card_config watson_emotion_classes_first_example --host_machine cccxc408 --local_catalog ../fm-eval/fm_eval/catalogs/private --only_yaml_flag + \ No newline at end of file diff --git a/ilab/run_base_bam_model.py b/ilab/run_base_bam_model.py new file mode 100644 index 000000000..8d62853c2 --- /dev/null +++ b/ilab/run_base_bam_model.py @@ -0,0 +1,122 @@ +from unitxt.inference import IbmGenAiInferenceEngine +from unitxt.api import evaluate, load_dataset +from ilab.ilab_evaluate import save_results,IlabRunParams +from ilab.create_ilab_skill_yaml import cat,clapnq,fin_qa,watson_emotion,ner, IlabParameters +from unitxt import register_local_catalog +from unitxt.system_prompts import TextualSystemPrompt +from unitxt.formats import SystemFormat +import os + +def get_base_model_predictions(test_dataset, model_name): + inference_model = IbmGenAiInferenceEngine( + model_name=model_name, max_new_tokens=1000 + ) + predictions = inference_model.infer(test_dataset) + evaluated_dataset = evaluate(predictions=predictions, data=test_dataset) + return evaluated_dataset + +def get_prompt(): + return TextualSystemPrompt( + "You are an AI language model developed by IBM Research. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior." +) +def get_format(num_shots=0): + if num_shots == 0: + return SystemFormat( + model_input_format='<|system|>\n{system_prompt}\n<|user|>\n{source}\n<|assistant|>\n' +) + if num_shots > 0: + return SystemFormat( + demo_format="Question:\n{source}\nAnswer:\n{target_prefix}{target}\n\n", + model_input_format="<|system|>\n{system_prompt}\n<|user|>\n\n{demos}\nQuestion:\n{source}\nAnswer:\n<|assistant|>{target_prefix}", + ) + + +def run(csv_path_to_save, config:IlabParameters,num_shots,loader_limit=100, overwrite=False): + model_name = "ibm/granite-7b-lab" + template = config.template if config.template else config.template_index + card = config.card + csv_path_to_save = f"{csv_path_to_save.replace('.csv','')}_{config.card.replace('cards.','')}_{num_shots}_shots.csv" + if not overwrite: + if os.path.exists(csv_path_to_save.replace('.csv','_predictions.csv')): + return + if config.local_catalog: + register_local_catalog(config.local_catalog) + load_params = { + 'card':card, + 'loader_limit':loader_limit, + 'system_prompt':get_prompt(), + 'format':get_format(num_shots=num_shots), + 'num_demos':num_shots, + 'demos_pool_size':num_shots*4 + } + if isinstance(template,int): + load_params['template_card_index']=template + else: + load_params['template']=template + dataset = load_dataset(**load_params) + test_dataset = dataset['test'] + evaluated_dataset = get_base_model_predictions(test_dataset,model_name=model_name) + save_results( + csv_path=csv_path_to_save, + evaluated_dataset=evaluated_dataset, + model_name=model_name, + owner='Roni', + card=card, + task_name=config.task_description, + run_params_dict = IlabRunParams( + file=csv_path_to_save, + yaml_indices=[], + template=template, + loader_limit=loader_limit, + num_shots=num_shots, + base_model=True, + is_yaml=False).to_dict(), + append_model_name=False + ) + + +universal_NER_pud = IlabParameters( + task_description='span_labeling', + card='cards.universal_ner.en.pud', + creator='', + yaml_file='', + template_index=0 +) + +BillSum = IlabParameters( + card='cards.billsum_document_filtered_to_6000_chars', + task_description='summarization', + creator='', + yaml_file='', + template_index=0 + +) +DoQA_travel = IlabParameters( + card = 'cards.rag.response_generation.chat_rag_bench.user_assistant_format.doqa_travel', + task_description='rag.response_generation', + creator='', + yaml_file='', + template="templates.rag.response_generation.please_respond_chat" + # template_index='default' +) +flores_101_spa_eng = IlabParameters( + card='cards.mt.flores_101.spa_eng', + task_description='translation', + creator='', + yaml_file='', + template_index=0 +) + +if __name__ == "__main__": + configs = [ DoQA_travel] #flores_101_spa_eng, universal_NER_pud, BillSum, + for config in configs: + loader_limit = 1000 + if 'clapnq' in config.card or 'fin_qa' in config.card: + numshots = [0] + if 'clapnq' in config.card: + loader_limit = 250 + for numshots in [0]: + print(f"running {config.card}") + run(f'ilab/ilab_results/granite_ilab/base',config,numshots,loader_limit=loader_limit) + + \ No newline at end of file diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py index e482adbbe..d87c76289 100644 --- a/src/unitxt/inference.py +++ b/src/unitxt/inference.py @@ -1,8 +1,10 @@ import abc import dataclasses import os +import requests import re from typing import Any, Dict, List, Literal, Optional, Union +from dataclasses import field from datasets import DatasetDict from tqdm import tqdm @@ -498,6 +500,9 @@ class OpenAiInferenceEngine( ): label: str = "openai" model_name: str + parameters: OpenAiInferenceEngineParams = field( + default_factory=OpenAiInferenceEngineParams + ) _requirements_list = { "openai": "Install openai package using 'pip install --upgrade openai" } @@ -606,7 +611,83 @@ def get_return_object(self, predict_result, response, return_meta_data): inference_type=self.label, ) return predict_result + +class InstructLabInferenceEngine( + InferenceEngine, LogProbInferenceEngine, PackageRequirementsMixin +): + base_url: str = 'http://127.0.0.1:8000/v1' #URL for local serving of the model + parameters: OpenAiInferenceEngineParams = field( + default_factory=OpenAiInferenceEngineParams + ) + batch_size: int = 10 + + def prepare(self): + try: + response = requests.get(f"{self.base_url}/models") + response.raise_for_status() + self.model = response.json()['data'][0]['id'] + except requests.exceptions.RequestException as e: + raise ValueError(f"Error fetching model ID: {e}") + except (KeyError, IndexError): + raise ValueError("Error: Failed to parse model ID from response JSON.") + + def extract_inferences(self,http_response): + data = http_response.json() + return [choice['text'] for choice in data['choices']] + + def get_dataset_batch(self, dataset): + return [dataset[i:i + self.batch_size]['source'] for i in range(0, len(dataset), self.batch_size)] + + def add_parametes_to_payload(self, payload): + params = {attr: getattr(self.parameters, attr) for attr in OpenAiInferenceEngineParams.__dict__.keys() if not attr.startswith("_") and getattr(self.parameters, attr) is not None} + payload.update(params) + + def _infer(self,dataset): + outputs = [] + for batch in tqdm(self.get_dataset_batch(dataset), desc=f"Inferring with InstructLab - model: {self.model}"): + payload = { + "prompt": batch, + 'model': self.model + } + self.add_parametes_to_payload(payload) + try: + response = requests.post(self.base_url+"/completions", json=payload) + response.raise_for_status() + batch_outputs = self.extract_inferences(response) + except requests.exceptions.RequestException as e: + raise ValueError(f"Error sending request: {e}") + except (KeyError, IndexError): + raise ValueError("Error: Failed to parse response JSON or access data.") + + outputs.extend(batch_outputs) + return outputs + + def _infer_log_probs(self, dataset): + return None +class NonBatchedInstructLabInferenceEngine(InstructLabInferenceEngine): + def extract_inference(self, http_response): + data = http_response.json() + return data['choices'][0]['text'] + + def _infer(self,dataset): + outputs = [] + for instance in tqdm(dataset, desc=f"Inferring with InstructLab - model: {self.model}"): + payload = { + "prompt": instance["source"], + 'model': self.model + } + self.add_parametes_to_payload(payload) + try: + response = requests.post(self.base_url + "/completions", json=payload) + response.raise_for_status() + output = self.extract_inference(response) + except requests.exceptions.RequestException as e: + raise ValueError(f"Error sending request: {e}") + except (KeyError, IndexError): + raise ValueError("Error: Failed to parse response JSON or access data.") + outputs.append(output) + return outputs class TogetherAiInferenceEngineParamsMixin(Artifact): max_tokens: Optional[int] = None