pubmedqa_long_binary_case1d.py

#!pip3 install --upgrade transformers optimum
#!pip3 install --upgrade auto-gptq

import sys
sys.path.append('/home/add_your_path/lm-evaluation-harness/')
from lm_eval.api.task import Task
from lm_eval.api.instance import Instance
from lm_eval.api.registry import register_task
from lm_eval.api.metrics import mean

import sacrebleu
import json


def bleu(refs, preds):
    """
    Returns `t5` style BLEU scores. See the related implementation:
    https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41
    :param refs:
        A `list` of `list` of reference `str`s.
    :param preds:
        A `list` of predicted `str`s.
    """
    score = sacrebleu.corpus_bleu(preds, refs, smooth_method="exp", smooth_value=0.0, force=False, lowercase=False, tokenize="intl", use_effective_order=False).score
    return score
    
def reverse_answer(ans):
    if ans == 'yes':
        return 'no'
    else:
        return 'yes'

@register_task("pubmedqa_long_binary_case1d")
class PubmedqaLongBinaryCase1d(Task):
    VERSION = 0
    DATASET_PATH = "datasets/pubmed_qa_labeled_fold0_source_binary_physician_acc"
    DATASET_NAME = "default"

    def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
        super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
        self.bert_score = None
        self.completions = []

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def training_docs(self):
        return self.dataset["train"]

    def validation_docs(self):
        return self.dataset["validation"]
    
    def test_docs(self):
        return self.dataset["test"]
    
    def prompt(self):
        """
        Task explanation.
        Here we define the task instructions and few shot examples. We have 4 examples, each containing a context, question, answer, and long answer.
        """
        
        context_1 = "To evaluate the degree to which histologic chorioamnionitis, a frequent finding in placentas submitted for histopathologic evaluation, correlates with clinical indicators of infection in the mother.\nA retrospective review was performed on 52 cases with a histologic diagnosis of acute chorioamnionitis from 2,051 deliveries at University Hospital, Newark, from January 2003 to July 2003. Third-trimester placentas without histologic chorioamnionitis (n = 52) served as controls. Cases and controls were selected sequentially. Maternal medical records were reviewed for indicators of maternal infection.\nHistologic chorioamnionitis was significantly associated with the usage of antibiotics (p = 0.0095) and a higher mean white blood cell count (p = 0.018). The presence of 1 or more clinical indicators was significantly associated with the presence of histologic chorioamnionitis (p = 0.019)."
        question_1 = "Does histologic chorioamnionitis correspond to clinical chorioamnionitis?"
        answer_1 = "yes"
        long_answer_1 = "Histologic chorioamnionitis is a reliable indicator of infection whether or not it is clinically apparent."
        context_2 = "Cancer of the buccal mucosa is an uncommon and aggressive neoplasm of the oral cavity. Less than 2% of patients treated for cancer of the oral cavity at Roswell Park Cancer Institute (RPCI) from 1971 to 1997 had primary buccal cancers. Because the majority of these patients did not undergo any adjuvant treatment, this group provided us with the opportunity to assess the relationship between margin status and local recurrence for both small (T1-T2) and large (T3-T4) tumors treated with surgery alone.\nThe RPCI tumor registry database reported 104 patients who were treated for buccal carcinoma. A retrospective chart review identified 27 patients who met our criteria for a buccal mucosal primary tumor (epicenter of the mass in the buccal mucosa). There were 13 men and 14 women, ranging in age from 34 to 94 years (mean, 75). Data were collected regarding patient demographics, presenting symptoms, stage, treatment received, and outcome.\nAll patients underwent surgical resection of their primary lesion; 21 (75%) had T1 or T2 tumors. The rate of local recurrence was 56% for the group as a whole. Patients with close or positive margins had a 66% local failure rate as compared with 52% when surgical margins were negative (greater than or equal to 5 mm from the resection margin after tissue fixation; P = ns). Among those in whom negative margins were achieved, patients with T1-T2 disease had a 40% local failure rate with surgical resection alone."
        question_2 = "Cancer of the buccal mucosa: are margins and T-stage accurate predictors of local control?"
        answer_2 = "no"
        long_answer_2 = "Local excision of T1 and T2 buccal mucosa cancers with pathologically negative margins had a high rate of local recurrence in our series. Low T-stage and negative margins are not adequate predictors of local control. Even early buccal tumors may benefit from adjuvant therapy to enhance local control."
        context_3 = "To be able to adhere to discharge instructions after a visit to the emergency department (ED), patients should understand both the care that they received and their discharge instructions. The objective of this study is to assess, at discharge, patients' comprehension of their ED care and instructions and their awareness of deficiencies in their comprehension.\nWe conducted structured interviews of 140 adult English-speaking patients or their primary caregivers after ED discharge in 2 health systems. Participants rated their subjective understanding of 4 domains: (1) diagnosis and cause; (2) ED care; (3) post-ED care, and (4) return instructions. We assessed patient comprehension as the degree of agreement (concordance) between patients' recall of each of these domains and information obtained from chart review. Two authors scored each case independently and discussed discrepancies before providing a final concordance rating (no concordance, minimal concordance, partial concordance, near concordance, complete concordance).\nSeventy-eight percent of patients demonstrated deficient comprehension (less than complete concordance) in at least 1 domain; 51% of patients, in 2 or more domains. Greater than a third of these deficiencies (34%) involved patients' understanding of post-ED care, whereas only 15% were for diagnosis and cause. The majority of patients with comprehension deficits failed to perceive them. Patients perceived difficulty with comprehension only 20% of the time when they demonstrated deficient comprehension."
        question_3 = "Patient comprehension of emergency department care and instructions: are patients aware of when they do not understand?"
        answer_3 = "no"
        long_answer_3 = "Many patients do not understand their ED care or their discharge instructions. Moreover, most patients appear to be unaware of their lack of understanding and report inappropriate confidence in their comprehension and recall."
        context_4 = "Complex regional pain syndrome type I is treated symptomatically. A protective effect of vitamin C (ascorbic acid) has been reported previously. A dose-response study was designed to evaluate its effect in patients with wrist fractures.\nIn a double-blind, prospective, multicenter trial, 416 patients with 427 wrist fractures were randomly allocated to treatment with placebo or treatment with 200, 500, or 1500 mg of vitamin C daily for fifty days. The effect of gender, age, fracture type, and cast-related complaints on the occurrence of complex regional pain syndrome was analyzed.\nThree hundred and seventeen patients with 328 fractures were randomized to receive vitamin C, and ninety-nine patients with ninety-nine fractures were randomized to receive a placebo. The prevalence of complex regional pain syndrome was 2.4% (eight of 328) in the vitamin C group and 10.1% (ten of ninety-nine) in the placebo group (p=0.002); all of the affected patients were elderly women. Analysis of the different doses of vitamin C showed that the prevalence of complex regional pain syndrome was 4.2% (four of ninety-six) in the 200-mg group (relative risk, 0.41; 95% confidence interval, 0.13 to 1.27), 1.8% (two of 114) in the 500-mg group (relative risk, 0.17; 95% confidence interval, 0.04 to 0.77), and 1.7% (two of 118) in the 1500-mg group (relative risk, 0.17; 95% confidence interval, 0.04 to 0.75). Early cast-related complaints predicted the development of complex regional pain syndrome (relative risk, 5.35; 95% confidence interval, 2.13 to 13.42)."
        question_4 = "Can vitamin C prevent complex regional pain syndrome in patients with wrist fractures?"
        answer_4 = "yes"
        long_answer_4 = "Vitamin C reduces the prevalence of complex regional pain syndrome after wrist fractures. A daily dose of 500 mg for fifty days is recommended."       
        
        explanation = "You are a supportive, respectful, and truthful assistant, dedicated to providing assistance in a clinical context. Your responses must adhere to the highest standards of safety, ethics, and professional integrity. They should be free from any form of bias (e.g., racial, gender-based, socio-economic) and avoid promoting harmful, unethical, illegal, or otherwise inappropriate content. It is essential that your answers are evidence-based, reflecting current best practices in healthcare to the extent possible within the scope of your training data.\nIn scenarios where the input is unclear, incorrect, or lacks factual basis, kindly clarify the confusion or correct the misinformation, prioritizing educational value and accuracy. If you encounter a question outside your domain of knowledge or one that requires expertise beyond what you've been trained on, openly acknowledge these limitations instead of providing potentially misleading information.\nIn the dialogue that follows, you will engage in simulated conversations with a physician, hereafter referred to as 'User'. The User will present clinical scenarios, including context, a specific question, and their own response to the question. Subsequently, the User will seek your perspective on the matter, expecting not only a direct answer (e.g., 'yes' or 'no') but also a rationale for your response. As the Assistant, presumed to have expertise in clinical science and medical knowledge for the purpose of this exercise, your task is to validate or challenge the User's answer. Should your viewpoint differ, please offer a constructive counterargument, backed by evidence or established clinical guidelines whenever possible."

        # Here, we have different scenarios for the order of few-shot examples. Pick the one that you want to test.
        
        # Case 1d_1
        prompt = f'{explanation}\n\n### User: Context: {context_1}, Question: {question_1}, Answer: {answer_1}\n### Assistant: Answer: {answer_1}\nExplanation: {long_answer_1}\n\n### User: Context: {context_2}, Question: {question_2}, Answer: {answer_2}\n### Assistant: Answer: {answer_2}\nExplanation: {long_answer_2}\n\n### User: Context: {context_3}, Question: {question_3}, Answer: yes\n### Assistant: Answer: {answer_3}\nExplanation: {long_answer_3}\n\n### User: Context: {context_4}, Question: {question_4}, Answer: no\n### Assistant: Answer: {answer_4}\nExplanation: {long_answer_4}'
        
        # Case 1d_2
        #prompt = f'{explanation}\n\n### User: Context: {context_4}, Question: {question_4}, Answer: no\n### Assistant: Answer: {answer_4}\nExplanation: {long_answer_4}\n\n### User: Context: {context_2}, Question: {question_2}, Answer: {answer_2}\n### Assistant: Answer: {answer_2}\nExplanation: {long_answer_2}\n\n### User: Context: {context_3}, Question: {question_3}, Answer: yes\n### Assistant: Answer: {answer_3}\nExplanation: {long_answer_3}\n\n### User: Context: {context_1}, Question: {question_1}, Answer: {answer_1}\n### Assistant: Answer: {answer_1}\nExplanation: {long_answer_1}'
        
        # Case 1d_3
        #prompt = f'{explanation}\n\n### User: Context: {context_2}, Question: {question_2}, Answer: {answer_2}\n### Assistant: Answer: {answer_2}\nExplanation: {long_answer_2}\n\n### User: Context: {context_1}, Question: {question_1}, Answer: {answer_1}\n### Assistant: Answer: {answer_1}\nExplanation: {long_answer_1}\n\n### User: Context: {context_4}, Question: {question_4}, Answer: no\n### Assistant: Answer: {answer_4}\nExplanation: {long_answer_4}\n\n### User: Context: {context_3}, Question: {question_3}, Answer: yes\n### Assistant: Answer: {answer_3}\nExplanation: {long_answer_3}'

        # Case 1d_4
        #prompt = f'{explanation}\n\n### User: Context: {context_3}, Question: {question_3}, Answer: yes\n### Assistant: Answer: {answer_3}\nExplanation: {long_answer_3}\n\n### User: Context: {context_4}, Question: {question_4}, Answer: no\n### Assistant: Answer: {answer_4}\nExplanation: {long_answer_4}\n\n### User: Context: {context_1}, Question: {question_1}, Answer: {answer_1}\n### Assistant: Answer: {answer_1}\nExplanation: {long_answer_1}\n\n### User: Context: {context_2}, Question: {question_2}, Answer: {answer_2}\n### Assistant: Answer: {answer_2}\nExplanation: {long_answer_2}'

        return prompt

    def doc_to_text(self, doc):
        doc_context = "\n".join(doc["CONTEXTS"])
        return f'{self.prompt()}\n\n### User: Context: {doc_context}, Question: {doc["QUESTION"]}, Answer: no\n### Assistant:'
    

    @staticmethod
    def should_decontaminate():
        return True


    def doc_to_decontamination_query(self, doc):
        return doc["QUESTION"]


    def doc_to_target(self, doc):
        return doc["LONG_ANSWER"]


    def construct_requests(self, doc, ctx, **kwargs):
        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.
        :param doc:
            The document as returned from training_docs, validation_docs, or test_docs.
        :param ctx: str
            The context string, generated by fewshot_context. This includes the natural
            language description, as well as the few shot examples, and the question
            part of the document for `doc`.
        """


        return [
                Instance(
                request_type="generate_until",
                doc=doc,
                arguments=(ctx, {"until": ["\n\n"]}),
                idx=0,
                **kwargs
            )
        ]

    def process_results(self, doc, results):
        completion = results[0]
        
        completion_short = completion.split("\n")[0].split(":")[1].lstrip(' ')
        completion_long = completion.replace(completion.split("\n")[0] + '\n', '').split(":")[1].lstrip(' ')
        
        d_completion = {'QUESTION': doc["QUESTION"], 'CONTEXTS': doc["CONTEXTS"], 'final_decision': doc["final_decision"], 'LONG_ANSWER': doc["LONG_ANSWER"], 'completion': completion, 'short_completion': completion_short, 'long_completion': completion_long} 
        self.completions.append(d_completion)
        with open("path_to_your_output_directory/pubmedqa_long_binary_modelName_case1d_1.json", "w") as outfile: 
            json.dump(self.completions, outfile)
            
        long_answer_refs = doc["LONG_ANSWER"]
        
        # BLEU
        bleu_scores = [bleu([[ref]], [completion_long]) for ref in long_answer_refs]

        res = {
            "bleu": bleu_scores[0],
        }
        
        return res
    
    def aggregation(self):
        """
        :returns: {str: [float] -> float}
            A dictionary where keys are the names of submetrics and values are
            functions that aggregate a list of metrics
        """
        return {k: mean for k in ["bleu"]}


    def higher_is_better(self):
        """
        :returns: {str: bool}
            A dictionary where keys are the names of submetrics and values are
            whether a higher value of the submetric is better
        """
        return {k: True for k in ["bleu"]}