From fb9847cd1036cae039ed1590e854ed0b029ee777 Mon Sep 17 00:00:00 2001 From: Hillary Kipkemoi Date: Mon, 29 Jul 2024 06:46:03 +0300 Subject: [PATCH] run baseline benchmark using ragas against the 20 q&a pairs --- .../rag-system/advanced_RAG_system.ipynb | 425 +++++++++++++++++- src/ragas/ragas_utils.py | 4 +- 2 files changed, 423 insertions(+), 6 deletions(-) diff --git a/notebooks/rag-system/advanced_RAG_system.ipynb b/notebooks/rag-system/advanced_RAG_system.ipynb index 1a203f4..87cb7d6 100644 --- a/notebooks/rag-system/advanced_RAG_system.ipynb +++ b/notebooks/rag-system/advanced_RAG_system.ipynb @@ -129,10 +129,12 @@ "data": { "text/plain": [ "{'question': 'What event is Rory McIlroy preparing for after the WGC-Cadillac Championship?',\n", - " 'answer': 'Rory McIlroy is preparing for the Masters Tournament after the WGC-Cadillac Championship.',\n", - " 'contexts': ['gift was data processing of genetic profiles from donor-recipient pairs. It works on a simple swapping principle but takes it to a much higher level, according to California Pacific Medical Center in San Francisco. So high, that it is taking five surgeons, a covey of physician assistants, nurses and anesthesiologists, and more than 40 support staff to perform surgeries on 12 people. They are extracting six kidneys from donors and implanting them into six recipients. \"The ages of the donors and recipients range from 26 to 70 and include three parent and child pairs, one sibling pair and one brother and sister-in-law pair,\" the medical center said in a statement. The chain of surgeries is to be wrapped up Friday. In late March, the medical center is planning to hold a reception for all 12 patients. Here\\'s how the super swap works, according to California Pacific Medical Center. Say, your brother needs a kidney to save his life, or at least get off of dialysis, and you\\'re willing to give',\n", - " '(CNN)Share, and your gift will be multiplied. That may sound like an esoteric adage, but when Zully Broussard selflessly decided to give one of her kidneys to a stranger, her generosity paired up with big data. It resulted in six patients receiving transplants. That surprised and wowed her. \"I thought I was going to help this one person who I don\\'t know, but the fact that so many people can have a life extension, that\\'s pretty big,\" Broussard told CNN affiliate KGO. She may feel guided in her generosity by a higher power. \"Thanks for all the support and prayers,\" a comment on a Facebook page in her name read. \"I know this entire journey is much bigger than all of us. I also know I\\'m just the messenger.\" CNN cannot verify the authenticity of the page. But the power that multiplied Broussard\\'s gift was data processing of genetic profiles from donor-recipient pairs. It works on a simple swapping principle but takes it to a much higher level, according to California Pacific Medical Center',\n", - " \"Here's how the super swap works, according to California Pacific Medical Center. Say, your brother needs a kidney to save his life, or at least get off of dialysis, and you're willing to give him one of yours. But then it turns out that your kidney is not a match for him, and it's certain his body would reject it. Your brother can then get on a years-long waiting list for a kidney coming from an organ donor who died. Maybe that will work out -- or not, and time could run out for him. Alternatively, you and your brother could look for another recipient-living donor couple like yourselves -- say, two more siblings, where the donor's kidney isn't suited for his sister, the recipient. But maybe your kidney is a match for his sister, and his kidney is a match for your brother. So, you'd do a swap. That's called a paired donation. It's a bit of a surgical square dance, where four people cross over partners temporarily and everybody goes home smiling. But instead of a square dance,\"]}" + " 'answer': \"Rory McIlroy is preparing for Arnold Palmer's tournament, which takes place from March 19-22.\",\n", + " 'contexts': ['(CNN)Jordan Spieth has Rory McIlroy and the world No.1 spot firmly in his sights after winning the Valspar Championship on Sunday. Spieth won a three-way play-off with a 28-foot birdie on the third extra hole to become only the fourth player since 1940 to win twice on the PGA Tour before turning 22. It is a feat that not even McIlroy mastered with Tiger Woods, Sergio Garcia and Robert Gamez the only players to have achieved that particular accolade in the past 75 years. But it is the Northern Irishman that is within Spieth\\'s focus heading towards Augusta. \"I like studying the game, being a historian of the game,\" Spieth told the PGA Tour website. \"It\\'s really cool to have my name go alongside those. \"But right now currently what I\\'m really focused on is Rory McIlroy and the No.1 in the world. That\\'s who everyone is trying to chase. \"That\\'s our ultimate goal to eventually be the best in the world and this is a great, great stepping stone. But going into the four majors of the year, to',\n", + " 'was good fun.\" Not that opportunity knocked for McIlroy when he chose the 3-iron to play his third shot to the 18th and final hole of the tournament and promptly found the water again. The Northern Irishman feigned to repeat his earlier antics, before placing it back in the bag. His mistake led to a double bogey six and left him tied for ninth at one-under-par, eight shots behind winner Dustin Johnson. McIlroy had promised to return the club to Trump after the round and was as good as his word. \"We\\'re thinking about auctioning it for charity or doing a trophy case for Doral, putting it on a beautiful mount,\" Trump said. Johnson is looking set to be one of McIlroy\\'s main rivals in the first major of the season, the U.S. Masters at Augusta, next month and his victory completed a triumphant comeback to the PGA Tour. The 30-year-old American took a six-month break from the Tour last July to cope with \"personal problems\" and returned earlier this year. Johnson finished with a three-under',\n", + " 'It raises money for two Florida hospitals named for the seven-time major winner and his late wife Winnie. \"I am so proud of what has been accomplished at the hospitals over the past 25 years. It is always a privilege to know that we are making a difference in the lives of families throughout the community,\" said Palmer after his medical center was named one of the best for children in the U.S. for 2014-15. He hurt his shoulder in December after tripping on carpet when he was about to make a speech at a PGA Tour father/son event. World No. 1 Rory McIlroy will make his first appearance at Palmer\\'s March 19-22 tournament, which features a restricted field, while top-five players Bubba Watson, Henrik Stenson, Adam Scott and Jason Day will also take part. Like us on Facebook .',\n", + " '(CNN)It was an act of frustration perhaps more commonly associated with golf\\'s fictional anti-hero Happy Gilmore than the world\\'s reigning No 1. player. But when Rory McIlroy pulled his second shot on the eighth hole of the WGC Cadillac Championship into a lake Friday, he might as well have been channeling the much loved Adam Sandler character. Before continuing his round with a dropped ball, the four-time major winner launched the 3-iron used to play the offending shot into the water as well. \"(It) felt good at the time,\" a rueful McIlroy later said of the incident in comments carried by the PGA Tour website. \"I just let frustration get the better of me. It was heat of the moment, and I mean, if it had of been any other club I probably wouldn\\'t have but I didn\\'t need a 3‑iron for the rest of the round so I thought, why not.\" The club \"must have went a good 60, 70 yards,\" he joked. McIlroy composed himself to finish with a second round of 70, leaving him one-under for the tournament',\n", + " 'That\\'s who everyone is trying to chase. \"That\\'s our ultimate goal to eventually be the best in the world and this is a great, great stepping stone. But going into the four majors of the year, to have closed one out in this kind of fashion is going to give me a lot of confidence.\" Spieth has long been heralded as the next big thing in golf -- shining in the amateur ranks with two U.S. junior titles and leading last year\\'s Masters after the third round before eventually finishing tied for second, three shots behind Bubba Watson. On Sunday, he held his nerve at the Innisbrook Resort in Palm Harbor, Florida, to hold on for the victory. He first rescued par on the 16th with a 60-foot bunker shot and needed a 32-foot putt to ensure his place in the playoff, which he duly holed. It was a putt of similar length that ensured a second Tour win to add to his John Deere Classic success in July 2013, which he also won courtesy of a playoff making him the youngest PGA Tour winner for 82 years at']}" ] }, "execution_count": 8, @@ -598,6 +600,421 @@ "rag_ensemble_results" ] }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--LOADING EVALUATION DATA--\n", + "--GETTING CONTEXT AND ANSWERS--\n", + "--EVALUATING LOCALLY--\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8e8318f0eded4656b94500052d2dea26", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Evaluating: 0%| | 0/76 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
questioncontextsanswerground_truthanswer_correctnessfaithfulnessanswer_relevancycontext_precision
0What is the significance of cherry trees in Wa...[Washington (CNN)There should be plenty of pen...The cherry trees in Washington, D.C., are sign...Cherry trees hold great significance in Washin...0.2439160.8333330.9837141.000000
1What is one of the events that Hillary Clinton...[(CNN)After a handful events in two months, Hi...Hillary Clinton will be participating in a pai...Hillary Clinton will be participating in the A...0.9985231.0000000.9445430.950000
2What role did the United States-led coalition ...[(CNN)In his first interview since Islamic Sta...The United States-led coalition conducted airs...The United States-led coalition played a role ...0.8660160.8000000.8878861.000000
3What can viewers expect from the season finale...[(CNN)Have Rick and his fellow survivors final...Viewers can expect a 90-minute season finale f...The season finale of \"The Walking Dead\" is exp...0.2234311.0000000.9735450.916667
4What is the significance of the Tunisian jihad...[Museum. Protesters held banners that said \"We...The Tunisian jihadist's significance lies in h...The significance of the Tunisian jihadist in r...0.6732661.0000000.9018391.000000
5How did the undercover FBI informant play a ro...[(CNN)An Army National Guard member and his co...The undercover FBI informant played a crucial ...The undercover FBI informant played a role in ...0.8292410.7142860.8529891.000000
6How does the new Red brand aim to leverage Rad...[Carlson Rezidor, which is hueing (sic) toward...The new Red brand aims to leverage Radisson's ...The new Red brand aims to leverage Radisson's ...0.8484051.0000000.9150660.916667
7When are the peak blooms expected during the N...[Washington (CNN)There should be plenty of pen...The peak blooms are expected between April 11 ...The peak blooms are expected between April 11 ...0.2500001.0000000.9235101.000000
8What is Zhanna Nemtsova's opinion on the Russi...[of the case except what is in the media. \"I d...Zhanna Nemtsova has no faith in the Russian in...Zhanna Nemtsova said she has no faith in the R...0.7331501.0000000.9357311.000000
9Who was one of Putin's harshest critics?[be heading an opposition party and do what I'...One of Putin's harshest critics was Boris Nemt...Boris Nemtsov0.2296241.0000001.0000001.000000
10What factors could affect the selection of the...[no longer believed that organized labor was a...The selection of the new Senate Democrats lead...Left-leaning activists and organized labor's c...0.6464621.0000000.9377060.679167
11How did the USS Sterett and USS New York coord...[(CNN)The U.S. Navy rescued two Saudi Arabia a...The USS Sterett coordinated the search for the...The USS Sterett, a guided-missile destroyer, c...0.2437230.2500000.9416121.000000
12When will Hillary Clinton announce her 2016 pr...[(CNN)Sen. Harry's Reid's perfunctory announce...I don't know when Hillary Clinton will announc...Hillary Clinton is expected to announce her 20...0.5365820.6666670.0000001.000000
13What does King Abdullah II think about ISIS es...[have that Syrian President Bashar al-Assad is...King Abdullah II believes that ISIS's establis...During his interview with Zakaria, the King re...0.4667880.5000000.9617260.804167
14What nationalities were the Bardo Museum attac...[Museum. Protesters held banners that said \"We...The victims of the Bardo Museum attack include...The victims of the Bardo Museum attack include...0.7433561.0000000.9352581.000000
15How did W Hotels become a pioneer in the hospi...[in pre-social media 1998. \"Starwood was the f...W Hotels became a pioneer in the hospitality i...W Hotels became a pioneer in the hospitality i...0.6541351.0000000.8822601.000000
16Why were the first cherry trees in Washington,...[Taft planted one of the first of more than 3,...The first cherry trees in Washington, D.C. wer...The first cherry trees in Washington, D.C. wer...0.6775631.0000000.9615640.833333
17How can the government protect middle class jo...[of unions is one area where the government ca...The government can protect middle class jobs a...Shoring up the strength of unions is one area ...0.6949701.0000001.0000000.950000
18Why was the Bardo Museum targeted in the terro...[Museum. Protesters held banners that said \"We...The Bardo Museum was targeted in the terrorist...The Bardo Museum was targeted in the terrorist...0.7792830.3000000.9581511.000000
\n", + "" + ], + "text/plain": [ + " question \\\n", + "0 What is the significance of cherry trees in Wa... \n", + "1 What is one of the events that Hillary Clinton... \n", + "2 What role did the United States-led coalition ... \n", + "3 What can viewers expect from the season finale... \n", + "4 What is the significance of the Tunisian jihad... \n", + "5 How did the undercover FBI informant play a ro... \n", + "6 How does the new Red brand aim to leverage Rad... \n", + "7 When are the peak blooms expected during the N... \n", + "8 What is Zhanna Nemtsova's opinion on the Russi... \n", + "9 Who was one of Putin's harshest critics? \n", + "10 What factors could affect the selection of the... \n", + "11 How did the USS Sterett and USS New York coord... \n", + "12 When will Hillary Clinton announce her 2016 pr... \n", + "13 What does King Abdullah II think about ISIS es... \n", + "14 What nationalities were the Bardo Museum attac... \n", + "15 How did W Hotels become a pioneer in the hospi... \n", + "16 Why were the first cherry trees in Washington,... \n", + "17 How can the government protect middle class jo... \n", + "18 Why was the Bardo Museum targeted in the terro... \n", + "\n", + " contexts \\\n", + "0 [Washington (CNN)There should be plenty of pen... \n", + "1 [(CNN)After a handful events in two months, Hi... \n", + "2 [(CNN)In his first interview since Islamic Sta... \n", + "3 [(CNN)Have Rick and his fellow survivors final... \n", + "4 [Museum. Protesters held banners that said \"We... \n", + "5 [(CNN)An Army National Guard member and his co... \n", + "6 [Carlson Rezidor, which is hueing (sic) toward... \n", + "7 [Washington (CNN)There should be plenty of pen... \n", + "8 [of the case except what is in the media. \"I d... \n", + "9 [be heading an opposition party and do what I'... \n", + "10 [no longer believed that organized labor was a... \n", + "11 [(CNN)The U.S. Navy rescued two Saudi Arabia a... \n", + "12 [(CNN)Sen. Harry's Reid's perfunctory announce... \n", + "13 [have that Syrian President Bashar al-Assad is... \n", + "14 [Museum. Protesters held banners that said \"We... \n", + "15 [in pre-social media 1998. \"Starwood was the f... \n", + "16 [Taft planted one of the first of more than 3,... \n", + "17 [of unions is one area where the government ca... \n", + "18 [Museum. Protesters held banners that said \"We... \n", + "\n", + " answer \\\n", + "0 The cherry trees in Washington, D.C., are sign... \n", + "1 Hillary Clinton will be participating in a pai... \n", + "2 The United States-led coalition conducted airs... \n", + "3 Viewers can expect a 90-minute season finale f... \n", + "4 The Tunisian jihadist's significance lies in h... \n", + "5 The undercover FBI informant played a crucial ... \n", + "6 The new Red brand aims to leverage Radisson's ... \n", + "7 The peak blooms are expected between April 11 ... \n", + "8 Zhanna Nemtsova has no faith in the Russian in... \n", + "9 One of Putin's harshest critics was Boris Nemt... \n", + "10 The selection of the new Senate Democrats lead... \n", + "11 The USS Sterett coordinated the search for the... \n", + "12 I don't know when Hillary Clinton will announc... \n", + "13 King Abdullah II believes that ISIS's establis... \n", + "14 The victims of the Bardo Museum attack include... \n", + "15 W Hotels became a pioneer in the hospitality i... \n", + "16 The first cherry trees in Washington, D.C. wer... \n", + "17 The government can protect middle class jobs a... \n", + "18 The Bardo Museum was targeted in the terrorist... \n", + "\n", + " ground_truth answer_correctness \\\n", + "0 Cherry trees hold great significance in Washin... 0.243916 \n", + "1 Hillary Clinton will be participating in the A... 0.998523 \n", + "2 The United States-led coalition played a role ... 0.866016 \n", + "3 The season finale of \"The Walking Dead\" is exp... 0.223431 \n", + "4 The significance of the Tunisian jihadist in r... 0.673266 \n", + "5 The undercover FBI informant played a role in ... 0.829241 \n", + "6 The new Red brand aims to leverage Radisson's ... 0.848405 \n", + "7 The peak blooms are expected between April 11 ... 0.250000 \n", + "8 Zhanna Nemtsova said she has no faith in the R... 0.733150 \n", + "9 Boris Nemtsov 0.229624 \n", + "10 Left-leaning activists and organized labor's c... 0.646462 \n", + "11 The USS Sterett, a guided-missile destroyer, c... 0.243723 \n", + "12 Hillary Clinton is expected to announce her 20... 0.536582 \n", + "13 During his interview with Zakaria, the King re... 0.466788 \n", + "14 The victims of the Bardo Museum attack include... 0.743356 \n", + "15 W Hotels became a pioneer in the hospitality i... 0.654135 \n", + "16 The first cherry trees in Washington, D.C. wer... 0.677563 \n", + "17 Shoring up the strength of unions is one area ... 0.694970 \n", + "18 The Bardo Museum was targeted in the terrorist... 0.779283 \n", + "\n", + " faithfulness answer_relevancy context_precision \n", + "0 0.833333 0.983714 1.000000 \n", + "1 1.000000 0.944543 0.950000 \n", + "2 0.800000 0.887886 1.000000 \n", + "3 1.000000 0.973545 0.916667 \n", + "4 1.000000 0.901839 1.000000 \n", + "5 0.714286 0.852989 1.000000 \n", + "6 1.000000 0.915066 0.916667 \n", + "7 1.000000 0.923510 1.000000 \n", + "8 1.000000 0.935731 1.000000 \n", + "9 1.000000 1.000000 1.000000 \n", + "10 1.000000 0.937706 0.679167 \n", + "11 0.250000 0.941612 1.000000 \n", + "12 0.666667 0.000000 1.000000 \n", + "13 0.500000 0.961726 0.804167 \n", + "14 1.000000 0.935258 1.000000 \n", + "15 1.000000 0.882260 1.000000 \n", + "16 1.000000 0.961564 0.833333 \n", + "17 1.000000 1.000000 0.950000 \n", + "18 0.300000 0.958151 1.000000 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "basic_rag_results" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "basic_rag_results.to_csv(\"data/evaluation_results/bm_1_baseline.csv\")" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/src/ragas/ragas_utils.py b/src/ragas/ragas_utils.py index d64f375..e428a53 100644 --- a/src/ragas/ragas_utils.py +++ b/src/ragas/ragas_utils.py @@ -2,11 +2,11 @@ from datasets import Dataset from ragas.integrations.langsmith import upload_dataset -EVALUATION_FILE_PATH = "data/evaluation_set.csv" +EVALUATION_FILE_PATH = "data/evaluation_sets/evaluation_set_20d20.csv" DATASET_NAME = "CNN DailyMail Evaluation Dataset" DATASET_DESC = "A dataset of questions and ground truth answers from the CNN DailyMail dataset for evaluation purposes." -def load_evaluation_data(csv_file_path: str = "data/evaluation_set.csv") -> dict: +def load_evaluation_data(csv_file_path: str = EVALUATION_FILE_PATH) -> dict: """Loads evaluation data from a CSV file and returns questions and ground truths. Args: