diff --git a/datasets/mediasum-corpus/convert_mediasum-corpus.ipynb b/datasets/mediasum-corpus/convert_mediasum-corpus.ipynb new file mode 100644 index 00000000..dcb33787 --- /dev/null +++ b/datasets/mediasum-corpus/convert_mediasum-corpus.ipynb @@ -0,0 +1,1442 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "28d4a9804792413d", + "metadata": {}, + "source": [ + "# Creating Convokit Corpus element\n", + "according to https://github.com/CornellNLP/ConvoKit/blob/master/examples/converting_movie_corpus.ipynb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e3c3f33-f3bb-4c65-99c9-0a6b7c0614e1", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install torch torchvision\n", + "!pip install convokit\n", + "!pip install datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "362ef5498fa51d2", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T13:47:31.398864Z", + "start_time": "2025-08-21T13:47:31.396352Z" + } + }, + "outputs": [], + "source": [ + "from convokit import Corpus, Speaker, Utterance\n", + "import pandas as pd\n", + "import tqdm\n", + "import ast" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9527f1ed47847a79", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T13:47:48.826642Z", + "start_time": "2025-08-21T13:47:34.037108Z" + } + }, + "outputs": [], + "source": [ + "media_sum_path = \"data/MediaSum/news_dialogue.json\"\n", + "media_sum_json = pd.read_json(media_sum_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d4783945f0f36a2e", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T13:47:48.853529Z", + "start_time": "2025-08-21T13:47:48.841408Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idprogramdateurltitlesummaryuttspeaker
0NPR-1News & Notes2007-11-28https://www.npr.org/templates/story/story.php?...Black Actors Give Bible Star AppealMore than 400 black actors, artists and minist...[Now, moving on, Forest Whitaker as Moses, Tis...[FARAI CHIDEYA, host, FARAI CHIDEYA, host, Mr....
1NPR-2Weekend Edition Sunday2016-10-23https://www.npr.org/2016/10/23/499042298/young...Young, First-Time Voters Share Views On Electi...NPR's Rachel Martin speaks with young voters w...[You have heard it again and again - this is a...[RACHEL MARTIN, HOST, ASHANTI MARTINEZ, LAUREN...
2NPR-3News & Notes2007-11-30https://www.npr.org/templates/story/story.php?...Snapshots: On Solid GroundIn this week's snapshot, actor and playwright ...[I came close to running out of luck, when I a...[Mr. JEFF OBAFEMI CARR (Actor, Playwright), CH...
3NPR-4News & Notes2007-11-30https://www.npr.org/templates/story/story.php?...Washington, D.C. Facing HIV/AIDS EpidemicA new study says one in 50 people in the natio...[This is NEWS & NOTES. I'm Farai Chideya., In ...[FARAI CHIDEYA, host, FARAI CHIDEYA, host, Dr....
4NPR-5News & Notes2007-11-30https://www.npr.org/templates/story/story.php?...Coping When AIDS Hits Your Family: Part IIWhen a family member is diagnosed with HIV/AID...[I'm Farai Chideya and this is NEWS & NOTES., ...[FARAI CHIDEYA, host, FARAI CHIDEYA, host, FAR...
...........................
463591CNN-414237CNN NEWSROOM2020-10-25http://transcripts.cnn.com/TRANSCRIPTS/2010/25...NaNU.S. Officials: Russia, Iran Have Stolen Voter...[Welcome back to our viewers in the United Sta...[BRUNHUBER, NATASHA CHEN, CNN CORRESPONDENT, W...
463592CNN-414238CNN NEWSROOM2020-10-25http://transcripts.cnn.com/TRANSCRIPTS/2010/25...NaNNigerian Police Force Mobilize To Quell Worst ...[In Nigeria, chaotic scenes of looting and des...[BRUNHUBER, BRUNHUBER (voice-over), BRUNHUBER ...
463593CNN-414239CNN NEWSROOM2020-10-25http://transcripts.cnn.com/TRANSCRIPTS/2010/25...NaNCOVID-19 Triggers Rise In Asian American Unemp...[Officials in the U.S. are worried about wides...[BRUNHUBER, AMARA WALKER, CNN ANCHOR (voice-ov...
463594CNN-414240STATE OF THE UNION2020-10-25http://transcripts.cnn.com/TRANSCRIPTS/2010/25...NaNCOVID-19 Outbreak Hits Vice President Pence's ...[Dark winter? U.S. COVID cases hit a new daily...[JAKE TAPPER, CNN HOST (voice-over), DONALD TR...
463595CNN-414241STATE OF THE UNION2020-10-25http://transcripts.cnn.com/TRANSCRIPTS/2010/25...NaNInterview With Rep. Alexandria Ocasio-Cortez (...[Welcome back the STATE OF THE UNION. I'm Jake...[TAPPER, REP. ALEXANDRIA OCASIO-CORTEZ (D-NY),...
\n", + "

463596 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " id program date \\\n", + "0 NPR-1 News & Notes 2007-11-28 \n", + "1 NPR-2 Weekend Edition Sunday 2016-10-23 \n", + "2 NPR-3 News & Notes 2007-11-30 \n", + "3 NPR-4 News & Notes 2007-11-30 \n", + "4 NPR-5 News & Notes 2007-11-30 \n", + "... ... ... ... \n", + "463591 CNN-414237 CNN NEWSROOM 2020-10-25 \n", + "463592 CNN-414238 CNN NEWSROOM 2020-10-25 \n", + "463593 CNN-414239 CNN NEWSROOM 2020-10-25 \n", + "463594 CNN-414240 STATE OF THE UNION 2020-10-25 \n", + "463595 CNN-414241 STATE OF THE UNION 2020-10-25 \n", + "\n", + " url \\\n", + "0 https://www.npr.org/templates/story/story.php?... \n", + "1 https://www.npr.org/2016/10/23/499042298/young... \n", + "2 https://www.npr.org/templates/story/story.php?... \n", + "3 https://www.npr.org/templates/story/story.php?... \n", + "4 https://www.npr.org/templates/story/story.php?... \n", + "... ... \n", + "463591 http://transcripts.cnn.com/TRANSCRIPTS/2010/25... \n", + "463592 http://transcripts.cnn.com/TRANSCRIPTS/2010/25... \n", + "463593 http://transcripts.cnn.com/TRANSCRIPTS/2010/25... \n", + "463594 http://transcripts.cnn.com/TRANSCRIPTS/2010/25... \n", + "463595 http://transcripts.cnn.com/TRANSCRIPTS/2010/25... \n", + "\n", + " title \\\n", + "0 Black Actors Give Bible Star Appeal \n", + "1 Young, First-Time Voters Share Views On Electi... \n", + "2 Snapshots: On Solid Ground \n", + "3 Washington, D.C. Facing HIV/AIDS Epidemic \n", + "4 Coping When AIDS Hits Your Family: Part II \n", + "... ... \n", + "463591 NaN \n", + "463592 NaN \n", + "463593 NaN \n", + "463594 NaN \n", + "463595 NaN \n", + "\n", + " summary \\\n", + "0 More than 400 black actors, artists and minist... \n", + "1 NPR's Rachel Martin speaks with young voters w... \n", + "2 In this week's snapshot, actor and playwright ... \n", + "3 A new study says one in 50 people in the natio... \n", + "4 When a family member is diagnosed with HIV/AID... \n", + "... ... \n", + "463591 U.S. Officials: Russia, Iran Have Stolen Voter... \n", + "463592 Nigerian Police Force Mobilize To Quell Worst ... \n", + "463593 COVID-19 Triggers Rise In Asian American Unemp... \n", + "463594 COVID-19 Outbreak Hits Vice President Pence's ... \n", + "463595 Interview With Rep. Alexandria Ocasio-Cortez (... \n", + "\n", + " utt \\\n", + "0 [Now, moving on, Forest Whitaker as Moses, Tis... \n", + "1 [You have heard it again and again - this is a... \n", + "2 [I came close to running out of luck, when I a... \n", + "3 [This is NEWS & NOTES. I'm Farai Chideya., In ... \n", + "4 [I'm Farai Chideya and this is NEWS & NOTES., ... \n", + "... ... \n", + "463591 [Welcome back to our viewers in the United Sta... \n", + "463592 [In Nigeria, chaotic scenes of looting and des... \n", + "463593 [Officials in the U.S. are worried about wides... \n", + "463594 [Dark winter? U.S. COVID cases hit a new daily... \n", + "463595 [Welcome back the STATE OF THE UNION. I'm Jake... \n", + "\n", + " speaker \n", + "0 [FARAI CHIDEYA, host, FARAI CHIDEYA, host, Mr.... \n", + "1 [RACHEL MARTIN, HOST, ASHANTI MARTINEZ, LAUREN... \n", + "2 [Mr. JEFF OBAFEMI CARR (Actor, Playwright), CH... \n", + "3 [FARAI CHIDEYA, host, FARAI CHIDEYA, host, Dr.... \n", + "4 [FARAI CHIDEYA, host, FARAI CHIDEYA, host, FAR... \n", + "... ... \n", + "463591 [BRUNHUBER, NATASHA CHEN, CNN CORRESPONDENT, W... \n", + "463592 [BRUNHUBER, BRUNHUBER (voice-over), BRUNHUBER ... \n", + "463593 [BRUNHUBER, AMARA WALKER, CNN ANCHOR (voice-ov... \n", + "463594 [JAKE TAPPER, CNN HOST (voice-over), DONALD TR... \n", + "463595 [TAPPER, REP. ALEXANDRIA OCASIO-CORTEZ (D-NY),... \n", + "\n", + "[463596 rows x 8 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "media_sum_json" + ] + }, + { + "cell_type": "markdown", + "id": "1d6a199f40f8e82d", + "metadata": {}, + "source": [ + "## 1. Create speakers" + ] + }, + { + "cell_type": "markdown", + "id": "860b57775acb98b2", + "metadata": {}, + "source": [ + "**Note**: In the speaker list, authors sometimes have non-unique identifiers (e.g., ‘STEVE PROFFITT’, ‘PROFFITT’ or ‘S. PROFFITT’ refer to the same speaker). See example below. Currently I **do not** address this. I will count each unique identifier as a different speaker. Plus, I will count an identifier that is the same in one conversation as in another as the same speaker in another conversation. This might be incorrect for cases like below with 'UNIDENTIFIED MALE' or 'UNIDENTIFIED FEMALE', but I will not address this for now." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "cfaf7e5b64da5f44", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T13:47:49.684433Z", + "start_time": "2025-08-21T13:47:49.681481Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['CUOMO',\n", + " 'ED LAVANDERA, CNN CORRESPONDENT',\n", + " 'LAVANDERA (voice-over)',\n", + " 'ERIC HOLDER, U.S. ATTORNEY GENERAL',\n", + " 'LAVANDERA',\n", + " 'UNIDENTIFIED FEMALE',\n", + " 'UNIDENTIFIED MALE',\n", + " 'UNIDENTIFIED MALE',\n", + " 'LAVANDERA',\n", + " 'HOLDER',\n", + " 'LAVANDERA',\n", + " 'LAVANDERA',\n", + " 'PEREIRA',\n", + " 'PASTOR ROBERT WHITE, PEACE OF MIND CHURCH OF HAPPINESS',\n", + " 'PEREIRA',\n", + " 'MO IVORY, ATTORNEY/RADIO PERSONALITY',\n", + " 'PEREIRA',\n", + " 'IVORY',\n", + " 'PEREIRA',\n", + " 'IVORY',\n", + " 'PEREIRA',\n", + " 'WHITE',\n", + " 'PEREIRA',\n", + " 'WHITE',\n", + " 'PEREIRA',\n", + " 'WHITE',\n", + " 'PEREIRA',\n", + " 'WHITE',\n", + " 'PEREIRA',\n", + " 'IVORY',\n", + " 'WHITE',\n", + " 'IVORY',\n", + " 'PEREIRA',\n", + " 'IVORY',\n", + " 'PEREIRA',\n", + " 'WHITE',\n", + " 'PEREIRA',\n", + " 'WHITE',\n", + " 'PEREIRA',\n", + " 'CUOMO',\n", + " 'BERMAN']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "media_sum_json[\"speaker\"][300000]" + ] + }, + { + "cell_type": "markdown", + "id": "75a4a58ad5b962fb", + "metadata": {}, + "source": [ + "Thus, I use the incorrect **assumption that each element in the speaker list is a string that is the only unique string for this speaker across the whole dataset**." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d6722413f58d0d68", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T13:47:51.200620Z", + "start_time": "2025-08-21T13:47:49.720406Z" + } + }, + "outputs": [], + "source": [ + "# get all speakers from the speaker column\n", + "speakers = media_sum_json['speaker']\n", + "unique_speakers = sorted(set(name for sublist in speakers for name in sublist))" + ] + }, + { + "cell_type": "markdown", + "id": "22759b46fd4a6c84", + "metadata": {}, + "source": [ + "I create a speaker object that only includes the speaker name as information and identifier." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "62de82aa-5608-43c9-bac2-3e8cb8b6f743", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "718483\n" + ] + } + ], + "source": [ + "print(len(unique_speakers))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "377a07f9c845f339", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T13:47:56.477476Z", + "start_time": "2025-08-21T13:47:51.211363Z" + } + }, + "outputs": [], + "source": [ + "corpus_speakers = {speaker_name: Speaker(id = speaker_name, meta ={'name': speaker_name}) for speaker_name in unique_speakers}" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "32fa1c93d8106ee3", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T13:47:56.561297Z", + "start_time": "2025-08-21T13:47:56.558512Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Speaker({'obj_type': 'speaker', 'vectors': [], 'owner': None, 'id': 'LAVANDERA', 'temp_backend': {}, 'meta': {'name': 'LAVANDERA'}})" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "corpus_speakers['LAVANDERA']" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "91385288c5f914fc", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T13:47:56.643238Z", + "start_time": "2025-08-21T13:47:56.640915Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Speaker({'obj_type': 'speaker', 'vectors': [], 'owner': None, 'id': 'ED LAVANDERA, CNN CORRESPONDENT', 'temp_backend': {}, 'meta': {'name': 'ED LAVANDERA, CNN CORRESPONDENT'}})" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "corpus_speakers['ED LAVANDERA, CNN CORRESPONDENT']" + ] + }, + { + "cell_type": "markdown", + "id": "d04dd3a28f43ac6b", + "metadata": {}, + "source": [ + "## 2. Creating utterance objects" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "a5e278626779f655", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T13:47:56.725389Z", + "start_time": "2025-08-21T13:47:56.722713Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "list" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(media_sum_json['utt'][0])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "763bc995ed61198d", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T13:49:00.308204Z", + "start_time": "2025-08-21T13:47:56.804792Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 463596/463596 [01:06<00:00, 6972.92it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of utterances: 13919244\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "utterance_corpus = {}\n", + "conversation_meta = {}\n", + "\n", + "count = 0\n", + "# iterate over each row in the dataframe\n", + "for index, row in tqdm.tqdm(media_sum_json.iterrows(), total=media_sum_json.shape[0]):\n", + " # get the conversation id\n", + " conversation_id = row['id']\n", + " program = row['program']\n", + " date = row['date']\n", + " summary = row['summary']\n", + " url = row['url']\n", + " title = row['title']\n", + "\n", + " conversation_meta[conversation_id] = {\n", + " 'program': program,\n", + " 'date': date,\n", + " 'summary': summary,\n", + " 'url': url,\n", + " 'title': title,\n", + " 'broadcaster': conversation_id.split('-')[0], # should be either NPR or CNN\n", + " }\n", + "\n", + " # get utterance information\n", + " utterance_list = row['utt']\n", + " speaker_list = row['speaker']\n", + "\n", + " for i, utt in enumerate(utterance_list):\n", + " # create a unique identifier for the utterance as in https://aclanthology.org/2024.emnlp-main.52.pdf\n", + " # i.e., from the code base ID of the form 'CNN-67148-13' where 'CNN-67148' is the identifier as used in MediaSum and 13 is the index of the utterance in the original utterance list\n", + " utterance_id = f\"{conversation_id}-{i}\"\n", + " utt_speaker = corpus_speakers[speaker_list[i]]\n", + " utt_text = utt\n", + " reply_to = None if i == 0 else f\"{conversation_id}-{i-1}\" # reply_to is None for the first utterance in the conversation\n", + " # timestamp is not provided\n", + "\n", + " utterance_corpus[utterance_id] = Utterance(\n", + " id=utterance_id,\n", + " speaker=utt_speaker,\n", + " conversation_id=conversation_id,\n", + " reply_to=reply_to,\n", + " text=utt_text,\n", + " )\n", + "\n", + "print(f\"Total number of utterances: {len(utterance_corpus)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "58376a2c-2096-4aa6-8ed2-bb08a24f106e", + "metadata": {}, + "source": [ + "Note: Due to the format of the original dataset, **the same speaker can have several turns in a row** in an interview. For example:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "d8c58719-3fb1-4a86-bd2a-08bd5bbfbbb0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Utterance(id: 'NPR-4-0', conversation_id: NPR-4, reply-to: None, speaker: Speaker(id: 'FARAI CHIDEYA, host', vectors: [], meta: {'name': 'FARAI CHIDEYA, host'}), timestamp: None, text: \"This is NEWS & NOTES. I'm Farai Chideya.\", vectors: [], meta: {})\n", + "Utterance(id: 'NPR-4-1', conversation_id: NPR-4, reply-to: NPR-4-0, speaker: Speaker(id: 'FARAI CHIDEYA, host', vectors: [], meta: {'name': 'FARAI CHIDEYA, host'}), timestamp: None, text: \"In the nation's capital, a killer is on the loose. It's been operating in America for decades now. We're talking about AIDS. Tomorrow is World AIDS Day. Today, we'll discuss staggering new information on how prevalent AIDS is in Washington D.C., particularly among African-Americans. Overall, the rate of AIDS cases in Washington D.C. is about 10 times higher than in the United States. Dr. Shannon Hader is the director of the D.C. HIV/AIDS Administration. Welcome.\", vectors: [], meta: {})\n" + ] + } + ], + "source": [ + "# example utterance\n", + "print(utterance_corpus['NPR-4-0'])\n", + "print(utterance_corpus['NPR-4-1'])" + ] + }, + { + "cell_type": "markdown", + "id": "4cc58a0b-ce42-4adf-bad4-f1ea4fcab674", + "metadata": {}, + "source": [ + "We keep with this original formatting." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "b6ae30ffc04296a9", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T13:49:02.010504Z", + "start_time": "2025-08-21T13:49:02.007258Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Utterance({'obj_type': 'utterance', 'vectors': [], 'speaker_': Speaker({'obj_type': 'speaker', 'vectors': [], 'owner': None, 'id': 'CLARK', 'temp_backend': {}, 'meta': {'name': 'CLARK'}}), 'owner': None, 'id': 'CNN-67148-13', 'temp_backend': {'speaker_id': 'CLARK', 'conversation_id': 'CNN-67148', 'reply_to': 'CNN-67148-12', 'timestamp': None, 'text': \"Well, I don't think -- as far as I know, we're not paying anything to Saudi Arabia, for example, right now. In fact, they're still buying weapons. They are having economic difficulties, but they do have oil. But the other countries in the region are in one way or another in financial trouble, and have been for a long time. They've been sustained on a diet of expectations of economic growth, funded by taking short and long term loans that come from commercial banks, sometimes guaranteed by governments. And then they have to repay these loans. And repaying these loans consumes their foreign exchange earnings from their exports and from remitted earnings to their workers to send the money back home. And they can't get themselves out of the hole easily.\"}, 'meta': {}})" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# example utterance\n", + "utterance_corpus['CNN-67148-13']" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "bf21a57d4bf2205a", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T13:49:03.719167Z", + "start_time": "2025-08-21T13:49:03.715981Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Utterance({'obj_type': 'utterance', 'vectors': [], 'speaker_': Speaker({'obj_type': 'speaker', 'vectors': [], 'owner': None, 'id': 'FARAI CHIDEYA, host', 'temp_backend': {}, 'meta': {'name': 'FARAI CHIDEYA, host'}}), 'owner': None, 'id': 'NPR-1-0', 'temp_backend': {'speaker_id': 'FARAI CHIDEYA, host', 'conversation_id': 'NPR-1', 'reply_to': None, 'timestamp': None, 'text': 'Now, moving on, Forest Whitaker as Moses, Tisha Campbell Martin as Mary Magdalene - well, that\\'s all in \"The Bible Experience.\" A New Testament edition was released in 2006. This edition is billed as \"The Complete Bible.\" It doesn\\'t have one person reading the gospels. It features nearly 400 African-American artists, actors and ministers, plus sound effects.'}, 'meta': {}})" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# example utterance\n", + "utterance_corpus[\"NPR-1-0\"]" + ] + }, + { + "cell_type": "markdown", + "id": "1127ee513645c9b2", + "metadata": {}, + "source": [ + "## 3. Creating corpus from list of utterances" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "38aee32017526f43", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T13:49:05.391920Z", + "start_time": "2025-08-21T13:49:05.390253Z" + } + }, + "outputs": [], + "source": [ + "utterance_list = utterance_corpus.values()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "fe4913b41071bc95", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T13:57:43.305833Z", + "start_time": "2025-08-21T13:53:30.300245Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No configuration file found at /Users/Wegma003/.convokit/config.yml; writing with contents: \n", + "# Default Backend Parameters\n", + "db_host: localhost:27017\n", + "data_directory: ~/.convokit/saved-corpora\n", + "model_directory: ~/.convokit/saved-models\n", + "default_backend: mem\n" + ] + } + ], + "source": [ + "media_sum_corpus = Corpus(utterances=utterance_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "caa212f55d9ccdc4", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T13:59:46.870152Z", + "start_time": "2025-08-21T13:59:46.159878Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "number of conversations in the dataset = 463596\n" + ] + } + ], + "source": [ + "print(\"number of conversations in the dataset = {}\".format(len(media_sum_corpus.get_conversation_ids())))" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "13cdd362d6870e29", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T14:01:50.549811Z", + "start_time": "2025-08-21T14:01:49.822313Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sample conversation 0:\n", + "['NPR-1-0', 'NPR-1-1', 'NPR-1-2', 'NPR-1-3', 'NPR-1-4', 'NPR-1-5', 'NPR-1-6', 'NPR-1-7', 'NPR-1-8', 'NPR-1-9', 'NPR-1-10', 'NPR-1-11', 'NPR-1-12', 'NPR-1-13', 'NPR-1-14', 'NPR-1-15', 'NPR-1-16', 'NPR-1-17', 'NPR-1-18', 'NPR-1-19', 'NPR-1-20', 'NPR-1-21', 'NPR-1-22', 'NPR-1-23', 'NPR-1-24', 'NPR-1-25', 'NPR-1-26', 'NPR-1-27', 'NPR-1-28', 'NPR-1-29', 'NPR-1-30', 'NPR-1-31', 'NPR-1-32', 'NPR-1-33', 'NPR-1-34', 'NPR-1-35', 'NPR-1-36', 'NPR-1-37', 'NPR-1-38', 'NPR-1-39', 'NPR-1-40', 'NPR-1-41', 'NPR-1-42', 'NPR-1-43', 'NPR-1-44', 'NPR-1-45', 'NPR-1-46', 'NPR-1-47']\n", + "sample conversation 1:\n", + "['NPR-2-0', 'NPR-2-1', 'NPR-2-2', 'NPR-2-3', 'NPR-2-4', 'NPR-2-5', 'NPR-2-6', 'NPR-2-7', 'NPR-2-8', 'NPR-2-9', 'NPR-2-10', 'NPR-2-11', 'NPR-2-12', 'NPR-2-13', 'NPR-2-14', 'NPR-2-15', 'NPR-2-16', 'NPR-2-17', 'NPR-2-18', 'NPR-2-19', 'NPR-2-20', 'NPR-2-21', 'NPR-2-22', 'NPR-2-23', 'NPR-2-24', 'NPR-2-25', 'NPR-2-26', 'NPR-2-27', 'NPR-2-28', 'NPR-2-29', 'NPR-2-30', 'NPR-2-31', 'NPR-2-32', 'NPR-2-33', 'NPR-2-34', 'NPR-2-35', 'NPR-2-36', 'NPR-2-37', 'NPR-2-38', 'NPR-2-39', 'NPR-2-40', 'NPR-2-41', 'NPR-2-42', 'NPR-2-43', 'NPR-2-44', 'NPR-2-45', 'NPR-2-46', 'NPR-2-47', 'NPR-2-48', 'NPR-2-49', 'NPR-2-50', 'NPR-2-51', 'NPR-2-52']\n", + "sample conversation 2:\n", + "['NPR-3-0', 'NPR-3-1', 'NPR-3-2', 'NPR-3-3', 'NPR-3-4', 'NPR-3-5', 'NPR-3-6', 'NPR-3-7', 'NPR-3-8', 'NPR-3-9', 'NPR-3-10', 'NPR-3-11', 'NPR-3-12']\n", + "sample conversation 3:\n", + "['NPR-4-0', 'NPR-4-1', 'NPR-4-2', 'NPR-4-3', 'NPR-4-4', 'NPR-4-5', 'NPR-4-6', 'NPR-4-7', 'NPR-4-8', 'NPR-4-9', 'NPR-4-10', 'NPR-4-11', 'NPR-4-12', 'NPR-4-13', 'NPR-4-14', 'NPR-4-15', 'NPR-4-16', 'NPR-4-17', 'NPR-4-18', 'NPR-4-19', 'NPR-4-20', 'NPR-4-21', 'NPR-4-22', 'NPR-4-23']\n", + "sample conversation 4:\n", + "['NPR-5-0', 'NPR-5-1', 'NPR-5-2', 'NPR-5-3', 'NPR-5-4', 'NPR-5-5', 'NPR-5-6', 'NPR-5-7', 'NPR-5-8', 'NPR-5-9', 'NPR-5-10', 'NPR-5-11', 'NPR-5-12', 'NPR-5-13', 'NPR-5-14', 'NPR-5-15', 'NPR-5-16', 'NPR-5-17', 'NPR-5-18', 'NPR-5-19', 'NPR-5-20', 'NPR-5-21', 'NPR-5-22', 'NPR-5-23']\n" + ] + } + ], + "source": [ + "convo_ids = media_sum_corpus.get_conversation_ids()\n", + "for i, convo_idx in enumerate(convo_ids[0:5]):\n", + " print(\"sample conversation {}:\".format(i))\n", + " print(media_sum_corpus.get_conversation(convo_idx).get_utterance_ids())" + ] + }, + { + "cell_type": "markdown", + "id": "bb3ad8f5cb406065", + "metadata": {}, + "source": [ + "## 4. Updating Conversation and Corpus level metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "b97c4667e4a4f01c", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T14:04:17.339564Z", + "start_time": "2025-08-21T14:03:57.316632Z" + } + }, + "outputs": [], + "source": [ + "for convo in media_sum_corpus.iter_conversations():\n", + " # get the conversation id by checking from utterance info\n", + " convo_id = convo.get_id()\n", + "\n", + " # update meta with additional conversation information\n", + " convo.meta.update(conversation_meta[convo_id])" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "9a5e4909560cc603", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T14:06:18.831685Z", + "start_time": "2025-08-21T14:06:18.807937Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "ConvoKitMeta({'program': 'CNN SATURDAY NIGHT', 'date': '2003-2-22', 'summary': 'How Much Will War With Iraq Cost?', 'url': 'http://transcripts.cnn.com/TRANSCRIPTS/0302/22/stn.02.html', 'title': nan, 'broadcaster': 'CNN'})" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "media_sum_corpus.get_conversation(\"CNN-67148\").meta" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "cc3f26788e1b6bab", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T14:08:32.789841Z", + "start_time": "2025-08-21T14:08:32.774543Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "ConvoKitMeta({'program': 'News & Notes', 'date': '2007-11-28', 'summary': 'More than 400 black actors, artists and ministers are bringing the Gospel to life in the audio book, The Bible Experience:The Complete Bible. Farai Chideya talks with producer Kyle Bowser and actress Wendy Raquel Robinson, who lends her voice to the project.', 'url': 'https://www.npr.org/templates/story/story.php?storyId=16697288', 'title': 'Black Actors Give Bible Star Appeal', 'broadcaster': 'NPR'})" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "media_sum_corpus.get_conversation(\"NPR-1\").meta" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "13e26722-482d-4c48-9ce4-8ae9ee9c17cd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Speaker({'obj_type': 'speaker', 'vectors': [], 'owner': , 'id': 'ED LAVANDERA, CNN CORRESPONDENT', 'meta': ConvoKitMeta({'name': 'ED LAVANDERA, CNN CORRESPONDENT'})})" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "media_sum_corpus.get_speaker(\"ED LAVANDERA, CNN CORRESPONDENT\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "ca8f7db9db3ff3bb", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T14:10:39.493234Z", + "start_time": "2025-08-21T14:10:39.477045Z" + } + }, + "outputs": [], + "source": [ + "# add name\n", + "media_sum_corpus.meta['name'] = 'MediaSum Corpus'" + ] + }, + { + "cell_type": "markdown", + "id": "669eef79f1a77d5e", + "metadata": {}, + "source": [ + "## 5. Adding Paraphrase annotations" + ] + }, + { + "cell_type": "markdown", + "id": "4a950483-63ab-4a65-8da8-9f82d501a942", + "metadata": {}, + "source": [ + "Annotations are saved as lists which correspond to the text with utt.text.split() calls." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "23d3dbe48af33196", + "metadata": { + "jupyter": { + "is_executing": true + } + }, + "outputs": [], + "source": [ + "# load annotations from huggingface dataset\n", + "from datasets import load_dataset\n", + "dataset = load_dataset(\"AnnaWegmann/Paraphrases-in-Interviews\")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "1ce4dba9c660a47c", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T13:34:39.123696Z", + "start_time": "2025-08-21T13:34:38.889030Z" + } + }, + "outputs": [], + "source": [ + "# load into one dataframe\n", + "split_names = list(dataset.keys())\n", + "dataframes = [dataset[split].to_pandas() for split in split_names]\n", + "df = pd.concat(dataframes, ignore_index=True) # if you just need one split: dataset['train'].to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "d3adaa8e781d1e39", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T13:05:24.069712Z", + "start_time": "2025-08-21T13:05:24.061503Z" + } + }, + "outputs": [], + "source": [ + "utterance = media_sum_corpus.get_utterance(\"CNN-177596-7\")" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "604756181fbe1d72", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T13:07:36.312399Z", + "start_time": "2025-08-21T13:07:36.292547Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'This is not good.'" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "utterance.text" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "08406ffe-e6c7-49d4-b753-114afbae630c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "112" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unique_annotators = set(df['Annotator'])\n", + "len(unique_annotators)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "f1f729ded115626c", + "metadata": {}, + "outputs": [], + "source": [ + "# get all unique pairs or QIDs that were annotated for paraphrases\n", + "unique_qids = set(df['QID'].unique())" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "2bcac2d8733adeb3", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T13:32:14.082618Z", + "start_time": "2025-08-21T13:32:12.806715Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 600/600 [00:01<00:00, 330.18it/s]\n" + ] + } + ], + "source": [ + "pairs = []\n", + "paraphrase_labels = []\n", + "# go over the unique QIDs\n", + "for q_id in tqdm.tqdm(unique_qids):\n", + " group = df[df['QID'] == q_id]\n", + " # Compute total votes and paraphrase votes\n", + " total_votes = len(group)\n", + " paraphrase_votes = group['Is Paraphrase'].astype(int).sum()\n", + "\n", + " meta_info = {\n", + " 'paraphrase_number_votes': int(total_votes), # the number of annotators that rated in total paraphrase and not\n", + " 'paraphrase_votes': int(paraphrase_votes), # the number of annotators voting \n", + " 'paraphrase_ratio': float(paraphrase_votes / total_votes if total_votes > 0 else 0)\n", + " }\n", + "\n", + " # paraphrase label\n", + " paraphrase_labels.append(meta_info['paraphrase_ratio'])\n", + " \n", + " # Process Guest Highlights\n", + " guest_highlights_list = group['Guest Highlights'].apply(ast.literal_eval).tolist()\n", + " guest_highlights_sums = [sum(x)/int(total_votes) for x in zip(*guest_highlights_list)]\n", + "\n", + " # Process Host Highlights\n", + " host_highlights_list = group['Host Highlights'].apply(ast.literal_eval).tolist()\n", + " host_highlights_sums = [sum(x)/int(total_votes) for x in zip(*host_highlights_list)]\n", + "\n", + " cur_utt = media_sum_corpus.get_utterance(q_id)\n", + " utt_number = int(q_id.split(\"-\")[2])\n", + " # guest_speaker = cur_utt.speaker.id\n", + " cur_id = 0\n", + " cur_pair = [[], []]\n", + " while cur_id < len(guest_highlights_sums):\n", + " cur_utt_text_len = len(cur_utt.text.split())\n", + " cur_utt.add_meta('paraphrase_guest_highlights', guest_highlights_sums[cur_id:cur_utt_text_len])\n", + " # cur_utt.add_meta('paraphrase_guest_words'], cur_utt.text.split())\n", + " cur_utt.add_meta('paraphrase_is_host', False)\n", + " # print(meta_info)\n", + " for key in meta_info.keys():\n", + " cur_utt.add_meta(key, meta_info[key])\n", + " for index, row in group.iterrows():\n", + " cur_utt.add_meta(\"paraphrase_\" + row['Annotator'], ast.literal_eval(row['Guest Highlights'])[cur_id:cur_utt_text_len])\n", + " cur_pair[0].append((f\"{cur_utt.conversation_id}-{utt_number}\"))\n", + " utt_number+=1\n", + " cur_utt = media_sum_corpus.get_utterance(f\"{cur_utt.conversation_id}-{utt_number}\")\n", + " cur_id += cur_utt_text_len\n", + " cur_id = 0\n", + " while cur_id < len(host_highlights_sums):\n", + " cur_utt_text_len = len(cur_utt.text.split())\n", + " cur_utt.add_meta(\"paraphrase_host_highlights\", host_highlights_sums[cur_id:cur_utt_text_len])\n", + " # cur_utt.add_meta('paraphrase_host_words', cur_utt.text.split())\n", + " cur_utt.add_meta('paraphrase_is_host', True)\n", + " for key in meta_info.keys():\n", + " cur_utt.add_meta(key, meta_info[key])\n", + " for index, row in group.iterrows():\n", + " cur_utt.add_meta(\"paraphrase_\" + row['Annotator'], ast.literal_eval(row['Host Highlights'])[cur_id:cur_utt_text_len])\n", + " cur_pair[1].append((f\"{cur_utt.conversation_id}-{utt_number}\"))\n", + " utt_number+=1\n", + " cur_utt = media_sum_corpus.get_utterance(f\"{cur_utt.conversation_id}-{utt_number}\")\n", + " cur_id += cur_utt_text_len\n", + " pairs.append(cur_pair)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "561ce72b-75b8-4c45-a497-1ee2b3ca6ff4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Utterance(id: 'CNN-177596-7', conversation_id: CNN-177596, reply-to: CNN-177596-6, speaker: Speaker(id: 'JOHNS', vectors: [], meta: ConvoKitMeta({'name': 'JOHNS'})), timestamp: None, text: 'This is not good.', vectors: [], meta: ConvoKitMeta({'paraphrase_guest_highlights': [0.5, 0.45, 0.45, 0.45], 'paraphrase_is_host': False, 'paraphrase_number_votes': 20, 'paraphrase_votes': 10, 'paraphrase_ratio': 0.5, 'paraphrase_PROLIFIC_1': [0, 0, 0, 0], 'paraphrase_PROLIFIC_2': [1, 1, 1, 1], 'paraphrase_PROLIFIC_3': [0, 0, 0, 0], 'paraphrase_PROLIFIC_4': [0, 0, 0, 0], 'paraphrase_PROLIFIC_5': [0, 0, 0, 0], 'paraphrase_PROLIFIC_6': [1, 1, 1, 1], 'paraphrase_PROLIFIC_7': [1, 0, 0, 0], 'paraphrase_PROLIFIC_8': [1, 1, 1, 1], 'paraphrase_PROLIFIC_9': [0, 0, 0, 0], 'paraphrase_PROLIFIC_10': [0, 0, 0, 0], 'paraphrase_PROLIFIC_11': [1, 1, 1, 1], 'paraphrase_PROLIFIC_12': [0, 0, 0, 0], 'paraphrase_PROLIFIC_13': [1, 1, 1, 1], 'paraphrase_PROLIFIC_14': [0, 0, 0, 0], 'paraphrase_PROLIFIC_15': [0, 0, 0, 0], 'paraphrase_PROLIFIC_16': [1, 1, 1, 1], 'paraphrase_PROLIFIC_17': [0, 0, 0, 0], 'paraphrase_PROLIFIC_18': [1, 1, 1, 1], 'paraphrase_PROLIFIC_19': [1, 1, 1, 1], 'paraphrase_PROLIFIC_20': [1, 1, 1, 1]}))\n" + ] + } + ], + "source": [ + "utterance = media_sum_corpus.get_utterance(\"CNN-177596-7\")\n", + "print(utterance)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "5042c5f66b3411df", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T13:21:04.286844Z", + "start_time": "2025-08-21T13:21:04.274565Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(ConvoKitMeta({'paraphrase_host_highlights': [0.45, 0.4, 0.45, 0.45, 0.45, 0.45, 0.45, 0.35, 0.35, 0.35, 0.05], 'paraphrase_is_host': True, 'paraphrase_number_votes': 20, 'paraphrase_votes': 10, 'paraphrase_ratio': 0.5, 'paraphrase_PROLIFIC_1': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'paraphrase_PROLIFIC_2': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], 'paraphrase_PROLIFIC_3': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'paraphrase_PROLIFIC_4': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'paraphrase_PROLIFIC_5': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'paraphrase_PROLIFIC_6': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], 'paraphrase_PROLIFIC_7': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'paraphrase_PROLIFIC_8': [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], 'paraphrase_PROLIFIC_9': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'paraphrase_PROLIFIC_10': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'paraphrase_PROLIFIC_11': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'paraphrase_PROLIFIC_12': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'paraphrase_PROLIFIC_13': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], 'paraphrase_PROLIFIC_14': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'paraphrase_PROLIFIC_15': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'paraphrase_PROLIFIC_16': [0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0], 'paraphrase_PROLIFIC_17': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'paraphrase_PROLIFIC_18': [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], 'paraphrase_PROLIFIC_19': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], 'paraphrase_PROLIFIC_20': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]}),\n", + " \"This is what you don't want happening with your menorah, folks.\")" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "utterance = media_sum_corpus.get_utterance(\"CNN-177596-8\")\n", + "utterance.meta, utterance.text" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "33eebc45-1b81-42b5-aa05-680abbaead96", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(ConvoKitMeta({'paraphrase_guest_highlights': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.3333333333333333, 0.3333333333333333, 0.0], 'paraphrase_is_host': False, 'paraphrase_number_votes': 3, 'paraphrase_votes': 3, 'paraphrase_ratio': 1.0, 'paraphrase_PROLIFIC_36': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], 'paraphrase_PROLIFIC_40': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], 'paraphrase_PROLIFIC_53': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]}),\n", + " \"It's a positive sign, I think. I was encouraged to see that. And people always prefer, of course, to see the pope as the principal celebrant of the mass. So that's good. That'll be tonight. And it will be his 26th mass and it will be the 40th or, rather, the 30th time that this is offered in round the world transmission. And it will be my 20th time in doing it as a television commentator from Rome so.\")" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "utterance = media_sum_corpus.get_utterance(\"CNN-80522-7\")\n", + "utterance.meta, utterance.text" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "e9595dc5-c37d-4162-b335-05eddfe3ca9d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(ConvoKitMeta({'paraphrase_host_highlights': [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 'paraphrase_is_host': True, 'paraphrase_number_votes': 3, 'paraphrase_votes': 3, 'paraphrase_ratio': 1.0, 'paraphrase_PROLIFIC_36': [0, 1, 1, 1, 1, 1, 1, 1, 1], 'paraphrase_PROLIFIC_40': [0, 1, 1, 1, 1, 1, 1, 1, 1], 'paraphrase_PROLIFIC_53': [0, 1, 1, 1, 1, 1, 1, 1, 1]}),\n", + " \"Yes, you've been doing this for a while now.\")" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "utterance = media_sum_corpus.get_utterance(\"CNN-80522-8\")\n", + "utterance.meta, utterance.text" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "74bc55cd-f0fd-435f-8176-a42639d5e671", + "metadata": {}, + "outputs": [], + "source": [ + "# add the pair IDs that were annotated for paraphrases to the media_sum_corpus metadata\n", + "media_sum_corpus.add_meta(\"paraphrase_pairs\", pairs)\n", + "media_sum_corpus.add_meta(\"paraphrase_labels\", paraphrase_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "ad014e02-dace-4314-8e33-0a184cf54de9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ConvoKitMeta({'name': 'MediaSum Corpus', 'paraphrase_pairs': [[['NPR-15505-5'], ['NPR-15505-6']], [['NPR-29413-12'], ['NPR-29413-13']], [['NPR-32322-5'], ['NPR-32322-6']], [['NPR-36238-12'], ['NPR-36238-13']], [['CNN-79698-3'], ['CNN-79698-4']], [['NPR-4258-5'], ['NPR-4258-6']], [['CNN-74539-5'], ['CNN-74539-6']], [['CNN-187362-5'], ['CNN-187362-6']], [['CNN-235767-5'], ['CNN-235767-6']], [['NPR-24552-10'], ['NPR-24552-11']], [['CNN-236636-5'], ['CNN-236636-6']], [['CNN-26647-3'], ['CNN-26647-4']], [['CNN-395861-10'], ['CNN-395861-11']], [['CNN-300212-13'], ['CNN-300212-14']], [['CNN-72381-11'], ['CNN-72381-12']], [['CNN-319588-9'], ['CNN-319588-10']], [['CNN-363088-3'], ['CNN-363088-4']], [['NPR-26301-10', 'NPR-26301-11'], ['NPR-26301-12']], [['CNN-198991-3'], ['CNN-198991-4']], [['CNN-390120-3'], ['CNN-390120-4']], [['CNN-64125-11'], ['CNN-64125-12']], [['NPR-58-11'], ['NPR-58-12']], [['CNN-319869-5'], ['CNN-319869-6']], [['CNN-33404-7'], ['CNN-33404-8']], [['NPR-44959-24', 'NPR-44959-25', 'NPR-44959-26'], ['NPR-44959-27']], [['CNN-378349-3'], ['CNN-378349-4']], [['NPR-8248-3', 'NPR-8248-4'], ['NPR-8248-5']], [['NPR-25659-9'], ['NPR-25659-10']], [['NPR-32992-15', 'NPR-32992-16'], ['NPR-32992-17']], [['CNN-22810-11'], ['CNN-22810-12']], [['CNN-224806-11'], ['CNN-224806-12']], [['NPR-14546-4'], ['NPR-14546-5']], [['NPR-39280-9', 'NPR-39280-10', 'NPR-39280-11'], ['NPR-39280-12']], [['CNN-378542-11'], ['CNN-378542-12']], [['CNN-31436-3'], ['CNN-31436-4']], [['CNN-126364-12'], ['CNN-126364-13']], [['CNN-11952-3'], ['CNN-11952-4']], [['NPR-15761-9'], ['NPR-15761-10']], [['CNN-135292-8', 'CNN-135292-9'], ['CNN-135292-10']], [['CNN-162108-3'], ['CNN-162108-4']], [['CNN-320073-7'], ['CNN-320073-8']], [['CNN-188650-8'], ['CNN-188650-9']], [['CNN-22692-3'], ['CNN-22692-4']], [['CNN-80522-7'], ['CNN-80522-8']], [['CNN-13079-5'], ['CNN-13079-6']], [['CNN-148766-13'], ['CNN-148766-14']], [['NPR-4329-4'], ['NPR-4329-5']], [['NPR-16405-4', 'NPR-16405-5', 'NPR-16405-6'], ['NPR-16405-7', 'NPR-16405-8']], [['NPR-5966-3', 'NPR-5966-4', 'NPR-5966-5', 'NPR-5966-6', 'NPR-5966-7'], ['NPR-5966-8']], [['NPR-39855-9'], ['NPR-39855-10']], [['CNN-29888-9'], ['CNN-29888-10']], [['CNN-185406-7'], ['CNN-185406-8']], [['CNN-58823-9'], ['CNN-58823-10']], [['CNN-182566-9'], ['CNN-182566-10']], [['NPR-21694-3'], ['NPR-21694-4']], [['CNN-147734-3'], ['CNN-147734-4']], [['CNN-169621-5'], ['CNN-169621-6']], [['CNN-75616-9'], ['CNN-75616-10']], [['CNN-11451-3'], ['CNN-11451-4']], [['NPR-4238-7', 'NPR-4238-8'], ['NPR-4238-9']], [['CNN-69574-17'], ['CNN-69574-18']], [['CNN-178518-5'], ['CNN-178518-6']], [['CNN-54317-5'], ['CNN-54317-6']], [['CNN-70652-7'], ['CNN-70652-8']], [['CNN-13309-5'], ['CNN-13309-6']], [['NPR-58-5'], ['NPR-58-6']], [['CNN-398359-4'], ['CNN-398359-5']], [['NPR-26677-5'], ['NPR-26677-6']], [['CNN-8208-5'], ['CNN-8208-6']], [['NPR-9347-17', 'NPR-9347-18', 'NPR-9347-19'], ['NPR-9347-20']], [['CNN-259380-5'], ['CNN-259380-6']], [['CNN-228174-6'], ['CNN-228174-7']], [['CNN-24667-5'], ['CNN-24667-6']], [['CNN-197484-7'], ['CNN-197484-8']], [['CNN-44456-3'], ['CNN-44456-4']], [['CNN-56267-3'], ['CNN-56267-4']], [['CNN-386671-9'], ['CNN-386671-10']], [['CNN-108240-7'], ['CNN-108240-8']], [['CNN-174464-13'], ['CNN-174464-14']], [['NPR-45424-5'], ['NPR-45424-6']], [['NPR-9347-10', 'NPR-9347-11'], ['NPR-9347-12']], [['NPR-42397-7'], ['NPR-42397-8']], [['CNN-226205-5'], ['CNN-226205-6']], [['CNN-111610-7'], ['CNN-111610-8']], [['NPR-27923-3', 'NPR-27923-4', 'NPR-27923-5'], ['NPR-27923-6']], [['CNN-159830-3'], ['CNN-159830-4']], [['CNN-1649-7'], ['CNN-1649-8']], [['CNN-374873-12'], ['CNN-374873-13']], [['CNN-401386-3'], ['CNN-401386-4']], [['CNN-408093-3'], ['CNN-408093-4']], [['NPR-4581-4'], ['NPR-4581-5']], [['NPR-29986-8', 'NPR-29986-9'], ['NPR-29986-10']], [['NPR-35888-5', 'NPR-35888-6'], ['NPR-35888-7']], [['CNN-136098-3'], ['CNN-136098-4']], [['NPR-20571-5'], ['NPR-20571-6']], [['NPR-7508-9', 'NPR-7508-10'], ['NPR-7508-11']], [['CNN-168166-4'], ['CNN-168166-5']], [['CNN-87376-7'], ['CNN-87376-8']], [['NPR-32992-12', 'NPR-32992-13'], ['NPR-32992-14']], [['NPR-6595-7'], ['NPR-6595-8']], [['NPR-14986-14'], ['NPR-14986-15']], [['CNN-374557-6'], ['CNN-374557-7']], [['CNN-62768-13'], ['CNN-62768-14']], [['CNN-68683-5'], ['CNN-68683-6']], [['CNN-42770-7'], ['CNN-42770-8']], [['CNN-327899-3'], ['CNN-327899-4']], [['NPR-23442-5'], ['NPR-23442-6']], [['CNN-190136-7'], ['CNN-190136-8']], [['CNN-26241-5'], ['CNN-26241-6']], [['CNN-339958-3'], ['CNN-339958-4']], [['NPR-20426-6'], ['NPR-20426-7']], [['CNN-69574-13'], ['CNN-69574-14']], [['NPR-27311-4', 'NPR-27311-5', 'NPR-27311-6'], ['NPR-27311-7']], [['CNN-95407-11'], ['CNN-95407-12']], [['CNN-319588-7'], ['CNN-319588-8']], [['CNN-323891-3'], ['CNN-323891-4']], [['CNN-212822-9'], ['CNN-212822-10']], [['NPR-11551-6'], ['NPR-11551-7']], [['CNN-10486-11'], ['CNN-10486-12']], [['CNN-178518-3'], ['CNN-178518-4']], [['CNN-24678-5'], ['CNN-24678-6']], [['CNN-22088-3'], ['CNN-22088-4']], [['CNN-22326-7'], ['CNN-22326-8']], [['CNN-241881-3'], ['CNN-241881-4']], [['NPR-15761-3'], ['NPR-15761-4']], [['CNN-52317-7'], ['CNN-52317-8']], [['CNN-114925-5'], ['CNN-114925-6']], [['CNN-18448-7'], ['CNN-18448-8']], [['NPR-13536-7'], ['NPR-13536-8']], [['CNN-13309-11'], ['CNN-13309-12']], [['CNN-61505-15'], ['CNN-61505-16']], [['NPR-32479-7', 'NPR-32479-8', 'NPR-32479-9', 'NPR-32479-10'], ['NPR-32479-11']], [['CNN-235909-21'], ['CNN-235909-22']], [['NPR-32322-11'], ['NPR-32322-12']], [['CNN-371716-3'], ['CNN-371716-4']], [['CNN-83-7'], ['CNN-83-8']], [['CNN-293273-3'], ['CNN-293273-4']], [['CNN-40490-3'], ['CNN-40490-4']], [['NPR-42283-9', 'NPR-42283-10'], ['NPR-42283-11']], [['NPR-4581-6'], ['NPR-4581-7']], [['NPR-20491-7'], ['NPR-20491-8']], [['NPR-41516-15'], ['NPR-41516-16']], [['CNN-193731-3'], ['CNN-193731-4']], [['NPR-29986-4'], ['NPR-29986-5']], [['NPR-18055-16'], ['NPR-18055-17']], [['CNN-349401-8'], ['CNN-349401-9']], [['CNN-283887-9'], ['CNN-283887-10']], [['CNN-73833-11'], ['CNN-73833-12']], [['NPR-636-7', 'NPR-636-8'], ['NPR-636-9']], [['NPR-19489-5'], ['NPR-19489-6']], [['CNN-192572-5'], ['CNN-192572-6']], [['CNN-221236-3'], ['CNN-221236-4']], [['CNN-75803-3'], ['CNN-75803-4']], [['NPR-795-9'], ['NPR-795-10']], [['NPR-30457-5', 'NPR-30457-6'], ['NPR-30457-7']], [['NPR-45051-11'], ['NPR-45051-12']], [['CNN-41861-3'], ['CNN-41861-4']], [['NPR-14133-4'], ['NPR-14133-5']], [['CNN-313218-3'], ['CNN-313218-4']], [['CNN-82534-9'], ['CNN-82534-10']], [['CNN-198991-5'], ['CNN-198991-6']], [['NPR-32322-3'], ['NPR-32322-4']], [['CNN-385021-10'], ['CNN-385021-11']], [['CNN-3409-7'], ['CNN-3409-8']], [['CNN-144616-5'], ['CNN-144616-6']], [['CNN-51112-7'], ['CNN-51112-8']], [['CNN-329196-7'], ['CNN-329196-8']], [['CNN-32771-11'], ['CNN-32771-12']], [['CNN-333433-3'], ['CNN-333433-4']], [['CNN-25526-3'], ['CNN-25526-4']], [['CNN-159391-15'], ['CNN-159391-16']], [['NPR-44902-13'], ['NPR-44902-14']], [['CNN-390220-5'], ['CNN-390220-6']], [['CNN-224826-5'], ['CNN-224826-6']], [['CNN-265170-3'], ['CNN-265170-4']], [['CNN-44522-8'], ['CNN-44522-9']], [['CNN-78023-3'], ['CNN-78023-4']], [['NPR-9809-7'], ['NPR-9809-8']], [['CNN-345614-9'], ['CNN-345614-10']], [['NPR-13735-6'], ['NPR-13735-7']], [['CNN-154710-20'], ['CNN-154710-21']], [['NPR-42649-11'], ['NPR-42649-12']], [['NPR-26677-3'], ['NPR-26677-4']], [['NPR-39794-10'], ['NPR-39794-11']], [['NPR-19802-5'], ['NPR-19802-6']], [['NPR-13735-12', 'NPR-13735-13', 'NPR-13735-14'], ['NPR-13735-15']], [['NPR-38181-5', 'NPR-38181-6'], ['NPR-38181-7']], [['CNN-257156-5'], ['CNN-257156-6']], [['CNN-14483-5'], ['CNN-14483-6']], [['CNN-321514-9'], ['CNN-321514-10']], [['CNN-350238-9'], ['CNN-350238-10']], [['CNN-155836-23'], ['CNN-155836-24']], [['NPR-18056-8', 'NPR-18056-9'], ['NPR-18056-10']], [['NPR-13227-12', 'NPR-13227-13'], ['NPR-13227-14']], [['NPR-13639-11', 'NPR-13639-12'], ['NPR-13639-13']], [['CNN-143964-5'], ['CNN-143964-6']], [['NPR-14986-10'], ['NPR-14986-11']], [['NPR-733-15'], ['NPR-733-16']], [['NPR-15724-4', 'NPR-15724-5', 'NPR-15724-6'], ['NPR-15724-7', 'NPR-15724-8']], [['CNN-68742-3'], ['CNN-68742-4']], [['NPR-46180-5'], ['NPR-46180-6']], [['NPR-8617-9', 'NPR-8617-10'], ['NPR-8617-11']], [['CNN-17933-5'], ['CNN-17933-6']], [['CNN-327899-9'], ['CNN-327899-10']], [['NPR-10683-9'], ['NPR-10683-10']], [['NPR-26945-7'], ['NPR-26945-8']], [['CNN-108240-3'], ['CNN-108240-4']], [['CNN-323090-3'], ['CNN-323090-4']], [['CNN-345032-3'], ['CNN-345032-4']], [['CNN-65775-3'], ['CNN-65775-4']], [['NPR-35389-10'], ['NPR-35389-11']], [['NPR-32322-9'], ['NPR-32322-10']], [['CNN-98333-7'], ['CNN-98333-8']], [['NPR-42763-7', 'NPR-42763-8'], ['NPR-42763-9']], [['CNN-145687-3'], ['CNN-145687-4']], [['CNN-243827-11'], ['CNN-243827-12']], [['CNN-170110-7'], ['CNN-170110-8']], [['NPR-48919-5'], ['NPR-48919-6']], [['CNN-80402-3'], ['CNN-80402-4']], [['CNN-148030-7'], ['CNN-148030-8']], [['CNN-187362-11'], ['CNN-187362-12']], [['CNN-50949-3'], ['CNN-50949-4']], [['NPR-8678-6', 'NPR-8678-7'], ['NPR-8678-8']], [['CNN-14551-3'], ['CNN-14551-4']], [['CNN-28077-7'], ['CNN-28077-8']], [['CNN-187362-7'], ['CNN-187362-8']], [['CNN-131966-5'], ['CNN-131966-6']], [['CNN-6322-5'], ['CNN-6322-6']], [['NPR-44579-3'], ['NPR-44579-4']], [['NPR-19540-5'], ['NPR-19540-6']], [['NPR-13476-4'], ['NPR-13476-5']], [['NPR-14546-8'], ['NPR-14546-9']], [['NPR-20086-7'], ['NPR-20086-8']], [['CNN-270198-3'], ['CNN-270198-4']], [['CNN-111931-7'], ['CNN-111931-8']], [['CNN-245013-12'], ['CNN-245013-13']], [['CNN-32771-3'], ['CNN-32771-4']], [['CNN-378542-13'], ['CNN-378542-14']], [['CNN-258563-3'], ['CNN-258563-4']], [['CNN-195140-5'], ['CNN-195140-6']], [['CNN-341938-3'], ['CNN-341938-4']], [['NPR-24016-4'], ['NPR-24016-5']], [['CNN-74126-3'], ['CNN-74126-4']], [['NPR-42758-10', 'NPR-42758-11'], ['NPR-42758-12']], [['CNN-393323-3'], ['CNN-393323-4']], [['CNN-40557-5'], ['CNN-40557-6']], [['CNN-64921-5'], ['CNN-64921-6']], [['CNN-166487-12'], ['CNN-166487-13']], [['NPR-20491-4', 'NPR-20491-5'], ['NPR-20491-6']], [['CNN-370927-11'], ['CNN-370927-12']], [['CNN-187940-6'], ['CNN-187940-7']], [['NPR-8339-8', 'NPR-8339-9'], ['NPR-8339-10']], [['CNN-312194-3'], ['CNN-312194-4']], [['NPR-10742-4'], ['NPR-10742-5']], [['CNN-221020-3'], ['CNN-221020-4']], [['CNN-349795-9'], ['CNN-349795-10']], [['CNN-255832-9'], ['CNN-255832-10']], [['NPR-12213-3'], ['NPR-12213-4']], [['NPR-10997-3'], ['NPR-10997-4']], [['CNN-376903-8'], ['CNN-376903-9']], [['NPR-45706-6'], ['NPR-45706-7']], [['CNN-254573-3'], ['CNN-254573-4']], [['NPR-22848-7', 'NPR-22848-8'], ['NPR-22848-9']], [['CNN-55894-3'], ['CNN-55894-4']], [['CNN-84854-5'], ['CNN-84854-6']], [['NPR-12213-7'], ['NPR-12213-8']], [['CNN-378674-9'], ['CNN-378674-10']], [['CNN-327899-13'], ['CNN-327899-14']], [['CNN-395861-6'], ['CNN-395861-7']], [['CNN-303095-5'], ['CNN-303095-6']], [['NPR-17734-16'], ['NPR-17734-17']], [['CNN-39232-11'], ['CNN-39232-12']], [['CNN-80984-3'], ['CNN-80984-4']], [['CNN-126364-6'], ['CNN-126364-7']], [['CNN-44404-7'], ['CNN-44404-8']], [['CNN-61505-19'], ['CNN-61505-20']], [['CNN-155750-6'], ['CNN-155750-7']], [['CNN-52275-17'], ['CNN-52275-18']], [['CNN-166930-9'], ['CNN-166930-10']], [['CNN-8419-9'], ['CNN-8419-10']], [['CNN-231880-9'], ['CNN-231880-10']], [['CNN-162898-12'], ['CNN-162898-13']], [['NPR-11045-5', 'NPR-11045-6'], ['NPR-11045-7']], [['CNN-371078-3'], ['CNN-371078-4']], [['NPR-20286-8'], ['NPR-20286-9']], [['NPR-17919-9', 'NPR-17919-10', 'NPR-17919-11'], ['NPR-17919-12', 'NPR-17919-13', 'NPR-17919-14']], [['CNN-50501-5'], ['CNN-50501-6']], [['CNN-140088-3'], ['CNN-140088-4']], [['CNN-22554-3'], ['CNN-22554-4']], [['CNN-146084-3'], ['CNN-146084-4']], [['CNN-30274-13'], ['CNN-30274-14']], [['NPR-40990-7'], ['NPR-40990-8', 'NPR-40990-9']], [['NPR-12431-13'], ['NPR-12431-14']], [['CNN-220275-3'], ['CNN-220275-4']], [['CNN-52217-3'], ['CNN-52217-4']], [['CNN-187587-5'], ['CNN-187587-6']], [['NPR-35922-5', 'NPR-35922-6', 'NPR-35922-7'], ['NPR-35922-8']], [['CNN-285008-4'], ['CNN-285008-5']], [['CNN-41259-4'], ['CNN-41259-5']], [['CNN-42770-17'], ['CNN-42770-18']], [['NPR-2705-4', 'NPR-2705-5'], ['NPR-2705-6']], [['NPR-17496-10'], ['NPR-17496-11']], [['CNN-38805-3'], ['CNN-38805-4']], [['CNN-57033-5'], ['CNN-57033-6']], [['CNN-214354-5'], ['CNN-214354-6']], [['CNN-45579-5'], ['CNN-45579-6']], [['CNN-82012-6'], ['CNN-82012-7']], [['CNN-21949-9'], ['CNN-21949-10']], [['NPR-3764-12'], ['NPR-3764-13']], [['CNN-78827-5'], ['CNN-78827-6']], [['NPR-35389-8'], ['NPR-35389-9']], [['CNN-9396-3'], ['CNN-9396-4']], [['CNN-268015-12'], ['CNN-268015-13']], [['CNN-70747-7'], ['CNN-70747-8']], [['CNN-19564-5'], ['CNN-19564-6']], [['NPR-41134-14'], ['NPR-41134-15']], [['CNN-148718-9'], ['CNN-148718-10']], [['CNN-162898-6'], ['CNN-162898-7']], [['CNN-155730-5'], ['CNN-155730-6']], [['CNN-98051-7'], ['CNN-98051-8']], [['CNN-61850-5'], ['CNN-61850-6']], [['NPR-20376-7'], ['NPR-20376-8']], [['CNN-11451-13'], ['CNN-11451-14']], [['CNN-72706-11'], ['CNN-72706-12']], [['CNN-5386-3'], ['CNN-5386-4']], [['CNN-111931-3'], ['CNN-111931-4']], [['NPR-17702-8', 'NPR-17702-9'], ['NPR-17702-10']], [['NPR-6494-5'], ['NPR-6494-6']], [['CNN-50898-3'], ['CNN-50898-4']], [['NPR-40437-10', 'NPR-40437-11'], ['NPR-40437-12']], [['CNN-378674-17'], ['CNN-378674-18']], [['CNN-269414-5'], ['CNN-269414-6']], [['CNN-274148-7'], ['CNN-274148-8']], [['CNN-24376-3'], ['CNN-24376-4']], [['NPR-44121-10', 'NPR-44121-11', 'NPR-44121-12'], ['NPR-44121-13']], [['CNN-63439-7'], ['CNN-63439-8']], [['NPR-8339-15', 'NPR-8339-16'], ['NPR-8339-17']], [['CNN-301662-4'], ['CNN-301662-5']], [['CNN-35423-9'], ['CNN-35423-10']], [['CNN-21377-3'], ['CNN-21377-4']], [['NPR-15761-7'], ['NPR-15761-8']], [['NPR-5871-3'], ['NPR-5871-4']], [['CNN-10486-9'], ['CNN-10486-10']], [['NPR-47914-8', 'NPR-47914-9', 'NPR-47914-10'], ['NPR-47914-11']], [['NPR-23442-3'], ['NPR-23442-4']], [['CNN-326433-7'], ['CNN-326433-8']], [['CNN-80402-7'], ['CNN-80402-8']], [['NPR-9809-12', 'NPR-9809-13'], ['NPR-9809-14']], [['CNN-62525-9'], ['CNN-62525-10']], [['CNN-164977-3'], ['CNN-164977-4']], [['CNN-112560-7'], ['CNN-112560-8']], [['NPR-7619-6'], ['NPR-7619-7']], [['CNN-58514-3'], ['CNN-58514-4']], [['CNN-175637-9'], ['CNN-175637-10']], [['CNN-95407-3'], ['CNN-95407-4']], [['NPR-2995-10'], ['NPR-2995-11']], [['CNN-316325-9'], ['CNN-316325-10']], [['NPR-48741-7'], ['NPR-48741-8']], [['CNN-378275-5'], ['CNN-378275-6']], [['CNN-55879-13'], ['CNN-55879-14']], [['NPR-2622-10'], ['NPR-2622-11']], [['CNN-363203-3'], ['CNN-363203-4']], [['CNN-266429-3'], ['CNN-266429-4']], [['NPR-18917-10'], ['NPR-18917-11']], [['CNN-22088-7'], ['CNN-22088-8']], [['NPR-23790-5'], ['NPR-23790-6']], [['CNN-191293-5'], ['CNN-191293-6']], [['NPR-45810-9'], ['NPR-45810-10']], [['NPR-23296-9'], ['NPR-23296-10']], [['NPR-16222-4'], ['NPR-16222-5']], [['CNN-171567-3'], ['CNN-171567-4']], [['CNN-229741-7'], ['CNN-229741-8']], [['NPR-252-13'], ['NPR-252-14']], [['CNN-384940-15'], ['CNN-384940-16']], [['CNN-19396-7'], ['CNN-19396-8']], [['NPR-49275-3'], ['NPR-49275-4']], [['CNN-192500-5'], ['CNN-192500-6']], [['CNN-6197-7'], ['CNN-6197-8']], [['CNN-28089-7'], ['CNN-28089-8']], [['NPR-27311-8'], ['NPR-27311-9']], [['NPR-28157-4'], ['NPR-28157-5']], [['NPR-39855-3'], ['NPR-39855-4']], [['NPR-29413-7', 'NPR-29413-8'], ['NPR-29413-9']], [['NPR-8150-9'], ['NPR-8150-10']], [['CNN-388007-10'], ['CNN-388007-11']], [['CNN-154452-5'], ['CNN-154452-6']], [['CNN-166512-5'], ['CNN-166512-6']], [['NPR-20086-15'], ['NPR-20086-16']], [['CNN-350653-7', 'CNN-350653-8'], ['CNN-350653-9']], [['CNN-370927-7'], ['CNN-370927-8']], [['NPR-14122-9'], ['NPR-14122-10']], [['NPR-16743-3'], ['NPR-16743-4']], [['CNN-231880-5'], ['CNN-231880-6']], [['CNN-235767-9'], ['CNN-235767-10']], [['NPR-6831-29'], ['NPR-6831-30']], [['CNN-114404-5'], ['CNN-114404-6']], [['CNN-36090-3'], ['CNN-36090-4']], [['CNN-227609-5'], ['CNN-227609-6']], [['CNN-44456-5'], ['CNN-44456-6']], [['NPR-2541-23'], ['NPR-2541-24']], [['CNN-6322-7'], ['CNN-6322-8']], [['NPR-33017-3'], ['NPR-33017-4']], [['NPR-20855-6'], ['NPR-20855-7']], [['NPR-986-18'], ['NPR-986-19']], [['CNN-84604-9'], ['CNN-84604-10']], [['CNN-36090-7'], ['CNN-36090-8']], [['CNN-31436-9'], ['CNN-31436-10']], [['CNN-372119-11'], ['CNN-372119-12']], [['CNN-55331-7'], ['CNN-55331-8']], [['CNN-388007-7', 'CNN-388007-8'], ['CNN-388007-9']], [['CNN-377264-11'], ['CNN-377264-12']], [['NPR-25926-9'], ['NPR-25926-10']], [['NPR-58-9'], ['NPR-58-10']], [['CNN-237525-9'], ['CNN-237525-10']], [['NPR-35889-8'], ['NPR-35889-9']], [['CNN-261527-7'], ['CNN-261527-8']], [['NPR-45810-15'], ['NPR-45810-16']], [['CNN-26711-3'], ['CNN-26711-4']], [['CNN-34495-8'], ['CNN-34495-9']], [['CNN-50501-3'], ['CNN-50501-4']], [['NPR-6494-7'], ['NPR-6494-8']], [['NPR-12431-3'], ['NPR-12431-4']], [['NPR-22815-3'], ['NPR-22815-4']], [['NPR-6131-9', 'NPR-6131-10'], ['NPR-6131-11']], [['CNN-34967-7'], ['CNN-34967-8']], [['NPR-19127-3', 'NPR-19127-4'], ['NPR-19127-5']], [['CNN-328670-9'], ['CNN-328670-10']], [['CNN-350238-7'], ['CNN-350238-8']], [['CNN-177596-7'], ['CNN-177596-8']], [['NPR-21694-7', 'NPR-21694-8', 'NPR-21694-9'], ['NPR-21694-10']], [['CNN-80157-3'], ['CNN-80157-4']], [['CNN-189512-7'], ['CNN-189512-8']], [['CNN-44986-5'], ['CNN-44986-6']], [['CNN-103838-3'], ['CNN-103838-4']], [['CNN-58823-3'], ['CNN-58823-4']], [['CNN-234343-5'], ['CNN-234343-6']], [['NPR-22359-11'], ['NPR-22359-12']], [['CNN-412181-3'], ['CNN-412181-4']], [['CNN-80984-7'], ['CNN-80984-8']], [['NPR-11718-10'], ['NPR-11718-11']], [['CNN-33404-5'], ['CNN-33404-6']], [['CNN-374557-10'], ['CNN-374557-11']], [['CNN-18712-3'], ['CNN-18712-4']], [['NPR-15913-5'], ['NPR-15913-6']], [['NPR-8365-7', 'NPR-8365-8'], ['NPR-8365-9']], [['CNN-179766-6'], ['CNN-179766-7']], [['CNN-372414-5'], ['CNN-372414-6']], [['CNN-165408-7'], ['CNN-165408-8']], [['CNN-26647-5'], ['CNN-26647-6']], [['CNN-384581-5'], ['CNN-384581-6']], [['CNN-226205-3'], ['CNN-226205-4']], [['NPR-27460-13'], ['NPR-27460-14']], [['NPR-37713-3'], ['NPR-37713-4']], [['CNN-19396-9'], ['CNN-19396-10']], [['NPR-40147-15'], ['NPR-40147-16', 'NPR-40147-17']], [['CNN-7965-7'], ['CNN-7965-8']], [['CNN-66474-3'], ['CNN-66474-4']], [['NPR-39531-4'], ['NPR-39531-5']], [['CNN-13892-3'], ['CNN-13892-4']], [['CNN-22232-3'], ['CNN-22232-4']], [['NPR-30517-3'], ['NPR-30517-4']], [['CNN-76224-3'], ['CNN-76224-4']], [['CNN-38493-11'], ['CNN-38493-12']], [['NPR-34690-4', 'NPR-34690-5', 'NPR-34690-6', 'NPR-34690-7'], ['NPR-34690-8']], [['NPR-44959-16', 'NPR-44959-17'], ['NPR-44959-18']], [['NPR-20431-5'], ['NPR-20431-6']], [['NPR-16481-3'], ['NPR-16481-4']], [['CNN-134633-5'], ['CNN-134633-6']], [['CNN-9750-11'], ['CNN-9750-12']], [['NPR-7886-6'], ['NPR-7886-7']], [['CNN-264364-11'], ['CNN-264364-12']], [['CNN-148030-5'], ['CNN-148030-6']], [['CNN-67068-5'], ['CNN-67068-6']], [['NPR-17376-15'], ['NPR-17376-16']], [['NPR-27460-5'], ['NPR-27460-6']], [['CNN-74624-5'], ['CNN-74624-6']], [['NPR-30121-10', 'NPR-30121-11'], ['NPR-30121-12']], [['NPR-14806-5'], ['NPR-14806-6']], [['CNN-368578-5'], ['CNN-368578-6']], [['CNN-36590-14'], ['CNN-36590-15']], [['CNN-141300-3'], ['CNN-141300-4']], [['NPR-6831-23'], ['NPR-6831-24']], [['CNN-168600-3'], ['CNN-168600-4']], [['NPR-46081-4', 'NPR-46081-5', 'NPR-46081-6'], ['NPR-46081-7', 'NPR-46081-8', 'NPR-46081-9']], [['CNN-332405-27'], ['CNN-332405-28']], [['CNN-33404-3'], ['CNN-33404-4']], [['CNN-33404-9', 'CNN-33404-10'], ['CNN-33404-11']], [['CNN-154879-3'], ['CNN-154879-4']], [['NPR-12123-3'], ['NPR-12123-4']], [['NPR-15308-5'], ['NPR-15308-6']], [['NPR-23629-3'], ['NPR-23629-4']], [['NPR-22815-7'], ['NPR-22815-8']], [['CNN-70652-3'], ['CNN-70652-4']], [['CNN-253680-3'], ['CNN-253680-4']], [['NPR-5531-10', 'NPR-5531-11', 'NPR-5531-12'], ['NPR-5531-13']], [['CNN-344762-3'], ['CNN-344762-4']], [['CNN-154452-7'], ['CNN-154452-8']], [['CNN-174603-13'], ['CNN-174603-14']], [['CNN-98051-5'], ['CNN-98051-6']], [['CNN-4434-3'], ['CNN-4434-4']], [['CNN-66127-3'], ['CNN-66127-4']], [['CNN-30808-15'], ['CNN-30808-16']], [['NPR-44230-10', 'NPR-44230-11'], ['NPR-44230-12']], [['NPR-13225-5'], ['NPR-13225-6']], [['CNN-350097-7'], ['CNN-350097-8']], [['CNN-122586-9'], ['CNN-122586-10']], [['CNN-30274-10', 'CNN-30274-11'], ['CNN-30274-12']], [['CNN-196162-3'], ['CNN-196162-4']], [['CNN-390220-9'], ['CNN-390220-10']], [['CNN-377264-3'], ['CNN-377264-4']], [['CNN-276969-5'], ['CNN-276969-6']], [['CNN-254880-9'], ['CNN-254880-10']], [['NPR-47942-9'], ['NPR-47942-10']], [['NPR-30473-7'], ['NPR-30473-8']], [['CNN-123568-3'], ['CNN-123568-4']], [['CNN-196862-3'], ['CNN-196862-4']], [['CNN-61850-11'], ['CNN-61850-12']], [['CNN-217116-9'], ['CNN-217116-10']], [['CNN-98333-3'], ['CNN-98333-4']], [['NPR-23296-7'], ['NPR-23296-8']], [['NPR-24380-6'], ['NPR-24380-7']], [['CNN-64921-3'], ['CNN-64921-4']], [['CNN-130492-3'], ['CNN-130492-4']], [['NPR-27690-13', 'NPR-27690-14'], ['NPR-27690-15']], [['CNN-208335-11'], ['CNN-208335-12']], [['CNN-139432-8'], ['CNN-139432-9']], [['CNN-382242-8'], ['CNN-382242-9', 'CNN-382242-10']], [['CNN-274148-3'], ['CNN-274148-4']], [['CNN-321443-13'], ['CNN-321443-14']], [['NPR-10683-11'], ['NPR-10683-12']], [['CNN-151830-18'], ['CNN-151830-19']], [['CNN-32269-13'], ['CNN-32269-14']], [['CNN-413106-9'], ['CNN-413106-10']], [['CNN-51112-5'], ['CNN-51112-6']], [['NPR-658-8', 'NPR-658-9'], ['NPR-658-10']], [['CNN-347633-7'], ['CNN-347633-8']], [['CNN-51636-9'], ['CNN-51636-10']], [['CNN-217116-7'], ['CNN-217116-8']], [['NPR-9355-6'], ['NPR-9355-7']], [['NPR-1945-4', 'NPR-1945-5'], ['NPR-1945-6']], [['CNN-59546-7'], ['CNN-59546-8']], [['CNN-108240-11'], ['CNN-108240-12']], [['CNN-15655-13'], ['CNN-15655-14']], [['CNN-395083-3'], ['CNN-395083-4']], [['CNN-65775-5'], ['CNN-65775-6']], [['NPR-8365-10', 'NPR-8365-11'], ['NPR-8365-12']], [['NPR-3135-3'], ['NPR-3135-4']], [['CNN-166930-17'], ['CNN-166930-18']], [['NPR-2656-11'], ['NPR-2656-12']], [['CNN-227626-5', 'CNN-227626-6'], ['CNN-227626-7']], [['CNN-111931-5'], ['CNN-111931-6']], [['NPR-33294-11'], ['NPR-33294-12']], [['NPR-23790-3'], ['NPR-23790-4']], [['CNN-235909-17'], ['CNN-235909-18']], [['NPR-41743-13', 'NPR-41743-14'], ['NPR-41743-15']], [['CNN-286124-3'], ['CNN-286124-4']], [['NPR-4855-7'], ['NPR-4855-8']], [['NPR-19980-12', 'NPR-19980-13'], ['NPR-19980-14']], [['CNN-185406-9'], ['CNN-185406-10']], [['CNN-90580-5'], ['CNN-90580-6']], [['CNN-245013-8'], ['CNN-245013-9']], [['CNN-3199-7'], ['CNN-3199-8']], [['NPR-10997-5'], ['NPR-10997-6']], [['NPR-9269-7'], ['NPR-9269-8']], [['CNN-48129-9'], ['CNN-48129-10']], [['CNN-112560-5'], ['CNN-112560-6']], [['CNN-303095-9'], ['CNN-303095-10']], [['CNN-286939-3'], ['CNN-286939-4']], [['NPR-3764-14'], ['NPR-3764-15']], [['CNN-384602-3'], ['CNN-384602-4']], [['CNN-321443-15'], ['CNN-321443-16']], [['CNN-286787-5'], ['CNN-286787-6']], [['CNN-26711-7'], ['CNN-26711-8']], [['NPR-2979-7'], ['NPR-2979-8']], [['NPR-23790-7'], ['NPR-23790-8']], [['NPR-16222-13'], ['NPR-16222-14']], [['CNN-74084-5'], ['CNN-74084-6']], [['CNN-14551-7'], ['CNN-14551-8']], [['CNN-178381-17'], ['CNN-178381-18']], [['NPR-45058-8'], ['NPR-45058-9']], [['NPR-9347-13', 'NPR-9347-14', 'NPR-9347-15'], ['NPR-9347-16']], [['CNN-142850-8'], ['CNN-142850-9']], [['NPR-16222-6'], ['NPR-16222-7']], [['CNN-175637-3'], ['CNN-175637-4']], [['CNN-279640-5'], ['CNN-279640-6']], [['NPR-9269-19'], ['NPR-9269-20']], [['NPR-9401-3'], ['NPR-9401-4']], [['CNN-73548-5'], ['CNN-73548-6']], [['NPR-34092-16', 'NPR-34092-17'], ['NPR-34092-18']], [['NPR-20376-3'], ['NPR-20376-4']], [['CNN-271388-12'], ['CNN-271388-13']], [['NPR-15848-3'], ['NPR-15848-4', 'NPR-15848-5']], [['NPR-15505-7', 'NPR-15505-8'], ['NPR-15505-9']], [['NPR-18917-8'], ['NPR-18917-9']], [['NPR-2622-6', 'NPR-2622-7', 'NPR-2622-8'], ['NPR-2622-9']], [['NPR-8339-5', 'NPR-8339-6'], ['NPR-8339-7']], [['CNN-35048-3'], ['CNN-35048-4']], [['NPR-45810-13'], ['NPR-45810-14']], [['NPR-12399-3'], ['NPR-12399-4']], [['NPR-2541-21'], ['NPR-2541-22']]], 'paraphrase_labels': [0.0, 0.7, 1.0, 0.5333333333333333, 0.5, 0.0, 0.0, 0.0, 1.0, 0.6, 0.4, 1.0, 1.0, 0.85, 0.8, 0.5333333333333333, 0.55, 0.8, 1.0, 1.0, 0.7, 0.0, 0.0, 0.2, 0.95, 0.2, 0.95, 0.5, 1.0, 0.2, 0.7333333333333333, 0.8, 0.8, 0.3333333333333333, 0.5333333333333333, 0.0, 0.2, 0.8, 0.4666666666666667, 0.6, 0.8, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.8, 1.0, 0.4, 1.0, 0.0, 1.0, 0.0, 0.6666666666666666, 0.05, 0.0, 1.0, 0.8, 0.0, 1.0, 0.2, 0.0, 0.5333333333333333, 0.0, 0.55, 1.0, 0.5333333333333333, 0.4666666666666667, 1.0, 0.5333333333333333, 0.6, 0.23076923076923078, 0.6, 0.2, 0.8, 0.35, 0.0, 0.0, 0.8, 0.85, 1.0, 0.0, 0.8, 1.0, 0.8, 0.0, 0.2, 0.0, 1.0, 0.0, 0.0, 0.5333333333333333, 0.0, 0.2, 0.6, 0.4666666666666667, 0.2, 0.8, 0.23076923076923078, 0.0, 0.0, 0.0, 0.0, 0.6, 0.65, 0.8, 0.5333333333333333, 1.0, 0.3333333333333333, 0.0, 1.0, 0.0, 0.0, 0.8, 0.0, 0.7777777777777778, 0.3333333333333333, 0.5333333333333333, 0.0, 0.5333333333333333, 0.4, 0.6666666666666666, 1.0, 0.2, 0.7692307692307693, 0.2, 1.0, 0.2, 1.0, 0.5333333333333333, 0.7, 1.0, 0.3333333333333333, 1.0, 0.8, 0.3333333333333333, 0.4, 0.4666666666666667, 0.4, 1.0, 0.8, 0.3333333333333333, 0.3333333333333333, 0.42857142857142855, 1.0, 0.0, 1.0, 0.4666666666666667, 0.3333333333333333, 0.8, 0.0, 0.2, 1.0, 0.5714285714285714, 1.0, 0.5238095238095238, 0.5333333333333333, 1.0, 0.0, 1.0, 0.4666666666666667, 0.5333333333333333, 0.6, 0.2, 0.6428571428571429, 0.1, 0.42857142857142855, 1.0, 0.4666666666666667, 0.2, 0.0, 0.4, 0.0, 0.8, 1.0, 0.35, 1.0, 1.0, 0.0, 0.5333333333333333, 1.0, 0.6, 0.5333333333333333, 0.6666666666666666, 0.6666666666666666, 0.5333333333333333, 1.0, 0.4666666666666667, 0.9, 0.4, 0.6, 0.5333333333333333, 0.6666666666666666, 1.0, 0.2, 0.5, 0.8, 0.4, 1.0, 0.8, 0.4666666666666667, 0.0, 1.0, 0.6, 0.9523809523809523, 0.7, 1.0, 1.0, 0.35, 1.0, 0.3, 0.2, 1.0, 0.4, 0.0, 0.2, 0.0, 0.0, 0.2, 0.6666666666666666, 0.25, 0.3, 1.0, 0.0, 0.0, 0.0, 0.8, 0.4666666666666667, 0.0, 0.5333333333333333, 1.0, 0.7, 0.05, 0.2222222222222222, 0.5, 0.7692307692307693, 1.0, 0.1, 0.2, 1.0, 0.95, 0.6666666666666666, 0.6666666666666666, 0.0, 0.6666666666666666, 0.8, 0.7777777777777778, 1.0, 1.0, 0.5333333333333333, 1.0, 0.0, 1.0, 1.0, 0.3333333333333333, 0.0, 0.0, 0.95, 1.0, 0.6666666666666666, 0.25, 0.7, 0.5333333333333333, 1.0, 0.6666666666666666, 0.0, 0.2, 0.0, 0.1, 0.2, 1.0, 0.7333333333333333, 0.0, 0.5333333333333333, 0.4666666666666667, 0.8, 1.0, 0.2, 1.0, 0.2, 0.8, 0.3, 0.0, 0.4, 1.0, 1.0, 0.6666666666666666, 0.5333333333333333, 0.2, 0.7777777777777778, 0.8, 0.2, 1.0, 0.2, 0.23076923076923078, 0.8, 0.3333333333333333, 0.8, 0.75, 0.4, 0.2, 0.0, 0.3333333333333333, 1.0, 0.6, 0.8, 0.2222222222222222, 0.2, 0.8, 1.0, 0.5, 0.0, 0.3333333333333333, 0.5333333333333333, 0.3333333333333333, 0.0, 0.26666666666666666, 0.047619047619047616, 0.8, 0.8, 0.5333333333333333, 0.2, 0.2, 0.9047619047619048, 1.0, 0.5, 0.2, 0.4666666666666667, 0.2, 0.0, 0.0, 1.0, 0.0, 0.4666666666666667, 0.2, 1.0, 0.0, 0.0, 1.0, 0.1, 1.0, 0.26666666666666666, 0.1, 0.8, 0.26666666666666666, 0.95, 0.0, 0.5333333333333333, 0.7692307692307693, 0.8, 0.0, 0.5333333333333333, 0.4666666666666667, 0.2222222222222222, 0.8, 0.3333333333333333, 1.0, 0.6, 0.2, 0.4, 0.45, 0.2, 0.8, 0.6666666666666666, 0.0, 1.0, 0.7777777777777778, 1.0, 1.0, 0.0, 1.0, 1.0, 0.35, 1.0, 1.0, 0.7619047619047619, 0.1, 1.0, 0.8, 0.2, 0.55, 0.6, 0.0, 0.4, 0.0, 0.6, 0.0, 0.2, 0.6, 0.0, 0.2, 0.2, 1.0, 0.85, 0.3333333333333333, 0.8, 0.8, 0.0, 0.75, 0.8, 0.8, 0.0, 0.2, 0.8, 0.5714285714285714, 0.2, 0.4, 0.0, 0.4, 0.0, 1.0, 0.0, 0.8, 0.0, 1.0, 0.6666666666666666, 0.6, 1.0, 0.6, 0.2222222222222222, 1.0, 0.0, 1.0, 0.6, 0.8, 0.5, 0.5, 1.0, 1.0, 0.65, 0.8, 0.4, 0.0, 0.6, 1.0, 0.8095238095238095, 0.6, 0.0, 0.2, 0.4, 1.0, 0.9523809523809523, 0.8, 0.15, 0.2, 0.9, 0.2, 0.5333333333333333, 0.2, 1.0, 0.7333333333333333, 0.9, 0.0, 0.2, 0.1, 1.0, 0.1, 0.35714285714285715, 0.0, 0.9, 0.0, 1.0, 0.3, 1.0, 0.4, 0.2, 1.0, 0.2, 1.0, 0.0, 0.4, 1.0, 0.8, 1.0, 0.26666666666666666, 0.0, 0.35, 0.0, 0.1, 0.7333333333333333, 1.0, 1.0, 0.5, 0.7692307692307693, 0.3333333333333333, 0.0, 1.0, 0.0, 1.0, 0.4666666666666667, 0.0, 0.0, 1.0, 0.4, 0.5, 0.5333333333333333, 0.3, 0.45, 0.75, 0.2, 0.7333333333333333, 0.5333333333333333, 0.95, 0.0, 0.0, 0.0, 0.35714285714285715, 0.4, 0.2, 0.0, 0.8333333333333334, 0.8, 0.3, 1.0, 0.0, 0.0, 0.5, 0.8, 0.875, 0.7777777777777778, 0.2222222222222222, 0.8, 0.1, 1.0, 0.6666666666666666, 1.0, 1.0, 0.0, 0.4, 0.6666666666666666, 0.6, 0.0, 0.8, 0.0, 0.8, 1.0, 1.0, 0.8, 0.0, 0.65, 0.45, 0.4666666666666667, 0.3333333333333333, 0.4666666666666667, 0.1, 0.7692307692307693, 0.8, 1.0, 0.2, 0.8, 0.0, 0.05, 1.0, 0.0, 0.0, 0.95, 1.0, 0.0, 0.2, 0.8, 0.0, 0.15, 0.6666666666666666, 0.4666666666666667, 0.6, 0.35714285714285715, 0.2222222222222222, 0.15, 1.0, 0.4, 0.0, 1.0, 0.2, 0.8, 1.0, 0.05, 1.0, 0.8, 0.0, 0.6666666666666666, 0.5333333333333333, 0.2, 0.4, 1.0, 0.8, 0.8, 0.5333333333333333, 0.7333333333333333, 0.26666666666666666, 0.4666666666666667, 0.8, 0.4666666666666667, 0.0, 1.0, 0.8, 0.0, 0.8, 0.0]})" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "media_sum_corpus.meta" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "adcf2fcb-8933-49aa-96ef-b4e3fb8b71e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[['NPR-15505-5'], ['NPR-15505-6']]" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "media_sum_corpus.meta['paraphrase_pairs'][0]" + ] + }, + { + "cell_type": "markdown", + "id": "c0803b46-2f3a-4309-99b7-f0f8bb3806d6", + "metadata": {}, + "source": [ + "## provide function for pretty printing of annotations" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "0545c28f-9a45-4752-9820-4a6a58277e86", + "metadata": {}, + "outputs": [], + "source": [ + "from itertools import chain\n", + "def get_paraphrase_pair_info(corpus, pair_id):\n", + " \"\"\"Get text, paraphrase ratio, and highlighting for a paraphrase pair.\"\"\"\n", + " pairs = corpus.meta['paraphrase_pairs']\n", + " labels = corpus.meta['paraphrase_labels']\n", + " \n", + " pair = pairs[pair_id]\n", + " group1_text = \" \".join([corpus.get_utterance(uid).text for uid in pair[0]])\n", + " group2_text = \" \".join([corpus.get_utterance(uid).text for uid in pair[1]])\n", + " \n", + " # Get highlighting from all utterances in each group\n", + " group1_highlights = list(chain.from_iterable(corpus.get_utterance(uid).meta['paraphrase_guest_highlights'] for uid in pair[0]))\n", + " group2_highlights = list(chain.from_iterable(corpus.get_utterance(uid).meta['paraphrase_host_highlights'] for uid in pair[1]))\n", + " \n", + " return {\n", + " 'pair_id': pairs[pair_id],\n", + " 'text1': group1_text,\n", + " 'text2': group2_text,\n", + " 'paraphrase_ratio': corpus.meta[\"paraphrase_labels\"][pair_id],\n", + " 'is_paraphrase': corpus.meta[\"paraphrase_labels\"][pair_id] >= 0.5,\n", + " 'guest_highlights': group1_highlights,\n", + " 'host_highlights': group2_highlights,\n", + " }\n", + "def print_highlighted_pair(pair_info):\n", + " \"\"\"Print paraphrase pair with token-level highlighting -- upper casing if >= 0.5 and emphasis if >= 0.4\"\"\"\n", + " \n", + " def highlight_text(text, highlights):\n", + " tokens = text.split()\n", + " return \" \".join(\n", + " token.upper() if score >= 0.5 \n", + " else f\"\\033[1m{token}\\033[0m\" if score >= 0.4 \n", + " else token\n", + " for token, score in zip(tokens, highlights)\n", + " )\n", + " \n", + " print(f\"=== Pair {pair_info['pair_id']} ===\")\n", + " print(f\"Paraphrase ratio: {pair_info['paraphrase_ratio']:.3f} ({'PARAPHRASE' if pair_info['is_paraphrase'] else 'NOT PARAPHRASE'})\")\n", + " print(f\"\\nGuest:\\n{highlight_text(pair_info['text1'], pair_info['guest_highlights'])}\")\n", + " print(f\"\\nHost:\\n{highlight_text(pair_info['text2'], pair_info['host_highlights'])}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "ccaeece5-f9f0-4808-b77b-f76f885502c2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Pair [['NPR-24552-10'], ['NPR-24552-11']] ===\n", + "Paraphrase ratio: 0.600 (PARAPHRASE)\n", + "\n", + "Guest:\n", + "Oh, yeah. Roger Goodell has been paid $79 million in the last two years to pretend to be a person running a philanthropy that serves the public interest. Obviously, he's not. The only reason is that IRS regulations attached to nonprofit status require this. And he's not the only one in NFL Headquarters with a million dollars-plus salary, he's just the most prominent one. The reason the league is \u001b[1mgiving\u001b[0m \u001b[1mup\u001b[0m \u001b[1mits\u001b[0m \u001b[1mtax\u001b[0m \u001b[1mexemption\u001b[0m is so that they can stop \u001b[1mdisclosing\u001b[0m \u001b[1mthe\u001b[0m \u001b[1mamounts\u001b[0m \u001b[1mof\u001b[0m \u001b[1mmoney\u001b[0m \u001b[1mthat\u001b[0m \u001b[1mGoodell\u001b[0m \u001b[1mand\u001b[0m \u001b[1mthe\u001b[0m \u001b[1mother\u001b[0m \u001b[1mtop\u001b[0m \u001b[1mofficials\u001b[0m in the league make.\n", + "\n", + "Host:\n", + "Are there any areas apart from \u001b[1mdisclosure\u001b[0m \u001b[1mof\u001b[0m \u001b[1msalaries\u001b[0m \u001b[1mof\u001b[0m \u001b[1mtheir\u001b[0m \u001b[1mtop\u001b[0m \u001b[1mexecutives\u001b[0m - any areas of the NFL's activities or any of its alleged defaults that would be changed because of this change in tax status?\n", + "\n" + ] + } + ], + "source": [ + "nbr = 9\n", + "print_highlighted_pair(get_paraphrase_pair_info(media_sum_corpus, nbr))" + ] + }, + { + "cell_type": "markdown", + "id": "34999977889e081e", + "metadata": {}, + "source": [ + "## 6. Saving created datsets" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "77f03f8d48e36317", + "metadata": {}, + "outputs": [], + "source": [ + "media_sum_corpus.dump(\"mediasum-corpus\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33a19808-9ba4-46fd-b346-fd4382f5dc85", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "662c4f9a-0d60-411a-8ef9-3495cc69bd87", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst index 20ecdc2a..6f613bc2 100644 --- a/docs/source/datasets.rst +++ b/docs/source/datasets.rst @@ -32,3 +32,4 @@ Datasets Federal Open Market Committee Corpus FORA Corpus DeliData Corpus + MediaSum Corpus diff --git a/docs/source/mediasum.rst b/docs/source/mediasum.rst new file mode 100644 index 00000000..883f201d --- /dev/null +++ b/docs/source/mediasum.rst @@ -0,0 +1,180 @@ +MediaSum Corpus +=============== + +A collection of Interview transcripts from CNN / NPR. + +Across the 10 seasons there are 463,596 conversations with ~49.4K +NPR transcripts and ~414.2K CNN transcripts. There is a total of 13,919,244 utterances, and 718,483 speakers. + +Across 600 pairs of utterances, there are 5,581 annotations done by 112 annotators on whether the second utterance is a paraphrase of the first. + +The original dataset is available `here `_. It was originally distributed with `MediaSum: A Large-scale Media Interview Dataset for Dialogue Summarization `_, Chenguang Zhu, Yang Liu, Jie Mei, Michael Zeng. Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL'21, 2021. + +The original annotation of paraphrases is available `here `_. It was originally distributed with `What's Mine becomes Yours: Detecting Context-Dependent Paraphrases in News Interview Dialogs `_, Anna Wegmann, Tijs A. van den Broek, Dong Nguyen, Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, EMNLP'24, 2024. + +Dataset details +--------------- + +Speaker-level information +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Speakers in this dataset are participants in an interview. This could be interview hosts or guests. The original dataset provides each speaker's name as a string, e.g. "ED LAVANDERA, CNN CORRESPONDENT". We index Speakers by these strings. + +Note: In the speaker list, authors sometimes have non-unique identifiers (e.g., ‘ED LAVANDERA, CNN CORRESPONDENT’, ‘LAVANDERA’ or ‘E. LAVANDERA’ refer to the same speaker). Further, each identifier that is the same in one conversation as in another is considered the same speaker. This might be incorrect for cases like 'UNIDENTIFIED MALE' or 'UNIDENTIFIED FEMALE' that are sometimes used in interviews. + + +Utterance-level information +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For each Utterance we provide: + +- id: ````, the index of the utterance in the format `BROADCASTER-CONVONBR-UTTNBR`, where *BROADCASTER* is NPR or CNN, *CONVONBR* is the conversation number, *UTTNBR* is the utterance number, (e.g. *CNN-177596-7* or *NPR-4-1*). +- conversation_id: ````, conversation_id: id in the format `BROADCASTER-CONVONBR` (e.g. *CNN-177596* or *NPR-4*). This corresponds to the original ids in the MediaSum dataset. +- speaker: ````, the speaker object who authored the utterance, name available via .speaker.id +- reply_to: ````, the id of the utterance to which this utterance replies to. `None` if the utterance is the first in a conversation. +- timestamp: ``None``. Our dataset does not contain timestamp information for utterances. +- text: ````, the textual content of the utterance. +- meta: ````, a dictionary containing additional metadata about the utterance. See below for details. +- vectors: empty list. No precomputed vectors are provided. + +We additionally provide metadata for paraphrase annotations in some 2-person interviews. For 600 utterance pairs (u, v), there are annotations on whether v contains a paraphrase of u. Annotations contains paraphrase spans selected by Prolific annotators. v is always the interview host. The annotations include: character entities (or who is referred to in the utterance), emotion, a tokenized version of the text, caption information, and notes about the transcript, which we describe as follows: + +- paraphrase_is_host: ````, whether the utterance is spoken by the host of the interview. This also tells us whether the utterance is the second in a pair of utterances annotated for paraphrases. +- paraphrase_number_votes ````, the number of annotators who annotated this utterance pair. +- paraphrase_votes ````, the number of annotators who indicated that the second utterance contains a paraphrase of the first. +- paraphrase_ratio ````, the ratio of paraphrase votes to total votes. +- paraphrase_guest_highlights ``list <[float]>``, a list based on the tokens that can be created from utterance.text.split(). Each entry is between 0 and 1, indicating the ratio of annotators that highlighted that token as part of a paraphrase for the next utterance. +- paraphrase_host_highlights ``list <[float]>``, a list based on the tokens that can be created from utterance.text.split(). Each entry is between 0 and 1, indicating the ratio of annotators that highlighted that token as part of a paraphrase of the previous utterance. +- paraphrase_PROLIFIC_X ``list ``, for X in [1, 2, ..., 112], a list of length utterance.text.split() indicating whether annotator X highlighted token i as part of a paraphrase (1) or not (0). If annotator X did not annotate this key does not exist in the dict. + +Example of utterance "CNN-177596-7": + + +>>> print(media_sum_corpus.get_utterance("CNN-177596-7")) + Utterance(id: 'CNN-177596-7', conversation_id: CNN-177596, reply-to: CNN-177596-6, + speaker: Speaker(id: 'JOHNS', vectors: [], meta: ConvoKitMeta({'name': 'JOHNS'})), + timestamp: None, text: 'This is not good.', vectors: [], + meta: ConvoKitMeta({ + 'paraphrase_guest_highlights': [0.5, 0.45, 0.45, 0.45], 'paraphrase_is_host': False, + 'paraphrase_number_votes': 20, 'paraphrase_votes': 10, 'paraphrase_ratio': 0.5, + 'paraphrase_PROLIFIC_1': [0, 0, 0, 0], 'paraphrase_PROLIFIC_2': [1, 1, 1, 1], 'paraphrase_PROLIFIC_3': [0, 0, 0, 0], 'paraphrase_PROLIFIC_4': [0, 0, 0, 0], 'paraphrase_PROLIFIC_5': [0, 0, 0, 0], 'paraphrase_PROLIFIC_6': [1, 1, 1, 1], 'paraphrase_PROLIFIC_7': [1, 0, 0, 0], 'paraphrase_PROLIFIC_8': [1, 1, 1, 1], 'paraphrase_PROLIFIC_9': [0, 0, 0, 0], 'paraphrase_PROLIFIC_10': [0, 0, 0, 0], 'paraphrase_PROLIFIC_11': [1, 1, 1, 1], 'paraphrase_PROLIFIC_12': [0, 0, 0, 0], 'paraphrase_PROLIFIC_13': [1, 1, 1, 1], 'paraphrase_PROLIFIC_14': [0, 0, 0, 0], 'paraphrase_PROLIFIC_15': [0, 0, 0, 0], 'paraphrase_PROLIFIC_16': [1, 1, 1, 1], 'paraphrase_PROLIFIC_17': [0, 0, 0, 0], 'paraphrase_PROLIFIC_18': [1, 1, 1, 1], 'paraphrase_PROLIFIC_19': [1, 1, 1, 1], 'paraphrase_PROLIFIC_20': [1, 1, 1, 1] + })) + +Conversation-level information +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Conversations represent interviews on NPR and CNN. They are indexed by the id conversation_id: id in the format `BROADCASTER-CONVONBR` (e.g. *CNN-177596* or *NPR-4*). This corresponds to the original ids in the MediaSum dataset. + +- program: ````, the name of the program that the interview is a part of, e.g., 'CNN SATURDAY NIGHT'. +- date: ````, the date the interview aired, in the format 'YYYY-MM-DD' or 'YYYY-M-DD', e.g., '2003-2-22' or '2007-11-28'. +- summary: ````, a summary of the interview or the topic of the interview. The level of detail varies, e.g., 'How Much Will War With Iraq Cost?' in "CNN-67148" and 'More than 400 black actors, artists and ministers are bringing the Gospel to life in the audio book, The Bible Experience:The Complete Bible. Farai Chideya talks with producer Kyle Bowser and actress Wendy Raquel Robinson, who lends her voice to the project.' in "NPR-1". +- url: ````, the URL of the interview transcript on the broadcaster's website, e.g., 'http://transcripts.cnn.com/TRANSCRIPTS/0302/22/stn.02.html' for "CNN-67148". +- title: ````, the title of the interview, e.g., 'Black Actors Give Bible Star Appeal' in "CNN-67148". +- broadcaster: ````, the broadcaster of the interview, either 'CNN' or 'NPR'. + +Corpus-level information +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- name ````, the name of the corpus, 'media-sum-corpus'. +- paraphrase_pairs ``list>>``, a list of lists containing the pairs of utterance ids (u, v) that are annotated for paraphrases. There are 600 such pairs in total. An entry in the list of paraphrase pairs can look like this , [['NPR-35922-5', 'NPR-35922-6', 'NPR-35922-7'], ['NPR-35922-8']], i.e., an "utterance" for the annotations can consist of multiple utterances according to the corpus utterance ids. This happens if the same speaker speaks multiple times in a row. +- paraphrase_labels ``list``, a list of floats between 0 and 1 indicating whether the second utterance in each paraphrase pair contains a paraphrase of the first (1) or not (0). The order corresponds to the order of paraphrase_pairs. Values are floats because they represent the ratio of annotators who indicated that the second utterance contains a paraphrase of the first. + + + +Usage +----- + +To download directly with ConvoKit: + +>>> from convokit import Corpus, download +>>> corpus = Corpus(filename=download("mediasum-corpus")) + + +For some quick stats: + +>>> corpus.print_summary_stats() +Number of Speakers: 700 +Number of Utterances: 67373 +Number of Conversations: 3107 + +Get all paraphrase pairs from the corpus metadata + +>>> paraphrase_pairs = corpus.meta['paraphrase_pairs'] +>>> print(f"Total paraphrase pairs: {len(paraphrase_pairs)}") +Total paraphrase pairs: 600 + +useful functions for working with paraphrase pairs + +.. code-block:: python + + from itertools import chain + + def get_paraphrase_pair_info(corpus, pair_id): + """Get text, paraphrase ratio, and highlighting for a paraphrase pair.""" + pairs = corpus.meta['paraphrase_pairs'] + labels = corpus.meta['paraphrase_labels'] + + pair = pairs[pair_id] + group1_text = " ".join([corpus.get_utterance(uid).text for uid in pair[0]]) + group2_text = " ".join([corpus.get_utterance(uid).text for uid in pair[1]]) + + # Get highlighting from all utterances in each group + group1_highlights = list(chain.from_iterable(corpus.get_utterance(uid).meta['paraphrase_guest_highlights'] for uid in pair[0])) + group2_highlights = list(chain.from_iterable(corpus.get_utterance(uid).meta['paraphrase_host_highlights'] for uid in pair[1])) + + return { + 'pair_id': pairs[pair_id], + 'text1': group1_text, + 'text2': group2_text, + 'paraphrase_ratio': corpus.meta["paraphrase_labels"][pair_id], + 'is_paraphrase': corpus.meta["paraphrase_labels"][pair_id] >= 0.5, + 'guest_highlights': group1_highlights, + 'host_highlights': group2_highlights, + } + def print_highlighted_pair(pair_info): + """Print paraphrase pair with token-level highlighting -- upper casing if >= 0.5 and emphasis if >= 0.4""" + + def highlight_text(text, highlights): + tokens = text.split() + return " ".join( + token.upper() if score >= 0.5 + else f"\033[1m{token}\033[0m" if score >= 0.4 + else token + for token, score in zip(tokens, highlights) + ) + + print(f"=== Pair {pair_info['pair_id']} ===") + print(f"Paraphrase ratio: {pair_info['paraphrase_ratio']:.3f} ({'PARAPHRASE' if pair_info['is_paraphrase'] else 'NOT PARAPHRASE'})") + print(f"\nGuest:\n{highlight_text(pair_info['text1'], pair_info['guest_highlights'])}") + print(f"\nHost:\n{highlight_text(pair_info['text2'], pair_info['host_highlights'])}\n") + +Example use: + +.. code-block:: python + + >>> print_highlighted_pair(get_paraphrase_pair_info(media_sum_corpus, 9)) + === Pair [['CNN-350238-7'], ['CNN-350238-8']] === + Paraphrase ratio: 0.500 (PARAPHRASE) + + Guest: + I want to applaud THE WORK OF THE TEXAS RANGERS and the sheriff's office and DPS in bringing this man into custody. + + Host: + Do you think -- last quick question -- no, it's been EXTRAORDINARY WORK for you guys in Texas. Had this most recent woman not escaped, what are her chances that she could have been next? + + +Additional note +--------------- + +Data License +^^^^^^^^^^^^ + +Research-only + +Details: +Regarding license, the `Mediasum paper `_ reads: "We have used only the publicly available transcripts data from the media sources and adhere to their only-for-research-purpose guideline." and "Please restrict your usage of this dataset to research purpose only" on `their GitHub `_. The annotations are also shared with a research-only license, see `GitHub `_. + + +Contact +^^^^^^^ + +Please email any questions to Anna Wegmann (a.m.wegmann@uu.nl). diff --git a/download_config.json b/download_config.json index af06ea3f..a309eac3 100644 --- a/download_config.json +++ b/download_config.json @@ -35,7 +35,8 @@ "wiki-articles-for-deletion-corpus": 1, "casino-corpus": 1, "wiki-sampled-en-corpus": 0, - "wiki-sampled-zh-corpus": 0 + "wiki-sampled-zh-corpus": 0, + "mediasum-corpus": 0 }, "DatasetURLs": { "chromium-corpus": "http://zissou.infosci.cornell.edu/convokit/datasets/chromium-corpus/chromium-corpus.zip", @@ -104,7 +105,8 @@ "wiki-articles-for-deletion-corpus": "https://zissou.infosci.cornell.edu/convokit/datasets/wiki-articles-for-deletion-corpus/wiki-articles-for-deletion-corpus.zip", "casino-corpus": "https://zissou.infosci.cornell.edu/convokit/datasets/casino-corpus/casino-corpus.zip", "wiki-sampled-en-corpus": "https://zissou.infosci.cornell.edu/convokit/datasets/wiki-sampled-en-corpus/wiki-sampled-en-corpus.zip", - "wiki-sampled-zh-corpus": "https://zissou.infosci.cornell.edu/convokit/datasets/wiki-sampled-zh-corpus/wiki-sampled-zh-corpus.zip" + "wiki-sampled-zh-corpus": "https://zissou.infosci.cornell.edu/convokit/datasets/wiki-sampled-zh-corpus/wiki-sampled-zh-corpus.zip", + "mediasum-corpus": "https://zissou.infosci.cornell.edu/convokit/datasets/mediasum-corpus/mediasum-corpus.zip" }, "ModelURLS": { "craft-wiki-pretrained": [