diff --git a/datasets/mediasum-corpus/convert_mediasum-corpus.ipynb b/datasets/mediasum-corpus/convert_mediasum-corpus.ipynb new file mode 100644 index 00000000..dcb33787 --- /dev/null +++ b/datasets/mediasum-corpus/convert_mediasum-corpus.ipynb @@ -0,0 +1,1442 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "28d4a9804792413d", + "metadata": {}, + "source": [ + "# Creating Convokit Corpus element\n", + "according to https://github.com/CornellNLP/ConvoKit/blob/master/examples/converting_movie_corpus.ipynb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e3c3f33-f3bb-4c65-99c9-0a6b7c0614e1", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install torch torchvision\n", + "!pip install convokit\n", + "!pip install datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "362ef5498fa51d2", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T13:47:31.398864Z", + "start_time": "2025-08-21T13:47:31.396352Z" + } + }, + "outputs": [], + "source": [ + "from convokit import Corpus, Speaker, Utterance\n", + "import pandas as pd\n", + "import tqdm\n", + "import ast" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9527f1ed47847a79", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T13:47:48.826642Z", + "start_time": "2025-08-21T13:47:34.037108Z" + } + }, + "outputs": [], + "source": [ + "media_sum_path = \"data/MediaSum/news_dialogue.json\"\n", + "media_sum_json = pd.read_json(media_sum_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d4783945f0f36a2e", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T13:47:48.853529Z", + "start_time": "2025-08-21T13:47:48.841408Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | id | \n", + "program | \n", + "date | \n", + "url | \n", + "title | \n", + "summary | \n", + "utt | \n", + "speaker | \n", + "
|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "NPR-1 | \n", + "News & Notes | \n", + "2007-11-28 | \n", + "https://www.npr.org/templates/story/story.php?... | \n", + "Black Actors Give Bible Star Appeal | \n", + "More than 400 black actors, artists and minist... | \n", + "[Now, moving on, Forest Whitaker as Moses, Tis... | \n", + "[FARAI CHIDEYA, host, FARAI CHIDEYA, host, Mr.... | \n", + "
| 1 | \n", + "NPR-2 | \n", + "Weekend Edition Sunday | \n", + "2016-10-23 | \n", + "https://www.npr.org/2016/10/23/499042298/young... | \n", + "Young, First-Time Voters Share Views On Electi... | \n", + "NPR's Rachel Martin speaks with young voters w... | \n", + "[You have heard it again and again - this is a... | \n", + "[RACHEL MARTIN, HOST, ASHANTI MARTINEZ, LAUREN... | \n", + "
| 2 | \n", + "NPR-3 | \n", + "News & Notes | \n", + "2007-11-30 | \n", + "https://www.npr.org/templates/story/story.php?... | \n", + "Snapshots: On Solid Ground | \n", + "In this week's snapshot, actor and playwright ... | \n", + "[I came close to running out of luck, when I a... | \n", + "[Mr. JEFF OBAFEMI CARR (Actor, Playwright), CH... | \n", + "
| 3 | \n", + "NPR-4 | \n", + "News & Notes | \n", + "2007-11-30 | \n", + "https://www.npr.org/templates/story/story.php?... | \n", + "Washington, D.C. Facing HIV/AIDS Epidemic | \n", + "A new study says one in 50 people in the natio... | \n", + "[This is NEWS & NOTES. I'm Farai Chideya., In ... | \n", + "[FARAI CHIDEYA, host, FARAI CHIDEYA, host, Dr.... | \n", + "
| 4 | \n", + "NPR-5 | \n", + "News & Notes | \n", + "2007-11-30 | \n", + "https://www.npr.org/templates/story/story.php?... | \n", + "Coping When AIDS Hits Your Family: Part II | \n", + "When a family member is diagnosed with HIV/AID... | \n", + "[I'm Farai Chideya and this is NEWS & NOTES., ... | \n", + "[FARAI CHIDEYA, host, FARAI CHIDEYA, host, FAR... | \n", + "
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 463591 | \n", + "CNN-414237 | \n", + "CNN NEWSROOM | \n", + "2020-10-25 | \n", + "http://transcripts.cnn.com/TRANSCRIPTS/2010/25... | \n", + "NaN | \n", + "U.S. Officials: Russia, Iran Have Stolen Voter... | \n", + "[Welcome back to our viewers in the United Sta... | \n", + "[BRUNHUBER, NATASHA CHEN, CNN CORRESPONDENT, W... | \n", + "
| 463592 | \n", + "CNN-414238 | \n", + "CNN NEWSROOM | \n", + "2020-10-25 | \n", + "http://transcripts.cnn.com/TRANSCRIPTS/2010/25... | \n", + "NaN | \n", + "Nigerian Police Force Mobilize To Quell Worst ... | \n", + "[In Nigeria, chaotic scenes of looting and des... | \n", + "[BRUNHUBER, BRUNHUBER (voice-over), BRUNHUBER ... | \n", + "
| 463593 | \n", + "CNN-414239 | \n", + "CNN NEWSROOM | \n", + "2020-10-25 | \n", + "http://transcripts.cnn.com/TRANSCRIPTS/2010/25... | \n", + "NaN | \n", + "COVID-19 Triggers Rise In Asian American Unemp... | \n", + "[Officials in the U.S. are worried about wides... | \n", + "[BRUNHUBER, AMARA WALKER, CNN ANCHOR (voice-ov... | \n", + "
| 463594 | \n", + "CNN-414240 | \n", + "STATE OF THE UNION | \n", + "2020-10-25 | \n", + "http://transcripts.cnn.com/TRANSCRIPTS/2010/25... | \n", + "NaN | \n", + "COVID-19 Outbreak Hits Vice President Pence's ... | \n", + "[Dark winter? U.S. COVID cases hit a new daily... | \n", + "[JAKE TAPPER, CNN HOST (voice-over), DONALD TR... | \n", + "
| 463595 | \n", + "CNN-414241 | \n", + "STATE OF THE UNION | \n", + "2020-10-25 | \n", + "http://transcripts.cnn.com/TRANSCRIPTS/2010/25... | \n", + "NaN | \n", + "Interview With Rep. Alexandria Ocasio-Cortez (... | \n", + "[Welcome back the STATE OF THE UNION. I'm Jake... | \n", + "[TAPPER, REP. ALEXANDRIA OCASIO-CORTEZ (D-NY),... | \n", + "
463596 rows × 8 columns
\n", + "