Skip to content

Commit

Permalink
Merge pull request #11 from colobas/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
colobas authored Feb 14, 2024
2 parents a4f3c03 + c69b36b commit c61a09d
Show file tree
Hide file tree
Showing 29 changed files with 1,743 additions and 127 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,18 @@ jobs:
cache-environment: true
post-cleanup: 'all'

- name: Setup Rclone
uses: AnimMouse/setup-rclone@v1
with:
rclone_config: ${{ secrets.RCLONE_CONFIG }}

- name: Extract gcloud config from rclone config
run: echo "${{ secrets.RCLONE_CONFIG }}" | python parse_rclone_conf.py > gcloud.json

- name: Process Debates
run: python process_debates.py debates.yaml public/debates
env:
CLIENT_SECRETS_FILE: gcloud.json
run: python process_debates.py public/debates --skip-transcription

- name: Commit and Push Changes
uses: EndBug/add-and-commit@v7
Expand All @@ -32,3 +42,9 @@ jobs:
add: 'public/debates/'
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

- name: Refresh Rclone Config
uses: AnimMouse/setup-rclone/update-config@v1
with:
rclone_config_secret_name: RCLONE_CONFIG
token: ${{ secrets.GITHUB_TOKEN }}
File renamed without changes.
48 changes: 0 additions & 48 deletions debates.yaml

This file was deleted.

26 changes: 26 additions & 0 deletions parse_rclone_conf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import configparser
import json
import base64


def main():
config = configparser.ConfigParser()
# decode config from base64 stdin

conf_str = base64.b64decode(input()).decode("utf-8")
config.read_string(conf_str)
conf = config["debates"]

refresh_token = json.loads(conf["token"])["refresh_token"]

out = {
"client_id": conf["client_id"],
"client_secret": conf["client_secret"],
"refresh_token": refresh_token,
}

print(json.dumps(out))


if __name__ == "__main__":
main()
72 changes: 58 additions & 14 deletions process_debates.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
import webvtt

from video_utils import upload_to_gdrive, get_file_ids, direct_link
from speaker_party_conversion import speaker_party_conversion


def convert_to_seconds(time):
"""
Expand Down Expand Up @@ -155,7 +157,7 @@ def slugify(title):
Turn a title into a slug
"""

return title.lower().replace(" ", "-")
return title.lower().replace(" - ", "-vs-")


def transcribe_audio(audio_path, output_root, mp3_direct_link):
Expand Down Expand Up @@ -213,12 +215,18 @@ def process_debate(*, title, url, output_root, gdrive_service, skip_transcriptio
Process a debate from the input data
"""

slug = slugify(title)
rev_slug = "-".join(slug.split("-")[::-1])

# check if reverse slug exists, if so, slug is the reverse slug
if (output_root / f"{rev_slug}.json").exists():
slug = rev_slug

m3u8_url, thumbnail_url = find_m3u8_and_thumbnail(url)
if m3u8_url is None or thumbnail_url is None:
logging.warning(f"Could not find m3u8 or thumbnail for {url}")
return {"title": title, "thumbnail": thumbnail_url, "slug": slug}

slug = slugify(title)
audio_path = output_root / f"media/{slug}.mp3"

if "rtp.pt" in url:
Expand Down Expand Up @@ -262,8 +270,52 @@ def process_debate(*, title, url, output_root, gdrive_service, skip_transcriptio
return {"title": title, "thumbnail": thumbnail_url, "slug": slug}


def scrape_page(url):
debates = {}

page = requests.get(url)
page = bs4.BeautifulSoup(page.text, "html.parser")

# find div with id "listProgramsContent"
div = page.find("div", id="listProgramsContent")

# find <article> tags -> one per "episode"
articles = div.find_all("article")
for article in articles:
title = article.find("h4", class_="episode-title").text.strip()
title = " - ".join([speaker_party_conversion[s.strip()] for s in title.split(" - ")])
href = article.find("a", class_="episode-item")["href"]

if href is not None:
debates[title] = "https://www.rtp.pt" + href

return debates


def scrape_debate_links():
ROOT_URLS = [
"https://www.rtp.pt/play/p12900/debates-legislativas-2024",
"https://www.rtp.pt/play/p12899/debates-legislativas-2024-sicsic-noticias",
"https://www.rtp.pt/play/p12901/debates-legislativas-2024-tvicnn",
]

debates = {}

for url in ROOT_URLS:
new_debates = scrape_page(url)
debates.update(new_debates)

# get a random url from new_debates, for the second pass
# (because one of the episodes will be selected and not have a link)
random_url = list(new_debates.values())[0]

new_debates = scrape_page(random_url)
debates.update(new_debates)

return [{"title": k, "url": v} for k,v in debates.items()]


def main(args):
input_path = Path(args.input)
output_root = Path(args.output_root)
output_root.mkdir(exist_ok=True, parents=True)

Expand All @@ -277,19 +329,11 @@ def main(args):
else:
creds, gdrive_service = None, None

with open(input_path, "r") as f:
data = yaml.safe_load(f)

master_json = []
for debate in data:
output_path = output_root / f"{debate}.json"

if output_path.exists() and not args.force:
continue

if debate is None:
continue
debates = scrape_debate_links()

master_json = []
for debate in debates:
summary = process_debate(**debate, output_root=output_root, skip_transcription=args.skip_transcription, gdrive_service=gdrive_service, skip_upload=args.skip_upload)
master_json.append(summary)

Expand Down
2 changes: 1 addition & 1 deletion public/debates/be-vs-chega.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"slug": "be-vs-chega",
"title": "BE vs Chega",
"title": "Chega - BE",
"original_url": "https://www.rtp.pt/play/p12900/e747851/debates-legislativas-2024",
"audio_url": "https://drive.google.com/uc?id=1Tk9gV0e_r_H638ZQsB0ume7r8K-ZzwRP"
}
2 changes: 1 addition & 1 deletion public/debates/be-vs-livre.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"slug": "be-vs-livre",
"title": "BE vs Livre",
"title": "BE - Livre",
"original_url": "https://www.rtp.pt/play/p12899/e746909/debates-legislativas-2024-sicsic-noticias",
"audio_url": "https://drive.google.com/uc?id=18VRopVy-VbjNLKYkUU9TRtyo2JdGowv0"
}
2 changes: 1 addition & 1 deletion public/debates/be-vs-pcp.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"slug": "be-vs-pcp",
"title": "BE vs PCP",
"title": "BE - PCP",
"original_url": "https://www.rtp.pt/play/p12899/e747442/debates-legislativas-2024-sicsic-noticias",
"audio_url": "https://drive.google.com/uc?id=1-A5Q4Jc6FY4TN2kKbN16gKlh2SV1tfX5"
}
4 changes: 2 additions & 2 deletions public/debates/be-vs-psd.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"slug": "be-vs-psd",
"title": "BE vs PSD",
"original_url": "https://sicnoticias.pt/especiais/eleicoes-legislativas/2024-02-06-Debate-entre-BE-e-PSD-Quem-e-que-sabe-salvar-o-SNS--a252ab7c",
"title": "PSD - BE",
"original_url": "https://www.rtp.pt/play/p12901/e746363/debates-legislativas-2024-tvicnn",
"audio_url": "https://drive.google.com/uc?id=1EIbaHmM2Xk-RFsIgYVf9BsH-xgrnntTt"
}
2 changes: 1 addition & 1 deletion public/debates/chega-vs-il.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"slug": "chega-vs-il",
"title": "Chega vs IL",
"title": "Chega - IL",
"original_url": "https://www.rtp.pt/play/p12899/e746368/debates-legislativas-2024-sicsic-noticias",
"audio_url": "https://drive.google.com/uc?id=1Z_VuncGFLxSYo_pV2irq-9hPydvN1o8-"
}
2 changes: 1 addition & 1 deletion public/debates/chega-vs-pan.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"slug": "chega-vs-pan",
"title": "Chega vs PAN",
"title": "Chega - PAN",
"original_url": "https://www.rtp.pt/play/p12900/e746061/debates-legislativas-2024",
"audio_url": "https://drive.google.com/uc?id=16V6_OfgkVBkZl4cVW29QUbPZtfg5jNiQ"
}
2 changes: 1 addition & 1 deletion public/debates/chega-vs-pcp.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"slug": "chega-vs-pcp",
"title": "Chega vs PCP",
"title": "Chega - PCP",
"original_url": "https://www.rtp.pt/play/p12901/e747268/debates-legislativas-2024-tvicnn",
"audio_url": "https://drive.google.com/uc?id=1aa_ln4ZOb6gDvh3iorstQ-GS2lZE6Hyj"
}
2 changes: 1 addition & 1 deletion public/debates/il-vs-pan.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"slug": "il-vs-pan",
"title": "IL vs PAN",
"title": "IL - PAN",
"original_url": "https://www.rtp.pt/play/p12899/e747269/debates-legislativas-2024-sicsic-noticias",
"audio_url": "https://drive.google.com/uc?id=1tWrPr9c-oQz202wU0jjnsqoMkxIrnLtW"
}
2 changes: 1 addition & 1 deletion public/debates/livre-vs-il.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"slug": "livre-vs-il",
"title": "Livre vs IL",
"title": "IL - Livre",
"original_url": "https://www.rtp.pt/play/p12901/e746631/debates-legislativas-2024-tvicnn",
"audio_url": "https://drive.google.com/uc?id=1ExCAfb6cx3Cm3LyduuC3F9F1HBUHGjVL"
}
6 changes: 6 additions & 0 deletions public/debates/livre-vs-pan.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"slug": "livre-vs-pan",
"title": "Livre - PAN",
"original_url": "https://www.rtp.pt/play/p12900/e748150/debates-legislativas-2024",
"audio_url": "https://drive.google.com/uc?id=1b65VelhrrUcAiaZu6lAHgUV5SbRLEDrk"
}
2 changes: 1 addition & 1 deletion public/debates/livre-vs-ps.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"slug": "livre-vs-ps",
"title": "Livre vs PS",
"title": "PS - Livre",
"original_url": "https://www.rtp.pt/play/p12900/e747215/debates-legislativas-2024",
"audio_url": "https://drive.google.com/uc?id=1204EOcVfCSOwsm_W1BMIV-LTJtuFWI7F"
}
Loading

0 comments on commit c61a09d

Please sign in to comment.