Merge pull request #11 from colobas/dev

Dev
colobas · Feb 14, 2024 · c61a09d · c61a09d
2 parents a4f3c03 + c69b36b
commit c61a09d
Show file tree

Hide file tree

Showing 29 changed files with 1,743 additions and 127 deletions.
diff --git a/.github/workflows/process-debates.yaml → .github/workflows/upload-debates.yaml b/.github/workflows/process-debates.yaml → .github/workflows/upload-debates.yaml
@@ -22,8 +22,18 @@ jobs:
           cache-environment: true
           post-cleanup: 'all'
 
+      - name: Setup Rclone
+        uses: AnimMouse/setup-rclone@v1
+        with:
+          rclone_config: ${{ secrets.RCLONE_CONFIG }}
+
+      - name: Extract gcloud config from rclone config
+        run: echo "${{ secrets.RCLONE_CONFIG }}" | python parse_rclone_conf.py > gcloud.json
+
       - name: Process Debates
-        run: python process_debates.py debates.yaml public/debates
+        env:
+          CLIENT_SECRETS_FILE: gcloud.json
+        run: python process_debates.py public/debates --skip-transcription
 
       - name: Commit and Push Changes
         uses: EndBug/add-and-commit@v7
@@ -32,3 +42,9 @@ jobs:
           add: 'public/debates/'
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Refresh Rclone Config
+        uses: AnimMouse/setup-rclone/update-config@v1
+        with:
+          rclone_config_secret_name: RCLONE_CONFIG
+          token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/cloudfare-corsproxy.json → cloudfare-corsproxy.js b/cloudfare-corsproxy.json → cloudfare-corsproxy.js
diff --git a/debates.yaml b/debates.yaml
diff --git a/parse_rclone_conf.py b/parse_rclone_conf.py
@@ -0,0 +1,26 @@
+import configparser
+import json
+import base64
+
+
+def main():
+    config = configparser.ConfigParser()
+    # decode config from base64 stdin
+
+    conf_str = base64.b64decode(input()).decode("utf-8")
+    config.read_string(conf_str)
+    conf = config["debates"]
+
+    refresh_token = json.loads(conf["token"])["refresh_token"]
+
+    out = {
+        "client_id": conf["client_id"],
+        "client_secret": conf["client_secret"],
+        "refresh_token": refresh_token,
+    }
+
+    print(json.dumps(out))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/process_debates.py b/process_debates.py
@@ -15,6 +15,8 @@
 import webvtt
 
 from video_utils import upload_to_gdrive, get_file_ids, direct_link
+from speaker_party_conversion import speaker_party_conversion
+
 
 def convert_to_seconds(time):
     """
@@ -155,7 +157,7 @@ def slugify(title):
     Turn a title into a slug
     """
 
-    return title.lower().replace(" ", "-")
+    return title.lower().replace(" - ", "-vs-")
 
 
 def transcribe_audio(audio_path, output_root, mp3_direct_link):
@@ -213,12 +215,18 @@ def process_debate(*, title, url, output_root, gdrive_service, skip_transcriptio
     Process a debate from the input data
     """
 
+    slug = slugify(title)
+    rev_slug = "-".join(slug.split("-")[::-1])
+
+    # check if reverse slug exists, if so, slug is the reverse slug
+    if (output_root / f"{rev_slug}.json").exists():
+        slug = rev_slug
+
     m3u8_url, thumbnail_url = find_m3u8_and_thumbnail(url)
     if m3u8_url is None or thumbnail_url is None:
         logging.warning(f"Could not find m3u8 or thumbnail for {url}")
         return {"title": title, "thumbnail": thumbnail_url, "slug": slug}
 
-    slug = slugify(title)
     audio_path = output_root / f"media/{slug}.mp3"
 
     if "rtp.pt" in url:
@@ -262,8 +270,52 @@ def process_debate(*, title, url, output_root, gdrive_service, skip_transcriptio
     return {"title": title, "thumbnail": thumbnail_url, "slug": slug}
 
 
+def scrape_page(url):
+    debates = {}
+
+    page = requests.get(url)
+    page = bs4.BeautifulSoup(page.text, "html.parser")
+
+    # find div with id "listProgramsContent"
+    div = page.find("div", id="listProgramsContent")
+
+    # find <article> tags -> one per "episode"
+    articles = div.find_all("article")
+    for article in articles:
+        title = article.find("h4", class_="episode-title").text.strip()
+        title = " - ".join([speaker_party_conversion[s.strip()] for s in title.split(" - ")])
+        href = article.find("a", class_="episode-item")["href"]
+
+        if href is not None:
+            debates[title] = "https://www.rtp.pt" + href
+
+    return debates
+
+
+def scrape_debate_links():
+    ROOT_URLS = [
+        "https://www.rtp.pt/play/p12900/debates-legislativas-2024",
+        "https://www.rtp.pt/play/p12899/debates-legislativas-2024-sicsic-noticias",
+        "https://www.rtp.pt/play/p12901/debates-legislativas-2024-tvicnn",
+    ]
+
+    debates = {}
+
+    for url in ROOT_URLS:
+        new_debates = scrape_page(url)
+        debates.update(new_debates)
+
+        # get a random url from new_debates, for the second pass
+        # (because one of the episodes will be selected and not have a link)
+        random_url = list(new_debates.values())[0]
+
+        new_debates = scrape_page(random_url)
+        debates.update(new_debates)
+
+    return [{"title": k, "url": v} for k,v in debates.items()]
+
+
 def main(args):
-    input_path = Path(args.input)
     output_root = Path(args.output_root)
     output_root.mkdir(exist_ok=True, parents=True)
 
@@ -277,19 +329,11 @@ def main(args):
     else:
         creds, gdrive_service = None, None
 
-    with open(input_path, "r") as f:
-        data = yaml.safe_load(f)
 
-    master_json = []
-    for debate in data:
-        output_path = output_root / f"{debate}.json"
-
-        if output_path.exists() and not args.force:
-            continue
-
-        if debate is None:
-            continue
+    debates = scrape_debate_links()
 
+    master_json = []
+    for debate in debates:
         summary = process_debate(**debate, output_root=output_root, skip_transcription=args.skip_transcription, gdrive_service=gdrive_service, skip_upload=args.skip_upload)
         master_json.append(summary)
 

diff --git a/public/debates/be-vs-chega.json b/public/debates/be-vs-chega.json
@@ -1,6 +1,6 @@
 {
     "slug": "be-vs-chega",
-    "title": "BE vs Chega",
+    "title": "Chega - BE",
     "original_url": "https://www.rtp.pt/play/p12900/e747851/debates-legislativas-2024",
     "audio_url": "https://drive.google.com/uc?id=1Tk9gV0e_r_H638ZQsB0ume7r8K-ZzwRP"
 }
diff --git a/public/debates/be-vs-livre.json b/public/debates/be-vs-livre.json
@@ -1,6 +1,6 @@
 {
     "slug": "be-vs-livre",
-    "title": "BE vs Livre",
+    "title": "BE - Livre",
     "original_url": "https://www.rtp.pt/play/p12899/e746909/debates-legislativas-2024-sicsic-noticias",
     "audio_url": "https://drive.google.com/uc?id=18VRopVy-VbjNLKYkUU9TRtyo2JdGowv0"
 }
diff --git a/public/debates/be-vs-pcp.json b/public/debates/be-vs-pcp.json
@@ -1,6 +1,6 @@
 {
     "slug": "be-vs-pcp",
-    "title": "BE vs PCP",
+    "title": "BE - PCP",
     "original_url": "https://www.rtp.pt/play/p12899/e747442/debates-legislativas-2024-sicsic-noticias",
     "audio_url": "https://drive.google.com/uc?id=1-A5Q4Jc6FY4TN2kKbN16gKlh2SV1tfX5"
 }
diff --git a/public/debates/be-vs-psd.json b/public/debates/be-vs-psd.json
@@ -1,6 +1,6 @@
 {
     "slug": "be-vs-psd",
-    "title": "BE vs PSD",
-    "original_url": "https://sicnoticias.pt/especiais/eleicoes-legislativas/2024-02-06-Debate-entre-BE-e-PSD-Quem-e-que-sabe-salvar-o-SNS--a252ab7c",
+    "title": "PSD - BE",
+    "original_url": "https://www.rtp.pt/play/p12901/e746363/debates-legislativas-2024-tvicnn",
     "audio_url": "https://drive.google.com/uc?id=1EIbaHmM2Xk-RFsIgYVf9BsH-xgrnntTt"
 }
diff --git a/public/debates/chega-vs-il.json b/public/debates/chega-vs-il.json
@@ -1,6 +1,6 @@
 {
     "slug": "chega-vs-il",
-    "title": "Chega vs IL",
+    "title": "Chega - IL",
     "original_url": "https://www.rtp.pt/play/p12899/e746368/debates-legislativas-2024-sicsic-noticias",
     "audio_url": "https://drive.google.com/uc?id=1Z_VuncGFLxSYo_pV2irq-9hPydvN1o8-"
 }
diff --git a/public/debates/chega-vs-pan.json b/public/debates/chega-vs-pan.json
@@ -1,6 +1,6 @@
 {
     "slug": "chega-vs-pan",
-    "title": "Chega vs PAN",
+    "title": "Chega - PAN",
     "original_url": "https://www.rtp.pt/play/p12900/e746061/debates-legislativas-2024",
     "audio_url": "https://drive.google.com/uc?id=16V6_OfgkVBkZl4cVW29QUbPZtfg5jNiQ"
 }
diff --git a/public/debates/chega-vs-pcp.json b/public/debates/chega-vs-pcp.json
@@ -1,6 +1,6 @@
 {
     "slug": "chega-vs-pcp",
-    "title": "Chega vs PCP",
+    "title": "Chega - PCP",
     "original_url": "https://www.rtp.pt/play/p12901/e747268/debates-legislativas-2024-tvicnn",
     "audio_url": "https://drive.google.com/uc?id=1aa_ln4ZOb6gDvh3iorstQ-GS2lZE6Hyj"
 }
diff --git a/public/debates/il-vs-pan.json b/public/debates/il-vs-pan.json
@@ -1,6 +1,6 @@
 {
     "slug": "il-vs-pan",
-    "title": "IL vs PAN",
+    "title": "IL - PAN",
     "original_url": "https://www.rtp.pt/play/p12899/e747269/debates-legislativas-2024-sicsic-noticias",
     "audio_url": "https://drive.google.com/uc?id=1tWrPr9c-oQz202wU0jjnsqoMkxIrnLtW"
 }
diff --git a/public/debates/livre-vs-il.json b/public/debates/livre-vs-il.json
@@ -1,6 +1,6 @@
 {
     "slug": "livre-vs-il",
-    "title": "Livre vs IL",
+    "title": "IL - Livre",
     "original_url": "https://www.rtp.pt/play/p12901/e746631/debates-legislativas-2024-tvicnn",
     "audio_url": "https://drive.google.com/uc?id=1ExCAfb6cx3Cm3LyduuC3F9F1HBUHGjVL"
 }
diff --git a/public/debates/livre-vs-pan.json b/public/debates/livre-vs-pan.json
@@ -0,0 +1,6 @@
+{
+    "slug": "livre-vs-pan",
+    "title": "Livre - PAN",
+    "original_url": "https://www.rtp.pt/play/p12900/e748150/debates-legislativas-2024",
+    "audio_url": "https://drive.google.com/uc?id=1b65VelhrrUcAiaZu6lAHgUV5SbRLEDrk"
+}
diff --git a/public/debates/livre-vs-ps.json b/public/debates/livre-vs-ps.json
@@ -1,6 +1,6 @@
 {
     "slug": "livre-vs-ps",
-    "title": "Livre vs PS",
+    "title": "PS - Livre",
     "original_url": "https://www.rtp.pt/play/p12900/e747215/debates-legislativas-2024",
     "audio_url": "https://drive.google.com/uc?id=1204EOcVfCSOwsm_W1BMIV-LTJtuFWI7F"
 }