From 1e21b7c406ce7821e0fc25f63c27b4746fb03558 Mon Sep 17 00:00:00 2001
From: Romain Gehrig <romain.gehrig@gmail.com>
Date: Mon, 16 May 2022 16:07:13 +0200
Subject: [PATCH 1/2] PoC for downloading Telegram files

---
 config.py.example |  2 ++
 telegram2org.py   | 78 +++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 71 insertions(+), 9 deletions(-)

diff --git a/config.py.example b/config.py.example
index 163f655..4acfa87 100644
--- a/config.py.example
+++ b/config.py.example
@@ -14,6 +14,8 @@ GROUP_NAME = 'Todos'
 # file tags for org file; can be None if you don't want a tag
 ORG_TAG = "telegram2org"
 
+# directory where the files are downloaded
+MEDIA_DIR = "media/"
 
 # org mode uses local timezone and telegram uses UTC, so we have to specify it
 TIMEZONE = 'Europe/London'
diff --git a/telegram2org.py b/telegram2org.py
index b7286db..7488a4b 100755
--- a/telegram2org.py
+++ b/telegram2org.py
@@ -11,22 +11,24 @@
 That way you keep your focus while not being mean ignoring your friends' messages.
 """
 
+from pathlib import Path
 from datetime import datetime
 import logging
 import re
-from typing import List, Dict, Tuple, Collection, Set
+from typing import Collection, Dict, List, Optional, Set, Tuple, Union
+import os
 import pytz
 
-import telethon.sync # type: ignore
-from telethon import TelegramClient # type: ignore
-from telethon.tl.types import MessageMediaWebPage, MessageMediaPhoto, MessageMediaDocument, MessageMediaVenue # type: ignore
-from telethon.tl.types import MessageService, WebPageEmpty # type: ignore
+import telethon.sync  # type: ignore
+from telethon import TelegramClient  # type: ignore
+from telethon.tl.types import MessageMediaWebPage, MessageMediaPhoto, MessageMediaDocument, MessageMediaVenue  # type: ignore
+from telethon.tl.types import Message, MessageService, WebPageEmpty  # type: ignore
 
 from orger import InteractiveView
 from orger.common import todo
 from orger.inorganic import link
 
-from config import ORG_TAG, TG_APP_HASH, TG_APP_ID, TELETHON_SESSION, GROUP_NAME, TIMEZONE, NAME_TO_TAG
+from config import ORG_TAG, TG_APP_HASH, TG_APP_ID, TELETHON_SESSION, GROUP_NAME, TIMEZONE, NAME_TO_TAG, MEDIA_DIR
 
 
 Timestamp = int
@@ -34,6 +36,45 @@
 Lines = List[str]
 Tags = Set[str]
 
+SAVE_DIR = Path(MEDIA_DIR)
+
+
+def simple_download_progress(filename: str):
+    try:
+        from humanize.filesize import naturalsize
+    except:
+        naturalsize = lambda x: f"{x:.2f} bytes"
+
+    def callback(current, total):
+        print(
+            f"[{filename}] Downloaded {naturalsize(current)} / {naturalsize(total)} [{current/total:.2%}]"
+        )
+
+    return callback
+
+
+def download_document_if_not_present(
+    message: Message, filename: str, logger
+) -> Optional[Path]:
+    SAVE_DIR.mkdir(parents=True, exist_ok=True)
+
+    destination = SAVE_DIR / filename
+
+    if destination.exists():
+        if destination.is_dir():
+            logger.error(f"Could not save file as {destination} as it is a directory.")
+            return None
+
+        logger.info(f"File {destination} exists already, skipping download.")
+        return destination
+
+    saved_dest = message.download_media(
+        file=destination,
+        progress_callback=simple_download_progress(destination.as_posix()),
+    )
+
+    return destination
+
 
 def format_group(group: List, dialog, logger) -> Tuple[Timestamp, From, Tags, Lines]:
     date = int(group[0].date.timestamp())
@@ -51,6 +92,8 @@ def get_from(m):
         u = fw.sender
         if u.username is not None:
             return u.username
+        elif u.last_name is None:
+            return f"{u.first_name}"
         else:
             return f"{u.first_name} {u.last_name}"
 
@@ -79,11 +122,28 @@ def get_from(m):
                     uu += ' ' + page.description
             texts.append(uu)
         elif isinstance(e, MessageMediaPhoto):
-            # TODO no file location? :(
-            texts.append("*PHOTO*")
+            saved_location = download_document_if_not_present(
+                message=m, filename=f"{e.photo.id}.jpg", logger=logger
+            )
+            if saved_location is not None:
+                texts.append(f"[[file:{saved_location.as_posix()}]]")
+            else:
+                texts.append("ERROR SAVING PHOTO {m.photo.id}")
             # print(vars(e))
         elif isinstance(e, MessageMediaDocument):
-            texts.append("*DOCUMENT*")
+            try:
+                original_file_name = e.document.attributes[0].file_name
+            except:
+                naive_file_ext = e.document.mime_type.split("/")[-1]
+                original_file_name = "{}.{}".format(e.document.id, naive_file_ext)
+
+            saved_location = download_document_if_not_present(
+                message=m, filename=original_file_name, logger=logger
+            )
+            if saved_location is not None:
+                texts.append(f"[[file:{saved_location.as_posix()}]]")
+            else:
+                texts.append("ERROR SAVING DOCUMENT {m.document.id}")
             # print(vars(e.document))
         elif isinstance(e, MessageMediaVenue):
             texts.append("*VENUE* " + e.title)

From 4d321deb463f042f36cd6026e08dae20910cd520 Mon Sep 17 00:00:00 2001
From: Romain Gehrig <romain.gehrig@gmail.com>
Date: Mon, 16 May 2022 16:22:55 +0200
Subject: [PATCH 2/2] Skip file names in header

---
 telegram2org.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/telegram2org.py b/telegram2org.py
index 7488a4b..bf1b6f4 100755
--- a/telegram2org.py
+++ b/telegram2org.py
@@ -164,6 +164,9 @@ def get_from(m):
     LIMIT = 400
     lines = '\n'.join(texts).splitlines() # meh
     for line in lines:
+        # Skip file names in header
+        if line.startswith("[[file:") and line.endswith("]]"):
+            continue
         if len(heading) + len(line) <= LIMIT:
             heading += " " + line
         else: