From 1e21b7c406ce7821e0fc25f63c27b4746fb03558 Mon Sep 17 00:00:00 2001 From: Romain Gehrig Date: Mon, 16 May 2022 16:07:13 +0200 Subject: [PATCH 1/2] PoC for downloading Telegram files --- config.py.example | 2 ++ telegram2org.py | 78 +++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 71 insertions(+), 9 deletions(-) diff --git a/config.py.example b/config.py.example index 163f655..4acfa87 100644 --- a/config.py.example +++ b/config.py.example @@ -14,6 +14,8 @@ GROUP_NAME = 'Todos' # file tags for org file; can be None if you don't want a tag ORG_TAG = "telegram2org" +# directory where the files are downloaded +MEDIA_DIR = "media/" # org mode uses local timezone and telegram uses UTC, so we have to specify it TIMEZONE = 'Europe/London' diff --git a/telegram2org.py b/telegram2org.py index b7286db..7488a4b 100755 --- a/telegram2org.py +++ b/telegram2org.py @@ -11,22 +11,24 @@ That way you keep your focus while not being mean ignoring your friends' messages. """ +from pathlib import Path from datetime import datetime import logging import re -from typing import List, Dict, Tuple, Collection, Set +from typing import Collection, Dict, List, Optional, Set, Tuple, Union +import os import pytz -import telethon.sync # type: ignore -from telethon import TelegramClient # type: ignore -from telethon.tl.types import MessageMediaWebPage, MessageMediaPhoto, MessageMediaDocument, MessageMediaVenue # type: ignore -from telethon.tl.types import MessageService, WebPageEmpty # type: ignore +import telethon.sync # type: ignore +from telethon import TelegramClient # type: ignore +from telethon.tl.types import MessageMediaWebPage, MessageMediaPhoto, MessageMediaDocument, MessageMediaVenue # type: ignore +from telethon.tl.types import Message, MessageService, WebPageEmpty # type: ignore from orger import InteractiveView from orger.common import todo from orger.inorganic import link -from config import ORG_TAG, TG_APP_HASH, TG_APP_ID, TELETHON_SESSION, GROUP_NAME, TIMEZONE, NAME_TO_TAG +from config import ORG_TAG, TG_APP_HASH, TG_APP_ID, TELETHON_SESSION, GROUP_NAME, TIMEZONE, NAME_TO_TAG, MEDIA_DIR Timestamp = int @@ -34,6 +36,45 @@ Lines = List[str] Tags = Set[str] +SAVE_DIR = Path(MEDIA_DIR) + + +def simple_download_progress(filename: str): + try: + from humanize.filesize import naturalsize + except: + naturalsize = lambda x: f"{x:.2f} bytes" + + def callback(current, total): + print( + f"[{filename}] Downloaded {naturalsize(current)} / {naturalsize(total)} [{current/total:.2%}]" + ) + + return callback + + +def download_document_if_not_present( + message: Message, filename: str, logger +) -> Optional[Path]: + SAVE_DIR.mkdir(parents=True, exist_ok=True) + + destination = SAVE_DIR / filename + + if destination.exists(): + if destination.is_dir(): + logger.error(f"Could not save file as {destination} as it is a directory.") + return None + + logger.info(f"File {destination} exists already, skipping download.") + return destination + + saved_dest = message.download_media( + file=destination, + progress_callback=simple_download_progress(destination.as_posix()), + ) + + return destination + def format_group(group: List, dialog, logger) -> Tuple[Timestamp, From, Tags, Lines]: date = int(group[0].date.timestamp()) @@ -51,6 +92,8 @@ def get_from(m): u = fw.sender if u.username is not None: return u.username + elif u.last_name is None: + return f"{u.first_name}" else: return f"{u.first_name} {u.last_name}" @@ -79,11 +122,28 @@ def get_from(m): uu += ' ' + page.description texts.append(uu) elif isinstance(e, MessageMediaPhoto): - # TODO no file location? :( - texts.append("*PHOTO*") + saved_location = download_document_if_not_present( + message=m, filename=f"{e.photo.id}.jpg", logger=logger + ) + if saved_location is not None: + texts.append(f"[[file:{saved_location.as_posix()}]]") + else: + texts.append("ERROR SAVING PHOTO {m.photo.id}") # print(vars(e)) elif isinstance(e, MessageMediaDocument): - texts.append("*DOCUMENT*") + try: + original_file_name = e.document.attributes[0].file_name + except: + naive_file_ext = e.document.mime_type.split("/")[-1] + original_file_name = "{}.{}".format(e.document.id, naive_file_ext) + + saved_location = download_document_if_not_present( + message=m, filename=original_file_name, logger=logger + ) + if saved_location is not None: + texts.append(f"[[file:{saved_location.as_posix()}]]") + else: + texts.append("ERROR SAVING DOCUMENT {m.document.id}") # print(vars(e.document)) elif isinstance(e, MessageMediaVenue): texts.append("*VENUE* " + e.title) From 4d321deb463f042f36cd6026e08dae20910cd520 Mon Sep 17 00:00:00 2001 From: Romain Gehrig Date: Mon, 16 May 2022 16:22:55 +0200 Subject: [PATCH 2/2] Skip file names in header --- telegram2org.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/telegram2org.py b/telegram2org.py index 7488a4b..bf1b6f4 100755 --- a/telegram2org.py +++ b/telegram2org.py @@ -164,6 +164,9 @@ def get_from(m): LIMIT = 400 lines = '\n'.join(texts).splitlines() # meh for line in lines: + # Skip file names in header + if line.startswith("[[file:") and line.endswith("]]"): + continue if len(heading) + len(line) <= LIMIT: heading += " " + line else: