Support parsing Json export from Telegram Desktop Client

KindDragon · KindDragon · commit 96b95d07addd · 2020-02-19T21:11:04.000+03:00
diff --git a/README.md b/README.md
@@ -20,12 +20,13 @@ Can also generate histograms and word clouds from the chat logs.
 
 ### Support Matrix
 
-|      Platform      | Direct Chat  | Group Chat |
-|:------------------:|:-----------: |:----------:|
-| Facebook Messenger |     ✔        |     ✘      |
-| Google Hangouts    |     ✔        |     ✘      |
-| Telegram           |     ✔        |     ✘      |
-| WhatsApp           |     ✔        |     ✔      |
+|      Platform             | Direct Chat | Group Chat |
+|:-------------------------:|:-----------:|:----------:|
+| Facebook Messenger        |     ✔       |     ✘     |
+| Google Hangouts           |     ✔       |     ✘     |
+| Telegram (API)            |     ✔       |     ✘     |
+| Telegram (Desktop Client) |     ✔       |     ✔     |
+| WhatsApp                  |     ✔       |     ✔     |
 
 ### Exported data
 
@@ -76,9 +77,16 @@ Unfortunately, WhatsApp only lets you export your conversations **from your phon
 4. Send chat to yourself eg via Email
 5. Unpack the archive and add the individual .txt files to the folder `./raw_data/whatsapp/`
 
-### Telegram
+### Telegram (Desktop Client)
 
-The Telegram API works differently: you will first need to setup Chatistics, then query your chat logs programmatically. This process is documented below. Exporting Telegram chat logs is very fast.
+1. Open Telegram Desktop Client
+2. Open Settings > Export Telegram data
+5. Unpack result.json file to the folder `./raw_data/telegram/`
+
+### Telegram (API)
+
+The Telegram API works differently: you will first need to setup Chatistics, then query your chat logs programmatically. 
+This process is documented below. Exporting Telegram chat logs is very fast.
 
 ## 2. Setup Chatistics
 
@@ -102,18 +110,21 @@ python parse.py messenger
 
 # WhatsApp
 python parse.py whatsapp
+
+# Telegram (Desktop Client)
+python parse.py telegram_json
 ```
 
-### Telegram
+### Telegram (API)
 1. Create your Telegram application to access chat logs ([instructions](https://core.telegram.org/api/obtaining_api_id)).
 You will need `api_id` and `api_hash` which we will now set as environment variables.
 2. Run `cp secrets.sh.example secrets.sh` and fill in the values for the environment variables `TELEGRAM_API_ID`, `TELEGRAMP_API_HASH` and `TELEGRAM_PHONE` (your phone number including country code).
 3. Run `source secrets.sh`
-4. Execute the parser script using `python parse.py telegram`
+4. Execute the parser script using `python parse.py telegram_api`
 
 The pickle files will now be ready for analysis in the `data` folder!
 
-For more options use the `-h` argument on the parsers (e.g. `python parse.py telegram --help`).
+For more options use the `-h` argument on the parsers (e.g. `python parse.py telegram_api --help`).
 
 
 ## 3. All done! Play with your data
@@ -144,7 +155,7 @@ Among other options you can filter messages as needed (also see `python visualiz
 
 ```
   --platforms {telegram,whatsapp,messenger,hangouts}
-                        Use data only from certain platforms (default: ['telegram', 'whatsapp', 'messenger', 'hangouts'])
+                        Use data only from certain platforms (default: ['telegram_api', 'telegram_json', 'whatsapp', 'messenger', 'hangouts'])
   --filter-conversation
                         Limit by conversations with this person/group (default: [])
   --filter-sender
diff --git a/config.yml b/config.yml
@@ -15,9 +15,12 @@ hangouts:
 messenger:
   DEFAULT_RAW_LOCATION: 'raw_data/messenger'
   OUTPUT_PICKLE_NAME: 'messenger.pkl'
-telegram:
+telegram_api:
   USER_DIALOG_MESSAGES_LIMIT: 100000
-  OUTPUT_PICKLE_NAME: 'telegram.pkl'
+  OUTPUT_PICKLE_NAME: 'telegram_api.pkl'
+telegram_json:
+  DEFAULT_RAW_LOCATION: 'raw_data/telegram/result.json'
+  OUTPUT_PICKLE_NAME: 'telegram_json.pkl'
 whatsapp:
   DEFAULT_RAW_LOCATION: 'raw_data/whatsapp'
   OUTPUT_PICKLE_NAME: 'whatsapp.pkl'
diff --git a/parse.py b/parse.py
@@ -8,7 +8,8 @@
 python parse.py <command> [<args>]
 
 Available commands:
-  telegram         Parse logs from telegram
+  telegram_api     Parse logs from telegram (api)
+  telegram_json    Parse logs from telegram (desktop client)
   hangouts         Parse logs from hangouts
   messenger        Parse logs from messenger
   whatsapp         Parse logs from whatsapp
@@ -41,15 +42,24 @@ def __init__(self):
             sys.exit(1)
         getattr(self, args.command)()
 
-    def telegram(self):
-        from parsers.telegram import main
-        parser = ArgParseDefault(description='Parse message logs from Telegram')
+    def telegram_api(self):
+        from parsers.telegram_api import main
+        parser = ArgParseDefault(description='Parse message logs from Telegram (API)')
         parser = add_common_parse_arguments(parser)
-        parser.add_argument('--max-dialog', dest='max_dialog', type=int, default=config['telegram']['USER_DIALOG_MESSAGES_LIMIT'],
+        parser.add_argument('--max-dialog', dest='max_dialog', type=int, default=config['telegram_api']['USER_DIALOG_MESSAGES_LIMIT'],
                             help='Maximum number of messages to export per dialog')
         args = parser.parse_args(sys.argv[2:])
         main(args.own_name, max_exported_messages=args.max, user_dialog_messages_limit=args.max_dialog)
 
+    def telegram_json(self):
+        from parsers.telegram_json import main
+        parser = ArgParseDefault(description='Parse message logs from Telegram (Desktop Client)')
+        parser = add_common_parse_arguments(parser)
+        parser.add_argument('-f', '--file-path', dest='file_path', default=config['telegram_json']['DEFAULT_RAW_LOCATION'],
+                            help='Path to Telegram chat log file (json file)')
+        args = parser.parse_args(sys.argv[2:])
+        main(args.own_name, args.file_path, args.max)
+
     def hangouts(self):
         from parsers.hangouts import main
         parser = ArgParseDefault(description='Parse message logs from Google Hangouts')
diff --git a/parsers/telegram_api.py b/parsers/telegram_api.py
@@ -61,7 +61,7 @@ async def _main_loop(client):
     df['platform'] = 'telegram'
     log.info('Detecting languages...')
     df = detect_language(df)
-    export_dataframe(df, config['telegram']['OUTPUT_PICKLE_NAME'])
+    export_dataframe(df, config['telegram_api']['OUTPUT_PICKLE_NAME'])
     log.info('Done.')
 
 
diff --git a/parsers/telegram_json.py b/parsers/telegram_json.py
@@ -0,0 +1,85 @@
+from parsers.config import config
+from parsers.utils import export_dataframe, detect_language
+from dateutil.parser import parse
+import json
+import pandas as pd
+import logging
+from collections import defaultdict
+import os
+
+log = logging.getLogger(__name__)
+
+
+def main(own_name, file_path, max_exported_messages):
+    global MAX_EXPORTED_MESSAGES
+    MAX_EXPORTED_MESSAGES = max_exported_messages
+    log.info('Parsing Google Hangouts data...')
+    if not os.path.isfile(file_path):
+        log.error(f'No input file under {file_path}')
+        exit(0)
+    archive = read_archive(file_path)
+    if own_name is None:
+        own_name = " ".join([archive["personal_information"]["first_name"], archive["personal_information"]["last_name"]])
+    own_id = archive["personal_information"]["user_id"]
+    data = parse_messages(archive, own_id)
+    log.info('{:,} messages parsed.'.format(len(data)))
+    if len(data) < 1:
+        log.info('Nothing to save.')
+        exit(0)
+    log.info('Converting to DataFrame...')
+    df = pd.DataFrame(data, columns=config['ALL_COLUMNS'])
+    df['platform'] = 'telegram'
+    log.info('Detecting languages...')
+    df = detect_language(df)
+    export_dataframe(df, config['telegram_json']['OUTPUT_PICKLE_NAME'])
+    log.info('Done.')
+
+
+def parse_messages(archive, own_id):
+    def json_to_text(data):
+        result = ""
+        for v in data:
+            if isinstance(v, dict):
+                result += v["text"]
+            else:
+                result += v
+        return result
+
+    data = []
+    log.info('Extracting messages...')
+    for chat in archive["chats"]["list"]:
+        chat_type = chat["type"]
+        if chat_type == "personal_chat" or chat_type == "private_group" or chat_type == "private_supergroup":
+            conversation_with_id = chat["id"]
+            conversation_with_name = chat["name"]
+            for message in chat["messages"]:
+                if message["type"] != "message":
+                    continue
+                timestamp = parse(message["date"]).timestamp()
+                # skip text from forwarded messages
+                text = message["text"] if "forwarded_from" not in message else ""
+                if "sticker_emoji" in message:
+                    text = message["sticker_emoji"]
+                if isinstance(text, list):
+                    text = json_to_text(text)
+                sender_name = message["from"]
+                sender_id = message["from_id"]
+                if sender_name is None:
+                    # unknown sender
+                    log.error(f"No senderName could be found for senderId ({sender_id})")
+
+                # saves the message
+                outgoing = sender_id == own_id
+                data += [[timestamp, conversation_with_id, conversation_with_name, sender_name, outgoing, text, '', '']]
+
+                if len(data) >= MAX_EXPORTED_MESSAGES:
+                    log.warning(f'Reached max exported messages limit of {MAX_EXPORTED_MESSAGES}. Increase limit in order to parse all messages.')
+                    return data
+    return data
+
+
+def read_archive(file_path):
+    log.info(f'Reading archive file {file_path}...')
+    with open(file_path, encoding='utf-8') as f:
+        archive = json.loads(f.read())
+    return archive
diff --git a/utils.py b/utils.py
@@ -16,7 +16,7 @@ def __init__(self, **kwargs):
 
 def add_load_data_args(parser):
     """Adds common data loader arguments to arg parser"""
-    platforms = ['telegram', 'whatsapp', 'messenger', 'hangouts']
+    platforms = ['telegram_api', 'telegram_json', 'whatsapp', 'messenger', 'hangouts']
     parser.add_argument('-p', '--platforms', default=platforms, choices=platforms, nargs='+', help='Use data only from certain platforms')
     parser.add_argument('--filter-conversation', dest='filter_conversation', nargs='+', default=[], help='Limit by conversations with this person/group')
     parser.add_argument('--filter-sender', dest='filter_sender', nargs='+', default=[], help='Limit by messages by this sender')