|
| 1 | +from parsers.config import config |
| 2 | +from parsers.utils import export_dataframe, detect_language |
| 3 | +from dateutil.parser import parse |
| 4 | +import json |
| 5 | +import pandas as pd |
| 6 | +import logging |
| 7 | +from collections import defaultdict |
| 8 | +import os |
| 9 | + |
| 10 | +log = logging.getLogger(__name__) |
| 11 | + |
| 12 | + |
| 13 | +def main(own_name, file_path, max_exported_messages): |
| 14 | + global MAX_EXPORTED_MESSAGES |
| 15 | + MAX_EXPORTED_MESSAGES = max_exported_messages |
| 16 | + log.info('Parsing Google Hangouts data...') |
| 17 | + if not os.path.isfile(file_path): |
| 18 | + log.error(f'No input file under {file_path}') |
| 19 | + exit(0) |
| 20 | + archive = read_archive(file_path) |
| 21 | + if own_name is None: |
| 22 | + own_name = " ".join([archive["personal_information"]["first_name"], archive["personal_information"]["last_name"]]) |
| 23 | + own_id = archive["personal_information"]["user_id"] |
| 24 | + data = parse_messages(archive, own_id) |
| 25 | + log.info('{:,} messages parsed.'.format(len(data))) |
| 26 | + if len(data) < 1: |
| 27 | + log.info('Nothing to save.') |
| 28 | + exit(0) |
| 29 | + log.info('Converting to DataFrame...') |
| 30 | + df = pd.DataFrame(data, columns=config['ALL_COLUMNS']) |
| 31 | + df['platform'] = 'telegram' |
| 32 | + log.info('Detecting languages...') |
| 33 | + df = detect_language(df) |
| 34 | + export_dataframe(df, config['telegram_json']['OUTPUT_PICKLE_NAME']) |
| 35 | + log.info('Done.') |
| 36 | + |
| 37 | + |
| 38 | +def parse_messages(archive, own_id): |
| 39 | + def json_to_text(data): |
| 40 | + result = "" |
| 41 | + for v in data: |
| 42 | + if isinstance(v, dict): |
| 43 | + result += v["text"] |
| 44 | + else: |
| 45 | + result += v |
| 46 | + return result |
| 47 | + |
| 48 | + data = [] |
| 49 | + log.info('Extracting messages...') |
| 50 | + for chat in archive["chats"]["list"]: |
| 51 | + chat_type = chat["type"] |
| 52 | + if chat_type == "personal_chat" or chat_type == "private_group" or chat_type == "private_supergroup": |
| 53 | + conversation_with_id = chat["id"] |
| 54 | + conversation_with_name = chat["name"] |
| 55 | + for message in chat["messages"]: |
| 56 | + if message["type"] != "message": |
| 57 | + continue |
| 58 | + timestamp = parse(message["date"]).timestamp() |
| 59 | + # skip text from forwarded messages |
| 60 | + text = message["text"] if "forwarded_from" not in message else "" |
| 61 | + if "sticker_emoji" in message: |
| 62 | + text = message["sticker_emoji"] |
| 63 | + if isinstance(text, list): |
| 64 | + text = json_to_text(text) |
| 65 | + sender_name = message["from"] |
| 66 | + sender_id = message["from_id"] |
| 67 | + if sender_name is None: |
| 68 | + # unknown sender |
| 69 | + log.error(f"No senderName could be found for senderId ({sender_id})") |
| 70 | + |
| 71 | + # saves the message |
| 72 | + outgoing = sender_id == own_id |
| 73 | + data += [[timestamp, conversation_with_id, conversation_with_name, sender_name, outgoing, text, '', '']] |
| 74 | + |
| 75 | + if len(data) >= MAX_EXPORTED_MESSAGES: |
| 76 | + log.warning(f'Reached max exported messages limit of {MAX_EXPORTED_MESSAGES}. Increase limit in order to parse all messages.') |
| 77 | + return data |
| 78 | + return data |
| 79 | + |
| 80 | + |
| 81 | +def read_archive(file_path): |
| 82 | + log.info(f'Reading archive file {file_path}...') |
| 83 | + with open(file_path, encoding='utf-8') as f: |
| 84 | + archive = json.loads(f.read()) |
| 85 | + return archive |
0 commit comments