@@ -34,55 +34,118 @@ def main(own_name, file_path, max_exported_messages):
3434
3535
3636def parse_messages (archive , own_name ):
37+ """
38+ Google Hangouts exports all the chat logs in a single JSON file. The format is super redundant and contains a lot of meta-information.
39+ {
40+ "conversations": [
41+ {
42+ "conversation": {
43+ "conversation_id": {
44+ "id": "<conversation_id>"
45+ },
46+ "conversation": {
47+ "id": {
48+ "id": "<conversation_id>"
49+ },
50+ "type": "TEXT",
51+ "participant_data": [
52+ {
53+ "id": {<user ids>},
54+ "fallback_name": "<plaintext name>",
55+ },
56+ ...data for each participant...
57+ ]
58+ ...more conversation-specific data...
59+ }
60+ },
61+ "events": [
62+ {
63+ "conversation_id": {
64+ "id": "<conversation_id>"
65+ },
66+ "timestamp": "<timestamp>",
67+ "chat_message": {
68+ "message_content": {
69+ "segment": [
70+ {
71+ "type": "TEXT",
72+ "text": "<actual text>"
73+ }
74+ ]
75+ }
76+ },
77+ ...other message-specific data...
78+ },
79+ ...other messages/events...
80+ ]
81+ },
82+ ...other conversations...
83+ ]
84+ }
85+ """
86+ id_to_name_map = {}
87+
3788 def id_to_name (_id ):
38- if _id in names :
39- return names [_id ]
89+ if _id in id_to_name_map :
90+ return id_to_name_map [_id ]
4091 else :
4192 return None
4293
4394 def save_name_for_id (name , _id ):
44- if not _id in names :
45- names [_id ] = name
46- elif names [_id ] != name :
47- log .info (f'Assuming { name } is { names [_id ]} ' )
95+ if _id not in id_to_name_map :
96+ id_to_name_map [_id ] = name
97+ elif id_to_name_map [_id ] != name :
98+ log .info (f'Assuming { name } is { id_to_name_map [_id ]} ' )
4899
49- names = {}
50100 data = []
51101 log .info ('Extracting messages...' )
52- for conversation in archive [" conversations" ]:
102+ for conversation in archive [' conversations' ]:
53103 conversation_with_id = ''
54- conversationWithName = ''
55- if "conversation" in conversation ["conversation" ]:
56- for participant in conversation ["conversation" ]["conversation" ]["participant_data" ]:
57- if "fallback_name" in participant :
58- save_name_for_id (participant ["fallback_name" ], participant ["id" ]["chat_id" ])
59- for event in conversation ["events" ]:
60- if "chat_message" in event and "segment" in event ["chat_message" ]["message_content" ]:
61- timestamp = int (event ["timestamp" ])
62- content = event ["chat_message" ]["message_content" ]
63- text = content ["segment" ][0 ]["text" ]
64- conversationId = event ["conversation_id" ]["id" ]
65- sender_id = event ["sender_id" ]["chat_id" ]
66- participants = conversation ["conversation" ]["conversation" ]["current_participant" ]
104+
105+ # saves the fallback_name of all participants
106+ if 'conversation' in conversation ['conversation' ]:
107+ for participant in conversation ['conversation' ]['conversation' ]['participant_data' ]:
108+ if 'fallback_name' in participant :
109+ save_name_for_id (participant ['fallback_name' ], participant ['id' ]['chat_id' ])
110+
111+ for event in conversation ['events' ]:
112+ if 'chat_message' in event and 'segment' in event ['chat_message' ]['message_content' ]:
113+ timestamp = int (event ['timestamp' ])
114+ content = event ['chat_message' ]['message_content' ]
115+ text = content ['segment' ][0 ]['text' ]
116+ conversation_id = event ['conversation_id' ]['id' ]
117+ sender_id = event ['sender_id' ]['chat_id' ]
118+ participants = conversation ['conversation' ]['conversation' ]['current_participant' ]
119+
120+ # no support for group chat
67121 if len (participants ) == 2 :
68122 for participant in participants :
69- if id_to_name (participant ["chat_id" ]) != own_name :
70- conversation_with_id = participant ["chat_id" ]
123+ if id_to_name (participant ['chat_id' ]) != own_name :
124+ conversation_with_id = participant ['chat_id' ]
125+
71126 sender_name = id_to_name (sender_id )
72127 conversation_with_name = id_to_name (conversation_with_id )
128+
73129 if sender_name is not None or conversation_with_name is not None :
130+
131+ # checks that the sender is either own_name or the interlocutor
74132 if sender_name != own_name and sender_id != conversation_with_id :
75133 log .error (f'Parsing error. Is your own_name { own_name } correct?' )
134+ log .error (f'Problem: this message was sent by { sender_name } , who is not you ({ own_name } )' )
135+ log .error (f'It was sent by id { sender_id } , who is not your current interlocutor either ({ conversation_with_id } )' )
136+ log .error (f'Participants are: { participants } ' )
76137 exit (0 )
138+
77139 # saves the message
78140 timestamp = timestamp / 1000000
79141 outgoing = sender_name == own_name
80142 conversation_with_name = conversation_with_name if conversation_with_name is not None else ''
81143 sender_name = sender_name if sender_name is not None else ''
82- data += [[timestamp , conversationId , conversation_with_name , sender_name , outgoing , text , '' , '' ]]
144+ data += [[timestamp , conversation_id , conversation_with_name , sender_name , outgoing , text , '' , '' ]]
83145 else :
84146 # unknown sender
85- log .error (f"No senderName could be found for either senderId ({ sender_id } ) or ConversationWithId ({ conversation_with_id } )" )
147+ log .error (f'No sender_name could be found for either sender_id ({ sender_id } ) or conversation_with_id ({ conversation_with_id } )' )
148+
86149 if len (data ) >= MAX_EXPORTED_MESSAGES :
87150 log .warning (f'Reached max exported messages limit of { MAX_EXPORTED_MESSAGES } . Increase limit in order to parse all messages.' )
88151 return data
@@ -97,15 +160,15 @@ def read_archive(file_path):
97160
98161
99162def infer_own_name (archive , min_conversations = 2 ):
100- """ Infers own name from multiple conversations by finding the person who participated most in the conversations"""
163+ ''' Infers own name from multiple conversations by finding the person who participated most in the conversations'''
101164 participants_conversation_count = defaultdict (int )
102165 num_conversations = 0
103166 log .info ('Trying to infer own_name from data...' )
104- for conversation in archive [" conversations" ]:
167+ for conversation in archive [' conversations' ]:
105168 conversation_with_id = ''
106169 conversationWithName = ''
107- if " conversation" in conversation [" conversation" ]:
108- participants = conversation [" conversation" ][ " conversation" ][ " participant_data" ]
170+ if ' conversation' in conversation [' conversation' ]:
171+ participants = conversation [' conversation' ][ ' conversation' ][ ' participant_data' ]
109172 participants = [p ['fallback_name' ] for p in participants if 'fallback_name' in p ]
110173 if len (participants ) >= 2 :
111174 num_conversations += 1
0 commit comments