Skip to content

Commit e8c148e

Browse files
committed
Add documentation and more detailed error messages for Hangouts
1 parent 6068c27 commit e8c148e

File tree

1 file changed

+92
-29
lines changed

1 file changed

+92
-29
lines changed

parsers/hangouts.py

Lines changed: 92 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -34,55 +34,118 @@ def main(own_name, file_path, max_exported_messages):
3434

3535

3636
def parse_messages(archive, own_name):
37+
"""
38+
Google Hangouts exports all the chat logs in a single JSON file. The format is super redundant and contains a lot of meta-information.
39+
{
40+
"conversations": [
41+
{
42+
"conversation": {
43+
"conversation_id": {
44+
"id": "<conversation_id>"
45+
},
46+
"conversation": {
47+
"id": {
48+
"id": "<conversation_id>"
49+
},
50+
"type": "TEXT",
51+
"participant_data": [
52+
{
53+
"id": {<user ids>},
54+
"fallback_name": "<plaintext name>",
55+
},
56+
...data for each participant...
57+
]
58+
...more conversation-specific data...
59+
}
60+
},
61+
"events": [
62+
{
63+
"conversation_id": {
64+
"id": "<conversation_id>"
65+
},
66+
"timestamp": "<timestamp>",
67+
"chat_message": {
68+
"message_content": {
69+
"segment": [
70+
{
71+
"type": "TEXT",
72+
"text": "<actual text>"
73+
}
74+
]
75+
}
76+
},
77+
...other message-specific data...
78+
},
79+
...other messages/events...
80+
]
81+
},
82+
...other conversations...
83+
]
84+
}
85+
"""
86+
id_to_name_map = {}
87+
3788
def id_to_name(_id):
38-
if _id in names:
39-
return names[_id]
89+
if _id in id_to_name_map:
90+
return id_to_name_map[_id]
4091
else:
4192
return None
4293

4394
def save_name_for_id(name, _id):
44-
if not _id in names:
45-
names[_id] = name
46-
elif names[_id] != name:
47-
log.info(f'Assuming {name} is {names[_id]}')
95+
if _id not in id_to_name_map:
96+
id_to_name_map[_id] = name
97+
elif id_to_name_map[_id] != name:
98+
log.info(f'Assuming {name} is {id_to_name_map[_id]}')
4899

49-
names = {}
50100
data = []
51101
log.info('Extracting messages...')
52-
for conversation in archive["conversations"]:
102+
for conversation in archive['conversations']:
53103
conversation_with_id = ''
54-
conversationWithName = ''
55-
if "conversation" in conversation["conversation"]:
56-
for participant in conversation["conversation"]["conversation"]["participant_data"]:
57-
if "fallback_name" in participant:
58-
save_name_for_id(participant["fallback_name"], participant["id"]["chat_id"])
59-
for event in conversation["events"]:
60-
if "chat_message" in event and "segment" in event["chat_message"]["message_content"]:
61-
timestamp = int(event["timestamp"])
62-
content = event["chat_message"]["message_content"]
63-
text = content["segment"][0]["text"]
64-
conversationId = event["conversation_id"]["id"]
65-
sender_id = event["sender_id"]["chat_id"]
66-
participants = conversation["conversation"]["conversation"]["current_participant"]
104+
105+
# saves the fallback_name of all participants
106+
if 'conversation' in conversation['conversation']:
107+
for participant in conversation['conversation']['conversation']['participant_data']:
108+
if 'fallback_name' in participant:
109+
save_name_for_id(participant['fallback_name'], participant['id']['chat_id'])
110+
111+
for event in conversation['events']:
112+
if 'chat_message' in event and 'segment' in event['chat_message']['message_content']:
113+
timestamp = int(event['timestamp'])
114+
content = event['chat_message']['message_content']
115+
text = content['segment'][0]['text']
116+
conversation_id = event['conversation_id']['id']
117+
sender_id = event['sender_id']['chat_id']
118+
participants = conversation['conversation']['conversation']['current_participant']
119+
120+
# no support for group chat
67121
if len(participants) == 2:
68122
for participant in participants:
69-
if id_to_name(participant["chat_id"]) != own_name:
70-
conversation_with_id = participant["chat_id"]
123+
if id_to_name(participant['chat_id']) != own_name:
124+
conversation_with_id = participant['chat_id']
125+
71126
sender_name = id_to_name(sender_id)
72127
conversation_with_name = id_to_name(conversation_with_id)
128+
73129
if sender_name is not None or conversation_with_name is not None:
130+
131+
# checks that the sender is either own_name or the interlocutor
74132
if sender_name != own_name and sender_id != conversation_with_id:
75133
log.error(f'Parsing error. Is your own_name {own_name} correct?')
134+
log.error(f'Problem: this message was sent by {sender_name}, who is not you ({own_name})')
135+
log.error(f'It was sent by id {sender_id}, who is not your current interlocutor either ({conversation_with_id})')
136+
log.error(f'Participants are: {participants}')
76137
exit(0)
138+
77139
# saves the message
78140
timestamp = timestamp / 1000000
79141
outgoing = sender_name == own_name
80142
conversation_with_name = conversation_with_name if conversation_with_name is not None else ''
81143
sender_name = sender_name if sender_name is not None else ''
82-
data += [[timestamp, conversationId, conversation_with_name, sender_name, outgoing, text, '', '']]
144+
data += [[timestamp, conversation_id, conversation_with_name, sender_name, outgoing, text, '', '']]
83145
else:
84146
# unknown sender
85-
log.error(f"No senderName could be found for either senderId ({sender_id}) or ConversationWithId ({conversation_with_id})")
147+
log.error(f'No sender_name could be found for either sender_id ({sender_id}) or conversation_with_id ({conversation_with_id})')
148+
86149
if len(data) >= MAX_EXPORTED_MESSAGES:
87150
log.warning(f'Reached max exported messages limit of {MAX_EXPORTED_MESSAGES}. Increase limit in order to parse all messages.')
88151
return data
@@ -97,15 +160,15 @@ def read_archive(file_path):
97160

98161

99162
def infer_own_name(archive, min_conversations=2):
100-
"""Infers own name from multiple conversations by finding the person who participated most in the conversations"""
163+
'''Infers own name from multiple conversations by finding the person who participated most in the conversations'''
101164
participants_conversation_count = defaultdict(int)
102165
num_conversations = 0
103166
log.info('Trying to infer own_name from data...')
104-
for conversation in archive["conversations"]:
167+
for conversation in archive['conversations']:
105168
conversation_with_id = ''
106169
conversationWithName = ''
107-
if "conversation" in conversation["conversation"]:
108-
participants = conversation["conversation"]["conversation"]["participant_data"]
170+
if 'conversation' in conversation['conversation']:
171+
participants = conversation['conversation']['conversation']['participant_data']
109172
participants = [p['fallback_name'] for p in participants if 'fallback_name' in p]
110173
if len(participants) >= 2:
111174
num_conversations += 1

0 commit comments

Comments
 (0)