Skip to content

Commit 6068c27

Browse files
authored
Merge pull request #54 from MasterScrat/fix-whatsapp-datetime-am-pm
added am/pm to datetime format parsing in whatsapp
2 parents 9debb3d + 4a0164b commit 6068c27

File tree

3 files changed

+36
-2
lines changed

3 files changed

+36
-2
lines changed

parsers/whatsapp.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
log = logging.getLogger(__name__)
1515
regex_left = r'[\u0000-\u001F\u0100-\uFFFF]?'
16-
regex_datetime = r'[^\w]?([0-9./\-]{6,10},?[\sT][0-9:]{5,8})[^\w]?\s[\-]?\s?'
16+
regex_datetime = r'[^\w]?([0-9./\-]{6,10},?[\sT][0-9:]{5,8}\s?[AP]?M?)[^\w]?\s?[\-\–]?\s'
1717
regex_right = r'(([^:]+):\s)?(.*)'
1818
regex_message = re.compile(f'^{regex_left}{regex_datetime}{regex_right}$')
1919
MAX_EXPORTED_MESSAGES = 1000000
@@ -26,7 +26,7 @@ def infer_datetime_regex(f_path, max_messages=100):
2626
for c, line in enumerate(f):
2727
if c == max_messages:
2828
break;
29-
matches = regex_message.search(line)
29+
matches = regex_message.search(line.upper())
3030
if matches:
3131
pattern = ""
3232
first = True
@@ -50,6 +50,9 @@ def infer_datetime_regex(f_path, max_messages=100):
5050
if l in '.*+[]{}()\\|':
5151
pattern += '\\'
5252
pattern += l
53+
if i > 0 and pattern[-2:] in ['AM','PM']:
54+
pattern = pattern[:-2] + '[APap][Mm]'
55+
last = len(pattern)
5356
pattern = pattern[0:last] + ')' + pattern[last:]
5457
patterns[pattern] += 1
5558
if len(patterns) > 0:
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
09/17/19, 8:30:52 AM – John Doe: US datetime format
2+
09/17/19, 4:30:10 PM – John Doe: US datetime format
3+
09/18/19, 4:50:32 pm – John Doe: US datetime format

tests/test_whatsapp.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,24 @@
5858
]
5959
}
6060

61+
ground_truth_chat4 = {
62+
'datetime': [
63+
datetime(2019, 9, 17, 8, 30, 52),
64+
datetime(2019, 9, 17, 16, 30, 10),
65+
datetime(2019, 9, 18, 16, 50, 32),
66+
],
67+
'text': [
68+
'US datetime format',
69+
'US datetime format',
70+
'US datetime format',
71+
],
72+
'senderName': [
73+
'John Doe',
74+
'John Doe',
75+
'John Doe',
76+
]
77+
}
78+
6179

6280
def test_parse_chat_info_chat1():
6381
data = parse_messages([os.path.join(TEST_DATA_LOCATION, '_chat.txt')], 'John Doe', True)
@@ -83,3 +101,13 @@ def test_parse_us_datetime_chat3():
83101
assert len(df_truth) == len(df)
84102
for i, row in df.iloc[:len(df_truth)].iterrows():
85103
assert row.timestamp == df_truth.iloc[i].datetime.timestamp()
104+
105+
def test_parse_us_datetime_chat4():
106+
data = parse_messages([os.path.join(TEST_DATA_LOCATION, '_chat 4.txt')], 'John Doe', True)
107+
df = pd.DataFrame(data, columns=config['ALL_COLUMNS'])
108+
df_truth = pd.DataFrame(ground_truth_chat4)
109+
assert len(df_truth) == len(df)
110+
for i, row in df.iloc[:len(df_truth)].iterrows():
111+
assert row.timestamp == df_truth.iloc[i].datetime.timestamp()
112+
assert row.text == df_truth.iloc[i].text
113+
assert row.senderName == df_truth.iloc[i].senderName

0 commit comments

Comments
 (0)