Skip to content

Commit 85344f3

Browse files
authored
Merge pull request #153 from lucasrodes/release/0.7.1
v0.7.1
2 parents bbf7fe3 + c279111 commit 85344f3

File tree

7 files changed

+42
-9
lines changed

7 files changed

+42
-9
lines changed

.bumpversion.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 0.7.0
2+
current_version = 0.7.1
33
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(.(?P<pre>[a-z]+)(?P<prenum>\d+))?
44
serialize =
55
{major}.{minor}.{patch}.{pre}{prenum}

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
</h1>
66
<p align="left">
77
<a href="#">
8-
<img alt="Package version" src="https://img.shields.io/badge/pypi-0.7.0-blue.svg?&color=25D366&logo=whatsapp&">
8+
<img alt="Package version" src="https://img.shields.io/badge/pypi-0.7.1-blue.svg?&color=25D366&logo=whatsapp&">
99
</a>
1010
</p>
1111
<!-- style=for-the-badge -->

docs/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
author = 'lucasrodes'
3030

3131
# The full version, including alpha/beta/rc tags
32-
version = 'v0.7.0'
32+
version = 'v0.7.1'
3333

3434

3535
# -- General configuration ---------------------------------------------------

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545

4646
setup(
4747
name='whatstk',
48-
version="0.7.0",
48+
version="0.7.1",
4949
description="Parser and analytics tools for WhatsApp group chats",
5050
long_description=long_description,
5151
long_description_content_type='text/markdown',

whatstk/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
name = "whatstk"
1414

15-
__version__ = "0.7.0"
15+
__version__ = "0.7.1"
1616

1717
__all__ = [
1818
"WhatsAppChat",

whatstk/whatsapp/auto_header.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import logging
55
import re
66
from typing import List, Tuple, Optional
7-
87
import pandas as pd
98

109
from whatstk.utils.exceptions import RegexError
@@ -109,7 +108,10 @@ def _extract_possible_header_from_line(line: str) -> str:
109108
# possible header
110109
header = line_split[0]
111110
if not header.isprintable():
112-
header = header.replace("\u200e", "").replace("\u202e", "")
111+
print("""
112+
There is some unprintable character in the header.
113+
Please report this in https://github.com/lucasrodes/whatstk.
114+
""")
113115
if header[-1] != ":":
114116
header += ":"
115117
return header

whatstk/whatsapp/parser.py

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from pathlib import Path
88
import tempfile
99
from typing import Any, Optional, Tuple, List, Dict
10+
import unicodedata
1011
from urllib.request import urlopen
1112
import warnings
1213
import zipfile
@@ -27,8 +28,8 @@
2728
"%I": r"(?P<hour>\d{1,2})",
2829
"%M": r"(?P<minutes>\d{2})",
2930
"%S": r"(?P<seconds>\d{2})",
30-
"%P": r"(?P<ampm>[AaPp].? ?[Mm].?)",
31-
"%p": r"(?P<ampm>[AaPp].? ?[Mm].?)",
31+
"%P": r"(?P<ampm>[AaPp]\.?\s?[Mm].?)",
32+
"%p": r"(?P<ampm>[AaPp]\.?\s?[Mm]\.?)",
3233
"%name": rf"(?P<{COLNAMES_DF.USERNAME}>[^:]*)",
3334
}
3435

@@ -122,6 +123,9 @@ def df_from_whatsapp(
122123
# Read local file
123124
text = _str_from_file(filepath, encoding)
124125

126+
# Clean text from unwanted unicode characters
127+
text = _clean_text(text)
128+
125129
# Build dataframe
126130
df = _df_from_str(text, auto_header, hformat)
127131

@@ -298,6 +302,33 @@ def _parse_chat(text: str, regex: str) -> pd.DataFrame:
298302
return df_chat
299303

300304

305+
def _clean_text(text: str) -> str:
306+
# List of additional unwanted Unicode characters
307+
unwanted_chars = [
308+
'\u200B', # Zero Width Space
309+
'\u200C', # Zero Width Non-Joiner
310+
'\u200D', # Zero Width Joiner
311+
'\u202A', # Left-to-Right Embedding
312+
'\u202B', # Right-to-Left Embedding
313+
'\u202C', # Pop Directional Formatting
314+
'\u202D', # Left-to-Right Override
315+
'\u202E', # Right-To-Left Override
316+
'\u200E', # Left-To-Right Mark
317+
'\u200F', # Right-to-Left Mark
318+
'\u00AD', # Soft Hyphen
319+
]
320+
321+
# Create a regex pattern from the list
322+
pattern = '[' + ''.join(unwanted_chars) + ']'
323+
324+
# Remove unwanted characters
325+
text = re.sub(pattern, '', text)
326+
327+
text = unicodedata.normalize('NFKD', text)
328+
329+
return text
330+
331+
301332
def _add_schema(df: pd.DataFrame) -> pd.DataFrame:
302333
"""Add default chat schema to df.
303334

0 commit comments

Comments
 (0)