Skip to content

Commit

Permalink
Merge pull request #153 from lucasrodes/release/0.7.1
Browse files Browse the repository at this point in the history
v0.7.1
  • Loading branch information
lucasrodes authored Jun 29, 2024
2 parents bbf7fe3 + c279111 commit 85344f3
Show file tree
Hide file tree
Showing 7 changed files with 42 additions and 9 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.7.0
current_version = 0.7.1
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(.(?P<pre>[a-z]+)(?P<prenum>\d+))?
serialize =
{major}.{minor}.{patch}.{pre}{prenum}
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
</h1>
<p align="left">
<a href="#">
<img alt="Package version" src="https://img.shields.io/badge/pypi-0.7.0-blue.svg?&color=25D366&logo=whatsapp&">
<img alt="Package version" src="https://img.shields.io/badge/pypi-0.7.1-blue.svg?&color=25D366&logo=whatsapp&">
</a>
</p>
<!-- style=for-the-badge -->
Expand Down
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
author = 'lucasrodes'

# The full version, including alpha/beta/rc tags
version = 'v0.7.0'
version = 'v0.7.1'


# -- General configuration ---------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@

setup(
name='whatstk',
version="0.7.0",
version="0.7.1",
description="Parser and analytics tools for WhatsApp group chats",
long_description=long_description,
long_description_content_type='text/markdown',
Expand Down
2 changes: 1 addition & 1 deletion whatstk/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

name = "whatstk"

__version__ = "0.7.0"
__version__ = "0.7.1"

__all__ = [
"WhatsAppChat",
Expand Down
6 changes: 4 additions & 2 deletions whatstk/whatsapp/auto_header.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import logging
import re
from typing import List, Tuple, Optional

import pandas as pd

from whatstk.utils.exceptions import RegexError
Expand Down Expand Up @@ -109,7 +108,10 @@ def _extract_possible_header_from_line(line: str) -> str:
# possible header
header = line_split[0]
if not header.isprintable():
header = header.replace("\u200e", "").replace("\u202e", "")
print("""
There is some unprintable character in the header.
Please report this in https://github.com/lucasrodes/whatstk.
""")
if header[-1] != ":":
header += ":"
return header
Expand Down
35 changes: 33 additions & 2 deletions whatstk/whatsapp/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from pathlib import Path
import tempfile
from typing import Any, Optional, Tuple, List, Dict
import unicodedata
from urllib.request import urlopen
import warnings
import zipfile
Expand All @@ -27,8 +28,8 @@
"%I": r"(?P<hour>\d{1,2})",
"%M": r"(?P<minutes>\d{2})",
"%S": r"(?P<seconds>\d{2})",
"%P": r"(?P<ampm>[AaPp].? ?[Mm].?)",
"%p": r"(?P<ampm>[AaPp].? ?[Mm].?)",
"%P": r"(?P<ampm>[AaPp]\.?\s?[Mm].?)",
"%p": r"(?P<ampm>[AaPp]\.?\s?[Mm]\.?)",
"%name": rf"(?P<{COLNAMES_DF.USERNAME}>[^:]*)",
}

Expand Down Expand Up @@ -122,6 +123,9 @@ def df_from_whatsapp(
# Read local file
text = _str_from_file(filepath, encoding)

# Clean text from unwanted unicode characters
text = _clean_text(text)

# Build dataframe
df = _df_from_str(text, auto_header, hformat)

Expand Down Expand Up @@ -298,6 +302,33 @@ def _parse_chat(text: str, regex: str) -> pd.DataFrame:
return df_chat


def _clean_text(text: str) -> str:
# List of additional unwanted Unicode characters
unwanted_chars = [
'\u200B', # Zero Width Space
'\u200C', # Zero Width Non-Joiner
'\u200D', # Zero Width Joiner
'\u202A', # Left-to-Right Embedding
'\u202B', # Right-to-Left Embedding
'\u202C', # Pop Directional Formatting
'\u202D', # Left-to-Right Override
'\u202E', # Right-To-Left Override
'\u200E', # Left-To-Right Mark
'\u200F', # Right-to-Left Mark
'\u00AD', # Soft Hyphen
]

# Create a regex pattern from the list
pattern = '[' + ''.join(unwanted_chars) + ']'

# Remove unwanted characters
text = re.sub(pattern, '', text)

text = unicodedata.normalize('NFKD', text)

return text


def _add_schema(df: pd.DataFrame) -> pd.DataFrame:
"""Add default chat schema to df.
Expand Down

0 comments on commit 85344f3

Please sign in to comment.