Skip to content

Commit

Permalink
Added entry_points configuration to setup.py to enable running the pa…
Browse files Browse the repository at this point in the history
…ckage as a command-line tool after installation.

Add README

Simplified the CLI by removing the --no-split argument and keeping only the --split argument.

Added a new mode option 'judgement' to the command-line interface.

Implemented optional --no-split argument to disable document segmentation.

Changed the default mode from 'judgement' to 'label-only' in the command-line interface.

Fix division by zero problem

Fix types

Replaced string-based judgement types with a StrEnum type for better type safety and self-documentation.

Add enum type

Added JudgementType import to test_detector and test_judge files.

Replaced types in test files to use new JudgementType enum.
  • Loading branch information
laubonghaudoi committed Jul 3, 2024
1 parent b67cf25 commit 664b61d
Show file tree
Hide file tree
Showing 10 changed files with 92 additions and 60 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ __pycache__/
# Ignore Python bytecode files
*.pyc
__pycache__/
build*
*.egg-info/
14 changes: 9 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,20 +43,20 @@ The classifiers output four (coarse) or six (fine-grained) categories. The label
1. `MixedQuotesInSWC` : 書面中文,引文入面係 `Mixed` | `Mixed` contents quoted within SWC text
1. `CantoneseQuotesInSWC` : 書面中文,引文入面係純粵文 `cantonese` | `Cantonese` contents quoted within SWC text

## 用法 Usage

### 系統要求 Requirement

Python >= 3.11

### 安裝 Installation

首先用 pip 安裝

```bash
pip install cantonesedetect
```

## 用法 Usage

可以通過 Python 函數嚟引用,亦可以直接 CLI 調用。

### Python

Use `judge()`
Expand All @@ -72,4 +72,8 @@ print(judge('去學校讀書')[0]) # Neutral

### CLI

待補充 to be added.
最簡單用法:

```bash
cantonesedetect --input input.txt
```
49 changes: 26 additions & 23 deletions cantonesedetect/Detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from typing import Tuple

from cantonesedetect.SegmentFeatures import SegmentFeatures
from cantonesedetect.JudgementTypes import JudgementType

# Cantonese characters not found in SWC
CANTO_FEATURE_RE = re.compile(
Expand Down Expand Up @@ -142,7 +143,7 @@ def _get_segment_features(self, segment: str) -> SegmentFeatures:

return segment_features

def _judge_single_segment(self, segment: str) -> str:
def _judge_single_segment(self, segment: str) -> JudgementType:
"""
Determine the language of a segment based on the presence of Cantonese and SWC features.
Expand All @@ -160,6 +161,8 @@ def _judge_single_segment(self, segment: str) -> str:
and the length of the segment in Han characters.
"""
features: SegmentFeatures = self._get_segment_features(segment)
if features.length == 0:
return JudgementType.NEUTRAL

num_all_features: int = features.canto_feature_count + features.swc_feature_count

Expand All @@ -169,7 +172,7 @@ def _judge_single_segment(self, segment: str) -> str:
self.canto_tolerance * features.length)

if num_all_features == 0 or (lack_canto and lack_swc):
return "Neutral"
return JudgementType.NEUTRAL
else:
has_canto: bool = features.canto_feature_count >= math.ceil(
self.canto_presence * features.length)
Expand All @@ -182,13 +185,13 @@ def _judge_single_segment(self, segment: str) -> str:
features.canto_feature_count / num_all_features > 0.9

if canto_pref and not has_swc:
return "Cantonese"
return JudgementType.CANTONESE
elif swc_pref and not has_canto:
return "SWC"
return JudgementType.SWC
else:
return "Mixed"
return JudgementType.MIXED

def _judge_segments(self, document: str) -> str:
def _judge_segments(self, document: str) -> JudgementType:
"""
Given a list of segments:
1. If >95% of the segments are Neutral, the overall judgement is Neutral
Expand All @@ -210,9 +213,9 @@ def _judge_segments(self, document: str) -> str:

judgements_counter: Counter = Counter(judgements)

canto_seg_count: int = judgements_counter["Cantonese"]
swc_seg_count: int = judgements_counter["SWC"]
neutral_seg_count: int = judgements_counter["Neutral"]
canto_seg_count: int = judgements_counter[JudgementType.CANTONESE]
swc_seg_count: int = judgements_counter[JudgementType.SWC]
neutral_seg_count: int = judgements_counter[JudgementType.NEUTRAL]

# 95% threshold
threshold = math.ceil(sum(judgements_counter.values()) * 0.95)
Expand All @@ -222,21 +225,21 @@ def _judge_segments(self, document: str) -> str:
neutral_only: bool = neutral_seg_count >= threshold

if neutral_only:
return "Neutral"
return JudgementType.NEUTRAL
elif canto_only:
return "Cantonese"
return JudgementType.CANTONESE
elif swc_only:
return "SWC"
return JudgementType.SWC
else:
return "Mixed"
return JudgementType.MIXED

def _judge_document(self, document: str) -> str:
def _judge_document(self, document: str) -> JudgementType:
if self.split_seg:
return self._judge_segments(document)
else:
return self._judge_single_segment(document)

def _judge_matrix_quotes(self, document: str) -> str:
def _judge_matrix_quotes(self, document: str) -> JudgementType:
"""
Judge the language of a document with quotes.
Expand All @@ -259,22 +262,22 @@ def _judge_matrix_quotes(self, document: str) -> str:

if matrix_judgement == quotes_judgement:
return matrix_judgement
elif matrix_judgement == 'Neutral':
elif matrix_judgement == JudgementType.NEUTRAL:
return quotes_judgement
elif quotes_judgement == 'Neutral':
elif quotes_judgement == JudgementType.NEUTRAL:
return matrix_judgement
elif matrix_judgement == 'SWC' and quotes_judgement == 'Cantonese':
judgement = "CantoneseQuotesInSWC"
elif matrix_judgement == 'SWC' and quotes_judgement == 'Mixed':
judgement = "MixedQuotesInSWC"
elif matrix_judgement == JudgementType.SWC and quotes_judgement == JudgementType.CANTONESE:
judgement = JudgementType.CANTONESE_QUOTES_IN_SWC
elif matrix_judgement == JudgementType.SWC and quotes_judgement == JudgementType.MIXED:
judgement = JudgementType.MIXED_QUOTES_IN_SWC
else:
judgement = "Mixed"
judgement = JudgementType.MIXED

# canto_ratio = f'[M]{_c1}:[Q]{_c2}'
# swc_ratio = f'[M]{_s1}:[Q]{_s2}'
return judgement

def judge(self, document: str) -> str:
def judge(self, document: str) -> JudgementType:
"""
The only exposed api. Judge the language of a document.
Expand Down
10 changes: 10 additions & 0 deletions cantonesedetect/JudgementTypes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from enum import StrEnum, auto


class JudgementType(StrEnum):
CANTONESE = auto()
SWC = auto()
NEUTRAL = auto()
MIXED = auto()
CANTONESE_QUOTES_IN_SWC = auto()
MIXED_QUOTES_IN_SWC = auto()
5 changes: 3 additions & 2 deletions cantonesedetect/SegmentFeatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@ def __init__(self, segment, canto_feature, canto_exclude, swc_feature,

self.canto_feature_count: int = canto_feature_count
self.swc_feature_count: int = swc_feature_count
# Input with no Han characters will have a length of 0.
self.length: int = length

self.canto_ratio: float = canto_feature_count / length
self.swc_ratio: float = swc_feature_count / length
self.canto_ratio: float = canto_feature_count / length if length > 0 else 0
self.swc_ratio: float = swc_feature_count / length if length > 0 else 0

def print_analysis(self, print_features=False) -> None:
"""
Expand Down
5 changes: 4 additions & 1 deletion cantonesedetect/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def main():
argparser.add_argument(
'--quotes', help='Separate quotes from matrix and judge them separately.', action='store_true')
argparser.add_argument(
'--split', help='Split the document into segments if True', action='store_true')
'--split', help='Split the document into segments', action='store_true', default=False)
args = argparser.parse_args()

detector = CantoneseDetector(split_seg=args.split, get_quote=args.quotes)
Expand All @@ -37,3 +37,6 @@ def main():

if __name__ == '__main__':
main()
else:
# This allows the script to be run as a module
__all__ = ['main']
5 changes: 5 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,9 @@
long_description=long_description,
long_description_content_type='text/markdown',
test_suite='tests',
entry_points={
'console_scripts': [
'cantonesedetect=cantonesedetect.cli:main',
],
},
)
25 changes: 14 additions & 11 deletions tests/test_detector.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import unittest
import pytest
from cantonesedetect.Detector import CantoneseDetector
from cantonesedetect.JudgementTypes import JudgementType


class TestCantoneseDetector(unittest.TestCase):
Expand Down Expand Up @@ -30,33 +31,35 @@ def test_get_segment_features(self):
features = self.detector._get_segment_features(segment)
self.assertEqual(features.canto_feature_count, 2) # 哋、邊度
self.assertEqual(features.swc_feature_count, 2) # 們、哪裏
self.assertEqual(features.length, 18)
self.assertEqual(features.length, 16)

@pytest.mark.private
def test_judge_single_segment(self):
self.assertEqual(self.detector._judge_single_segment(
"我哋去邊度食飯?"), "Cantonese")
"我哋去邊度食飯?"), "cantonese")
self.assertEqual(
self.detector._judge_single_segment("我們去哪裏吃飯?"), "SWC")
self.assertEqual(self.detector._judge_single_segment("你好"), "Neutral")
self.assertEqual(self.detector._judge_single_segment("是咁的"), "Mixed")
self.detector._judge_single_segment("我們去哪裏吃飯?"), "swc")
self.assertEqual(self.detector._judge_single_segment("你好"), "neutral")
self.assertEqual(self.detector._judge_single_segment("是咁的"), "mixed")

@pytest.mark.private
def test_judge_segments(self):
self.assertEqual(self.detector._judge_segments(
"我哋去邊度?我们去哪里?Hello!"), "Mixed")
"我哋去邊度?我们去哪里?Hello!"), "mixed")

@pytest.mark.private
def test_judge_matrix_quotes(self):
self.assertEqual(self.detector._judge_matrix_quotes(
"他說「係噉嘅」"), "CantoneseQuotesInSWC")
"他說「係噉嘅」"), JudgementType.CANTONESE_QUOTES_IN_SWC)
self.assertEqual(self.detector._judge_matrix_quotes(
"他說「是咁的」"), "MixedQuotesInSWC")
"他說「是咁的」"), JudgementType.MIXED_QUOTES_IN_SWC)

def test_judge(self):
self.assertEqual(self.detector.judge("我哋去邊度?"), "Cantonese")
self.assertEqual(self.detector.judge("我们去哪里?"), "SWC")
self.assertEqual(self.detector.judge("Hello World!"), "Neutral")
self.assertEqual(self.detector.judge(
"我哋去邊度?"), JudgementType.CANTONESE)
self.assertEqual(self.detector.judge("我们去哪里?"), JudgementType.SWC)
self.assertEqual(self.detector.judge(
"Hello World!"), JudgementType.NEUTRAL)


if __name__ == '__main__':
Expand Down
3 changes: 2 additions & 1 deletion tests/test_judge.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import unittest
from cantonesedetect.Detector import CantoneseDetector
from cantonesedetect.JudgementTypes import JudgementType


def load_test_sentences(file_path):
Expand All @@ -25,7 +26,7 @@ def test_judge(self):
for sentence, quotemode, expected in test_cases:
result = self.detector.judge(sentence)
self.assertEqual(
result, expected, f"Failed for input: {sentence}. Expected: {expected}, Quote Mode: {quotemode} but got: {result}")
result, JudgementType(expected), f"Failed for input: {sentence}. Expected: {expected}, Quote Mode: {quotemode} but got: {result}")


if __name__ == "__main__":
Expand Down
34 changes: 17 additions & 17 deletions tests/test_judge_sentences.txt
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
你喺邊度|NoQuote|Cantonese
乜你今日唔使返學咩|NoQuote|Cantonese
今日好可能會嚟唔到|NoQuote|Cantonese
我哋影張相留念|NoQuote|Cantonese
你在哪裏|NoQuote|SWC
家長也應做好家居防蚊措施|NoQuote|SWC
教育不只是為了傳授知識|NoQuote|SWC
是咁的|NoQuote|Mixed
佢在屋企吃飯|NoQuote|Mixed
去學校讀書|NoQuote|Neutral
做人最重要開心|NoQuote|Neutral
外交部駐香港特別行政區特派員公署副特派員|NoQuote|Neutral
全日制或大學生於晚市星期一至星期四一天前訂座|NoQuote|Neutral
這就是「你哋都戇鳩嘅」的意思 |Quote|CantoneseQuotesInSWC
今天我是一個「冇嘢好做」的狀態 |Quote|CantoneseQuotesInSWC
他們跟我說:「是咁的,即係噉講」 |Quote|MixedQuotesInSWC
他說:「佢在屋企吃飯」 |Quote|MixedQuotesInSWC
你喺邊度|NoQuote|cantonese
乜你今日唔使返學咩|NoQuote|cantonese
今日好可能會嚟唔到|NoQuote|cantonese
我哋影張相留念|NoQuote|cantonese
你在哪裏|NoQuote|swc
家長也應做好家居防蚊措施|NoQuote|swc
教育不只是為了傳授知識|NoQuote|swc
是咁的|NoQuote|mixed
佢在屋企吃飯|NoQuote|mixed
去學校讀書|NoQuote|neutral
做人最重要開心|NoQuote|neutral
外交部駐香港特別行政區特派員公署副特派員|NoQuote|neutral
全日制或大學生於晚市星期一至星期四一天前訂座|NoQuote|neutral
這就是「你哋都戇鳩嘅」的意思 |Quote|cantonese_quotes_in_swc
今天我是一個「冇嘢好做」的狀態 |Quote|cantonese_quotes_in_swc
他們跟我說:「是咁的,即係噉講」 |Quote|mixed_quotes_in_swc
他說:「佢在屋企吃飯」 |Quote|mixed_quotes_in_swc

0 comments on commit 664b61d

Please sign in to comment.