Added entry_points configuration to setup.py to enable running the pa…

…ckage as a command-line tool after installation. Add README Simplified the CLI by removing the --no-split argument and keeping only the --split argument. Added a new mode option 'judgement' to the command-line interface. Implemented optional --no-split argument to disable document segmentation. Changed the default mode from 'judgement' to 'label-only' in the command-line interface. Fix division by zero problem Fix types Replaced string-based judgement types with a StrEnum type for better type safety and self-documentation. Add enum type Added JudgementType import to test_detector and test_judge files. Replaced types in test files to use new JudgementType enum.
CanCLID · Jul 3, 2024 · 664b61d · 664b61d
1 parent b67cf25
commit 664b61d
Show file tree

Hide file tree

Showing 10 changed files with 92 additions and 60 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,5 @@ __pycache__/
 # Ignore Python bytecode files
 *.pyc
 __pycache__/
+build*
+*.egg-info/
diff --git a/README.md b/README.md
@@ -43,20 +43,20 @@ The classifiers output four (coarse) or six (fine-grained) categories. The label
 1. `MixedQuotesInSWC` : 書面中文，引文入面係 `Mixed` | `Mixed` contents quoted within SWC text
 1. `CantoneseQuotesInSWC` : 書面中文，引文入面係純粵文 `cantonese` | `Cantonese` contents quoted within SWC text
 
-## 用法 Usage
-
 ### 系統要求 Requirement
 
 Python >= 3.11
 
 ### 安裝 Installation
 
-首先用 pip 安裝
-
 ```bash
 pip install cantonesedetect
 ```
 
+## 用法 Usage
+
+可以通過 Python 函數嚟引用，亦可以直接 CLI 調用。
+
 ### Python
 
 Use `judge()`
@@ -72,4 +72,8 @@ print(judge('去學校讀書')[0])  # Neutral
 
 ### CLI
 
-待補充 to be added.
+最簡單用法：
+
+```bash
+cantonesedetect --input input.txt
+```
diff --git a/cantonesedetect/Detector.py b/cantonesedetect/Detector.py
@@ -10,6 +10,7 @@
 from typing import Tuple
 
 from cantonesedetect.SegmentFeatures import SegmentFeatures
+from cantonesedetect.JudgementTypes import JudgementType
 
 # Cantonese characters not found in SWC
 CANTO_FEATURE_RE = re.compile(
@@ -142,7 +143,7 @@ def _get_segment_features(self, segment: str) -> SegmentFeatures:
 
         return segment_features
 
-    def _judge_single_segment(self, segment: str) -> str:
+    def _judge_single_segment(self, segment: str) -> JudgementType:
         """
         Determine the language of a segment based on the presence of Cantonese and SWC features.
 
@@ -160,6 +161,8 @@ def _judge_single_segment(self, segment: str) -> str:
                 and the length of the segment in Han characters.
         """
         features: SegmentFeatures = self._get_segment_features(segment)
+        if features.length == 0:
+            return JudgementType.NEUTRAL
 
         num_all_features: int = features.canto_feature_count + features.swc_feature_count
 
@@ -169,7 +172,7 @@ def _judge_single_segment(self, segment: str) -> str:
             self.canto_tolerance * features.length)
 
         if num_all_features == 0 or (lack_canto and lack_swc):
-            return "Neutral"
+            return JudgementType.NEUTRAL
         else:
             has_canto: bool = features.canto_feature_count >= math.ceil(
                 self.canto_presence * features.length)
@@ -182,13 +185,13 @@ def _judge_single_segment(self, segment: str) -> str:
                 features.canto_feature_count / num_all_features > 0.9
 
             if canto_pref and not has_swc:
-                return "Cantonese"
+                return JudgementType.CANTONESE
             elif swc_pref and not has_canto:
-                return "SWC"
+                return JudgementType.SWC
             else:
-                return "Mixed"
+                return JudgementType.MIXED
 
-    def _judge_segments(self, document: str) -> str:
+    def _judge_segments(self, document: str) -> JudgementType:
         """
         Given a list of segments:
         1. If >95% of the segments are Neutral, the overall judgement is Neutral
@@ -210,9 +213,9 @@ def _judge_segments(self, document: str) -> str:
 
         judgements_counter: Counter = Counter(judgements)
 
-        canto_seg_count: int = judgements_counter["Cantonese"]
-        swc_seg_count: int = judgements_counter["SWC"]
-        neutral_seg_count: int = judgements_counter["Neutral"]
+        canto_seg_count: int = judgements_counter[JudgementType.CANTONESE]
+        swc_seg_count: int = judgements_counter[JudgementType.SWC]
+        neutral_seg_count: int = judgements_counter[JudgementType.NEUTRAL]
 
         # 95% threshold
         threshold = math.ceil(sum(judgements_counter.values()) * 0.95)
@@ -222,21 +225,21 @@ def _judge_segments(self, document: str) -> str:
         neutral_only: bool = neutral_seg_count >= threshold
 
         if neutral_only:
-            return "Neutral"
+            return JudgementType.NEUTRAL
         elif canto_only:
-            return "Cantonese"
+            return JudgementType.CANTONESE
         elif swc_only:
-            return "SWC"
+            return JudgementType.SWC
         else:
-            return "Mixed"
+            return JudgementType.MIXED
 
-    def _judge_document(self, document: str) -> str:
+    def _judge_document(self, document: str) -> JudgementType:
         if self.split_seg:
             return self._judge_segments(document)
         else:
             return self._judge_single_segment(document)
 
-    def _judge_matrix_quotes(self, document: str) -> str:
+    def _judge_matrix_quotes(self, document: str) -> JudgementType:
         """
         Judge the language of a document with quotes.
 
@@ -259,22 +262,22 @@ def _judge_matrix_quotes(self, document: str) -> str:
 
             if matrix_judgement == quotes_judgement:
                 return matrix_judgement
-            elif matrix_judgement == 'Neutral':
+            elif matrix_judgement == JudgementType.NEUTRAL:
                 return quotes_judgement
-            elif quotes_judgement == 'Neutral':
+            elif quotes_judgement == JudgementType.NEUTRAL:
                 return matrix_judgement
-            elif matrix_judgement == 'SWC' and quotes_judgement == 'Cantonese':
-                judgement = "CantoneseQuotesInSWC"
-            elif matrix_judgement == 'SWC' and quotes_judgement == 'Mixed':
-                judgement = "MixedQuotesInSWC"
+            elif matrix_judgement == JudgementType.SWC and quotes_judgement == JudgementType.CANTONESE:
+                judgement = JudgementType.CANTONESE_QUOTES_IN_SWC
+            elif matrix_judgement == JudgementType.SWC and quotes_judgement == JudgementType.MIXED:
+                judgement = JudgementType.MIXED_QUOTES_IN_SWC
             else:
-                judgement = "Mixed"
+                judgement = JudgementType.MIXED
 
             # canto_ratio = f'[M]{_c1}:[Q]{_c2}'
             # swc_ratio = f'[M]{_s1}:[Q]{_s2}'
             return judgement
 
-    def judge(self, document: str) -> str:
+    def judge(self, document: str) -> JudgementType:
         """
         The only exposed api. Judge the language of a document.
 

diff --git a/cantonesedetect/JudgementTypes.py b/cantonesedetect/JudgementTypes.py
@@ -0,0 +1,10 @@
+from enum import StrEnum, auto
+
+
+class JudgementType(StrEnum):
+    CANTONESE = auto()
+    SWC = auto()
+    NEUTRAL = auto()
+    MIXED = auto()
+    CANTONESE_QUOTES_IN_SWC = auto()
+    MIXED_QUOTES_IN_SWC = auto()
diff --git a/cantonesedetect/SegmentFeatures.py b/cantonesedetect/SegmentFeatures.py
@@ -13,10 +13,11 @@ def __init__(self, segment, canto_feature, canto_exclude, swc_feature,
 
         self.canto_feature_count: int = canto_feature_count
         self.swc_feature_count: int = swc_feature_count
+        # Input with no Han characters will have a length of 0.
         self.length: int = length
 
-        self.canto_ratio: float = canto_feature_count / length
-        self.swc_ratio: float = swc_feature_count / length
+        self.canto_ratio: float = canto_feature_count / length if length > 0 else 0
+        self.swc_ratio: float = swc_feature_count / length if length > 0 else 0
 
     def print_analysis(self, print_features=False) -> None:
         """

diff --git a/cantonesedetect/cli.py b/cantonesedetect/cli.py
@@ -19,7 +19,7 @@ def main():
     argparser.add_argument(
         '--quotes', help='Separate quotes from matrix and judge them separately.', action='store_true')
     argparser.add_argument(
-        '--split', help='Split the document into segments if True', action='store_true')
+        '--split', help='Split the document into segments', action='store_true', default=False)
     args = argparser.parse_args()
 
     detector = CantoneseDetector(split_seg=args.split, get_quote=args.quotes)
@@ -37,3 +37,6 @@ def main():
 
 if __name__ == '__main__':
     main()
+else:
+    # This allows the script to be run as a module
+    __all__ = ['main']
diff --git a/setup.py b/setup.py
@@ -14,4 +14,9 @@
     long_description=long_description,
     long_description_content_type='text/markdown',
     test_suite='tests',
+    entry_points={
+        'console_scripts': [
+            'cantonesedetect=cantonesedetect.cli:main',
+        ],
+    },
 )
diff --git a/tests/test_detector.py b/tests/test_detector.py
@@ -1,6 +1,7 @@
 import unittest
 import pytest
 from cantonesedetect.Detector import CantoneseDetector
+from cantonesedetect.JudgementTypes import JudgementType
 
 
 class TestCantoneseDetector(unittest.TestCase):
@@ -30,33 +31,35 @@ def test_get_segment_features(self):
         features = self.detector._get_segment_features(segment)
         self.assertEqual(features.canto_feature_count, 2)  # 哋、邊度
         self.assertEqual(features.swc_feature_count, 2)  # 們、哪裏
-        self.assertEqual(features.length, 18)
+        self.assertEqual(features.length, 16)
 
     @pytest.mark.private
     def test_judge_single_segment(self):
         self.assertEqual(self.detector._judge_single_segment(
-            "我哋去邊度食飯？"), "Cantonese")
+            "我哋去邊度食飯？"), "cantonese")
         self.assertEqual(
-            self.detector._judge_single_segment("我們去哪裏吃飯？"), "SWC")
-        self.assertEqual(self.detector._judge_single_segment("你好"), "Neutral")
-        self.assertEqual(self.detector._judge_single_segment("是咁的"), "Mixed")
+            self.detector._judge_single_segment("我們去哪裏吃飯？"), "swc")
+        self.assertEqual(self.detector._judge_single_segment("你好"), "neutral")
+        self.assertEqual(self.detector._judge_single_segment("是咁的"), "mixed")
 
     @pytest.mark.private
     def test_judge_segments(self):
         self.assertEqual(self.detector._judge_segments(
-            "我哋去邊度？我们去哪里？Hello!"), "Mixed")
+            "我哋去邊度？我们去哪里？Hello!"), "mixed")
 
     @pytest.mark.private
     def test_judge_matrix_quotes(self):
         self.assertEqual(self.detector._judge_matrix_quotes(
-            "他說「係噉嘅」"), "CantoneseQuotesInSWC")
+            "他說「係噉嘅」"), JudgementType.CANTONESE_QUOTES_IN_SWC)
         self.assertEqual(self.detector._judge_matrix_quotes(
-            "他說「是咁的」"), "MixedQuotesInSWC")
+            "他說「是咁的」"), JudgementType.MIXED_QUOTES_IN_SWC)
 
     def test_judge(self):
-        self.assertEqual(self.detector.judge("我哋去邊度？"), "Cantonese")
-        self.assertEqual(self.detector.judge("我们去哪里？"), "SWC")
-        self.assertEqual(self.detector.judge("Hello World!"), "Neutral")
+        self.assertEqual(self.detector.judge(
+            "我哋去邊度？"), JudgementType.CANTONESE)
+        self.assertEqual(self.detector.judge("我们去哪里？"), JudgementType.SWC)
+        self.assertEqual(self.detector.judge(
+            "Hello World!"), JudgementType.NEUTRAL)
 
 
 if __name__ == '__main__':

diff --git a/tests/test_judge.py b/tests/test_judge.py
@@ -1,5 +1,6 @@
 import unittest
 from cantonesedetect.Detector import CantoneseDetector
+from cantonesedetect.JudgementTypes import JudgementType
 
 
 def load_test_sentences(file_path):
@@ -25,7 +26,7 @@ def test_judge(self):
         for sentence, quotemode, expected in test_cases:
             result = self.detector.judge(sentence)
             self.assertEqual(
-                result, expected, f"Failed for input: {sentence}. Expected: {expected}, Quote Mode: {quotemode} but got: {result}")
+                result, JudgementType(expected), f"Failed for input: {sentence}. Expected: {expected}, Quote Mode: {quotemode} but got: {result}")
 
 
 if __name__ == "__main__":

diff --git a/tests/test_judge_sentences.txt b/tests/test_judge_sentences.txt
@@ -1,17 +1,17 @@
-你喺邊度|NoQuote|Cantonese
-乜你今日唔使返學咩|NoQuote|Cantonese
-今日好可能會嚟唔到|NoQuote|Cantonese
-我哋影張相留念|NoQuote|Cantonese
-你在哪裏|NoQuote|SWC
-家長也應做好家居防蚊措施|NoQuote|SWC
-教育不只是為了傳授知識|NoQuote|SWC
-是咁的|NoQuote|Mixed
-佢在屋企吃飯|NoQuote|Mixed
-去學校讀書|NoQuote|Neutral
-做人最重要開心|NoQuote|Neutral
-外交部駐香港特別行政區特派員公署副特派員|NoQuote|Neutral
-全日制或大學生於晚市星期一至星期四一天前訂座|NoQuote|Neutral
-這就是「你哋都戇鳩嘅」的意思 |Quote|CantoneseQuotesInSWC
-今天我是一個「冇嘢好做」的狀態 |Quote|CantoneseQuotesInSWC
-他們跟我說：「是咁的，即係噉講」 |Quote|MixedQuotesInSWC
-他說：「佢在屋企吃飯」 |Quote|MixedQuotesInSWC
+你喺邊度|NoQuote|cantonese
+乜你今日唔使返學咩|NoQuote|cantonese
+今日好可能會嚟唔到|NoQuote|cantonese
+我哋影張相留念|NoQuote|cantonese
+你在哪裏|NoQuote|swc
+家長也應做好家居防蚊措施|NoQuote|swc
+教育不只是為了傳授知識|NoQuote|swc
+是咁的|NoQuote|mixed
+佢在屋企吃飯|NoQuote|mixed
+去學校讀書|NoQuote|neutral
+做人最重要開心|NoQuote|neutral
+外交部駐香港特別行政區特派員公署副特派員|NoQuote|neutral
+全日制或大學生於晚市星期一至星期四一天前訂座|NoQuote|neutral
+這就是「你哋都戇鳩嘅」的意思 |Quote|cantonese_quotes_in_swc
+今天我是一個「冇嘢好做」的狀態 |Quote|cantonese_quotes_in_swc
+他們跟我說：「是咁的，即係噉講」 |Quote|mixed_quotes_in_swc
+他說：「佢在屋企吃飯」 |Quote|mixed_quotes_in_swc