Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

close #2; fix: regular express for 即係 with test cases #3

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
2 changes: 1 addition & 1 deletion regular.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ o 丫,吖
[無冇] [咁甘噤],冇咁
[無冇] [左咗唨],冇咗
[撲扑] 街,仆街
[姐遮] 係,即係
(^|[^小家表姑契阿堂姐雨把擔收]) [姐遮] 係,\1即係
[痴癡黐] [綫線],黐線
[痴癡黐] [撚𠹌能] [綫線],黐撚線
([出過落入用再]) 黎,\1嚟
Expand Down
49 changes: 49 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from rules import apply_contextual_rules
import re

regular_typos = []

# Read regular typos
for line in open("./regular.txt", "r", encoding="utf-8"):
typo, replace = line.strip().split(",")
regular_typos.append(
(re.compile(typo.replace(" ", "\s*"), re.I), replace))

def fix_regular_typo(line: str) -> str:
"""
Regular typo means that they can be simply replaced by a regular expression.
Fixing them does not require any context information.
"""
for typo, replace in regular_typos:
line = typo.sub(replace, line)
return line

def correct(line):
stripped_line = line.strip()
fixed = fix_regular_typo(stripped_line)
fixed = apply_contextual_rules(fixed)

return fixed

def test_即係():
# 遮
assert correct("遮係噉") == "即係噉"
assert correct("哪!遮係噉") == "哪!即係噉"
assert correct("落雨要擔遮係嗎?") == "落雨要擔遮係嗎?"
assert correct("落雨要把遮係嗎?") == "落雨要把遮係嗎?"
assert correct("落雨要收遮係嗎?") == "落雨要收遮係嗎?"
assert correct("落雨要雨遮係嗎?") == "落雨要雨遮係嗎?"

# 姐
assert correct("姐係噉") == "即係噉"
assert correct("哪!姐係噉") == "哪!即係噉"
assert correct("我家姐係我親人") == "我家姐係我親人"
assert correct("我表姐係我親人") == "我表姐係我親人"
assert correct("我堂姐係我親人") == "我堂姐係我親人"
assert correct("我契姐係我親人") == "我契姐係我親人"
assert correct("我姐姐係我親人") == "我姐姐係我親人"
assert correct("我姑姐係我親人") == "我姑姐係我親人"
assert correct("我小姐係我親人") == "我小姐係我親人"

if __name__ == "__main__":
test_即係()