-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrules.py
268 lines (206 loc) · 7.19 KB
/
rules.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
from __future__ import annotations
import re
from typing import Callable
import pycantonese
han = r"\u3006\u3007\u4e00-\u9fff\u3400-\u4dbf\U00020000-\U0002a6df\U0002a700-\U0002ebef\U00030000-\U000323af"
full_width_punct = r"\uff00-\uffef"
cjk_punct = r"\u3000-\u303f"
kana = r"\u3040-\u309f\u30a0-\u30ff\u31f0-\u31ff"
hangul = r"\uac00-\ud7af\u1100-\u11ff"
cjk_regex = f"[{han}{full_width_punct}{cjk_punct}{kana}{hangul}]"
cjk_pattern = re.compile(rf"(?<={cjk_regex})\s+(?={cjk_regex})")
number_pattern = re.compile(r"(?<={num})\s+(?={num})".format(num=r"[\d.,]"))
zo_words = ["左側", "左右", "左邊", "左手", "左腳", "左方", "左鄰", "左翼", "左膠", "左面"],
class Context:
pos_list: list[tuple[str, str]]
i: int
word: str
pos: str
next_word: str = ""
next_pos: str = ""
prev_word: str = ""
prev_pos: str = ""
@property
def sentence_remain(self):
"""The remaining part of the sentence."""
return "".join(x[0] for x in self.pos_list[self.i:])
def replace_word(self, word: str):
self.pos_list[self.i] = word, self.pos_list[self.i][1]
class Pos:
NONE = ""
NOUN = "NOUN"
VERB = "VERB"
PRON = "PRON"
PROPN = "PROPN"
ADJ = "ADJ"
ADP = "ADP"
ADV = "ADV"
AUX = "AUX"
PART = "PART"
X = "X"
_handlers: dict[str, Callable[[Context], None]] = {}
def segment_line(line: str) -> list[str]:
cjk = re.compile(cjk_regex)
words = []
segments = re.split(r"\s+", line)
for seg in segments:
if cjk.search(seg):
s = pycantonese.segment(seg)
# If the segment is not a CJK string, don't segment it,
# otherwise the segmentation will mess up the spacing
else:
s = [seg]
words.extend(s)
return words
def fix_space(line: str) -> str:
"""
Remove spaces between Han characters and symbols according to the
Chinese copywriting guidelines.
See:
https://sparanoid.com/note/chinese-copywriting-guidelines/
"""
line = cjk_pattern.sub("", line)
line = number_pattern.sub("", line)
return line
# Debugging purpose
# pos_file = open("pos.txt", "w", encoding="utf-8")
def apply_contextual_rules(line: str):
ctx = Context()
ctx.pos_list = pycantonese.pos_tag(segment_line(line))
# Debugging purpose
# print(ctx.pos_list, file=pos_file)
for ctx.i, (ctx.word, ctx.pos) in enumerate(ctx.pos_list):
if ctx.i + 1 < len(ctx.pos_list):
ctx.next_word, ctx.next_pos = ctx.pos_list[ctx.i + 1]
else:
ctx.next_word, ctx.next_pos = "", Pos.NONE
rule_handler = _handlers.get(ctx.word, None)
if rule_handler:
# If the word triggers a rule, apply the rule
rule_handler(ctx)
# Some rules can't be triggered by the decorator, so we put them here
elif "左" in ctx.word and ctx.word not in zo_words:
if ctx.prev_pos == Pos.VERB:
ctx.pos_list[ctx.i] = ctx.word.replace(
"左", "咗"), ctx.pos_list[ctx.i][1]
ctx.prev_word = ctx.word
ctx.prev_pos = ctx.pos
return fix_space(" ".join(w for w, _ in ctx.pos_list))
def contextual_rule(word: str, pos: set[str] = set()):
"""Decorator for registering a contextual typo correction rule."""
def deco(f):
if word in _handlers:
raise ValueError(
f"rule for {word} already exists: {_handlers[word]}")
_handlers[word] = f
return f
return deco
@contextual_rule("比")
def _(c: Context):
"""
比 -> 畀: 如果後面第一個詞係名詞,且第二個詞係形容詞、副詞,就係 比
"""
remain_words, remain_pos = zip(*c.pos_list[c.i:])
if "仲" in remain_words or "更" in remain_words or Pos.ADJ in remain_pos or Pos.ADV in remain_pos:
return
c.replace_word("畀")
# @contextual_rule("俾")
# def _(c: Context):
# if c.next_word not in ("使", "能", "便", "斯麥", "路支"):
# c.replace_word("畀")
@contextual_rule("d")
@contextual_rule("D")
def _(c: Context):
if re.search(cjk_regex, c.next_word) or c.prev_pos in (Pos.ADJ, Pos.ADV):
c.replace_word("啲")
@contextual_rule("番")
def _(c: Context):
if c.prev_pos == Pos.VERB:
c.replace_word("返")
@contextual_rule("黎")
def _(c: Context):
"""黎 -> 嚟
如果 黎 字係動詞,就改成 嚟.
"""
if c.pos == Pos.VERB or c.next_pos == Pos.VERB:
c.replace_word("嚟")
# @contextual_rule("咁")
# def _(c: Context):
# """咁 -> 噉
# 如果前面係形容詞、副詞,或者後面後動詞、名詞、代詞,就係 噉
# """
# # 句末直接當 噉
# if c.next_word == "":
# c.replace_word("噉")
# elif c.next_pos in (Pos.ADJ, Pos.ADV):
# return
# if (c.next_pos in (Pos.VERB, Pos.NOUN, Pos.PRON, Pos.PART, Pos.AUX) or
# c.prev_pos in (Pos.ADJ, Pos.ADV, Pos.NOUN)):
# c.replace_word("噉")
@contextual_rule("甘")
def _(c: Context):
"""甘 -> 咁/噉
如果前面係形容詞、副詞,或者後面後動詞、名詞、代詞,就係 噉.
如果後面係形容詞、副詞,就係 咁
"""
if c.pos in (Pos.VERB, Pos.NOUN):
return
# 句末直接當噉
if c.next_word == "":
c.replace_word("噉")
elif c.next_pos in (Pos.ADV, Pos.ADJ):
c.replace_word("咁")
return
if (c.next_pos in (Pos.VERB, Pos.NOUN, Pos.PRON, Pos.PART, Pos.AUX) or
c.prev_pos in (Pos.ADJ, Pos.ADV, Pos.NOUN)):
c.replace_word("噉")
@contextual_rule("既")
def _(c: Context):
"""既 -> 嘅
如果 既 字前面係一個名詞/動詞/形容詞/副詞,句子後面又冇"又 ADV/ADJ/VERB"嘅結構,噉就改成 嘅.
"""
if c.prev_pos not in (Pos.PROPN, Pos.PRON, Pos.NOUN, Pos.ADJ, Pos.ADV, Pos.VERB):
return
# 句子後面冇 "又 ADV/ADJ/VERB" 嘅結構
if "又" in c.sentence_remain and c.next_pos not in (Pos.ADJ, Pos.ADV, Pos.VERB):
return
c.replace_word("嘅")
@contextual_rule("果")
def _(c: Context):
"""果 -> 嗰"""
if c.next_pos == Pos.NOUN:
c.replace_word("嗰")
@contextual_rule("野")
def _(c: Context):
"""野 -> 嘢
如果係隻名詞,就改成 嘢. 包埋動詞同X係因為 pycantonese 有時會識別成動詞.
"""
if c.pos in (Pos.PRON, Pos.NOUN, Pos.X, Pos.AUX) or c.prev_pos == Pos.VERB:
c.replace_word("嘢")
@contextual_rule("無")
def _(c: Context):
"""無 -> 冇"""
if c.next_pos in (Pos.NOUN, Pos.ADP):
c.replace_word("冇")
@contextual_rule("曬")
def _(c: Context):
"""曬 -> 晒"""
if c.pos == Pos.VERB:
c.replace_word("晒")
@contextual_rule("哂")
def _(c: Context):
"""哂 -> 晒"""
if c.next_word != "笑":
c.replace_word("晒")
@contextual_rule("左")
def _(c: Context):
"""左 -> 咗: 如果 左 字前面係一個動詞,噉就改成 咗."""
if c.prev_pos in (Pos.VERB, Pos.ADJ, Pos.ADV):
c.replace_word("咗")
@contextual_rule("著")
def _(c: Context):
"""著 -> 着: 如果 著 字前面係一個動詞,噉就改成 住."""
if c.pos == Pos.VERB:
c.replace_word("着")
elif c.prev_pos == Pos.VERB:
c.replace_word("住")