Skip to content

Commit

Permalink
Update
Browse files Browse the repository at this point in the history
  • Loading branch information
SkyEye-FAST committed Feb 10, 2025
1 parent ff7eb4e commit e71f34b
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 57 deletions.
32 changes: 14 additions & 18 deletions base.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
# -*- encoding: utf-8 -*-
"""基础文件,提供通用功能和数据结构定义。"""

from pathlib import Path
from typing import TypeAlias, Dict, Set, Tuple, Final
from typing import Final, TypeAlias

import ujson

# 类型别名和常量定义
Ldata: TypeAlias = Dict[str, str]
Ldata: TypeAlias = dict[str, str]
P: Final[Path] = Path(__file__).resolve().parent
LANG_FILES: Final[Tuple[str, ...]] = ("en_us", "zh_cn")
LANG_FILES: Final[tuple[str, ...]] = ("en_us", "zh_cn")


def load_json(file: str, folder: str = "data") -> Ldata:
Expand All @@ -20,23 +19,22 @@ def load_json(file: str, folder: str = "data") -> Ldata:
folder (str, optional): JSON文件所在文件夹路径,默认为"data"
Returns:
Dict[str, str]: 加载的JSON内容
dict[str, str]: 加载的JSON内容
"""
path = P / folder / f"{file}.json"
with path.open("r", encoding="utf-8", newline="\n") as f:
return ujson.load(f)


def save_to_json(
input_data: Tuple[Ldata, float],
input_data: tuple[Ldata, float],
output_file: str,
output_folder: str = "output",
) -> None:
"""
将生成的语言文件保存至JSON。
"""将生成的语言文件保存至JSON。
Args:
input_data (Tuple[Ldata, float]): 输入的数据
input_data (tuple[Ldata, float]): 输入的数据
output_file (str): 保存的文件名,无格式后缀
output_folder (str, optional): 保存的文件夹,默认为“output”
Expand All @@ -50,9 +48,7 @@ def save_to_json(
with open(file_path, "w", encoding="utf-8", newline="\n") as j:
ujson.dump(input_dict, j, indent=2, ensure_ascii=False)
size = file_size(file_path)
print(
f"已生成语言文件“{output_file}.json”,大小{size},耗时{elapsed_time:.2f} s。"
)
print(f"已生成语言文件“{output_file}.json”,大小{size},耗时{elapsed_time:.2f} s。")
except Exception as e:
raise OSError(f"保存至JSON失败:{str(e)}") from e

Expand All @@ -76,12 +72,12 @@ def file_size(p: Path) -> str:


# 语言文件数据
DATA: Final[Dict[str, Ldata]] = {
DATA: Final[dict[str, Ldata]] = {
lang_name: load_json(lang_name, "mc_lang/full") for lang_name in LANG_FILES
}

# 转换映射表
PINYIN_TO: Final[Dict[str, Ldata]] = {
PINYIN_TO: Final[dict[str, Ldata]] = {
"wadegiles": load_json("py2wg"),
"romatzyh": load_json("py2gr"),
"simp_romatzyh": load_json("py2sgr"),
Expand All @@ -95,7 +91,7 @@ def file_size(p: Path) -> str:
}

# 修正数据
fixed_zh: Dict[str, Ldata] = {
fixed_zh: dict[str, Ldata] = {
f"zh_{scheme}": load_json(f"fixed_zh_{scheme}", "data/fixed")
for scheme in [
"source", # 来源修正
Expand All @@ -114,8 +110,8 @@ def file_size(p: Path) -> str:
# 汉语拼音手动修正
fixed_zh["zh_py"].update(load_json("fixed_zh_py_manual", "data/fixed"))

gr_values: Set[str] = set(PINYIN_TO["romatzyh"].values()) # 国语罗马字的有效拼写
cy_values: Set[str] = set(PINYIN_TO["cyrillic"].values()) # 西里尔转写的有效拼写
gr_values: set[str] = set(PINYIN_TO["romatzyh"].values()) # 国语罗马字的有效拼写
cy_values: set[str] = set(PINYIN_TO["cyrillic"].values()) # 西里尔转写的有效拼写
TONE_TO_IPA: Final[Ldata] = {
"1": "˥",
"2": "˧˥",
Expand All @@ -125,6 +121,6 @@ def file_size(p: Path) -> str:
} # IPA声调

rep_zh: Ldata = load_json("rep_zh", "data/rep") # 连写的中文转写方案替换修正
PINYIN_FINALS: Final[Tuple[str, ...]] = tuple("aāááàoōóǒòeēéěè") # 可能的零声母开头
PINYIN_FINALS: Final[tuple[str, ...]] = tuple("aāááàoōóǒòeēéěè") # 可能的零声母开头

rep_ja_kk: Ldata = load_json("rep_ja_kk", "data/rep") # 片假名替换修正
81 changes: 43 additions & 38 deletions converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@ class BaseConverter:
"""

def __init__(self, data: Ldata, rep: Ldata) -> None:
"""初始化转换器。
Args:
data (Ldata): 输入的语言数据字典
rep (Ldata): 需要替换的格式内容字典
"""
self.data = data
self.rep = rep

Expand Down Expand Up @@ -149,7 +155,7 @@ def add_apostrophes(self, input_list: list[str], values: set[str]) -> list[str]:
Args:
input_list (list[str]): 需要转换的字符串
values (set[str]: 有效的拼写
values (set[str]): 有效的拼写
Returns:
list[str]: 处理结果
Expand Down Expand Up @@ -190,11 +196,12 @@ def convert(
start_time = time.time()

output_dict: Ldata = {}
for k, v in input_dict.items():
try:
try:
for k, v in input_dict.items():
output_dict[k] = func(v)
except Exception as e:
raise ConversionError(f"转换{k}时出现错误:{str(e)}") from e
except Exception as e:
current_key = list(input_dict.keys())[len(output_dict)]
raise ConversionError(f"转换{current_key}时出现错误:{str(e)}") from e

if self.rep is rep_zh:
output_dict.update(fixed_zh_u)
Expand All @@ -209,20 +216,27 @@ def convert(


class EnglishConverter(BaseConverter):
"""
英文转换器。处理英文文本到其他格式的转换。
"""英文转换器。处理英文文本到其他格式的转换。
Attributes:
data (Ldata): 输入的英文语言数据
rep (Ldata, optional): 英文转写替换规则
"""

def __init__(self, data: Ldata, rep: Ldata = rep_ja_kk) -> None:
"""初始化英文转换器。
Args:
data (Ldata): 输入的英文语言数据
rep (Ldata, optional): 英文转写替换规则,默认为rep_ja_kk
"""
super().__init__(data, rep)

def to_i7h(self, text: str) -> str:
"""将字符串中的所有单词缩写。
保留单词的首尾字符,中间用字符数替代。
长度为2或以下的单词保持不变。
Args:
Expand Down Expand Up @@ -281,6 +295,13 @@ class ChineseConverter(BaseConverter):
"""

def __init__(self, data: Ldata, rep: Ldata = rep_zh, auto_cut: bool = True) -> None:
"""初始化中文转换器。
Args:
data (Ldata): 输入的中文语言数据
rep (Ldata, optional): 中文转写替换规则,默认为rep_zh
auto_cut (bool, optional): 是否使用自动分词,默认为True
"""
super().__init__(data, rep)
self.auto_cut = auto_cut

Expand Down Expand Up @@ -310,16 +331,15 @@ def convert(
start_time = time.time()

output_dict: Ldata = {}
for k, v in input_dict.items():
try:
try:
for k, v in input_dict.items():
string = (
v.replace("为", "位")
if k in wei and func.__name__ != "to_split"
else v
v.replace("为", "位") if k in wei and func.__name__ != "to_split" else v
)
output_dict[k] = func(string)
except Exception as e:
raise ConversionError(f"转换{k}时出现错误:{str(e)}") from e
except Exception as e:
current_key = list(input_dict.keys())[len(output_dict)]
raise ConversionError(f"转换{current_key}时出现错误:{str(e)}") from e

if self.rep is rep_zh:
output_dict.update(fixed_zh_u)
Expand Down Expand Up @@ -362,9 +382,7 @@ def to_split(self, text: str) -> str:
"之物": "之 物",
}
)
return self.replace_multiple(
" ".join(self.segment_str(text)).replace(" 了", "了"), rep
)
return self.replace_multiple(" ".join(self.segment_str(text)).replace(" 了", "了"), rep)

def to_harmonic(self, text: str) -> str:
"""将字符串中的汉字按GB/Z 40637-2021和《通用规范汉字表》转换。
Expand Down Expand Up @@ -394,9 +412,7 @@ def to_pinyin(self, text: str) -> str:
pinyin_list = [
(
f"'{py}"
if i > 0
and py.startswith(PINYIN_FINALS)
and pinyin_list[i - 1][-1].isalpha()
if i > 0 and py.startswith(PINYIN_FINALS) and pinyin_list[i - 1][-1].isalpha()
else py
)
for i, py in enumerate(pinyin_list)
Expand All @@ -409,9 +425,7 @@ def to_pinyin(self, text: str) -> str:
else " "
)
result += "".join(pinyin_list)
return self.capitalize_lines(
self.capitalize_titles(self.replace_multiple(result[1:]))
)
return self.capitalize_lines(self.capitalize_titles(self.replace_multiple(result[1:])))

def pinyin_to_other(
self,
Expand All @@ -433,9 +447,7 @@ def pinyin_to_other(
result = ""

for i, seg in enumerate(seg_list):
pinyin_list = lazy_pinyin(
seg, style=Style.TONE3, neutral_tone_with_five=True
)
pinyin_list = lazy_pinyin(seg, style=Style.TONE3, neutral_tone_with_five=True)
result_list = [correspondence.get(p, p) for p in pinyin_list]
result += (
""
Expand All @@ -445,12 +457,11 @@ def pinyin_to_other(
else " "
)
result += delimiter.join(result_list)
return self.capitalize_lines(
self.capitalize_titles(self.replace_multiple(result[1:]))
)
return self.capitalize_lines(self.capitalize_titles(self.replace_multiple(result[1:])))

def to_ipa(self, text: str) -> str:
"""将字符串中的汉字转写为IPA,单字之间使用空格分开。
IPA数据来自@UntPhesoca,宽式标音。
Args:
Expand Down Expand Up @@ -504,17 +515,13 @@ def to_romatzyh(self, text: str) -> str:

for seg in seg_list:
seg = seg.replace("不", "bu")
pinyin_list = lazy_pinyin(
seg, style=Style.TONE3, neutral_tone_with_five=True
)
pinyin_list = lazy_pinyin(seg, style=Style.TONE3, neutral_tone_with_five=True)
gr_list = [PINYIN_TO["romatzyh"].get(p, p) for p in pinyin_list]
output_list.append("".join(self.add_apostrophes(gr_list, gr_values)))

result = " ".join(output_list)

return self.capitalize_lines(
self.capitalize_titles(self.replace_multiple(result))
)
return self.capitalize_lines(self.capitalize_titles(self.replace_multiple(result)))

def to_simp_romatzyh(self, text: str) -> str:
"""将字符串中的汉字转写为简化国语罗马字,词之间使用空格分开。
Expand Down Expand Up @@ -591,9 +598,7 @@ def to_cyrillic(self, text: str) -> str:
output_list.append("".join(self.add_apostrophes(cy_list, cy_values)))

result = " ".join(output_list)
return self.capitalize_lines(
self.capitalize_titles(self.replace_multiple(result))
)
return self.capitalize_lines(self.capitalize_titles(self.replace_multiple(result)))

def to_xiaojing(self, text: str) -> str:
"""将字符串中的汉字转写为小儿经,使用零宽不连字(U+200C)分开。
Expand Down
12 changes: 11 additions & 1 deletion ruff.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
line-length = 100

[lint]
select = ["F", "E", "W", "UP", "I", "N", "PERF"]
select = ["F", "E", "W", "UP", "I", "N", "PERF", "D"]
ignore = ["D415"]

[lint.per-file-ignores]
"__init__.py" = ["E402"]
"**/{tests,docs,tools}/*" = ["E402"]

[lint.pydocstyle]
convention = "google"

[format]
line-ending = "lf"
docstring-code-format = true

0 comments on commit e71f34b

Please sign in to comment.