diff --git a/base.py b/base.py index ee08f82..6d9eaff 100644 --- a/base.py +++ b/base.py @@ -1,15 +1,14 @@ -# -*- encoding: utf-8 -*- """基础文件,提供通用功能和数据结构定义。""" from pathlib import Path -from typing import TypeAlias, Dict, Set, Tuple, Final +from typing import Final, TypeAlias import ujson # 类型别名和常量定义 -Ldata: TypeAlias = Dict[str, str] +Ldata: TypeAlias = dict[str, str] P: Final[Path] = Path(__file__).resolve().parent -LANG_FILES: Final[Tuple[str, ...]] = ("en_us", "zh_cn") +LANG_FILES: Final[tuple[str, ...]] = ("en_us", "zh_cn") def load_json(file: str, folder: str = "data") -> Ldata: @@ -20,7 +19,7 @@ def load_json(file: str, folder: str = "data") -> Ldata: folder (str, optional): JSON文件所在文件夹路径,默认为"data" Returns: - Dict[str, str]: 加载的JSON内容 + dict[str, str]: 加载的JSON内容 """ path = P / folder / f"{file}.json" with path.open("r", encoding="utf-8", newline="\n") as f: @@ -28,15 +27,14 @@ def load_json(file: str, folder: str = "data") -> Ldata: def save_to_json( - input_data: Tuple[Ldata, float], + input_data: tuple[Ldata, float], output_file: str, output_folder: str = "output", ) -> None: - """ - 将生成的语言文件保存至JSON。 + """将生成的语言文件保存至JSON。 Args: - input_data (Tuple[Ldata, float]): 输入的数据 + input_data (tuple[Ldata, float]): 输入的数据 output_file (str): 保存的文件名,无格式后缀 output_folder (str, optional): 保存的文件夹,默认为“output” @@ -50,9 +48,7 @@ def save_to_json( with open(file_path, "w", encoding="utf-8", newline="\n") as j: ujson.dump(input_dict, j, indent=2, ensure_ascii=False) size = file_size(file_path) - print( - f"已生成语言文件“{output_file}.json”,大小{size},耗时{elapsed_time:.2f} s。" - ) + print(f"已生成语言文件“{output_file}.json”,大小{size},耗时{elapsed_time:.2f} s。") except Exception as e: raise OSError(f"保存至JSON失败:{str(e)}") from e @@ -76,12 +72,12 @@ def file_size(p: Path) -> str: # 语言文件数据 -DATA: Final[Dict[str, Ldata]] = { +DATA: Final[dict[str, Ldata]] = { lang_name: load_json(lang_name, "mc_lang/full") for lang_name in LANG_FILES } # 转换映射表 -PINYIN_TO: Final[Dict[str, Ldata]] = { +PINYIN_TO: Final[dict[str, Ldata]] = { "wadegiles": load_json("py2wg"), "romatzyh": load_json("py2gr"), "simp_romatzyh": load_json("py2sgr"), @@ -95,7 +91,7 @@ def file_size(p: Path) -> str: } # 修正数据 -fixed_zh: Dict[str, Ldata] = { +fixed_zh: dict[str, Ldata] = { f"zh_{scheme}": load_json(f"fixed_zh_{scheme}", "data/fixed") for scheme in [ "source", # 来源修正 @@ -114,8 +110,8 @@ def file_size(p: Path) -> str: # 汉语拼音手动修正 fixed_zh["zh_py"].update(load_json("fixed_zh_py_manual", "data/fixed")) -gr_values: Set[str] = set(PINYIN_TO["romatzyh"].values()) # 国语罗马字的有效拼写 -cy_values: Set[str] = set(PINYIN_TO["cyrillic"].values()) # 西里尔转写的有效拼写 +gr_values: set[str] = set(PINYIN_TO["romatzyh"].values()) # 国语罗马字的有效拼写 +cy_values: set[str] = set(PINYIN_TO["cyrillic"].values()) # 西里尔转写的有效拼写 TONE_TO_IPA: Final[Ldata] = { "1": "˥", "2": "˧˥", @@ -125,6 +121,6 @@ def file_size(p: Path) -> str: } # IPA声调 rep_zh: Ldata = load_json("rep_zh", "data/rep") # 连写的中文转写方案替换修正 -PINYIN_FINALS: Final[Tuple[str, ...]] = tuple("aāááàoōóǒòeēéěè") # 可能的零声母开头 +PINYIN_FINALS: Final[tuple[str, ...]] = tuple("aāááàoōóǒòeēéěè") # 可能的零声母开头 rep_ja_kk: Ldata = load_json("rep_ja_kk", "data/rep") # 片假名替换修正 diff --git a/converter.py b/converter.py index 63246b6..37eef0e 100644 --- a/converter.py +++ b/converter.py @@ -58,6 +58,12 @@ class BaseConverter: """ def __init__(self, data: Ldata, rep: Ldata) -> None: + """初始化转换器。 + + Args: + data (Ldata): 输入的语言数据字典 + rep (Ldata): 需要替换的格式内容字典 + """ self.data = data self.rep = rep @@ -149,7 +155,7 @@ def add_apostrophes(self, input_list: list[str], values: set[str]) -> list[str]: Args: input_list (list[str]): 需要转换的字符串 - values (set[str]: 有效的拼写 + values (set[str]): 有效的拼写 Returns: list[str]: 处理结果 @@ -190,11 +196,12 @@ def convert( start_time = time.time() output_dict: Ldata = {} - for k, v in input_dict.items(): - try: + try: + for k, v in input_dict.items(): output_dict[k] = func(v) - except Exception as e: - raise ConversionError(f"转换{k}时出现错误:{str(e)}") from e + except Exception as e: + current_key = list(input_dict.keys())[len(output_dict)] + raise ConversionError(f"转换{current_key}时出现错误:{str(e)}") from e if self.rep is rep_zh: output_dict.update(fixed_zh_u) @@ -209,8 +216,7 @@ def convert( class EnglishConverter(BaseConverter): - """ - 英文转换器。处理英文文本到其他格式的转换。 + """英文转换器。处理英文文本到其他格式的转换。 Attributes: data (Ldata): 输入的英文语言数据 @@ -218,11 +224,19 @@ class EnglishConverter(BaseConverter): """ def __init__(self, data: Ldata, rep: Ldata = rep_ja_kk) -> None: + """初始化英文转换器。 + + Args: + data (Ldata): 输入的英文语言数据 + rep (Ldata, optional): 英文转写替换规则,默认为rep_ja_kk + """ super().__init__(data, rep) def to_i7h(self, text: str) -> str: """将字符串中的所有单词缩写。 + 保留单词的首尾字符,中间用字符数替代。 + 长度为2或以下的单词保持不变。 Args: @@ -281,6 +295,13 @@ class ChineseConverter(BaseConverter): """ def __init__(self, data: Ldata, rep: Ldata = rep_zh, auto_cut: bool = True) -> None: + """初始化中文转换器。 + + Args: + data (Ldata): 输入的中文语言数据 + rep (Ldata, optional): 中文转写替换规则,默认为rep_zh + auto_cut (bool, optional): 是否使用自动分词,默认为True + """ super().__init__(data, rep) self.auto_cut = auto_cut @@ -310,16 +331,15 @@ def convert( start_time = time.time() output_dict: Ldata = {} - for k, v in input_dict.items(): - try: + try: + for k, v in input_dict.items(): string = ( - v.replace("为", "位") - if k in wei and func.__name__ != "to_split" - else v + v.replace("为", "位") if k in wei and func.__name__ != "to_split" else v ) output_dict[k] = func(string) - except Exception as e: - raise ConversionError(f"转换{k}时出现错误:{str(e)}") from e + except Exception as e: + current_key = list(input_dict.keys())[len(output_dict)] + raise ConversionError(f"转换{current_key}时出现错误:{str(e)}") from e if self.rep is rep_zh: output_dict.update(fixed_zh_u) @@ -362,9 +382,7 @@ def to_split(self, text: str) -> str: "之物": "之 物", } ) - return self.replace_multiple( - " ".join(self.segment_str(text)).replace(" 了", "了"), rep - ) + return self.replace_multiple(" ".join(self.segment_str(text)).replace(" 了", "了"), rep) def to_harmonic(self, text: str) -> str: """将字符串中的汉字按GB/Z 40637-2021和《通用规范汉字表》转换。 @@ -394,9 +412,7 @@ def to_pinyin(self, text: str) -> str: pinyin_list = [ ( f"'{py}" - if i > 0 - and py.startswith(PINYIN_FINALS) - and pinyin_list[i - 1][-1].isalpha() + if i > 0 and py.startswith(PINYIN_FINALS) and pinyin_list[i - 1][-1].isalpha() else py ) for i, py in enumerate(pinyin_list) @@ -409,9 +425,7 @@ def to_pinyin(self, text: str) -> str: else " " ) result += "".join(pinyin_list) - return self.capitalize_lines( - self.capitalize_titles(self.replace_multiple(result[1:])) - ) + return self.capitalize_lines(self.capitalize_titles(self.replace_multiple(result[1:]))) def pinyin_to_other( self, @@ -433,9 +447,7 @@ def pinyin_to_other( result = "" for i, seg in enumerate(seg_list): - pinyin_list = lazy_pinyin( - seg, style=Style.TONE3, neutral_tone_with_five=True - ) + pinyin_list = lazy_pinyin(seg, style=Style.TONE3, neutral_tone_with_five=True) result_list = [correspondence.get(p, p) for p in pinyin_list] result += ( "" @@ -445,12 +457,11 @@ def pinyin_to_other( else " " ) result += delimiter.join(result_list) - return self.capitalize_lines( - self.capitalize_titles(self.replace_multiple(result[1:])) - ) + return self.capitalize_lines(self.capitalize_titles(self.replace_multiple(result[1:]))) def to_ipa(self, text: str) -> str: """将字符串中的汉字转写为IPA,单字之间使用空格分开。 + IPA数据来自@UntPhesoca,宽式标音。 Args: @@ -504,17 +515,13 @@ def to_romatzyh(self, text: str) -> str: for seg in seg_list: seg = seg.replace("不", "bu") - pinyin_list = lazy_pinyin( - seg, style=Style.TONE3, neutral_tone_with_five=True - ) + pinyin_list = lazy_pinyin(seg, style=Style.TONE3, neutral_tone_with_five=True) gr_list = [PINYIN_TO["romatzyh"].get(p, p) for p in pinyin_list] output_list.append("".join(self.add_apostrophes(gr_list, gr_values))) result = " ".join(output_list) - return self.capitalize_lines( - self.capitalize_titles(self.replace_multiple(result)) - ) + return self.capitalize_lines(self.capitalize_titles(self.replace_multiple(result))) def to_simp_romatzyh(self, text: str) -> str: """将字符串中的汉字转写为简化国语罗马字,词之间使用空格分开。 @@ -591,9 +598,7 @@ def to_cyrillic(self, text: str) -> str: output_list.append("".join(self.add_apostrophes(cy_list, cy_values))) result = " ".join(output_list) - return self.capitalize_lines( - self.capitalize_titles(self.replace_multiple(result)) - ) + return self.capitalize_lines(self.capitalize_titles(self.replace_multiple(result))) def to_xiaojing(self, text: str) -> str: """将字符串中的汉字转写为小儿经,使用零宽不连字(U+200C)分开。 diff --git a/ruff.toml b/ruff.toml index 0fbe9ff..b312a2f 100644 --- a/ruff.toml +++ b/ruff.toml @@ -1,6 +1,16 @@ +line-length = 100 + [lint] -select = ["F", "E", "W", "UP", "I", "N", "PERF"] +select = ["F", "E", "W", "UP", "I", "N", "PERF", "D"] +ignore = ["D415"] [lint.per-file-ignores] "__init__.py" = ["E402"] "**/{tests,docs,tools}/*" = ["E402"] + +[lint.pydocstyle] +convention = "google" + +[format] +line-ending = "lf" +docstring-code-format = true