Fix

SkyEye-FAST · Aug 15, 2024 · afaf607 · afaf607
1 parent f8676f2
commit afaf607
Show file tree

Hide file tree

Showing 10 changed files with 213 additions and 248 deletions.
diff --git a/converter.py b/converter.py
@@ -4,7 +4,8 @@
 import json
 import re
 import time
-from typing import Callable, Optional, List, Set
+import inspect
+from typing import Dict, List, Set
 
 from romajitable import to_kana as tk
 from pypinyin import Style, lazy_pinyin, load_phrases_dict
@@ -23,8 +24,8 @@
 jieba.load_userdict(str(P / "data" / "dict.txt"))
 
 # 初始化其他自定义数据
+fixed_zh_u = load_json("fixed_zh_universal")
 tone_to_ipa: Ldata = {"1": "˥", "2": "˧˥", "3": "˨˩˦", "4": "˥˩", "5": ""}  # IPA声调
-
 rep_ja_kk: Ldata = load_json("rep_ja_kk")  # 片假名替换修正
 manyoganas_dict: Ldata = load_json("manyogana")  # 万叶假名
 
@@ -75,21 +76,10 @@ def capitalize_titles(text: str) -> str:
         str: 转换结果
     """
 
-    def title_case_content(content: str) -> str:
-        """
-        将书名号中的内容首字母大写。
-
-        Args:
-            content (str): 书名号中的内容。
-
-        Returns:
-            str: 首字母大写后的书名号内容。
-        """
-
-        return " ".join(word.capitalize() for word in content.split())
-
     return re.sub(
-        r"《(.*?)》", lambda match: f"《{title_case_content(match.group(1))}》", text
+        r"《(.*?)》",
+        lambda match: f"《{' '.join(word.capitalize() for word in match.group(1).split())}》",
+        text,
     )
 
 
@@ -102,7 +92,7 @@ def add_apostrophes(input_list: List[str], values: Set[str]) -> List[str]:
         values (Set[str]): 有效的拼写
 
     Returns:
-        list: 处理结果
+        List[str]: 处理结果
     """
 
     for i in range(1, len(input_list)):
@@ -116,6 +106,21 @@ def add_apostrophes(input_list: List[str], values: Set[str]) -> List[str]:
     return input_list
 
 
+def segment_str(text: str, auto_cut: bool = True) -> List[str]:
+    """
+    将字符串分词。
+
+    Args:
+        text (str): 需要转换的字符串
+        auto_cut (bool, optional): 是否自动分词，默认为True
+
+    Returns:
+        str: 转换结果
+    """
+
+    return jieba.lcut(text) if auto_cut else text.split()
+
+
 def to_katakana(text: str) -> str:
     """
     将字符串中的英文转写为片假名。
@@ -141,35 +146,39 @@ def to_manyogana(text: str) -> str:
         str: 转换结果
     """
 
-    return "".join([manyoganas_dict.get(char, char) for char in to_katakana(text)])
+    return "".join(manyoganas_dict.get(char, char) for char in to_katakana(text))
 
 
-def to_pinyin(text: str) -> str:
+def to_pinyin(text: str, rep: Ldata, auto_cut: bool = True) -> str:
     """
     将字符串中的汉字转写为拼音，尝试遵循GB/T 16159-2012分词，词之间使用空格分开。
 
     Args:
         text (str): 需要转换的字符串
+        rep (Ldata): 需要替换格式的内容
+        auto_cut (bool, optional): 是否自动分词，默认为True
 
     Returns:
         str: 转换结果
     """
 
-    seg_list: List[str] = jieba.lcut(text)
+    seg_list = segment_str(text, auto_cut)
     output_list: List[str] = []
 
     for seg in seg_list:
         pinyin_list = lazy_pinyin(seg, style=Style.TONE)
-        # 处理隔音符号
-        for i, py in enumerate(pinyin_list[1:], 1):
-            if py.startswith(finals):
-                pinyin_list[i] = f"'{py}"
+        pinyin_list = [
+            (
+                f"'{py}"
+                if i > 0 and py.startswith(finals) and pinyin_list[i - 1][-1].isalpha()
+                else py
+            )
+            for i, py in enumerate(pinyin_list)
+        ]
         output_list.append("".join(pinyin_list))
 
-    # 调整格式
-    result = replace_multiple(" ".join(output_list), rep_zh)
-
-    return capitalize_lines(capitalize_titles(result))
+    result = " ".join(output_list)
+    return capitalize_lines(capitalize_titles(replace_multiple(result, rep)))
 
 
 def to_ipa(text: str) -> str:
@@ -186,7 +195,7 @@ def to_ipa(text: str) -> str:
 
     pinyin_list = lazy_pinyin(text, style=Style.TONE3, neutral_tone_with_five=True)
     ipa_list = [
-        f"{pinyin_to["ipa"].get(p[:-1], p[:-1])}{tone_to_ipa.get(p[-1], p[-1])}"
+        f"{pinyin_to['ipa'].get(p[:-1], p[:-1])}{tone_to_ipa.get(p[-1], p[-1])}"
         for p in pinyin_list
     ]
     return " ".join(ipa_list)
@@ -206,128 +215,140 @@ def to_bopomofo(text: str) -> str:
     return " ".join(lazy_pinyin(text, style=Style.BOPOMOFO))
 
 
-def to_wadegiles(text: str) -> str:
+def to_wadegiles(text: str, rep: Ldata, auto_cut: bool = True) -> str:
     """
     将字符串中的汉字转写为威妥玛拼音，单字之间使用连字符分开，词之间使用空格分开。
 
     Args:
         text (str): 需要转换的字符串
+        rep (Ldata): 需要替换格式的内容
+        auto_cut (bool, optional): 是否自动分词，默认为True
 
     Returns:
         str: 转换结果
     """
 
-    seg_list: List[str] = jieba.lcut(text)
+    seg_list = segment_str(text, auto_cut)
     output_list: List[str] = []
 
     for seg in seg_list:
         pinyin_list = lazy_pinyin(seg, style=Style.TONE3, neutral_tone_with_five=True)
-        gr_list = [pinyin_to["wadegiles"].get(p, p) for p in pinyin_list]
-        output_list.append("-".join(gr_list))
-
-    # 调整格式
-    result = replace_multiple(" ".join(output_list), rep_zh)
+        wg_list = [pinyin_to["wadegiles"].get(p, p) for p in pinyin_list]
+        output_list.append("-".join(wg_list))
 
-    return capitalize_lines(capitalize_titles(result))
+    result = " ".join(output_list)
+    return capitalize_lines(capitalize_titles(replace_multiple(result, rep)))
 
 
-def to_romatzyh(text: str) -> str:
+def to_romatzyh(text: str, rep: Ldata, auto_cut: bool = True) -> str:
     """
     将字符串中的汉字转写为国语罗马字，词之间使用空格分开。
 
     Args:
         text (str): 需要转换的字符串
+        rep (Ldata): 需要替换格式的内容
+        auto_cut (bool, optional): 是否自动分词，默认为True
 
     Returns:
         str: 转换结果
     """
 
-    seg_list: List[str] = jieba.lcut(text)
-    output_list: List[str] = []
+    seg_list = segment_str(text, auto_cut)
+    output_list = []
 
     for seg in seg_list:
         seg = seg.replace("不", "bu")
         pinyin_list = lazy_pinyin(seg, style=Style.TONE3, neutral_tone_with_five=True)
         gr_list = [pinyin_to["romatzyh"].get(p, p) for p in pinyin_list]
         output_list.append("".join(add_apostrophes(gr_list, gr_values)))
 
-    result = replace_multiple(" ".join(output_list), rep_zh)  # 调整格式
+    result = " ".join(output_list)
 
-    return capitalize_lines(capitalize_titles(result))
+    return capitalize_lines(capitalize_titles(replace_multiple(result, rep)))
 
 
-def to_cyrillic(text: str) -> str:
+def to_cyrillic(text: str, rep: Ldata, auto_cut: bool = True) -> str:
     """
-    将字符串中的汉字转写为西里尔字母，使用帕拉季音标体系，词之间使用空格分开。
+    将字符串中的汉字转写为西里尔字母，使用帕拉季音标体系。
 
     Args:
         text (str): 需要转换的字符串
+        rep (Ldata): 需要替换格式的内容
+        auto_cut (bool, optional): 是否自动分词，默认为True
 
     Returns:
         str: 转换结果
     """
 
-    seg_list: List[str] = jieba.lcut(text)
+    seg_list = segment_str(text, auto_cut)
     output_list: List[str] = []
 
     for seg in seg_list:
         pinyin_list = lazy_pinyin(seg)
         cy_list = [pinyin_to["cyrillic"].get(p, p) for p in pinyin_list]
         output_list.append("".join(add_apostrophes(cy_list, cy_values)))
 
-    result = replace_multiple(" ".join(output_list), rep_zh)  # 调整格式
-
-    return capitalize_lines(capitalize_titles(result))
+    result = " ".join(output_list)
+    return capitalize_lines(capitalize_titles(replace_multiple(result, rep)))
 
 
-def to_xiaojing(text: str) -> str:
+def to_xiaojing(text: str, rep: Ldata, auto_cut: bool = True) -> str:
     """
-    将字符串中的汉字转写为小儿经，单字之间使用零宽不连字（U+200C）分开，词之间使用空格分开。
+    将字符串中的汉字转写为小儿经，使用零宽不连字（U+200C）分开。
 
     Args:
         text (str): 需要转换的字符串
+        rep (Ldata): 需要替换格式的内容
+        auto_cut (bool, optional): 是否自动分词，默认为True
 
     Returns:
         str: 转换结果
     """
 
-    seg_list: List[str] = jieba.lcut(text)
-    output_list: List[str] = []
-
+    seg_list = segment_str(text, auto_cut)
+    output_list = []
     for seg in seg_list:
         pinyin_list = lazy_pinyin(seg)
         xj_list = [pinyin_to["xiaojing"].get(p, p) for p in pinyin_list]
         output_list.append("\u200c".join(xj_list))
-
-    return replace_multiple(" ".join(output_list), rep_zh)
+    return replace_multiple(" ".join(output_list), rep)
 
 
-def save_to_json(
-    input_dict: Ldata,
-    output_file: str,
-    func: Callable[[str], str],
-    fix_dict: Optional[Ldata] = None,
-    output_folder: str = "output",
-) -> None:
-    """
-    将生成的语言文件保存至JSON。
+def save_to_json(input_dict: Ldata, config: Dict) -> None:
+    """将生成的语言文件保存至JSON。
 
     Args:
         input_dict (Ldata): 输入的数据
-        output_file (str): 保存的文件名，无格式后缀
-        func (Callable[[str], str]): 生成语言文件所用的函数
-        fix_dict (Optional[Ldata], optional): 语言文件中需要修复的内容. 默认为None
-        output_folder (str, optional): 保存的文件夹，默认为“output”
+        config (Dict): 含有配置的字典
     """
 
     start_time = time.time()
-    full_file_name = f"{output_file}.json"
-    output_dict = {k: func(v) for k, v in input_dict.items()}
-    if fix_dict:
-        output_dict.update(fix_dict)
-    file_path = P / output_folder / full_file_name
+
+    func = config["func"]
+
+    auto_cut = config.get("auto_cut", True)
+    rep = config.get("rep", rep_zh)
+
+    output_dict = {}
+    for k, v in input_dict.items():
+        func_signature = inspect.signature(func)
+        kwargs = {}
+        if "auto_cut" in func_signature.parameters and auto_cut is not None:
+            kwargs["auto_cut"] = auto_cut
+        if "rep" in func_signature.parameters and rep is not None:
+            kwargs["rep"] = rep
+        output_dict[k] = func(v, **kwargs)
+
+    output_dict.update(fixed_zh_u)
+    if config.get("fixed_dict"):
+        output_dict.update(config["fixed_dict"])
+    file_path = (
+        P / config.get("output_folder", "output") / f"{config['output_file']}.json"
+    )
     with open(file_path, "w", encoding="utf-8") as j:
         json.dump(output_dict, j, indent=2, ensure_ascii=False)
     elapsed_time = time.time() - start_time
     size = f"{round(file_path.stat().st_size / 1024, 2)} KB"
-    print(f"已生成语言文件“{full_file_name}”，大小{size}，耗时{elapsed_time:.2f} s。")
+    print(
+        f"已生成语言文件“{config['output_file']}.json”，大小{size}，耗时{elapsed_time:.2f} s。"
+    )