Update

SkyEye-FAST · Aug 22, 2024 · 43c0ee1 · 43c0ee1
1 parent 039eb64
commit 43c0ee1
Show file tree

Hide file tree

Showing 6 changed files with 7,371 additions and 57 deletions.
diff --git a/base.py b/base.py
@@ -56,22 +56,23 @@ def file_size(p: Path):
 
 # 初始化其他自定义数据
 pinyin_to: Dict[str, Ldata] = {}
+pinyin_to["wadegiles"] = load_json("py2wg")  # 汉语拼音至威妥玛拼音
+pinyin_to["romatzyh"] = load_json("py2gr")  # 汉语拼音至国语罗马字
 pinyin_to["mps2"] = load_json("py2mps2")  # 汉语拼音至注音二式
-pinyin_to["typy"] = load_json("py2ty")  # 汉语拼音至通用拼音
+pinyin_to["tongyong"] = load_json("py2ty")  # 汉语拼音至通用拼音
 pinyin_to["yale"] = load_json("py2yale")  # 汉语拼音至耶鲁拼音
 pinyin_to["ipa"] = load_json("py2ipa")  # 汉语拼音至IPA
-pinyin_to["wadegiles"] = load_json("py2wg")  # 汉语拼音至威妥玛拼音
-pinyin_to["romatzyh"] = load_json("py2gr")  # 汉语拼音至国语罗马字
+pinyin_to["katakana"] = load_json("py2kk")  # 汉语拼音至片假名转写
 pinyin_to["cyrillic"] = load_json("py2cy")  # 汉语拼音至西里尔转写
 pinyin_to["xiaojing"] = load_json("py2xj")  # 汉语拼音至小儿经
 
 fixed_zh: Dict[str, Ldata] = {}
 fixed_zh["zh_py"] = load_json("fixed_zh_py")  # 汉语拼音修正
+fixed_zh["zh_wg"] = load_json("fixed_zh_wg")  # 威妥玛拼音修正
+fixed_zh["zh_gr"] = load_json("fixed_zh_gr")  # 国语罗马字修正
 fixed_zh["zh_mps2"] = load_json("fixed_zh_mps2")  # 注音二式修正
 fixed_zh["zh_ty"] = load_json("fixed_zh_ty")  # 通用拼音修正
 fixed_zh["zh_yale"] = load_json("fixed_zh_yale")  # 耶鲁拼音修正
-fixed_zh["zh_wg"] = load_json("fixed_zh_wg")  # 威妥玛拼音修正
-fixed_zh["zh_gr"] = load_json("fixed_zh_gr")  # 国语罗马字修正
 fixed_zh["zh_cy"] = load_json("fixed_zh_cy")  # 西里尔转写修正
 fixed_zh["zh_xj"] = load_json("fixed_zh_xj")  # 小儿经转写修正
 

diff --git a/converter.py b/converter.py
@@ -239,14 +239,22 @@ def to_pinyin(text: str, rep: Ldata, auto_cut: bool = True) -> str:
     return capitalize_lines(capitalize_titles(replace_multiple(result, rep)))
 
 
-def to_mps2(text: str, rep: Ldata, auto_cut: bool = True) -> str:
+def pinyin_to_other(
+    correspondence: Ldata,
+    text: str,
+    rep: Ldata,
+    auto_cut: bool = True,
+    delimiter: str = "-",
+) -> str:
     """
-    将字符串中的汉字转写为注音符号第二式，单字之间使用连字符分开，词之间使用空格分开。
+    将字符串中的汉字转写，单字之间使用delimiter定义的符号分开，词之间使用空格分开。
 
     Args:
+        correspondence (Ldata): 对应关系
         text (str): 需要转换的字符串
         rep (Ldata): 需要替换格式的内容
         auto_cut (bool, optional): 是否自动分词，默认为True
+        delimiter (str, optional): 分隔符，默认为'-'
 
     Returns:
         str: 转换结果
@@ -257,16 +265,16 @@ def to_mps2(text: str, rep: Ldata, auto_cut: bool = True) -> str:
 
     for seg in seg_list:
         pinyin_list = lazy_pinyin(seg, style=Style.TONE3, neutral_tone_with_five=True)
-        mps2_list = [pinyin_to["mps2"].get(p, p) for p in pinyin_list]
-        output_list.append("-".join(mps2_list))
+        result_list = [correspondence.get(p, p) for p in pinyin_list]
+        output_list.append(delimiter.join(result_list))
 
     result = " ".join(output_list)
     return capitalize_lines(capitalize_titles(replace_multiple(result, rep)))
 
 
-def to_tongyong(text: str, rep: Ldata, auto_cut: bool = True) -> str:
+def to_mps2(text: str, rep: Ldata, auto_cut: bool = True) -> str:
     """
-    将字符串中的汉字转写为通用拼音，单字之间使用连字符分开，词之间使用空格分开。
+    将字符串中的汉字转写为注音符号第二式，单字之间使用连字符分开，词之间使用空格分开。
 
     Args:
         text (str): 需要转换的字符串
@@ -277,16 +285,23 @@ def to_tongyong(text: str, rep: Ldata, auto_cut: bool = True) -> str:
         str: 转换结果
     """
 
-    seg_list = segment_str(text, auto_cut)
-    output_list: List[str] = []
+    return pinyin_to_other(pinyin_to["mps2"], text, rep, auto_cut)
 
-    for seg in seg_list:
-        pinyin_list = lazy_pinyin(seg, style=Style.TONE3, neutral_tone_with_five=True)
-        typy_list = [pinyin_to["typy"].get(p, p) for p in pinyin_list]
-        output_list.append("-".join(typy_list))
 
-    result = " ".join(output_list)
-    return capitalize_lines(capitalize_titles(replace_multiple(result, rep)))
+def to_tongyong(text: str, rep: Ldata, auto_cut: bool = True) -> str:
+    """
+    将字符串中的汉字转写为通用拼音，单字之间使用连字符分开，词之间使用空格分开。
+
+    Args:
+        text (str): 需要转换的字符串
+        rep (Ldata): 需要替换格式的内容
+        auto_cut (bool, optional): 是否自动分词，默认为True
+
+    Returns:
+        str: 转换结果
+    """
+
+    return pinyin_to_other(pinyin_to["tongyong"], text, rep, auto_cut)
 
 
 def to_yale(text: str, rep: Ldata, auto_cut: bool = True) -> str:
@@ -302,16 +317,7 @@ def to_yale(text: str, rep: Ldata, auto_cut: bool = True) -> str:
         str: 转换结果
     """
 
-    seg_list = segment_str(text, auto_cut)
-    output_list: List[str] = []
-
-    for seg in seg_list:
-        pinyin_list = lazy_pinyin(seg, style=Style.TONE3, neutral_tone_with_five=True)
-        yale_list = [pinyin_to["yale"].get(p, p) for p in pinyin_list]
-        output_list.append("-".join(yale_list))
-
-    result = " ".join(output_list)
-    return capitalize_lines(capitalize_titles(replace_multiple(result, rep)))
+    return pinyin_to_other(pinyin_to["yale"], text, rep, auto_cut)
 
 
 def to_ipa(text: str) -> str:
@@ -363,16 +369,7 @@ def to_wadegiles(text: str, rep: Ldata, auto_cut: bool = True) -> str:
         str: 转换结果
     """
 
-    seg_list = segment_str(text, auto_cut)
-    output_list: List[str] = []
-
-    for seg in seg_list:
-        pinyin_list = lazy_pinyin(seg, style=Style.TONE3, neutral_tone_with_five=True)
-        wg_list = [pinyin_to["wadegiles"].get(p, p) for p in pinyin_list]
-        output_list.append("-".join(wg_list))
-
-    result = " ".join(output_list)
-    return capitalize_lines(capitalize_titles(replace_multiple(result, rep)))
+    return pinyin_to_other(pinyin_to["wadegiles"], text, rep, auto_cut)
 
 
 def to_romatzyh(text: str, rep: Ldata, auto_cut: bool = True) -> str:
@@ -402,6 +399,22 @@ def to_romatzyh(text: str, rep: Ldata, auto_cut: bool = True) -> str:
     return capitalize_lines(capitalize_titles(replace_multiple(result, rep)))
 
 
+def pinyin_to_katakana(text: str) -> str:
+    """
+    将字符串中的汉字转写为片假名。
+
+    Args:
+        text (str): 需要转换的字符串
+
+    Returns:
+        str: 转换结果
+    """
+
+    pinyin_list = lazy_pinyin(text)
+    kana_list = [f"{pinyin_to['katakana'].get(p, p)}" for p in pinyin_list]
+    return " ".join(kana_list)
+
+
 def to_cyrillic(text: str, rep: Ldata, auto_cut: bool = True) -> str:
     """
     将字符串中的汉字转写为西里尔字母，使用帕拉季音标体系。