Skip to content

Commit

Permalink
Update
Browse files Browse the repository at this point in the history
  • Loading branch information
SkyEye-FAST committed Aug 22, 2024
1 parent 039eb64 commit 43c0ee1
Show file tree
Hide file tree
Showing 6 changed files with 7,371 additions and 57 deletions.
11 changes: 6 additions & 5 deletions base.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,22 +56,23 @@ def file_size(p: Path):

# 初始化其他自定义数据
pinyin_to: Dict[str, Ldata] = {}
pinyin_to["wadegiles"] = load_json("py2wg") # 汉语拼音至威妥玛拼音
pinyin_to["romatzyh"] = load_json("py2gr") # 汉语拼音至国语罗马字
pinyin_to["mps2"] = load_json("py2mps2") # 汉语拼音至注音二式
pinyin_to["typy"] = load_json("py2ty") # 汉语拼音至通用拼音
pinyin_to["tongyong"] = load_json("py2ty") # 汉语拼音至通用拼音
pinyin_to["yale"] = load_json("py2yale") # 汉语拼音至耶鲁拼音
pinyin_to["ipa"] = load_json("py2ipa") # 汉语拼音至IPA
pinyin_to["wadegiles"] = load_json("py2wg") # 汉语拼音至威妥玛拼音
pinyin_to["romatzyh"] = load_json("py2gr") # 汉语拼音至国语罗马字
pinyin_to["katakana"] = load_json("py2kk") # 汉语拼音至片假名转写
pinyin_to["cyrillic"] = load_json("py2cy") # 汉语拼音至西里尔转写
pinyin_to["xiaojing"] = load_json("py2xj") # 汉语拼音至小儿经

fixed_zh: Dict[str, Ldata] = {}
fixed_zh["zh_py"] = load_json("fixed_zh_py") # 汉语拼音修正
fixed_zh["zh_wg"] = load_json("fixed_zh_wg") # 威妥玛拼音修正
fixed_zh["zh_gr"] = load_json("fixed_zh_gr") # 国语罗马字修正
fixed_zh["zh_mps2"] = load_json("fixed_zh_mps2") # 注音二式修正
fixed_zh["zh_ty"] = load_json("fixed_zh_ty") # 通用拼音修正
fixed_zh["zh_yale"] = load_json("fixed_zh_yale") # 耶鲁拼音修正
fixed_zh["zh_wg"] = load_json("fixed_zh_wg") # 威妥玛拼音修正
fixed_zh["zh_gr"] = load_json("fixed_zh_gr") # 国语罗马字修正
fixed_zh["zh_cy"] = load_json("fixed_zh_cy") # 西里尔转写修正
fixed_zh["zh_xj"] = load_json("fixed_zh_xj") # 小儿经转写修正

Expand Down
81 changes: 47 additions & 34 deletions converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,14 +239,22 @@ def to_pinyin(text: str, rep: Ldata, auto_cut: bool = True) -> str:
return capitalize_lines(capitalize_titles(replace_multiple(result, rep)))


def to_mps2(text: str, rep: Ldata, auto_cut: bool = True) -> str:
def pinyin_to_other(
correspondence: Ldata,
text: str,
rep: Ldata,
auto_cut: bool = True,
delimiter: str = "-",
) -> str:
"""
将字符串中的汉字转写为注音符号第二式,单字之间使用连字符分开,词之间使用空格分开。
将字符串中的汉字转写,单字之间使用delimiter定义的符号分开,词之间使用空格分开。
Args:
correspondence (Ldata): 对应关系
text (str): 需要转换的字符串
rep (Ldata): 需要替换格式的内容
auto_cut (bool, optional): 是否自动分词,默认为True
delimiter (str, optional): 分隔符,默认为'-'
Returns:
str: 转换结果
Expand All @@ -257,16 +265,16 @@ def to_mps2(text: str, rep: Ldata, auto_cut: bool = True) -> str:

for seg in seg_list:
pinyin_list = lazy_pinyin(seg, style=Style.TONE3, neutral_tone_with_five=True)
mps2_list = [pinyin_to["mps2"].get(p, p) for p in pinyin_list]
output_list.append("-".join(mps2_list))
result_list = [correspondence.get(p, p) for p in pinyin_list]
output_list.append(delimiter.join(result_list))

result = " ".join(output_list)
return capitalize_lines(capitalize_titles(replace_multiple(result, rep)))


def to_tongyong(text: str, rep: Ldata, auto_cut: bool = True) -> str:
def to_mps2(text: str, rep: Ldata, auto_cut: bool = True) -> str:
"""
将字符串中的汉字转写为通用拼音,单字之间使用连字符分开,词之间使用空格分开。
将字符串中的汉字转写为注音符号第二式,单字之间使用连字符分开,词之间使用空格分开。
Args:
text (str): 需要转换的字符串
Expand All @@ -277,16 +285,23 @@ def to_tongyong(text: str, rep: Ldata, auto_cut: bool = True) -> str:
str: 转换结果
"""

seg_list = segment_str(text, auto_cut)
output_list: List[str] = []
return pinyin_to_other(pinyin_to["mps2"], text, rep, auto_cut)

for seg in seg_list:
pinyin_list = lazy_pinyin(seg, style=Style.TONE3, neutral_tone_with_five=True)
typy_list = [pinyin_to["typy"].get(p, p) for p in pinyin_list]
output_list.append("-".join(typy_list))

result = " ".join(output_list)
return capitalize_lines(capitalize_titles(replace_multiple(result, rep)))
def to_tongyong(text: str, rep: Ldata, auto_cut: bool = True) -> str:
"""
将字符串中的汉字转写为通用拼音,单字之间使用连字符分开,词之间使用空格分开。
Args:
text (str): 需要转换的字符串
rep (Ldata): 需要替换格式的内容
auto_cut (bool, optional): 是否自动分词,默认为True
Returns:
str: 转换结果
"""

return pinyin_to_other(pinyin_to["tongyong"], text, rep, auto_cut)


def to_yale(text: str, rep: Ldata, auto_cut: bool = True) -> str:
Expand All @@ -302,16 +317,7 @@ def to_yale(text: str, rep: Ldata, auto_cut: bool = True) -> str:
str: 转换结果
"""

seg_list = segment_str(text, auto_cut)
output_list: List[str] = []

for seg in seg_list:
pinyin_list = lazy_pinyin(seg, style=Style.TONE3, neutral_tone_with_five=True)
yale_list = [pinyin_to["yale"].get(p, p) for p in pinyin_list]
output_list.append("-".join(yale_list))

result = " ".join(output_list)
return capitalize_lines(capitalize_titles(replace_multiple(result, rep)))
return pinyin_to_other(pinyin_to["yale"], text, rep, auto_cut)


def to_ipa(text: str) -> str:
Expand Down Expand Up @@ -363,16 +369,7 @@ def to_wadegiles(text: str, rep: Ldata, auto_cut: bool = True) -> str:
str: 转换结果
"""

seg_list = segment_str(text, auto_cut)
output_list: List[str] = []

for seg in seg_list:
pinyin_list = lazy_pinyin(seg, style=Style.TONE3, neutral_tone_with_five=True)
wg_list = [pinyin_to["wadegiles"].get(p, p) for p in pinyin_list]
output_list.append("-".join(wg_list))

result = " ".join(output_list)
return capitalize_lines(capitalize_titles(replace_multiple(result, rep)))
return pinyin_to_other(pinyin_to["wadegiles"], text, rep, auto_cut)


def to_romatzyh(text: str, rep: Ldata, auto_cut: bool = True) -> str:
Expand Down Expand Up @@ -402,6 +399,22 @@ def to_romatzyh(text: str, rep: Ldata, auto_cut: bool = True) -> str:
return capitalize_lines(capitalize_titles(replace_multiple(result, rep)))


def pinyin_to_katakana(text: str) -> str:
"""
将字符串中的汉字转写为片假名。
Args:
text (str): 需要转换的字符串
Returns:
str: 转换结果
"""

pinyin_list = lazy_pinyin(text)
kana_list = [f"{pinyin_to['katakana'].get(p, p)}" for p in pinyin_list]
return " ".join(kana_list)


def to_cyrillic(text: str, rep: Ldata, auto_cut: bool = True) -> str:
"""
将字符串中的汉字转写为西里尔字母,使用帕拉季音标体系。
Expand Down
Loading

0 comments on commit 43c0ee1

Please sign in to comment.