Skip to content

Commit

Permalink
Fix
Browse files Browse the repository at this point in the history
  • Loading branch information
SkyEye-FAST committed Aug 15, 2024
1 parent f8676f2 commit afaf607
Show file tree
Hide file tree
Showing 10 changed files with 213 additions and 248 deletions.
169 changes: 95 additions & 74 deletions converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import json
import re
import time
from typing import Callable, Optional, List, Set
import inspect
from typing import Dict, List, Set

from romajitable import to_kana as tk
from pypinyin import Style, lazy_pinyin, load_phrases_dict
Expand All @@ -23,8 +24,8 @@
jieba.load_userdict(str(P / "data" / "dict.txt"))

# 初始化其他自定义数据
fixed_zh_u = load_json("fixed_zh_universal")
tone_to_ipa: Ldata = {"1": "˥", "2": "˧˥", "3": "˨˩˦", "4": "˥˩", "5": ""} # IPA声调

rep_ja_kk: Ldata = load_json("rep_ja_kk") # 片假名替换修正
manyoganas_dict: Ldata = load_json("manyogana") # 万叶假名

Expand Down Expand Up @@ -75,21 +76,10 @@ def capitalize_titles(text: str) -> str:
str: 转换结果
"""

def title_case_content(content: str) -> str:
"""
将书名号中的内容首字母大写。
Args:
content (str): 书名号中的内容。
Returns:
str: 首字母大写后的书名号内容。
"""

return " ".join(word.capitalize() for word in content.split())

return re.sub(
r"《(.*?)》", lambda match: f"《{title_case_content(match.group(1))}》", text
r"《(.*?)》",
lambda match: f"《{' '.join(word.capitalize() for word in match.group(1).split())}》",
text,
)


Expand All @@ -102,7 +92,7 @@ def add_apostrophes(input_list: List[str], values: Set[str]) -> List[str]:
values (Set[str]): 有效的拼写
Returns:
list: 处理结果
List[str]: 处理结果
"""

for i in range(1, len(input_list)):
Expand All @@ -116,6 +106,21 @@ def add_apostrophes(input_list: List[str], values: Set[str]) -> List[str]:
return input_list


def segment_str(text: str, auto_cut: bool = True) -> List[str]:
"""
将字符串分词。
Args:
text (str): 需要转换的字符串
auto_cut (bool, optional): 是否自动分词,默认为True
Returns:
str: 转换结果
"""

return jieba.lcut(text) if auto_cut else text.split()


def to_katakana(text: str) -> str:
"""
将字符串中的英文转写为片假名。
Expand All @@ -141,35 +146,39 @@ def to_manyogana(text: str) -> str:
str: 转换结果
"""

return "".join([manyoganas_dict.get(char, char) for char in to_katakana(text)])
return "".join(manyoganas_dict.get(char, char) for char in to_katakana(text))


def to_pinyin(text: str) -> str:
def to_pinyin(text: str, rep: Ldata, auto_cut: bool = True) -> str:
"""
将字符串中的汉字转写为拼音,尝试遵循GB/T 16159-2012分词,词之间使用空格分开。
Args:
text (str): 需要转换的字符串
rep (Ldata): 需要替换格式的内容
auto_cut (bool, optional): 是否自动分词,默认为True
Returns:
str: 转换结果
"""

seg_list: List[str] = jieba.lcut(text)
seg_list = segment_str(text, auto_cut)
output_list: List[str] = []

for seg in seg_list:
pinyin_list = lazy_pinyin(seg, style=Style.TONE)
# 处理隔音符号
for i, py in enumerate(pinyin_list[1:], 1):
if py.startswith(finals):
pinyin_list[i] = f"'{py}"
pinyin_list = [
(
f"'{py}"
if i > 0 and py.startswith(finals) and pinyin_list[i - 1][-1].isalpha()
else py
)
for i, py in enumerate(pinyin_list)
]
output_list.append("".join(pinyin_list))

# 调整格式
result = replace_multiple(" ".join(output_list), rep_zh)

return capitalize_lines(capitalize_titles(result))
result = " ".join(output_list)
return capitalize_lines(capitalize_titles(replace_multiple(result, rep)))


def to_ipa(text: str) -> str:
Expand All @@ -186,7 +195,7 @@ def to_ipa(text: str) -> str:

pinyin_list = lazy_pinyin(text, style=Style.TONE3, neutral_tone_with_five=True)
ipa_list = [
f"{pinyin_to["ipa"].get(p[:-1], p[:-1])}{tone_to_ipa.get(p[-1], p[-1])}"
f"{pinyin_to['ipa'].get(p[:-1], p[:-1])}{tone_to_ipa.get(p[-1], p[-1])}"
for p in pinyin_list
]
return " ".join(ipa_list)
Expand All @@ -206,128 +215,140 @@ def to_bopomofo(text: str) -> str:
return " ".join(lazy_pinyin(text, style=Style.BOPOMOFO))


def to_wadegiles(text: str) -> str:
def to_wadegiles(text: str, rep: Ldata, auto_cut: bool = True) -> str:
"""
将字符串中的汉字转写为威妥玛拼音,单字之间使用连字符分开,词之间使用空格分开。
Args:
text (str): 需要转换的字符串
rep (Ldata): 需要替换格式的内容
auto_cut (bool, optional): 是否自动分词,默认为True
Returns:
str: 转换结果
"""

seg_list: List[str] = jieba.lcut(text)
seg_list = segment_str(text, auto_cut)
output_list: List[str] = []

for seg in seg_list:
pinyin_list = lazy_pinyin(seg, style=Style.TONE3, neutral_tone_with_five=True)
gr_list = [pinyin_to["wadegiles"].get(p, p) for p in pinyin_list]
output_list.append("-".join(gr_list))

# 调整格式
result = replace_multiple(" ".join(output_list), rep_zh)
wg_list = [pinyin_to["wadegiles"].get(p, p) for p in pinyin_list]
output_list.append("-".join(wg_list))

return capitalize_lines(capitalize_titles(result))
result = " ".join(output_list)
return capitalize_lines(capitalize_titles(replace_multiple(result, rep)))


def to_romatzyh(text: str) -> str:
def to_romatzyh(text: str, rep: Ldata, auto_cut: bool = True) -> str:
"""
将字符串中的汉字转写为国语罗马字,词之间使用空格分开。
Args:
text (str): 需要转换的字符串
rep (Ldata): 需要替换格式的内容
auto_cut (bool, optional): 是否自动分词,默认为True
Returns:
str: 转换结果
"""

seg_list: List[str] = jieba.lcut(text)
output_list: List[str] = []
seg_list = segment_str(text, auto_cut)
output_list = []

for seg in seg_list:
seg = seg.replace("不", "bu")
pinyin_list = lazy_pinyin(seg, style=Style.TONE3, neutral_tone_with_five=True)
gr_list = [pinyin_to["romatzyh"].get(p, p) for p in pinyin_list]
output_list.append("".join(add_apostrophes(gr_list, gr_values)))

result = replace_multiple(" ".join(output_list), rep_zh) # 调整格式
result = " ".join(output_list)

return capitalize_lines(capitalize_titles(result))
return capitalize_lines(capitalize_titles(replace_multiple(result, rep)))


def to_cyrillic(text: str) -> str:
def to_cyrillic(text: str, rep: Ldata, auto_cut: bool = True) -> str:
"""
将字符串中的汉字转写为西里尔字母,使用帕拉季音标体系,词之间使用空格分开
将字符串中的汉字转写为西里尔字母,使用帕拉季音标体系。
Args:
text (str): 需要转换的字符串
rep (Ldata): 需要替换格式的内容
auto_cut (bool, optional): 是否自动分词,默认为True
Returns:
str: 转换结果
"""

seg_list: List[str] = jieba.lcut(text)
seg_list = segment_str(text, auto_cut)
output_list: List[str] = []

for seg in seg_list:
pinyin_list = lazy_pinyin(seg)
cy_list = [pinyin_to["cyrillic"].get(p, p) for p in pinyin_list]
output_list.append("".join(add_apostrophes(cy_list, cy_values)))

result = replace_multiple(" ".join(output_list), rep_zh) # 调整格式

return capitalize_lines(capitalize_titles(result))
result = " ".join(output_list)
return capitalize_lines(capitalize_titles(replace_multiple(result, rep)))


def to_xiaojing(text: str) -> str:
def to_xiaojing(text: str, rep: Ldata, auto_cut: bool = True) -> str:
"""
将字符串中的汉字转写为小儿经,单字之间使用零宽不连字(U+200C)分开,词之间使用空格分开
将字符串中的汉字转写为小儿经,使用零宽不连字(U+200C)分开。
Args:
text (str): 需要转换的字符串
rep (Ldata): 需要替换格式的内容
auto_cut (bool, optional): 是否自动分词,默认为True
Returns:
str: 转换结果
"""

seg_list: List[str] = jieba.lcut(text)
output_list: List[str] = []

seg_list = segment_str(text, auto_cut)
output_list = []
for seg in seg_list:
pinyin_list = lazy_pinyin(seg)
xj_list = [pinyin_to["xiaojing"].get(p, p) for p in pinyin_list]
output_list.append("\u200c".join(xj_list))

return replace_multiple(" ".join(output_list), rep_zh)
return replace_multiple(" ".join(output_list), rep)


def save_to_json(
input_dict: Ldata,
output_file: str,
func: Callable[[str], str],
fix_dict: Optional[Ldata] = None,
output_folder: str = "output",
) -> None:
"""
将生成的语言文件保存至JSON。
def save_to_json(input_dict: Ldata, config: Dict) -> None:
"""将生成的语言文件保存至JSON。
Args:
input_dict (Ldata): 输入的数据
output_file (str): 保存的文件名,无格式后缀
func (Callable[[str], str]): 生成语言文件所用的函数
fix_dict (Optional[Ldata], optional): 语言文件中需要修复的内容. 默认为None
output_folder (str, optional): 保存的文件夹,默认为“output”
config (Dict): 含有配置的字典
"""

start_time = time.time()
full_file_name = f"{output_file}.json"
output_dict = {k: func(v) for k, v in input_dict.items()}
if fix_dict:
output_dict.update(fix_dict)
file_path = P / output_folder / full_file_name

func = config["func"]

auto_cut = config.get("auto_cut", True)
rep = config.get("rep", rep_zh)

output_dict = {}
for k, v in input_dict.items():
func_signature = inspect.signature(func)
kwargs = {}
if "auto_cut" in func_signature.parameters and auto_cut is not None:
kwargs["auto_cut"] = auto_cut
if "rep" in func_signature.parameters and rep is not None:
kwargs["rep"] = rep
output_dict[k] = func(v, **kwargs)

output_dict.update(fixed_zh_u)
if config.get("fixed_dict"):
output_dict.update(config["fixed_dict"])
file_path = (
P / config.get("output_folder", "output") / f"{config['output_file']}.json"
)
with open(file_path, "w", encoding="utf-8") as j:
json.dump(output_dict, j, indent=2, ensure_ascii=False)
elapsed_time = time.time() - start_time
size = f"{round(file_path.stat().st_size / 1024, 2)} KB"
print(f"已生成语言文件“{full_file_name}”,大小{size},耗时{elapsed_time:.2f} s。")
print(
f"已生成语言文件“{config['output_file']}.json”,大小{size},耗时{elapsed_time:.2f} s。"
)
Loading

0 comments on commit afaf607

Please sign in to comment.