Skip to content

Commit

Permalink
Update
Browse files Browse the repository at this point in the history
  • Loading branch information
SkyEye-FAST committed Aug 14, 2024
1 parent 4029e6a commit 038f6b7
Show file tree
Hide file tree
Showing 17 changed files with 2,924 additions and 2,297 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
__pycache__/

*.zip
55 changes: 55 additions & 0 deletions base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# -*- encoding: utf-8 -*-
"""基础文件"""

import json
from pathlib import Path
from typing import TypeAlias, Dict, Set, Tuple

# 当前绝对路径
P = Path(__file__).resolve().parent

# 类型别名
Ldata: TypeAlias = Dict[str, str]


def load_json(file: str, folder: str = "data") -> Ldata:
"""
加载JSON文件。
Args:
file (str): 需要加载的文件,无格式后缀“.json”
folder (str, optional): 存放的文件夹,默认为“data”
Returns:
Ldata: 加载结果,字典
"""

with open(P / folder / f"{file}.json", "r", encoding="utf-8") as f:
return json.load(f)


# 读取语言文件
data: Dict[str, Ldata] = {
lang_name: load_json(lang_name, "mc_lang/full") for lang_name in ["en_us", "zh_cn"]
}

# 初始化其他自定义数据
pinyin_to: Dict[str, Ldata] = {}
pinyin_to["ipa"] = load_json("py2ipa") # 汉语拼音至IPA
pinyin_to["wadegiles"] = load_json("py2wg") # 汉语拼音至威妥玛拼音
pinyin_to["romatzyh"] = load_json("py2gr") # 汉语拼音至国语罗马字
pinyin_to["cyrillic"] = load_json("py2cy") # 汉语拼音至西里尔转写
pinyin_to["xiaojing"] = load_json("py2xj") # 汉语拼音至小儿经

fixed_zh: Dict[str, Ldata] = {}
fixed_zh["zh_py"] = load_json("fixed_zh_py") # 汉语拼音修正
fixed_zh["zh_wg"] = load_json("fixed_zh_wg") # 威妥玛拼音修正
fixed_zh["zh_gr"] = load_json("fixed_zh_gr") # 国语罗马字修正
fixed_zh["zh_cy"] = load_json("fixed_zh_cy") # 西里尔转写修正
fixed_zh["zh_xj"] = load_json("fixed_zh_xj") # 小儿经转写修正

gr_values: Set[str] = set(pinyin_to["romatzyh"].values()) # 国语罗马字的有效拼写
cy_values: Set[str] = set(pinyin_to["cyrillic"].values()) # 西里尔转写的有效拼写

rep_zh: Ldata = load_json("rep_zh") # 连写的中文转写方案替换修正
finals: Tuple[str, ...] = tuple("aāááàoōóǒòeēéěè") # 可能的零声母开头
333 changes: 333 additions & 0 deletions converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,333 @@
# -*- encoding: utf-8 -*-
"""难视语言转换器"""

import json
import re
import time
from typing import Callable, Optional, List, Set

from romajitable import to_kana as tk
from pypinyin import Style, lazy_pinyin, load_phrases_dict
from pypinyin_dict.phrase_pinyin_data import cc_cedict, di
import jieba

from base import P, Ldata, load_json, pinyin_to, gr_values, cy_values, finals, rep_zh

# 初始化pypinyin
cc_cedict.load()
di.load()
phrases = load_json("phrases")
load_phrases_dict({k: [[_] for _ in v.split()] for k, v in phrases.items()})

# 初始化jieba
jieba.load_userdict(str(P / "data" / "dict.txt"))

# 初始化其他自定义数据
tone_to_ipa: Ldata = {"1": "˥", "2": "˧˥", "3": "˨˩˦", "4": "˥˩", "5": ""} # IPA声调

rep_ja_kk: Ldata = load_json("rep_ja_kk") # 片假名替换修正
manyoganas_dict: Ldata = load_json("manyogana") # 万叶假名


def replace_multiple(text: str, replacements: Ldata) -> str:
"""
对字符串进行多次替换。
Args:
text (str): 需要替换的字符串
replacements (Ldata): 替换的内容
Returns:
str: 替换结果
"""

for old, new in replacements.items():
text = text.replace(old, new)
return text


def capitalize_lines(text: str) -> str:
"""
处理句首大写,字符串中带换行符的单独处理。
Args:
text (str): 需要转换的字符串
Returns:
str: 转换结果
"""

if "\n" in text:
lines = text.splitlines()
capitalized_lines = [line[:1].upper() + line[1:] for line in lines]
return "\n".join(capitalized_lines)
return text[:1].upper() + text[1:]


def capitalize_titles(text: str) -> str:
"""
将字符串中书名号(《》)中的单词全部作首字母大写处理。
Args:
text (str): 需要转换的字符串
Returns:
str: 转换结果
"""

def title_case_content(content: str) -> str:
"""
将书名号中的内容首字母大写。
Args:
content (str): 书名号中的内容。
Returns:
str: 首字母大写后的书名号内容。
"""

return " ".join(word.capitalize() for word in content.split())

return re.sub(
r"《(.*?)》", lambda match: f"《{title_case_content(match.group(1))}》", text
)


def add_apostrophes(input_list: List[str], values: Set[str]) -> List[str]:
"""
处理隔音符号。
Args:
input_list (List[str]): 需要转换的字符串
values (Set[str]): 有效的拼写
Returns:
list: 处理结果
"""

for i in range(1, len(input_list)):
for j in range(len(input_list[i - 1])):
prefix = input_list[i - 1][: -j - 1]
suffix = input_list[i - 1][-j:]
if (suffix + input_list[i] in values) and (prefix in values):
input_list[i] = f"'{input_list[i]}"
break

return input_list


def to_katakana(text: str) -> str:
"""
将字符串中的英文转写为片假名。
Args:
text (str): 需要转换的字符串
Returns:
str: 转换结果
"""

return replace_multiple(tk(text).katakana, rep_ja_kk)


def to_manyogana(text: str) -> str:
"""
将字符串中的片假名转写为万叶假名。
Args:
text (str): 需要转换的字符串
Returns:
str: 转换结果
"""

return "".join([manyoganas_dict.get(char, char) for char in to_katakana(text)])


def to_pinyin(text: str) -> str:
"""
将字符串中的汉字转写为拼音,尝试遵循GB/T 16159-2012分词,词之间使用空格分开。
Args:
text (str): 需要转换的字符串
Returns:
str: 转换结果
"""

seg_list: List[str] = jieba.lcut(text)
output_list: List[str] = []

for seg in seg_list:
pinyin_list = lazy_pinyin(seg, style=Style.TONE)
# 处理隔音符号
for i, py in enumerate(pinyin_list[1:], 1):
if py.startswith(finals):
pinyin_list[i] = f"'{py}"
output_list.append("".join(pinyin_list))

# 调整格式
result = replace_multiple(" ".join(output_list), rep_zh)

return capitalize_lines(capitalize_titles(result))


def to_ipa(text: str) -> str:
"""
将字符串中的汉字转写为IPA,单字之间使用空格分开。
IPA数据来自@UntPhesoca,宽式标音。
Args:
text (str): 需要转换的字符串
Returns:
str: 转换结果
"""

pinyin_list = lazy_pinyin(text, style=Style.TONE3, neutral_tone_with_five=True)
ipa_list = [
f"{pinyin_to["ipa"].get(p[:-1], p[:-1])}{tone_to_ipa.get(p[-1], p[-1])}"
for p in pinyin_list
]
return " ".join(ipa_list)


def to_bopomofo(text: str) -> str:
"""
将字符串中的汉字转写为注音符号,单字之间使用空格分开。
Args:
text (str): 需要转换的字符串
Returns:
str: 转换结果
"""

return " ".join(lazy_pinyin(text, style=Style.BOPOMOFO))


def to_wadegiles(text: str) -> str:
"""
将字符串中的汉字转写为威妥玛拼音,单字之间使用连字符分开,词之间使用空格分开。
Args:
text (str): 需要转换的字符串
Returns:
str: 转换结果
"""

seg_list: List[str] = jieba.lcut(text)
output_list: List[str] = []

for seg in seg_list:
pinyin_list = lazy_pinyin(seg, style=Style.TONE3, neutral_tone_with_five=True)
gr_list = [pinyin_to["wadegiles"].get(p, p) for p in pinyin_list]
output_list.append("-".join(gr_list))

# 调整格式
result = replace_multiple(" ".join(output_list), rep_zh)

return capitalize_lines(capitalize_titles(result))


def to_romatzyh(text: str) -> str:
"""
将字符串中的汉字转写为国语罗马字,词之间使用空格分开。
Args:
text (str): 需要转换的字符串
Returns:
str: 转换结果
"""

seg_list: List[str] = jieba.lcut(text)
output_list: List[str] = []

for seg in seg_list:
seg = seg.replace("不", "bu")
pinyin_list = lazy_pinyin(seg, style=Style.TONE3, neutral_tone_with_five=True)
gr_list = [pinyin_to["romatzyh"].get(p, p) for p in pinyin_list]
output_list.append("".join(add_apostrophes(gr_list, gr_values)))

result = replace_multiple(" ".join(output_list), rep_zh) # 调整格式

return capitalize_lines(capitalize_titles(result))


def to_cyrillic(text: str) -> str:
"""
将字符串中的汉字转写为西里尔字母,使用帕拉季音标体系,词之间使用空格分开。
Args:
text (str): 需要转换的字符串
Returns:
str: 转换结果
"""

seg_list: List[str] = jieba.lcut(text)
output_list: List[str] = []

for seg in seg_list:
pinyin_list = lazy_pinyin(seg)
cy_list = [pinyin_to["cyrillic"].get(p, p) for p in pinyin_list]
output_list.append("".join(add_apostrophes(cy_list, cy_values)))

result = replace_multiple(" ".join(output_list), rep_zh) # 调整格式

return capitalize_lines(capitalize_titles(result))


def to_xiaojing(text: str) -> str:
"""
将字符串中的汉字转写为小儿经,单字之间使用零宽不连字(U+200C)分开,词之间使用空格分开。
Args:
text (str): 需要转换的字符串
Returns:
str: 转换结果
"""

seg_list: List[str] = jieba.lcut(text)
output_list: List[str] = []

for seg in seg_list:
pinyin_list = lazy_pinyin(seg)
xj_list = [pinyin_to["xiaojing"].get(p, p) for p in pinyin_list]
output_list.append("\u200c".join(xj_list))

return replace_multiple(" ".join(output_list), rep_zh)


def save_to_json(
input_dict: Ldata,
output_file: str,
func: Callable[[str], str],
fix_dict: Optional[Ldata] = None,
output_folder: str = "output",
) -> None:
"""
将生成的语言文件保存至JSON。
Args:
input_dict (Ldata): 输入的数据
output_file (str): 保存的文件名,无格式后缀
func (Callable[[str], str]): 生成语言文件所用的函数
fix_dict (Optional[Ldata], optional): 语言文件中需要修复的内容. 默认为None
output_folder (str, optional): 保存的文件夹,默认为“output”
"""

start_time = time.time()
full_file_name = f"{output_file}.json"
output_dict = {k: func(v) for k, v in input_dict.items()}
if fix_dict:
output_dict.update(fix_dict)
file_path = P / output_folder / full_file_name
with open(file_path, "w", encoding="utf-8") as j:
json.dump(output_dict, j, indent=2, ensure_ascii=False)
elapsed_time = time.time() - start_time
size = f"{round(file_path.stat().st_size / 1024, 2)} KB"
print(f"已生成语言文件“{full_file_name}”,大小{size},耗时{elapsed_time:.2f} s。")
Loading

0 comments on commit 038f6b7

Please sign in to comment.