Skip to content

Commit

Permalink
Update
Browse files Browse the repository at this point in the history
  • Loading branch information
SkyEye-FAST committed Aug 15, 2024
1 parent afaf607 commit 9c7d75e
Show file tree
Hide file tree
Showing 7 changed files with 87 additions and 108 deletions.
2 changes: 2 additions & 0 deletions base.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,5 @@ def load_json(file: str, folder: str = "data") -> Ldata:

rep_zh: Ldata = load_json("rep_zh") # 连写的中文转写方案替换修正
finals: Tuple[str, ...] = tuple("aāááàoōóǒòeēéěè") # 可能的零声母开头

rep_ja_kk: Ldata = load_json("rep_ja_kk") # 片假名替换修正
78 changes: 52 additions & 26 deletions converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import re
import time
import inspect
from typing import Dict, List, Set
from typing import List, Set, Tuple, Callable, Optional

from romajitable import to_kana as tk
from pypinyin import Style, lazy_pinyin, load_phrases_dict
Expand All @@ -26,7 +26,6 @@
# 初始化其他自定义数据
fixed_zh_u = load_json("fixed_zh_universal")
tone_to_ipa: Ldata = {"1": "˥", "2": "˧˥", "3": "˨˩˦", "4": "˥˩", "5": ""} # IPA声调
rep_ja_kk: Ldata = load_json("rep_ja_kk") # 片假名替换修正
manyoganas_dict: Ldata = load_json("manyogana") # 万叶假名


Expand Down Expand Up @@ -121,32 +120,34 @@ def segment_str(text: str, auto_cut: bool = True) -> List[str]:
return jieba.lcut(text) if auto_cut else text.split()


def to_katakana(text: str) -> str:
def to_katakana(text: str, rep: Ldata) -> str:
"""
将字符串中的英文转写为片假名。
Args:
text (str): 需要转换的字符串
rep (Ldata): 需要替换格式的内容
Returns:
str: 转换结果
"""

return replace_multiple(tk(text).katakana, rep_ja_kk)
return replace_multiple(tk(text).katakana, rep)


def to_manyogana(text: str) -> str:
def to_manyogana(text: str, rep: Ldata) -> str:
"""
将字符串中的片假名转写为万叶假名。
Args:
text (str): 需要转换的字符串
rep (Ldata): 需要替换格式的内容
Returns:
str: 转换结果
"""

return "".join(manyoganas_dict.get(char, char) for char in to_katakana(text))
return "".join(manyoganas_dict.get(char, char) for char in to_katakana(text, rep))


def to_pinyin(text: str, rep: Ldata, auto_cut: bool = True) -> str:
Expand Down Expand Up @@ -314,22 +315,30 @@ def to_xiaojing(text: str, rep: Ldata, auto_cut: bool = True) -> str:
return replace_multiple(" ".join(output_list), rep)


def save_to_json(input_dict: Ldata, config: Dict) -> None:
"""将生成的语言文件保存至JSON。
def convert(
input_dict: Ldata,
func: Callable[[str], str],
fix_dict: Optional[Ldata] = None,
auto_cut: bool = True,
rep: Ldata = rep_zh,
) -> Tuple[Ldata, float]:
"""
转换语言数据。
Args:
input_dict (Ldata): 输入的数据
config (Dict): 含有配置的字典
func (Callable[[str], str]): 生成语言文件所用的函数
fix_dict (Optional[Ldata], optional): 语言文件中需要修复的内容. 默认为None
auto_cut (bool, optional): 是否自动分词,默认为True
rep (Ldata, optional): 需要替换的内容,默认为rep_zh的内容
Returns:
(Ldata, float): 转换结果及耗时
"""

start_time = time.time()

func = config["func"]

auto_cut = config.get("auto_cut", True)
rep = config.get("rep", rep_zh)

output_dict = {}
output_dict: Ldata = {}
for k, v in input_dict.items():
func_signature = inspect.signature(func)
kwargs = {}
Expand All @@ -339,16 +348,33 @@ def save_to_json(input_dict: Ldata, config: Dict) -> None:
kwargs["rep"] = rep
output_dict[k] = func(v, **kwargs)

output_dict.update(fixed_zh_u)
if config.get("fixed_dict"):
output_dict.update(config["fixed_dict"])
file_path = (
P / config.get("output_folder", "output") / f"{config['output_file']}.json"
)
with open(file_path, "w", encoding="utf-8") as j:
json.dump(output_dict, j, indent=2, ensure_ascii=False)
if rep is rep_zh:
output_dict.update(fixed_zh_u)

if fix_dict:
output_dict.update(fix_dict)

elapsed_time = time.time() - start_time

return output_dict, elapsed_time


def save_to_json(
input_data: Tuple[Ldata, float],
output_file: str,
output_folder: str = "output",
) -> None:
"""将生成的语言文件保存至JSON。
Args:
input_data (Tuple[Ldata, float]): 输入的数据
output_file (str): 保存的文件名,无格式后缀
output_folder (str, optional): 保存的文件夹,默认为“output”
"""

input_dict, elapsed_time = input_data
file_path = P / output_folder / f"{output_file}.json"
with open(file_path, "w", encoding="utf-8") as j:
json.dump(input_dict, j, indent=2, ensure_ascii=False)
size = f"{round(file_path.stat().st_size / 1024, 2)} KB"
print(
f"已生成语言文件“{config['output_file']}.json”,大小{size},耗时{elapsed_time:.2f} s。"
)
print(f"已生成语言文件“{output_file}.json”,大小{size},耗时{elapsed_time:.2f} s。")
3 changes: 2 additions & 1 deletion data/rep_ja_kk.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,6 @@
"サムエル・åベルグ": "サミュエル・オーバーグ",
"レナ・ライネ": "レナ・レイン",
"エン_ウス": "en_us",
"パラディストルäド": "パラダイスツリー"
"パラディストルäド": "パラダイスツリー",
"「・フ4・」":"[ F4 ]"
}
56 changes: 16 additions & 40 deletions fix_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from base import load_json
from converter import (
save_to_json,
convert,
to_pinyin,
to_wadegiles,
to_romatzyh,
Expand All @@ -15,52 +16,27 @@

fixed_zh_source = load_json("fixed_zh_source")
save_to_json(
fixed_zh_source,
{
"output_file": "fixed_zh_py",
"func": to_pinyin,
"output_folder": "data",
"auto_cut": False,
"rep": rep,
},
convert(fixed_zh_source, to_pinyin, auto_cut=False, rep=rep),
"fixed_zh_py",
"data",
)
save_to_json(
fixed_zh_source,
{
"output_file": "fixed_zh_wg",
"func": to_wadegiles,
"output_folder": "data",
"auto_cut": False,
"rep": rep,
},
convert(fixed_zh_source, to_wadegiles, auto_cut=False, rep=rep),
"fixed_zh_wg",
"data",
)
save_to_json(
fixed_zh_source,
{
"output_file": "fixed_zh_gr",
"func": to_romatzyh,
"output_folder": "data",
"auto_cut": False,
"rep": rep,
},
convert(fixed_zh_source, to_romatzyh, auto_cut=False, rep=rep),
"fixed_zh_gr",
"data",
)
save_to_json(
fixed_zh_source,
{
"output_file": "fixed_zh_cy",
"func": to_cyrillic,
"output_folder": "data",
"auto_cut": False,
"rep": rep,
},
convert(fixed_zh_source, to_cyrillic, auto_cut=False, rep=rep),
"fixed_zh_cy",
"data",
)
save_to_json(
fixed_zh_source,
{
"output_file": "fixed_zh_xj",
"func": to_xiaojing,
"output_folder": "data",
"auto_cut": False,
"rep": rep,
},
convert(fixed_zh_source, to_xiaojing, auto_cut=False, rep=rep),
"fixed_zh_xj",
"data",
)
4 changes: 2 additions & 2 deletions output/ja_kk.json
Original file line number Diff line number Diff line change
Expand Up @@ -4684,7 +4684,7 @@
"known_server_link.status": "スタツス",
"known_server_link.support": "スッポルト",
"known_server_link.website": "ヱブシテ",
"language.code": "zho-Hans_CN",
"language.code": "en_us",
"language.name": "エングリスホ",
"language.region": "ウニテド・スタテス",
"lanServer.otherPlayers": "セッチングス・フォル・オトヘル・プライェルス",
Expand Down Expand Up @@ -5547,7 +5547,7 @@
"painting.minecraft.sunflowers.author": "クリストッフェル・ゼッテルストランド",
"painting.minecraft.sunflowers.title": "スンフロヱルス",
"painting.minecraft.sunset.author": "クリストッフェル・ゼッテルストランド",
"painting.minecraft.sunset.title": "sunset_dense",
"painting.minecraft.sunset.title": "スンセト_デンセ",
"painting.minecraft.tides.author": "クリストッフェル・ゼッテルストランド",
"painting.minecraft.tides.title": "チデス",
"painting.minecraft.unpacked.author": "サラホ・ボエヴィング",
Expand Down
4 changes: 2 additions & 2 deletions output/ja_my.json
Original file line number Diff line number Diff line change
Expand Up @@ -4684,7 +4684,7 @@
"known_server_link.status": "須多川須",
"known_server_link.support": "須川保流止",
"known_server_link.website": "恵夫之天",
"language.code": "zho-Hans_CN",
"language.code": "en_us",
"language.name": "江尓具利須保",
"language.region": "宇仁天特・須多天須",
"lanServer.otherPlayers": "世川千尓具須・不於流・於止部流・不良伊江流須",
Expand Down Expand Up @@ -5547,7 +5547,7 @@
"painting.minecraft.sunflowers.author": "久利須止川不江流・是川天流須止良尓特",
"painting.minecraft.sunflowers.title": "須尓不呂恵流須",
"painting.minecraft.sunset.author": "久利須止川不江流・是川天流須止良尓特",
"painting.minecraft.sunset.title": "sunset_dense",
"painting.minecraft.sunset.title": "須尓世止_代尓世",
"painting.minecraft.tides.author": "久利須止川不江流・是川天流須止良尓特",
"painting.minecraft.tides.title": "千代須",
"painting.minecraft.unpacked.author": "散良保・番江无伊尓具",
Expand Down
48 changes: 11 additions & 37 deletions pack.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
import time
import zipfile as zf

from base import P, data, fixed_zh
from base import P, data, fixed_zh, rep_ja_kk
from converter import (
save_to_json,
convert,
to_bopomofo,
to_cyrillic,
to_ipa,
Expand All @@ -26,42 +27,15 @@ def main() -> None:

# 生成语言文件
main_start_time = time.time()
save_to_json(
data["en_us"],
{"output_file": "ja_kk", "func": to_katakana},
)
save_to_json(
data["en_us"],
{"output_file": "ja_my", "func": to_manyogana},
)
save_to_json(
data["zh_cn"],
{"output_file": "zh_py", "func": to_pinyin, "fixed_dict": fixed_zh["zh_py"]},
)
save_to_json(
data["zh_cn"],
{"output_file": "zh_ipa", "func": to_ipa},
)
save_to_json(
data["zh_cn"],
{"output_file": "zh_bpmf", "func": to_bopomofo},
)
save_to_json(
data["zh_cn"],
{"output_file": "zh_wg", "func": to_wadegiles, "fixed_dict": fixed_zh["zh_wg"]},
)
save_to_json(
data["zh_cn"],
{"output_file": "zh_gr", "func": to_romatzyh, "fixed_dict": fixed_zh["zh_gr"]},
)
save_to_json(
data["zh_cn"],
{"output_file": "zh_cy", "func": to_cyrillic, "fixed_dict": fixed_zh["zh_cy"]},
)
save_to_json(
data["zh_cn"],
{"output_file": "zh_xj", "func": to_xiaojing, "fixed_dict": fixed_zh["zh_xj"]},
)
save_to_json(convert(data["en_us"], to_katakana, rep=rep_ja_kk), "ja_kk")
save_to_json(convert(data["en_us"], to_manyogana, rep=rep_ja_kk), "ja_my")
save_to_json(convert(data["zh_cn"], to_pinyin, fixed_zh["zh_py"]), "zh_py")
save_to_json(convert(data["zh_cn"], to_ipa), "zh_ipa")
save_to_json(convert(data["zh_cn"], to_bopomofo), "zh_bpmf")
save_to_json(convert(data["zh_cn"], to_wadegiles, fixed_zh["zh_wg"]), "zh_wg")
save_to_json(convert(data["zh_cn"], to_romatzyh, fixed_zh["zh_gr"]), "zh_gr")
save_to_json(convert(data["zh_cn"], to_cyrillic, fixed_zh["zh_cy"]), "zh_cy")
save_to_json(convert(data["zh_cn"], to_xiaojing, fixed_zh["zh_xj"]), "zh_xj")
main_elapsed_time = time.time() - main_start_time
print(f"\n语言文件生成完毕,共耗时{main_elapsed_time:.2f} s。")

Expand Down

0 comments on commit 9c7d75e

Please sign in to comment.