Skip to content

Commit

Permalink
Update
Browse files Browse the repository at this point in the history
  • Loading branch information
SkyEye-FAST committed Aug 17, 2024
1 parent a8a0d45 commit efb2bef
Show file tree
Hide file tree
Showing 5 changed files with 6,948 additions and 4 deletions.
20 changes: 20 additions & 0 deletions base.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,26 @@ def load_json(file: str, folder: str = "data") -> Ldata:
return json.load(f)


def file_size(p: Path):
"""
计算文件大小。
Args:
p (Path): 需要计算大小的文件路径
Returns:
str: 文件大小
"""

size_in_bytes = p.stat().st_size
size = (
f"{round(size_in_bytes / 1048576, 2)} MB"
if size_in_bytes > 1048576
else f"{round(size_in_bytes / 1024, 2)} KB"
)
return size


# 读取语言文件
data: Dict[str, Ldata] = {
lang_name: load_json(lang_name, "mc_lang/full") for lang_name in ["en_us", "zh_cn"]
Expand Down
43 changes: 41 additions & 2 deletions converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,17 @@
import jieba
from opencc import OpenCC

from base import P, Ldata, load_json, pinyin_to, gr_values, cy_values, finals, rep_zh
from base import (
P,
Ldata,
load_json,
file_size,
pinyin_to,
gr_values,
cy_values,
finals,
rep_zh,
)

# 初始化OpenCC
opencc_s2c = OpenCC(str(P / "GujiCC" / "opencc" / "s2c.json"))
Expand Down Expand Up @@ -124,6 +134,35 @@ def segment_str(text: str, auto_cut: bool = True) -> List[str]:
return jieba.lcut(text) if auto_cut else text.split()


def to_i7h(text: str) -> str:
"""
将字符串中的所有单词缩写。
保留单词的首尾字符,中间用字符数替代。
长度为2或以下的单词保持不变。
Args:
text (str): 需要转换的字符串
Returns:
str: 转换结果
"""

words = re.findall(r"\w+", text)
results = []

for word in words:
if len(word) > 2:
result = f"{word[0]}{len(word) - 2}{word[-1]}"
else:
result = word
results.append(result)

for word, result in zip(words, results):
text = text.replace(word, result, 1)

return text


def to_katakana(text: str, rep: Ldata) -> str:
"""
将字符串中的英文转写为片假名。
Expand Down Expand Up @@ -394,5 +433,5 @@ def save_to_json(
file_path = P / output_folder / f"{output_file}.json"
with open(file_path, "w", encoding="utf-8") as j:
json.dump(input_dict, j, indent=2, ensure_ascii=False)
size = f"{round(file_path.stat().st_size / 1024, 2)} KB"
size = file_size(file_path)
print(f"已生成语言文件“{output_file}.json”,大小{size},耗时{elapsed_time:.2f} s。")
Loading

0 comments on commit efb2bef

Please sign in to comment.