Skip to content

Commit

Permalink
Attempts to use OCR to identify confused fonts
Browse files Browse the repository at this point in the history
  • Loading branch information
chettoy committed Apr 27, 2022
1 parent 21c01e2 commit 84a1402
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 2 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ FxxkStar works with [Python](https://www.python.org/download/) version 3.10 or a
**requirements**

- `lxml`, `beautifulsoup4`, `requests`, `brotli`, `zstandard`
- `fonttools`, `pytesseract` (for identifying confused fonts)



You can install requirements with the following command (using TUNA mirror):

Expand All @@ -64,10 +67,14 @@ pip install -i https://pypi.tuna.tsinghua.edu.cn/simple brotli
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple lxml
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple beautifulsoup4
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple zstandard
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple fonttools
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pytesseract
```



Note: Additional steps are required to install `pytesseract`, check the [pytesseract package page](https://pypi.python.org/pypi/pytesseract) for more information.

## Contributing

We'd love to have your helping hand on `FxxkStar`!
Expand Down
129 changes: 127 additions & 2 deletions fxxkstar.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@
from concurrent.futures import Future, ThreadPoolExecutor
from dataclasses import dataclass
from typing import List
from collections import Counter
from fontTools.ttLib import TTFont
from PIL import Image, ImageDraw, ImageFont
import pytesseract


VERSION_NAME = "FxxkStar 0.9"
Expand All @@ -43,6 +47,7 @@
'auto_submit_work': True,

'video_only_mode': False,
'experimental_fix_fonts': False,

'save_paper_to_file': False,

Expand Down Expand Up @@ -1384,8 +1389,8 @@ def __init__(self, fxxkstar: FxxkStar, attachment_item: dict, card_info: dict, c
self.paper_html: str = ""
self._load()
if G_CONFIG['save_paper_to_file']:
suffix = "1" if self.is_marked else ""
file_name = f"work_{self.work_id}_{suffix}.html"
suffix = "_1" if self.is_marked else ""
file_name = f"work_{self.work_id}{suffix}.html"
with open(f"temp/work/{file_name}", "w") as f:
f.write(self.paper_html)
print("[Work] ", self.title, file_name, " saved")
Expand Down Expand Up @@ -1489,6 +1494,10 @@ class MarkResultItem:
is_correct: bool | None = None

soup = BeautifulSoup(paper_page_html, "lxml")
if G_CONFIG['experimental_fix_fonts'] and soup.find("div", class_="font-cxsecret"):
print("[INFO] detect secret font")
paper_page_html = test_fix_ttf(paper_page_html)
soup = BeautifulSoup(paper_page_html, "lxml")

# Parse paper status
top_div = soup.find("div", class_="ZyTop")
Expand Down Expand Up @@ -2623,6 +2632,122 @@ def choose_course_and_study(self) -> None:
print()


def test_fix_ttf(html_text: str):

def translate(font_path) -> list:
font = TTFont(font_path)
image_font = ImageFont.truetype(font_path, size=40)
glyph_list = []
utext_list = []

for name in font.getGlyphOrder():
if name == '.notdef':
continue
u_text = ""
if name[:3] == 'uni':
u_text = name.replace('uni', '\\u')
elif name[:2] == 'uF':
u_text = name.replace('uF', '\\u')
else:
continue
u_text = json.loads(f'"{u_text}"')
#print(name, u_text)
glyph_list.append(name)
utext_list.append(u_text)

t_dict = {}
for u_text in utext_list:
t_dict[u_text] = []

utext_remains = utext_list.copy()
group_index = 0
group_max = (len(utext_list) / 20 + 1) * 4
width = 15
while True:
process_list = []
while len(utext_remains) < width:
utext_remains.extend(utext_list)
for i in range(width):
process_list.append(utext_remains.pop(0))
random.shuffle(process_list)

image_path1 = f"temp/cxsecret/img/{group_index}.png"
image_path2 = f"temp/cxsecret/img/{group_index}_r.png"

current_result = recog_glyph(process_list, image_font, image_path1)
reverse_process = process_list.copy()
reverse_process.reverse()
current_result2 = recog_glyph(
reverse_process, image_font, image_path2)
current_result2.reverse()
read_len = min(len(current_result), len(current_result2))
failed_list = []
for i in range(len(process_list)):
if i < read_len and current_result[i] == current_result2[i]:
u_text = current_result[i][0]
t_list = t_dict.get(u_text)
t_list.append(current_result[i][1])
if len(t_list) > 3 and u_text in utext_list and len(utext_list) > 5:
utext_list.remove(u_text)
else:
failed_list.append(process_list[i])
utext_remains.extend(failed_list)
group_index += 1
if group_index > group_max / 2:
width = 10
if group_index > group_max:
break
result = []
for u_text, t_list in t_dict.items():
counter = Counter(t_list)
print(u_text, counter)
best = counter.most_common(1)[0][0]
result.append((u_text, best))

return result

def recog_glyph(utext_list, image_font, image_path) -> list:

img = Image.new(mode='L', size=(40*len(utext_list), 40), color=255)
draw = ImageDraw.Draw(img)
for i, u_text in enumerate(utext_list):
draw.text((i*40, 0), u_text, font=image_font, fill=0)

img.save(image_path)
img = Image.open(image_path)
text = pytesseract.image_to_string(img, lang="chi_sim")
if not text:
text = pytesseract.image_to_string(
img, lang="chi_sim", config='--psm 10')
text = text.strip().replace('\n', '').replace(' ', '')
print(text)
print(len(text), len(utext_list))
if len(text) == len(utext_list):
text_list = list(text)
result = []
for i, u_text in enumerate(utext_list):
result.append((utext_list[i], text_list[i]))
return result
else:
return []

def fix_fonts(html):
secret_search = re.search(
r"url\('data:application/font-ttf;charset=utf-8;base64,(.*?)'\)", html)
secret = secret_search.group(1)
secret = base64.b64decode(secret)
with open("temp/cxsecret/tmp.ttf", "wb") as f:
f.write(secret)
text_map = translate("temp/cxsecret/tmp.ttf")
# for (s1, s2) in text_map:
# print(s1, "->", s2)
for (s1, s2) in text_map:
html = html.replace(s1, s2)
return html

return fix_fonts(html_text)


def before_start() -> None:
"print some info before start"
print()
Expand Down

0 comments on commit 84a1402

Please sign in to comment.