Attempts to use OCR to identify confused fonts

chettoy · Apr 27, 2022 · 84a1402 · 84a1402
1 parent 21c01e2
commit 84a1402
Show file tree

Hide file tree

Showing 2 changed files with 134 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -55,6 +55,9 @@ FxxkStar works with [Python](https://www.python.org/download/) version 3.10 or a
 **requirements**
 
 - `lxml`, `beautifulsoup4`, `requests`, `brotli`, `zstandard`
+- `fonttools`, `pytesseract` (for identifying confused fonts)
+
+
 
 You can install requirements with the following command (using TUNA mirror):
 
@@ -64,10 +67,14 @@ pip install -i https://pypi.tuna.tsinghua.edu.cn/simple brotli
 pip install -i https://pypi.tuna.tsinghua.edu.cn/simple lxml
 pip install -i https://pypi.tuna.tsinghua.edu.cn/simple beautifulsoup4
 pip install -i https://pypi.tuna.tsinghua.edu.cn/simple zstandard
+pip install -i https://pypi.tuna.tsinghua.edu.cn/simple fonttools
+pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pytesseract
 ```
 
 
 
+Note: Additional steps are required to install `pytesseract`, check the [pytesseract package page](https://pypi.python.org/pypi/pytesseract) for more information.
+
 ## Contributing
 
 We'd love to have your helping hand on `FxxkStar`! 

diff --git a/fxxkstar.py b/fxxkstar.py
@@ -20,6 +20,10 @@
 from concurrent.futures import Future, ThreadPoolExecutor
 from dataclasses import dataclass
 from typing import List
+from collections import Counter
+from fontTools.ttLib import TTFont
+from PIL import Image, ImageDraw, ImageFont
+import pytesseract
 
 
 VERSION_NAME = "FxxkStar 0.9"
@@ -43,6 +47,7 @@
     'auto_submit_work': True,
 
     'video_only_mode': False,
+    'experimental_fix_fonts': False,
 
     'save_paper_to_file': False,
 
@@ -1384,8 +1389,8 @@ def __init__(self, fxxkstar: FxxkStar, attachment_item: dict, card_info: dict, c
         self.paper_html: str = ""
         self._load()
         if G_CONFIG['save_paper_to_file']:
-            suffix = "1" if self.is_marked else ""
-            file_name = f"work_{self.work_id}_{suffix}.html"
+            suffix = "_1" if self.is_marked else ""
+            file_name = f"work_{self.work_id}{suffix}.html"
             with open(f"temp/work/{file_name}", "w") as f:
                 f.write(self.paper_html)
             print("[Work] ", self.title, file_name, " saved")
@@ -1489,6 +1494,10 @@ class MarkResultItem:
             is_correct: bool | None = None
 
         soup = BeautifulSoup(paper_page_html, "lxml")
+        if G_CONFIG['experimental_fix_fonts'] and soup.find("div", class_="font-cxsecret"):
+            print("[INFO] detect secret font")
+            paper_page_html = test_fix_ttf(paper_page_html)
+            soup = BeautifulSoup(paper_page_html, "lxml")
 
         # Parse paper status
         top_div = soup.find("div", class_="ZyTop")
@@ -2623,6 +2632,122 @@ def choose_course_and_study(self) -> None:
             print()
 
 
+def test_fix_ttf(html_text: str):
+
+    def translate(font_path) -> list:
+        font = TTFont(font_path)
+        image_font = ImageFont.truetype(font_path, size=40)
+        glyph_list = []
+        utext_list = []
+
+        for name in font.getGlyphOrder():
+            if name == '.notdef':
+                continue
+            u_text = ""
+            if name[:3] == 'uni':
+                u_text = name.replace('uni', '\\u')
+            elif name[:2] == 'uF':
+                u_text = name.replace('uF', '\\u')
+            else:
+                continue
+            u_text = json.loads(f'"{u_text}"')
+            #print(name, u_text)
+            glyph_list.append(name)
+            utext_list.append(u_text)
+
+        t_dict = {}
+        for u_text in utext_list:
+            t_dict[u_text] = []
+
+        utext_remains = utext_list.copy()
+        group_index = 0
+        group_max = (len(utext_list) / 20 + 1) * 4
+        width = 15
+        while True:
+            process_list = []
+            while len(utext_remains) < width:
+                utext_remains.extend(utext_list)
+            for i in range(width):
+                process_list.append(utext_remains.pop(0))
+            random.shuffle(process_list)
+
+            image_path1 = f"temp/cxsecret/img/{group_index}.png"
+            image_path2 = f"temp/cxsecret/img/{group_index}_r.png"
+
+            current_result = recog_glyph(process_list, image_font, image_path1)
+            reverse_process = process_list.copy()
+            reverse_process.reverse()
+            current_result2 = recog_glyph(
+                reverse_process, image_font, image_path2)
+            current_result2.reverse()
+            read_len = min(len(current_result), len(current_result2))
+            failed_list = []
+            for i in range(len(process_list)):
+                if i < read_len and current_result[i] == current_result2[i]:
+                    u_text = current_result[i][0]
+                    t_list = t_dict.get(u_text)
+                    t_list.append(current_result[i][1])
+                    if len(t_list) > 3 and u_text in utext_list and len(utext_list) > 5:
+                        utext_list.remove(u_text)
+                else:
+                    failed_list.append(process_list[i])
+            utext_remains.extend(failed_list)
+            group_index += 1
+            if group_index > group_max / 2:
+                width = 10
+            if group_index > group_max:
+                break
+        result = []
+        for u_text, t_list in t_dict.items():
+            counter = Counter(t_list)
+            print(u_text, counter)
+            best = counter.most_common(1)[0][0]
+            result.append((u_text, best))
+
+        return result
+
+    def recog_glyph(utext_list, image_font, image_path) -> list:
+
+        img = Image.new(mode='L', size=(40*len(utext_list), 40), color=255)
+        draw = ImageDraw.Draw(img)
+        for i, u_text in enumerate(utext_list):
+            draw.text((i*40, 0), u_text, font=image_font, fill=0)
+
+        img.save(image_path)
+        img = Image.open(image_path)
+        text = pytesseract.image_to_string(img, lang="chi_sim")
+        if not text:
+            text = pytesseract.image_to_string(
+                img, lang="chi_sim", config='--psm 10')
+        text = text.strip().replace('\n', '').replace(' ', '')
+        print(text)
+        print(len(text), len(utext_list))
+        if len(text) == len(utext_list):
+            text_list = list(text)
+            result = []
+            for i, u_text in enumerate(utext_list):
+                result.append((utext_list[i], text_list[i]))
+            return result
+        else:
+            return []
+
+    def fix_fonts(html):
+        secret_search = re.search(
+            r"url\('data:application/font-ttf;charset=utf-8;base64,(.*?)'\)", html)
+        secret = secret_search.group(1)
+        secret = base64.b64decode(secret)
+        with open("temp/cxsecret/tmp.ttf", "wb") as f:
+            f.write(secret)
+        text_map = translate("temp/cxsecret/tmp.ttf")
+        # for (s1, s2) in text_map:
+        #     print(s1, "->", s2)
+        for (s1, s2) in text_map:
+            html = html.replace(s1, s2)
+        return html
+
+    return fix_fonts(html_text)
+
+
 def before_start() -> None:
     "print some info before start"
     print()