Adding prototype translation functionality

lxyu · Nov 19, 2016 · 60dff93 · 60dff93
1 parent 287e34a
commit 60dff93
Show file tree

Hide file tree

Showing 7 changed files with 224 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
 *.pyc
 MANIFEST
 dist
+*~
+\#*
+*.egg-info/
diff --git a/Makefile b/Makefile
@@ -0,0 +1,10 @@
+tests:
+	python test_pinyin.py
+	python3 test_pinyin.py
+	python3 test_cedict.py
+
+cedict:
+	wget https://www.mdbg.net/chindict/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz -O -  > pinyin/cedict.txt.gz
+
+pep8:
+	pep8 *py pinyin/*py
diff --git a/README.rst b/README.rst
@@ -41,3 +41,27 @@ Usage
 .. note::
 
     `format` must be one of: numerical/diacritical/strip
+
+Prototype Chinese->English
+--------------------------
+
+.. code:: python
+
+    >>> import pinyin.cedict
+    >>> pinyin.cedict.translate_word('你')
+    ['you (informal, as opposed to courteous 您[nin2])']
+    >>> pinyin.cedict.translate_word('你好')
+    ['Hello!', 'Hi!', 'How are you?']
+    >>> list(pinyin.cedict.all_phrase_translations('你好'))
+    [['你', ['you (informal, as opposed to courteous 您[nin2])']], ['你好', ['Hello!', 'Hi!', 'How are you?']], ['好', ['to be fond of', 'to have a tendency to', 'to be prone to']]]
+
+Note that this is a prototype, and only functions from Python 3.
+
+License
+-------
+
+pinyin is free software, under an MIT-style license. See LICENSE for
+details. The data file for translations is the CC-BY-SA 3.0. The
+translations are from the CC-CE-DICT project
+(https://cc-cedict.org/wiki/), by Denisowski, Peterson, Brelsford, and
+others.
diff --git a/pinyin/cedict.py b/pinyin/cedict.py
@@ -0,0 +1,138 @@
+#!/usr/local/bin/python
+# -*- coding: utf-8 -*-
+
+'''
+A utility to translate Mandarin to English. The data file is from
+the CEDICT project. We keep it compressed for size and speed.
+
+Using this has substantial (1-2 second) init time for reading the
+dictionary. This is spent the first time this module is used (it
+is free to import).
+
+Note that this is a prototype. It is specific to Python 3. To bring
+this code to the quality level of the rest of the library, it would
+need to be backported to Python 2.
+
+In addition, it'd be nice if `all_phrase_translations` handled both
+traditional and simplified elegantly.
+'''
+
+import gzip
+import os.path
+import re
+import string
+
+import collections
+
+
+def Tree():
+    return collections.defaultdict(Tree)
+
+
+dictionaries = None  # Used for single word lookup
+trees = None  # Used for parsing.
+
+
+def _add_to_tree(tree, word, meaning):
+    '''
+    We build word search trees, where we walk down
+    the letters of a word. For example:
+      你 Good
+      你好 Hello
+    Would build the tree
+         你
+        /  \
+      You   好
+             \
+           Hello
+    '''
+    if len(word) == 0:
+        tree[''] = meaning
+    else:
+        _add_to_tree(tree[word[0]], word[1:], meaning)
+
+
+def init():
+    '''
+    Load in the Chinese-English dictionary. This takes 1-2 seconds. It
+    is done when the other functions are used, but this is public since
+    preloading sometimes makes sense.
+    '''
+    global dictionaries, trees
+
+    dictionaries = {
+        'traditional': {},
+        'simplified': {}
+    }
+
+    trees = {
+        'traditional': Tree(),
+        'simplified': Tree()
+    }
+
+    lines = gzip.open(
+        os.path.join(os.path.dirname(__file__), "cedict.txt.gz"),
+        mode='rt',
+        encoding='utf-8'
+    )
+    exp = re.compile("^([^ ]+) ([^ ]+) \[(.*)\] /(.+)/")
+    parsed_lines = (exp.match(line).groups()
+                    for line in lines
+                    if line[0] != '#')
+
+    for traditional, simplified, pinyin, meaning in parsed_lines:
+        meaning = meaning.split('/')
+        dictionaries['traditional'][traditional] = meaning
+        dictionaries['simplified'][simplified] = meaning
+        _add_to_tree(trees['traditional'], traditional, meaning)
+        _add_to_tree(trees['simplified'], simplified, meaning)
+
+
+def translate_word(word, dictionary=['simplified']):
+    '''
+    Return the set of translations for a single character or word, if
+    available.
+    '''
+    if not dictionaries:
+        init()
+    for d in dictionary:
+        if word in dictionaries[d]:
+            return dictionaries[d][word]
+    return None
+
+
+def _words_at_the_beginning(word, tree, prefix=""):
+    '''
+    We return all portions of the tree corresponding to the beginning
+    of `word`. This is used recursively, so we pass the prefix so we
+    can return meaningful words+translations.
+    '''
+    l = []
+    if "" in tree:
+        l.append([prefix, tree[""]])
+    if len(word) > 0 and word[0] in tree:
+        l.extend(_words_at_the_beginning(
+            word[1:],
+            tree[word[0]],
+            prefix=prefix+word[0]
+        ))
+    return l
+
+
+def all_phrase_translations(phrase):
+    '''
+    Return the set of translations for all possible words in a full
+    phrase. Chinese is sometimes ambiguous. We do not attempt to
+    disambiguate, or handle unknown letters especially well. Full
+    parsing is left to upstream logic.
+    '''
+    if not trees:
+        init()
+    phrase = phrase.split(string.whitespace)
+    for word in phrase:
+        for x in range(len(word)):
+            for translation in _words_at_the_beginning(
+                    word[x+1:],
+                    trees['simplified'][word[x]],
+                    prefix=word[x]):
+                yield translation
diff --git a/pinyin/cedict.txt.gz b/pinyin/cedict.txt.gz
diff --git a/setup.py b/setup.py
@@ -10,7 +10,9 @@
     author='Lx Yu',
     author_email='[email protected]',
     packages=['pinyin', ],
-    package_data={'': ['LICENSE'], 'pinyin': ['Mandarin.dat'], },
+    package_data={
+        '': ['LICENSE'],
+        'pinyin': ['Mandarin.dat', 'cedict.txt.gz'], },
     entry_points={"console_scripts": ["pinyin = pinyin.cmd:pinyin", ]},
     url='http://lxyu.github.io/pinyin/',
     license="BSD",

diff --git a/test_cedict.py b/test_cedict.py
@@ -0,0 +1,46 @@
+# -*- coding: utf-8 -*-
+
+import unittest
+
+import pinyin.cedict
+
+
+class BasicTestSuite(unittest.TestCase):
+    """Basic test cases."""
+    def test_translate_word(self):
+        self.assertEqual(
+            list(pinyin.cedict.translate_word("你好")),
+            ['Hello!', 'Hi!', 'How are you?']
+        )
+        self.assertEqual(
+            list(pinyin.cedict.translate_word("你")),
+            ['you (informal, as opposed to courteous 您[nin2])']
+        )
+
+    def test_all_phrase_translations(self):
+        self.assertEqual(
+            list(pinyin.cedict.all_phrase_translations("你好")),
+            [['你', ['you (informal, as opposed to courteous 您[nin2])']],
+             ['你好', ['Hello!', 'Hi!', 'How are you?']],
+             ['好', [
+                 'to be fond of', 'to have a tendency to',
+                 'to be prone to']]]
+        )
+        self.assertEqual(
+            list(pinyin.cedict.all_phrase_translations("小兔子乖乖")),
+            [['小', ['small', 'tiny', 'few', 'young']],
+             ['兔', ['rabbit']],
+             ['兔子', ['hare', 'rabbit', 'CL:隻|只[zhi1]']],
+             ['子', ['(noun suffix)']],
+             ['乖', [
+                 '(of a child) obedient, well-behaved', 'clever',
+                 'shrewd', 'alert', 'perverse', 'contrary to reason',
+                 'irregular', 'abnormal']],
+             ['乖', [
+                 '(of a child) obedient, well-behaved', 'clever',
+                 'shrewd', 'alert', 'perverse', 'contrary to reason',
+                 'irregular', 'abnormal']]]
+        )
+
+if __name__ == '__main__':
+    unittest.main()