diff --git a/.gitignore b/.gitignore index a90dda7..63ecfd2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ *.pyc MANIFEST dist +*~ +\#* +*.egg-info/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..23cf8f7 --- /dev/null +++ b/Makefile @@ -0,0 +1,10 @@ +tests: + python test_pinyin.py + python3 test_pinyin.py + python3 test_cedict.py + +cedict: + wget https://www.mdbg.net/chindict/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz -O - > pinyin/cedict.txt.gz + +pep8: + pep8 *py pinyin/*py diff --git a/README.rst b/README.rst index e93afc2..9b4fe36 100644 --- a/README.rst +++ b/README.rst @@ -41,3 +41,27 @@ Usage .. note:: `format` must be one of: numerical/diacritical/strip + +Prototype Chinese->English +-------------------------- + +.. code:: python + + >>> import pinyin.cedict + >>> pinyin.cedict.translate_word('你') + ['you (informal, as opposed to courteous 您[nin2])'] + >>> pinyin.cedict.translate_word('你好') + ['Hello!', 'Hi!', 'How are you?'] + >>> list(pinyin.cedict.all_phrase_translations('你好')) + [['你', ['you (informal, as opposed to courteous 您[nin2])']], ['你好', ['Hello!', 'Hi!', 'How are you?']], ['好', ['to be fond of', 'to have a tendency to', 'to be prone to']]] + +Note that this is a prototype, and only functions from Python 3. + +License +------- + +pinyin is free software, under an MIT-style license. See LICENSE for +details. The data file for translations is the CC-BY-SA 3.0. The +translations are from the CC-CE-DICT project +(https://cc-cedict.org/wiki/), by Denisowski, Peterson, Brelsford, and +others. diff --git a/pinyin/cedict.py b/pinyin/cedict.py new file mode 100644 index 0000000..69b6077 --- /dev/null +++ b/pinyin/cedict.py @@ -0,0 +1,138 @@ +#!/usr/local/bin/python +# -*- coding: utf-8 -*- + +''' +A utility to translate Mandarin to English. The data file is from +the CEDICT project. We keep it compressed for size and speed. + +Using this has substantial (1-2 second) init time for reading the +dictionary. This is spent the first time this module is used (it +is free to import). + +Note that this is a prototype. It is specific to Python 3. To bring +this code to the quality level of the rest of the library, it would +need to be backported to Python 2. + +In addition, it'd be nice if `all_phrase_translations` handled both +traditional and simplified elegantly. +''' + +import gzip +import os.path +import re +import string + +import collections + + +def Tree(): + return collections.defaultdict(Tree) + + +dictionaries = None # Used for single word lookup +trees = None # Used for parsing. + + +def _add_to_tree(tree, word, meaning): + ''' + We build word search trees, where we walk down + the letters of a word. For example: + 你 Good + 你好 Hello + Would build the tree + 你 + / \ + You 好 + \ + Hello + ''' + if len(word) == 0: + tree[''] = meaning + else: + _add_to_tree(tree[word[0]], word[1:], meaning) + + +def init(): + ''' + Load in the Chinese-English dictionary. This takes 1-2 seconds. It + is done when the other functions are used, but this is public since + preloading sometimes makes sense. + ''' + global dictionaries, trees + + dictionaries = { + 'traditional': {}, + 'simplified': {} + } + + trees = { + 'traditional': Tree(), + 'simplified': Tree() + } + + lines = gzip.open( + os.path.join(os.path.dirname(__file__), "cedict.txt.gz"), + mode='rt', + encoding='utf-8' + ) + exp = re.compile("^([^ ]+) ([^ ]+) \[(.*)\] /(.+)/") + parsed_lines = (exp.match(line).groups() + for line in lines + if line[0] != '#') + + for traditional, simplified, pinyin, meaning in parsed_lines: + meaning = meaning.split('/') + dictionaries['traditional'][traditional] = meaning + dictionaries['simplified'][simplified] = meaning + _add_to_tree(trees['traditional'], traditional, meaning) + _add_to_tree(trees['simplified'], simplified, meaning) + + +def translate_word(word, dictionary=['simplified']): + ''' + Return the set of translations for a single character or word, if + available. + ''' + if not dictionaries: + init() + for d in dictionary: + if word in dictionaries[d]: + return dictionaries[d][word] + return None + + +def _words_at_the_beginning(word, tree, prefix=""): + ''' + We return all portions of the tree corresponding to the beginning + of `word`. This is used recursively, so we pass the prefix so we + can return meaningful words+translations. + ''' + l = [] + if "" in tree: + l.append([prefix, tree[""]]) + if len(word) > 0 and word[0] in tree: + l.extend(_words_at_the_beginning( + word[1:], + tree[word[0]], + prefix=prefix+word[0] + )) + return l + + +def all_phrase_translations(phrase): + ''' + Return the set of translations for all possible words in a full + phrase. Chinese is sometimes ambiguous. We do not attempt to + disambiguate, or handle unknown letters especially well. Full + parsing is left to upstream logic. + ''' + if not trees: + init() + phrase = phrase.split(string.whitespace) + for word in phrase: + for x in range(len(word)): + for translation in _words_at_the_beginning( + word[x+1:], + trees['simplified'][word[x]], + prefix=word[x]): + yield translation diff --git a/pinyin/cedict.txt.gz b/pinyin/cedict.txt.gz new file mode 100644 index 0000000..aea6999 Binary files /dev/null and b/pinyin/cedict.txt.gz differ diff --git a/setup.py b/setup.py index 6a1e69c..6dac7a7 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,9 @@ author='Lx Yu', author_email='github@lxyu.net', packages=['pinyin', ], - package_data={'': ['LICENSE'], 'pinyin': ['Mandarin.dat'], }, + package_data={ + '': ['LICENSE'], + 'pinyin': ['Mandarin.dat', 'cedict.txt.gz'], }, entry_points={"console_scripts": ["pinyin = pinyin.cmd:pinyin", ]}, url='http://lxyu.github.io/pinyin/', license="BSD", diff --git a/test_cedict.py b/test_cedict.py new file mode 100644 index 0000000..f2e1340 --- /dev/null +++ b/test_cedict.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- + +import unittest + +import pinyin.cedict + + +class BasicTestSuite(unittest.TestCase): + """Basic test cases.""" + def test_translate_word(self): + self.assertEqual( + list(pinyin.cedict.translate_word("你好")), + ['Hello!', 'Hi!', 'How are you?'] + ) + self.assertEqual( + list(pinyin.cedict.translate_word("你")), + ['you (informal, as opposed to courteous 您[nin2])'] + ) + + def test_all_phrase_translations(self): + self.assertEqual( + list(pinyin.cedict.all_phrase_translations("你好")), + [['你', ['you (informal, as opposed to courteous 您[nin2])']], + ['你好', ['Hello!', 'Hi!', 'How are you?']], + ['好', [ + 'to be fond of', 'to have a tendency to', + 'to be prone to']]] + ) + self.assertEqual( + list(pinyin.cedict.all_phrase_translations("小兔子乖乖")), + [['小', ['small', 'tiny', 'few', 'young']], + ['兔', ['rabbit']], + ['兔子', ['hare', 'rabbit', 'CL:隻|只[zhi1]']], + ['子', ['(noun suffix)']], + ['乖', [ + '(of a child) obedient, well-behaved', 'clever', + 'shrewd', 'alert', 'perverse', 'contrary to reason', + 'irregular', 'abnormal']], + ['乖', [ + '(of a child) obedient, well-behaved', 'clever', + 'shrewd', 'alert', 'perverse', 'contrary to reason', + 'irregular', 'abnormal']]] + ) + +if __name__ == '__main__': + unittest.main()