Skip to content

Commit

Permalink
Adding prototype translation functionality
Browse files Browse the repository at this point in the history
  • Loading branch information
pmitros committed Nov 19, 2016
1 parent 287e34a commit 60dff93
Show file tree
Hide file tree
Showing 7 changed files with 224 additions and 1 deletion.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
*.pyc
MANIFEST
dist
*~
\#*
*.egg-info/
10 changes: 10 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
tests:
python test_pinyin.py
python3 test_pinyin.py
python3 test_cedict.py

cedict:
wget https://www.mdbg.net/chindict/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz -O - > pinyin/cedict.txt.gz

pep8:
pep8 *py pinyin/*py
24 changes: 24 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,27 @@ Usage
.. note::

`format` must be one of: numerical/diacritical/strip

Prototype Chinese->English
--------------------------

.. code:: python
>>> import pinyin.cedict
>>> pinyin.cedict.translate_word('')
['you (informal, as opposed to courteous 您[nin2])']
>>> pinyin.cedict.translate_word('你好')
['Hello!', 'Hi!', 'How are you?']
>>> list(pinyin.cedict.all_phrase_translations('你好'))
[['', ['you (informal, as opposed to courteous 您[nin2])']], ['你好', ['Hello!', 'Hi!', 'How are you?']], ['', ['to be fond of', 'to have a tendency to', 'to be prone to']]]
Note that this is a prototype, and only functions from Python 3.

License
-------

pinyin is free software, under an MIT-style license. See LICENSE for
details. The data file for translations is the CC-BY-SA 3.0. The
translations are from the CC-CE-DICT project
(https://cc-cedict.org/wiki/), by Denisowski, Peterson, Brelsford, and
others.
138 changes: 138 additions & 0 deletions pinyin/cedict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#!/usr/local/bin/python
# -*- coding: utf-8 -*-

'''
A utility to translate Mandarin to English. The data file is from
the CEDICT project. We keep it compressed for size and speed.
Using this has substantial (1-2 second) init time for reading the
dictionary. This is spent the first time this module is used (it
is free to import).
Note that this is a prototype. It is specific to Python 3. To bring
this code to the quality level of the rest of the library, it would
need to be backported to Python 2.
In addition, it'd be nice if `all_phrase_translations` handled both
traditional and simplified elegantly.
'''

import gzip
import os.path
import re
import string

import collections


def Tree():
return collections.defaultdict(Tree)


dictionaries = None # Used for single word lookup
trees = None # Used for parsing.


def _add_to_tree(tree, word, meaning):
'''
We build word search trees, where we walk down
the letters of a word. For example:
你 Good
你好 Hello
Would build the tree
/ \
You 好
\
Hello
'''
if len(word) == 0:
tree[''] = meaning
else:
_add_to_tree(tree[word[0]], word[1:], meaning)


def init():
'''
Load in the Chinese-English dictionary. This takes 1-2 seconds. It
is done when the other functions are used, but this is public since
preloading sometimes makes sense.
'''
global dictionaries, trees

dictionaries = {
'traditional': {},
'simplified': {}
}

trees = {
'traditional': Tree(),
'simplified': Tree()
}

lines = gzip.open(
os.path.join(os.path.dirname(__file__), "cedict.txt.gz"),
mode='rt',
encoding='utf-8'
)
exp = re.compile("^([^ ]+) ([^ ]+) \[(.*)\] /(.+)/")
parsed_lines = (exp.match(line).groups()
for line in lines
if line[0] != '#')

for traditional, simplified, pinyin, meaning in parsed_lines:
meaning = meaning.split('/')
dictionaries['traditional'][traditional] = meaning
dictionaries['simplified'][simplified] = meaning
_add_to_tree(trees['traditional'], traditional, meaning)
_add_to_tree(trees['simplified'], simplified, meaning)


def translate_word(word, dictionary=['simplified']):
'''
Return the set of translations for a single character or word, if
available.
'''
if not dictionaries:
init()
for d in dictionary:
if word in dictionaries[d]:
return dictionaries[d][word]
return None


def _words_at_the_beginning(word, tree, prefix=""):
'''
We return all portions of the tree corresponding to the beginning
of `word`. This is used recursively, so we pass the prefix so we
can return meaningful words+translations.
'''
l = []
if "" in tree:
l.append([prefix, tree[""]])
if len(word) > 0 and word[0] in tree:
l.extend(_words_at_the_beginning(
word[1:],
tree[word[0]],
prefix=prefix+word[0]
))
return l


def all_phrase_translations(phrase):
'''
Return the set of translations for all possible words in a full
phrase. Chinese is sometimes ambiguous. We do not attempt to
disambiguate, or handle unknown letters especially well. Full
parsing is left to upstream logic.
'''
if not trees:
init()
phrase = phrase.split(string.whitespace)
for word in phrase:
for x in range(len(word)):
for translation in _words_at_the_beginning(
word[x+1:],
trees['simplified'][word[x]],
prefix=word[x]):
yield translation
Binary file added pinyin/cedict.txt.gz
Binary file not shown.
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
author='Lx Yu',
author_email='[email protected]',
packages=['pinyin', ],
package_data={'': ['LICENSE'], 'pinyin': ['Mandarin.dat'], },
package_data={
'': ['LICENSE'],
'pinyin': ['Mandarin.dat', 'cedict.txt.gz'], },
entry_points={"console_scripts": ["pinyin = pinyin.cmd:pinyin", ]},
url='http://lxyu.github.io/pinyin/',
license="BSD",
Expand Down
46 changes: 46 additions & 0 deletions test_cedict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# -*- coding: utf-8 -*-

import unittest

import pinyin.cedict


class BasicTestSuite(unittest.TestCase):
"""Basic test cases."""
def test_translate_word(self):
self.assertEqual(
list(pinyin.cedict.translate_word("你好")),
['Hello!', 'Hi!', 'How are you?']
)
self.assertEqual(
list(pinyin.cedict.translate_word("你")),
['you (informal, as opposed to courteous 您[nin2])']
)

def test_all_phrase_translations(self):
self.assertEqual(
list(pinyin.cedict.all_phrase_translations("你好")),
[['你', ['you (informal, as opposed to courteous 您[nin2])']],
['你好', ['Hello!', 'Hi!', 'How are you?']],
['好', [
'to be fond of', 'to have a tendency to',
'to be prone to']]]
)
self.assertEqual(
list(pinyin.cedict.all_phrase_translations("小兔子乖乖")),
[['小', ['small', 'tiny', 'few', 'young']],
['兔', ['rabbit']],
['兔子', ['hare', 'rabbit', 'CL:隻|只[zhi1]']],
['子', ['(noun suffix)']],
['乖', [
'(of a child) obedient, well-behaved', 'clever',
'shrewd', 'alert', 'perverse', 'contrary to reason',
'irregular', 'abnormal']],
['乖', [
'(of a child) obedient, well-behaved', 'clever',
'shrewd', 'alert', 'perverse', 'contrary to reason',
'irregular', 'abnormal']]]
)

if __name__ == '__main__':
unittest.main()

0 comments on commit 60dff93

Please sign in to comment.