-
Notifications
You must be signed in to change notification settings - Fork 47
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding prototype translation functionality
- Loading branch information
Showing
7 changed files
with
224 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,6 @@ | ||
*.pyc | ||
MANIFEST | ||
dist | ||
*~ | ||
\#* | ||
*.egg-info/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
tests: | ||
python test_pinyin.py | ||
python3 test_pinyin.py | ||
python3 test_cedict.py | ||
|
||
cedict: | ||
wget https://www.mdbg.net/chindict/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz -O - > pinyin/cedict.txt.gz | ||
|
||
pep8: | ||
pep8 *py pinyin/*py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
#!/usr/local/bin/python | ||
# -*- coding: utf-8 -*- | ||
|
||
''' | ||
A utility to translate Mandarin to English. The data file is from | ||
the CEDICT project. We keep it compressed for size and speed. | ||
Using this has substantial (1-2 second) init time for reading the | ||
dictionary. This is spent the first time this module is used (it | ||
is free to import). | ||
Note that this is a prototype. It is specific to Python 3. To bring | ||
this code to the quality level of the rest of the library, it would | ||
need to be backported to Python 2. | ||
In addition, it'd be nice if `all_phrase_translations` handled both | ||
traditional and simplified elegantly. | ||
''' | ||
|
||
import gzip | ||
import os.path | ||
import re | ||
import string | ||
|
||
import collections | ||
|
||
|
||
def Tree(): | ||
return collections.defaultdict(Tree) | ||
|
||
|
||
dictionaries = None # Used for single word lookup | ||
trees = None # Used for parsing. | ||
|
||
|
||
def _add_to_tree(tree, word, meaning): | ||
''' | ||
We build word search trees, where we walk down | ||
the letters of a word. For example: | ||
你 Good | ||
你好 Hello | ||
Would build the tree | ||
你 | ||
/ \ | ||
You 好 | ||
\ | ||
Hello | ||
''' | ||
if len(word) == 0: | ||
tree[''] = meaning | ||
else: | ||
_add_to_tree(tree[word[0]], word[1:], meaning) | ||
|
||
|
||
def init(): | ||
''' | ||
Load in the Chinese-English dictionary. This takes 1-2 seconds. It | ||
is done when the other functions are used, but this is public since | ||
preloading sometimes makes sense. | ||
''' | ||
global dictionaries, trees | ||
|
||
dictionaries = { | ||
'traditional': {}, | ||
'simplified': {} | ||
} | ||
|
||
trees = { | ||
'traditional': Tree(), | ||
'simplified': Tree() | ||
} | ||
|
||
lines = gzip.open( | ||
os.path.join(os.path.dirname(__file__), "cedict.txt.gz"), | ||
mode='rt', | ||
encoding='utf-8' | ||
) | ||
exp = re.compile("^([^ ]+) ([^ ]+) \[(.*)\] /(.+)/") | ||
parsed_lines = (exp.match(line).groups() | ||
for line in lines | ||
if line[0] != '#') | ||
|
||
for traditional, simplified, pinyin, meaning in parsed_lines: | ||
meaning = meaning.split('/') | ||
dictionaries['traditional'][traditional] = meaning | ||
dictionaries['simplified'][simplified] = meaning | ||
_add_to_tree(trees['traditional'], traditional, meaning) | ||
_add_to_tree(trees['simplified'], simplified, meaning) | ||
|
||
|
||
def translate_word(word, dictionary=['simplified']): | ||
''' | ||
Return the set of translations for a single character or word, if | ||
available. | ||
''' | ||
if not dictionaries: | ||
init() | ||
for d in dictionary: | ||
if word in dictionaries[d]: | ||
return dictionaries[d][word] | ||
return None | ||
|
||
|
||
def _words_at_the_beginning(word, tree, prefix=""): | ||
''' | ||
We return all portions of the tree corresponding to the beginning | ||
of `word`. This is used recursively, so we pass the prefix so we | ||
can return meaningful words+translations. | ||
''' | ||
l = [] | ||
if "" in tree: | ||
l.append([prefix, tree[""]]) | ||
if len(word) > 0 and word[0] in tree: | ||
l.extend(_words_at_the_beginning( | ||
word[1:], | ||
tree[word[0]], | ||
prefix=prefix+word[0] | ||
)) | ||
return l | ||
|
||
|
||
def all_phrase_translations(phrase): | ||
''' | ||
Return the set of translations for all possible words in a full | ||
phrase. Chinese is sometimes ambiguous. We do not attempt to | ||
disambiguate, or handle unknown letters especially well. Full | ||
parsing is left to upstream logic. | ||
''' | ||
if not trees: | ||
init() | ||
phrase = phrase.split(string.whitespace) | ||
for word in phrase: | ||
for x in range(len(word)): | ||
for translation in _words_at_the_beginning( | ||
word[x+1:], | ||
trees['simplified'][word[x]], | ||
prefix=word[x]): | ||
yield translation |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,7 +10,9 @@ | |
author='Lx Yu', | ||
author_email='[email protected]', | ||
packages=['pinyin', ], | ||
package_data={'': ['LICENSE'], 'pinyin': ['Mandarin.dat'], }, | ||
package_data={ | ||
'': ['LICENSE'], | ||
'pinyin': ['Mandarin.dat', 'cedict.txt.gz'], }, | ||
entry_points={"console_scripts": ["pinyin = pinyin.cmd:pinyin", ]}, | ||
url='http://lxyu.github.io/pinyin/', | ||
license="BSD", | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
import unittest | ||
|
||
import pinyin.cedict | ||
|
||
|
||
class BasicTestSuite(unittest.TestCase): | ||
"""Basic test cases.""" | ||
def test_translate_word(self): | ||
self.assertEqual( | ||
list(pinyin.cedict.translate_word("你好")), | ||
['Hello!', 'Hi!', 'How are you?'] | ||
) | ||
self.assertEqual( | ||
list(pinyin.cedict.translate_word("你")), | ||
['you (informal, as opposed to courteous 您[nin2])'] | ||
) | ||
|
||
def test_all_phrase_translations(self): | ||
self.assertEqual( | ||
list(pinyin.cedict.all_phrase_translations("你好")), | ||
[['你', ['you (informal, as opposed to courteous 您[nin2])']], | ||
['你好', ['Hello!', 'Hi!', 'How are you?']], | ||
['好', [ | ||
'to be fond of', 'to have a tendency to', | ||
'to be prone to']]] | ||
) | ||
self.assertEqual( | ||
list(pinyin.cedict.all_phrase_translations("小兔子乖乖")), | ||
[['小', ['small', 'tiny', 'few', 'young']], | ||
['兔', ['rabbit']], | ||
['兔子', ['hare', 'rabbit', 'CL:隻|只[zhi1]']], | ||
['子', ['(noun suffix)']], | ||
['乖', [ | ||
'(of a child) obedient, well-behaved', 'clever', | ||
'shrewd', 'alert', 'perverse', 'contrary to reason', | ||
'irregular', 'abnormal']], | ||
['乖', [ | ||
'(of a child) obedient, well-behaved', 'clever', | ||
'shrewd', 'alert', 'perverse', 'contrary to reason', | ||
'irregular', 'abnormal']]] | ||
) | ||
|
||
if __name__ == '__main__': | ||
unittest.main() |