From f8f8618da12eac2898559cdd0ff695399c168baa Mon Sep 17 00:00:00 2001 From: Piotr Mitros Date: Sat, 2 Jan 2016 10:19:32 -0500 Subject: [PATCH] Adds diacritical marks to pinyin --- README.rst | 15 +++++++++++---- authors.txt | 2 ++ pinyin/pinyin.py | 41 +++++++++++++++++++++++++++++++---------- setup.py | 2 +- test_pinyin.py | 31 +++++++++++++++++-------------- 5 files changed, 62 insertions(+), 29 deletions(-) create mode 100644 authors.txt diff --git a/README.rst b/README.rst index a96a905..b4c2d8f 100644 --- a/README.rst +++ b/README.rst @@ -26,7 +26,14 @@ Usage .. code:: python >>> import pinyin - >>> pinyin.get('你好') - 'nihao' - >>> pinyin.get_initial('你好') - 'n h' + >>> print pinyin.get('你 好') + nǐ hǎo + + >>> print pinyin.get('你好', format="strip", delimiter=" ") + ni hao + + >>> print pinyin.get('你好', format="numerical") + ni3hao3 + + >>> print pinyin.get_initial('你好') + n h diff --git a/authors.txt b/authors.txt new file mode 100644 index 0000000..d7c027e --- /dev/null +++ b/authors.txt @@ -0,0 +1,2 @@ +Lx Yu +Piotr Mitros diff --git a/pinyin/pinyin.py b/pinyin/pinyin.py index a5736e5..983f627 100644 --- a/pinyin/pinyin.py +++ b/pinyin/pinyin.py @@ -1,33 +1,53 @@ -__all__ = ['get', 'get_pinyin', 'get_initial'] +# -*- coding: utf-8 -*- import os +import unicodedata +from ._compat import u + +__all__ = ['get', 'get_pinyin', 'get_initial'] + +tonemarks = ["", u("̄"), u("́"), u("̌"), u("̀"), ""] + # init pinyin dict pinyin_dict = {} +pinyin_tone = {} dat = os.path.join(os.path.dirname(__file__), "Mandarin.dat") with open(dat) as f: for line in f: k, v = line.strip().split('\t') - pinyin_dict[k] = v.lower().split(" ")[0][:-1] - - -from ._compat import u + pinyin_dict[k] = u(v.lower().split(" ")[0][:-1]) + pinyin_tone[k] = int(v.lower().split(" ")[0][-1]) -def _pinyin_generator(chars): +def _pinyin_generator(chars, format): """Generate pinyin for chars, if char is not chinese character, itself will be returned. Chars must be unicode list. """ for char in chars: key = "%X" % ord(char) - yield pinyin_dict.get(key, char) + pinyin = pinyin_dict.get(key, char) + tone = pinyin_tone.get(key, 0) + if tone == 0: + pass + elif format == "numerical": + pinyin = pinyin + str(tone) + elif format == "diacritical": + # Find first vowel -- we should put the diacritical mark + # just after + vowel = pinyin.index(next(x for x in pinyin if x in "aeiou")) + 1 + pinyin = pinyin[:vowel] + tonemarks[tone] + pinyin[vowel:] + elif format != "strip": + error = "Format must be one of: numerical/diacritical/strip" + raise ValueError(error) + yield unicodedata.normalize('NFC', pinyin) -def get(s, delimiter=''): +def get(s, delimiter='', format="diacritical"): """Return pinyin of string, the string must be unicode """ - return delimiter.join(_pinyin_generator(u(s))) + return delimiter.join(_pinyin_generator(u(s), format=format)) def get_pinyin(s): @@ -41,4 +61,5 @@ def get_pinyin(s): def get_initial(s, delimiter=' '): """Return the 1st char of pinyin of string, the string must be unicode """ - return delimiter.join([p[0] for p in _pinyin_generator(u(s))]) + initials = (p[0] for p in _pinyin_generator(u(s), format="strip")) + return delimiter.join(initials) diff --git a/setup.py b/setup.py index b6110d0..aa4295c 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='pinyin', - version='0.2.5', + version='0.3', description='Translate chinese chars to pinyin based on Mandarin.dat', author='Lx Yu', author_email='github@lxyu.net', diff --git a/test_pinyin.py b/test_pinyin.py index c4d02bc..c53f000 100644 --- a/test_pinyin.py +++ b/test_pinyin.py @@ -10,33 +10,36 @@ class BasicTestSuite(unittest.TestCase): """Basic test cases.""" def test_get(self): - self.assertEqual(pinyin.get('你好'), 'nihao') - self.assertEqual(pinyin.get(u('你好')), 'nihao') - self.assertEqual(pinyin.get('你好吗?'), 'nihaoma?') - self.assertEqual(pinyin.get('你好吗?'), u('nihaoma?')) + self.assertEqual(pinyin.get('你好'), + pinyin.get('你好', format="diacritical")) + self.assertEqual(pinyin.get(u('你好'), format="strip"), u('nihao')) + self.assertEqual(pinyin.get(u('你好'), format="numerical"), u('ni3hao3')) + self.assertEqual(pinyin.get(u('你好'), format="diacritical"), u('nǐhǎo')) + self.assertEqual(pinyin.get('你好吗?'), u('nǐhǎoma?')) + self.assertEqual(pinyin.get('你好吗?'), u('nǐhǎoma?')) - self.assertEqual(pinyin.get('你好'), 'nihao') - self.assertEqual(pinyin.get('叶'), 'ye') + self.assertEqual(pinyin.get('你好'), u('nǐhǎo')) + self.assertEqual(pinyin.get('叶'), u('yè')) def test_get_with_delimiter(self): - self.assertEqual(pinyin.get('你好', " "), 'ni hao') - self.assertEqual(pinyin.get('你好吗?', " "), 'ni hao ma ?') - self.assertEqual(pinyin.get('你好吗?', " "), u('ni hao ma ?')) + self.assertEqual(pinyin.get('你好', " "), u('nǐ hǎo')) + self.assertEqual(pinyin.get('你好吗?', " "), u('nǐ hǎo ma ?')) + self.assertEqual(pinyin.get('你好吗?', " "), u('nǐ hǎo ma ?')) def test_get_initial_with_delimiter(self): - self.assertEqual(pinyin.get_initial('你好', "-"), 'n-h') - self.assertEqual(pinyin.get_initial('你好吗?', "-"), 'n-h-m-?') + self.assertEqual(pinyin.get_initial('你好', "-"), u('n-h')) + self.assertEqual(pinyin.get_initial('你好吗?', "-"), u('n-h-m-?')) self.assertEqual(pinyin.get_initial('你好吗?', "-"), u('n-h-m-?')) def test_get_initial(self): - self.assertEqual(pinyin.get_initial('你好'), 'n h') - self.assertEqual(pinyin.get_initial('你好吗?'), 'n h m ?') + self.assertEqual(pinyin.get_initial('你好'), u('n h')) + self.assertEqual(pinyin.get_initial('你好吗?'), u('n h m ?')) self.assertEqual(pinyin.get_initial('你好吗?'), u('n h m ?')) self.assertEqual(pinyin.get_initial('你好'), 'n h') def test_mixed_chinese_english_input(self): - self.assertEqual(pinyin.get('hi你好'), 'hinihao') + self.assertEqual(pinyin.get('hi你好'), u('hinǐhǎo')) if __name__ == '__main__':