Skip to content

Commit

Permalink
Adds diacritical marks to pinyin
Browse files Browse the repository at this point in the history
  • Loading branch information
pmitros committed Jan 2, 2016
1 parent b956dd1 commit f8f8618
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 29 deletions.
15 changes: 11 additions & 4 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,14 @@ Usage
.. code:: python
>>> import pinyin
>>> pinyin.get('你好')
'nihao'
>>> pinyin.get_initial('你好')
'n h'
>>> print pinyin.get('你 好')
nǐ hǎo
>>> print pinyin.get('你好', format="strip", delimiter=" ")
ni hao
>>> print pinyin.get('你好', format="numerical")
ni3hao3
>>> print pinyin.get_initial('你好')
n h
2 changes: 2 additions & 0 deletions authors.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Lx Yu
Piotr Mitros
41 changes: 31 additions & 10 deletions pinyin/pinyin.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,53 @@
__all__ = ['get', 'get_pinyin', 'get_initial']
# -*- coding: utf-8 -*-

import os
import unicodedata
from ._compat import u

__all__ = ['get', 'get_pinyin', 'get_initial']

tonemarks = ["", u("̄"), u("́"), u("̌"), u("̀"), ""]


# init pinyin dict
pinyin_dict = {}
pinyin_tone = {}
dat = os.path.join(os.path.dirname(__file__), "Mandarin.dat")
with open(dat) as f:
for line in f:
k, v = line.strip().split('\t')
pinyin_dict[k] = v.lower().split(" ")[0][:-1]


from ._compat import u
pinyin_dict[k] = u(v.lower().split(" ")[0][:-1])
pinyin_tone[k] = int(v.lower().split(" ")[0][-1])


def _pinyin_generator(chars):
def _pinyin_generator(chars, format):
"""Generate pinyin for chars, if char is not chinese character,
itself will be returned.
Chars must be unicode list.
"""
for char in chars:
key = "%X" % ord(char)
yield pinyin_dict.get(key, char)
pinyin = pinyin_dict.get(key, char)
tone = pinyin_tone.get(key, 0)
if tone == 0:
pass
elif format == "numerical":
pinyin = pinyin + str(tone)
elif format == "diacritical":
# Find first vowel -- we should put the diacritical mark
# just after
vowel = pinyin.index(next(x for x in pinyin if x in "aeiou")) + 1
pinyin = pinyin[:vowel] + tonemarks[tone] + pinyin[vowel:]
elif format != "strip":
error = "Format must be one of: numerical/diacritical/strip"
raise ValueError(error)
yield unicodedata.normalize('NFC', pinyin)


def get(s, delimiter=''):
def get(s, delimiter='', format="diacritical"):
"""Return pinyin of string, the string must be unicode
"""
return delimiter.join(_pinyin_generator(u(s)))
return delimiter.join(_pinyin_generator(u(s), format=format))


def get_pinyin(s):
Expand All @@ -41,4 +61,5 @@ def get_pinyin(s):
def get_initial(s, delimiter=' '):
"""Return the 1st char of pinyin of string, the string must be unicode
"""
return delimiter.join([p[0] for p in _pinyin_generator(u(s))])
initials = (p[0] for p in _pinyin_generator(u(s), format="strip"))
return delimiter.join(initials)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setup(
name='pinyin',
version='0.2.5',
version='0.3',
description='Translate chinese chars to pinyin based on Mandarin.dat',
author='Lx Yu',
author_email='[email protected]',
Expand Down
31 changes: 17 additions & 14 deletions test_pinyin.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,33 +10,36 @@ class BasicTestSuite(unittest.TestCase):
"""Basic test cases."""

def test_get(self):
self.assertEqual(pinyin.get('你好'), 'nihao')
self.assertEqual(pinyin.get(u('你好')), 'nihao')
self.assertEqual(pinyin.get('你好吗?'), 'nihaoma?')
self.assertEqual(pinyin.get('你好吗?'), u('nihaoma?'))
self.assertEqual(pinyin.get('你好'),
pinyin.get('你好', format="diacritical"))
self.assertEqual(pinyin.get(u('你好'), format="strip"), u('nihao'))
self.assertEqual(pinyin.get(u('你好'), format="numerical"), u('ni3hao3'))
self.assertEqual(pinyin.get(u('你好'), format="diacritical"), u('nǐhǎo'))
self.assertEqual(pinyin.get('你好吗?'), u('nǐhǎoma?'))
self.assertEqual(pinyin.get('你好吗?'), u('nǐhǎoma?'))

self.assertEqual(pinyin.get('你好'), 'nihao')
self.assertEqual(pinyin.get('叶'), 'ye')
self.assertEqual(pinyin.get('你好'), u('nǐhǎo'))
self.assertEqual(pinyin.get('叶'), u('yè'))

def test_get_with_delimiter(self):
self.assertEqual(pinyin.get('你好', " "), 'ni hao')
self.assertEqual(pinyin.get('你好吗?', " "), 'ni hao ma ?')
self.assertEqual(pinyin.get('你好吗?', " "), u('ni hao ma ?'))
self.assertEqual(pinyin.get('你好', " "), u('nǐ hǎo'))
self.assertEqual(pinyin.get('你好吗?', " "), u('nǐ hǎo ma ?'))
self.assertEqual(pinyin.get('你好吗?', " "), u('nǐ hǎo ma ?'))

def test_get_initial_with_delimiter(self):
self.assertEqual(pinyin.get_initial('你好', "-"), 'n-h')
self.assertEqual(pinyin.get_initial('你好吗?', "-"), 'n-h-m-?')
self.assertEqual(pinyin.get_initial('你好', "-"), u('n-h'))
self.assertEqual(pinyin.get_initial('你好吗?', "-"), u('n-h-m-?'))
self.assertEqual(pinyin.get_initial('你好吗?', "-"), u('n-h-m-?'))

def test_get_initial(self):
self.assertEqual(pinyin.get_initial('你好'), 'n h')
self.assertEqual(pinyin.get_initial('你好吗?'), 'n h m ?')
self.assertEqual(pinyin.get_initial('你好'), u('n h'))
self.assertEqual(pinyin.get_initial('你好吗?'), u('n h m ?'))
self.assertEqual(pinyin.get_initial('你好吗?'), u('n h m ?'))

self.assertEqual(pinyin.get_initial('你好'), 'n h')

def test_mixed_chinese_english_input(self):
self.assertEqual(pinyin.get('hi你好'), 'hinihao')
self.assertEqual(pinyin.get('hi你好'), u('hinǐhǎo'))


if __name__ == '__main__':
Expand Down

0 comments on commit f8f8618

Please sign in to comment.