-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathNLP.py
138 lines (107 loc) · 4.23 KB
/
NLP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import os
import six
# Imports the Google Cloud client library
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types
from NLP_Constants import *
def get_client():
return language.LanguageServiceClient()
def syntax_text(text):
"""Detects syntax in the text."""
client = get_client()
if isinstance(text, six.binary_type):
text = text.decode('utf-8')
# Instantiates a plain text document.
document = types.Document(
content=text,
language='en',
type=enums.Document.Type.PLAIN_TEXT)
# Detects syntax in the document. You can also analyze HTML with:
# document.type == enums.Document.Type.HTML
response = client.analyze_syntax(document, encoding_type="UTF8")
sentences = response.sentences
tokens = response.tokens
printHeaders()
gloss = []
index = 0
for sentence in sentences:
content = sentence.text.content
sentence_begin = sentence.text.begin_offset
sentence_end = sentence_begin + len(content) - 1
while index < len(tokens) and tokens[index].text.begin_offset <= sentence_end:
# This token is in this sentence
token = tokens[index]
# handle cases for different POS tags
switch(pos_tag[token.part_of_speech.tag], token, gloss)
printToken(token)
index += 1
print(gloss)
return gloss
def processVerb(token, gloss):
pass
def processNoun(token, gloss):
if pos_proper[token.part_of_speech.proper] == 'PROPER':
# noun is proper. Need fingerspelling. Split it into alphabets
fingerspell = ['Alphabet.' + s.upper() for s in list(token.text.content)]
gloss.extend(fingerspell)
else:
# it is a common noun
gloss.append(token.lemma.lower())
def doNothing(token, gloss):
pass
def default(token, gloss):
# returns base word of token
return gloss.append(token.lemma.lower())
# a switch function that processes each token according to the pos tag of that token
# and calls suitable function to handle that token
def switch(tag, token, gloss):
switcher = {
'ADJ': default,
'ADP': default,
'ADV': default,
'CONJ': default,
'DET': default,
'NOUN': processNoun,
'NUM': default,
'PRON': default,
'PRT': default,
'VERB': default,
'AFFIX': default,
'PUNCT': doNothing,
'X': doNothing,
'UNKNOWN': default
}
fun = switcher.get(tag, default)
# call the function
fun(token, gloss)
# utils functions for printing data obtained from NLP API----------------------------
def printHeaders():
print(u'{:>15} {:>15} {:>15} {:>15} {:>15} {:>15} {:>15} {:>15} {:>15} {:>15}'.format("Text",
"Lemma", "dependecy",
"head_index", "POS_TAG",
"Number", "Person", "Gender",
"Tense", "Proper"))
def printToken(token):
print(u'{:>15} {:>15} {:>15} {:>15} {:>15} {:>15} {:>15} {:>15} {:>15} {:>15}'.format(
token.text.content,
token.lemma,
dependency_label[token.dependency_edge.label],
token.dependency_edge.head_token_index,
pos_tag[token.part_of_speech.tag],
pos_number[token.part_of_speech.number],
pos_person[token.part_of_speech.person],
pos_gender[token.part_of_speech.gender],
pos_tense[token.part_of_speech.tense],
# pos_aspect[token.part_of_speech.aspect],
# pos_mood[token.part_of_speech.mood],
# pos_voice[token.part_of_speech.voice],
pos_proper[token.part_of_speech.proper]
# pos_reciprocity[token.part_of_speech.reciprocity]
))
# for testing
if __name__ == "__main__":
print("Enter text.")
text = input()
gloss = syntax_text(text)
pass