-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMyLexicalChainer.py
138 lines (119 loc) · 3.51 KB
/
MyLexicalChainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from nltk.corpus import wordnet as wn
from nltk import sent_tokenize, word_tokenize, pos_tag
import json
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize, sent_tokenize
from lexicalchain import LexGraph, GalleyMcKeownChainer
class LexicalChain:
def __init__(self):
"""
Initiliase the class variables here
"""
self.np_list = []
return
def GetCandidateWords(self,sentences):
"""
Simple nouns are selected as candiate words.
"""
i = 0
for sentence in sentences:
for word in sentence:
if word[1] == "NN":
self.np_list.append([word[0], sentence])
i += 1
#print self.np_list
def Preprocessing(self,data):
for news in data:
sentences = sent_tokenize(news["content"])
tagged_sentences = []
for sentence in sentences:
try:
words = word_tokenize(sentence)
tagged_words = pos_tag(words)
tagged_sentences.append(tagged_words)
except NameError:
continue
except UnicodeEncodeError:
continue
return tagged_sentences
def buildLexicalChain(self):
for word in self.np_list:
print wn.synsets(word[0], pos='n')
def Processing(self, input):
input = input.replace("-\n","")
input = sent_tokenize(input)
input = [[pos_tag(word_tokenize(sent)) for sent in input]]
mc = GalleyMcKeownChainer(data=input)
chains = mc.computeChains()
return chains
def preProcessing(data, article_ids):
content = ""
for news in data:
if news["id"] not in article_ids:
continue
return news["content"]
content += news["content"].encode('utf-8')
return content
def getSentenceId(lexical_chains):
sentence_ids = set()
for chain in lexical_chains:
metachain = chain[0]
lex_nodes_list = metachain.getAdjacentNodes()
i = 0
for lex_node_tuple in lex_nodes_list:
if i == 0 and len(sentence_ids) <= 5:
sentence_ids.update(lex_node_tuple[0].getPos())
i+=1
return sentence_ids
def extractSentence(data, sentence_ids):
data = data.replace("-\n","")
data = sent_tokenize(data)
id_list = list(sentence_ids)
extracted_sentences = []
for id in id_list:
extracted_sentences.append(data[id-1])
return extracted_sentences
def generateSummary(data, article_ids):
data = preProcessing(data, article_ids)
#data = json.loads(open('DataCorpus.json').read())['root'][10]["content"]
l = LexicalChain()
lexical_chains = l.Processing(data)
sentence_ids = getSentenceId(lexical_chains)
print sentence_ids
content = (" ").join(extractSentence(data, sentence_ids))
return content
def generateSummaryForClusters (data, clusterings):
num = 0
flag = False
summaries = []
keys1 = clusterings.keys()
for i in keys1:
keys2 = clusterings[i].keys()
for j in keys2:
num+=1
article_ids = clusterings[i][j]
summary = generateSummary(data, article_ids)
summaries.append(summary)
if num >= 20:
flag = True
break
if flag:
break
return summaries
def summarize(json_file, clusterings):
data = json.loads(open(json_file).read())['root']
summaries = generateSummaryForClusters(data, clusterings)
#print type(summaries[0])
return summaries
'''
if __name__ == "__main__":
json_file = r"DataCorpus2.json"
data = json.loads(open(json_file).read())['root'][30]["content"]
#print data
print "\n Lexical chains generated : \n"
l = LexicalChain()
lexical_chains = l.Processing(data)
sentence_ids = getSentenceId(lexical_chains)
content = extractSentence(data, sentence_ids)
print content
'''