-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathabsSummary.py
90 lines (81 loc) · 2.88 KB
/
absSummary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import json
import nltk
import re
def genSummary():
out_file = open("summary.json", "r")
root_node = json.load(out_file)
roots = root_node["root"]
mode_root_node = {}
mode_root_node["root"] = []
print "\n\n\n\n\n\n\n\n"
print "From abs summary :- \n\n"
notwords = ["also", "hence","furthermore","moreover","therefore", "so", "however", "but", "further", "u", "&", "13", "#", ";"]
#roots=[{"summary" : "\"However, this is not going to happen\". So, he is good. So, that is not good so good. So, no. "}]
for root in roots:
summary = root["summary"]
#summary = "Therefore, I am going to say that he also will not speak"
#print summary
#print "\n"
#1.check if they are in the beginning of sentence. 2.clean the sentences(like comma and all in wrong places, 3.pronoun to noun conversion
summ = {}
summ["key"] = root["key"]
sentences = nltk.sent_tokenize(summary)
#print sentences
finalSentences = []
marks = []
for sentence in sentences:
words = nltk.word_tokenize(sentence)
#print words
finalSummary = ""
for i in range(len(words)):
if words[i].lower() in notwords:
marks.append(words[i])
elif words[i].lower() == "n't":
words[i] = "not"
elif words[i].lower() == "'s":
words[i] = "is"
for mark in marks:
words.remove(mark)
marks = []
finalSentence = (" ").join(words).rstrip()
#print finalSentence.decode('utf-8')
finalSentences.append(finalSentence)
finalSummary = (" ").join(finalSentences)
#print finalSummary
#finalSummary = re.sub(("(\. )[,!?;]+ "), ". ", finalSummary)
#finalSummary = re.sub(' +[.,?!]+','', finalSummary)
#finalSummary = re.sub(("
"),"", finalSummary)
finalSummary = re.sub(("(\. )+[,!?]+ "), ". ", finalSummary)
#finalSummary = re.sub(' +[.,?!]+','', finalSummary)
finalSummary = re.sub(("
"),"", finalSummary)
#print finalSummary
if(finalSummary[0:2] in [" ,", " !", " ?"]):
finalSummary = finalSummary[4:]
elif (finalSummary[0:4] in ["\" ," , "\" ?" , "\" !", "`` ," , "`` ?" , "`` !"]):
finalSummary1 = '\"'
finalSummary1 += finalSummary[5:]
finalSummary = finalSummary1
'''elif(finalSummary[0:4] in ["'' ,", "'' ?", "'' !"]):
finalSummary1 = "\""
finalSummary1+= finalSummary[5:]
finalSummary = finalSummary1'''
'''k = 0
while k< len(finalSummary):
flag = k
j = k
while(finalSummary[k] in [" ", ",", ":"]):
j+=1
if (flag != j):
finalSummary[k+1]
k+=1'''
#print finalSummary.encode('utf-8')
#print "******************************************************************************" * 2
summ["summary"] = finalSummary
mode_root_node["root"].append(summ)
#print finalSummary
#print mode_root_node
out_file = open("modified_summary.json", "w")
json.dump(mode_root_node, out_file, indent=4)
out_file.close()
print "\n\n\n\n\n\n\n\n"
#genSummary()