-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathmytask2.py
77 lines (51 loc) · 1.61 KB
/
mytask2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
""" Groups and Topics
The objective of this task is to explore the structure of the deals.txt file.
Building on task 1, we now want to start to understand the relationships to help us understand:
1. What groups exist within the deals?
2. What topics exist within the deals?
"""
###########################################
# Solution of Q2:
# Using termextract and tagging tools from topia for term extraction;
# Ranking for top k popular terms, which are restricted to NN or NNS
# Results for answering 1 and 2
# Other part of speech tagging methods can be used
###########################################
from topia.termextract import extract
from topia.termextract import tag
tagger = tag.Tagger()
tagger.initialize()
extractor = extract.TermExtractor()
extractor.filter = extract.permissiveFilter
# load data
openfile = open('..\data\deals.txt', 'r')
# define dictionary
d = {}
# iterative process
for line in openfile:
terms = extractor(line)
# remove empty
if not terms:
continue
# take each term from terms
for term in terms:
tag = tagger(term[0])
# remove unrelevant terms by tagging
if not tag[0][1] in ['NN', 'NNS']:
continue
#print tag[0][1]
# aggregate dictionary for each term
if not (term[0] in d):
d[term[0]] = 0
d[term[0]] += term[1]
# sorting for ranking topics
term_tuples = d.items()
term_tuples = sorted(term_tuples, key = lambda term: term[1], reverse = True)
# select 10 topics
i = 0
for k,v in term_tuples:
print k, v
if i >= 10:
break
i+=1
print "done!"