-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTF_matrix.py
83 lines (72 loc) · 2.69 KB
/
TF_matrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# CALCULATION OF TF VALUES
# TF= (NO OF TIMES A TERM APPEAR IN SENTENCE) / (NO OF TERMS IN EACH SENTENCE)
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
corpus_root = 'C:\MyData\PythonPractice\Mycorpus'
wordlists = PlaintextCorpusReader(corpus_root, 'resort.*\.txt')
print('\nFollowing file ids are there in this corpus: \n ')
print(wordlists.fileids())
print("\nNumber of sentences in the file are :")
sencount=len(wordlists.sents(fileids=['resort.txt']))
print(sencount)
print('\n Sentences are : \n')
sentences=wordlists.sents(fileids='resort.txt')
print(sentences)
sample=wordlists.raw("resort.txt")
s=sample.split('.')
# NUMBER OF TIMES A TERM APPEAR IN EACH SENTENCE
# NUMBER OF TERMS IN EACH SENTENCE
wordfreq = []
term_freq=[]
terms_count_doc=[]
for i in range(sencount):
print("\n Sentence "+ str(i+1))
print(s[i])
#print('\n Tokenization \n')
word_tokens = word_tokenize(s[i])
#print('\n Removing PUNCTUATIONS')
word_tokens=[word.lower() for word in word_tokens if word.isalpha()]
#print('\n Removing STOPWORDS Now')
stop_words = set(stopwords.words('english'))
filtered_sentence = [w for w in word_tokens if not w in stop_words]
filtered_sentence = []
for w in word_tokens:
if w not in stop_words:
filtered_sentence.append(w)
for w in filtered_sentence:
wordfreq.append(filtered_sentence.count(w))
print("\n Pair wise --(Words,Frequences) for Sentence"+ str(i+1)+ "\n" )
print(list(zip(filtered_sentence, wordfreq)))
fs, wf = zip(*(list(zip(filtered_sentence, wordfreq))))
print("\n Number of times a term appear in sentence "+str(i+1)+"\n")
print(wf)
term_freq.append(wf)
unique_tokens = []
for x in filtered_sentence:
if x not in unique_tokens:
unique_tokens.append(x)
print('\n Number of tokens:'+ str(len(unique_tokens)))
print(unique_tokens)
terms_count_doc.append(len(unique_tokens))
print("\n Numbers of terms in each sentence is :")
print(terms_count_doc)
print("\n Numbers of times a term appear in all sentences : COUNT MATRIX \n")
print(term_freq)
# TF for each term
for i in range(sencount):
TF=[]
for j in range(len(term_freq[i])):
x=((term_freq[i])[j])/(terms_count_doc[i])
TF.append(x)
print("\n TF values for Sentence : "+ str(i+1))
print(TF)
# COMBINED TF MATRIX
TFM=[]
for i in range(sencount):
for j in range(len(term_freq[i])):
x=((term_freq[i])[j])/(terms_count_doc[i])
TFM.append(x)
print("\n COMBINED REPRESENTATION of TF Values : \n ")
print(TFM)