forked from isdapro/SpanishFlu
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
103 lines (85 loc) · 3.82 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import fasttext
import json
import os
import re
from codecs import encode
from tqdm import tqdm
from ast import literal_eval
from config import citationfile, years, DEBUG, keywords, DATA_PATH, PRETRAINED_MODEL_PATH, languages
import chardet
from language_extracter import language_extractions
model = fasttext.load_model(PRETRAINED_MODEL_PATH)
overall_stats = dict()
sentdict = {}
monthstats = {}
if not DEBUG:
citations = open(citationfile, "w+")
for root,dirs,files in os.walk(DATA_PATH, topdown=True):
for name in tqdm(files):
with open(os.path.join(DATA_PATH,name),mode="r+") as f:
try:
data = json.load(f)
except:
print("File %s has errors" % os.path.join(DATA_PATH,name))
continue
# Default is to process all years, use filter on yeas in config.py otherwise
if re.match(years, data['date']):
if DEBUG:
print("Publication date: %s" % data['date'])
g = re.search('\d+\-(\d+)\-\d+', data['date'])
# Calculating statistics on months
pubmonth = g.group(1)
if pubmonth in monthstats:
monthstats[pubmonth] = monthstats[pubmonth] + 1
else:
monthstats[pubmonth] = 1
x = literal_eval(data['text'])
temp = x.decode('utf-8').split('\r\n')
all_sentences = []
for i in temp:
all_sentences.extend(i.split('\n'))
sentences = []
for item in (all_sentences):
for keyword in keywords:
if re.search(keyword, item, re.IGNORECASE):
if not item in sentdict:
sentences.append(item)
sentdict[item] = name
if DEBUG:
print("[%s] %s" % (data['date'], item))
else:
try:
citation = "\"%s\"\n\tSource: %s, %s, %s" % (item, data['alternative'], data['date'], data['url'])
except KeyError:
print ("File name: {} has insufficient attributes, adding just source urn".format(os.path.join(DATA_PATH,name)))
citation = "\"%s\"\n\tSource: %s" % (item, data['url'])
citations.write("%s\n\n" % citation)
predictions = model.predict(sentences)
doc_lang = dict()
for i in range(len(sentences)):
if predictions[1][i][0]>0.7:
key = predictions[0][i][0].replace('__label__','')
if key in doc_lang:
doc_lang[key] += 1
else:
doc_lang[key] = 1
doc_lang = {k: v for k, v in sorted(doc_lang.items(), key=lambda item: item[1], reverse=True)}
if len(doc_lang)==0:
#print (name)
identified_lang = 'failed_to_identify'
else:
identified_lang = list(doc_lang)[0]
data['lang_iso'] = identified_lang
if identified_lang in overall_stats:
overall_stats[identified_lang] += 1
else:
overall_stats[identified_lang] = 1
f.seek(0)
f.truncate()
json.dump(data,f)
print(monthstats)
if not DEBUG:
citations.close()
print (overall_stats)
#Running separate language extraction to check english sentences
language_extractions(languages,model,keywords)