-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
159 lines (141 loc) · 4.9 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import torchtext
from torch.utils.data import Dataset, DataLoader
import numpy as np
import os
import sys
import re
import unicodedata
import pprint
from collections import defaultdict
from random import shuffle
def unicodeToAscii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
)
def normalizeString(s):
s = unicodeToAscii(s.lower().strip())
s = re.sub(r"\"", r"'", s)
s = re.sub(r"/", r"", s)
s = re.sub(r" ", r" ", s)
s = re.sub(r"[^a-zA-Z0-9.,'!?$[]\"<>-]+", r" ", s)
return s
def load_data(path):
with open(path, "r", encoding='iso-8859-15') as f:
triples = []
labels = []
triple = ""
sentence = ""
sentences = []
for line in f.readlines():
if line[0].isdigit():
triple, label = split(line)
if triple and label:
triples.append(triple)
labels.append(label)
sentences.append(normalizeString(sentence))
else:
pass
else:
sentence = line[: -1]
return sentences, triples, labels
def split(line):
triple, label = '', ''
fields = line[2:-1].split('\t')
for i in fields[:-1]:
triple += i[1: -1] + " "
label = fields[-1]
return normalizeString(triple), label
def build_word_dict(sentences):
word_dict = defaultdict(int)
for s in sentences:
for fields in s.split(' '):
word_dict[fields] += 1
return word_dict
def build_label_dict(sentences):
label_dict = defaultdict(int)
for s in sentences:
for fields in s:
label_dict[fields] += 1
return label_dict
def create_mapping_with_unk(dico):
sorted_items = sorted(dico.items(), key=lambda x: (-x[1], x[0]))
id_to_word = {index + 2: w[0] for (index, w) in enumerate(sorted_items)}
word_to_id = {v: k for k, v in id_to_word.items()}
#sorted_items = sorted(dico.items(), key=lambda x: (-x[1], x[0]))
#word_to_vector = [ for i in id_to_word.keys() ]
id_to_word[0] = "<pad>"
word_to_id["<pad>"] = 0
id_to_word[1] = "<unk>"
word_to_id["<unk>"] = 1
return word_to_id, id_to_word
def create_mapping(dico):
"""
Create a mapping (item to ID / ID to item) from a dictionary.
Items are ordered by decreasing frequency.
"""
sorted_items = sorted(dico.items(), key=lambda x: (-x[1], x[0]))
id_to_item = {i: v[0] for i, v in enumerate(sorted_items)}
item_to_id = {v: k for k, v in id_to_item.items()}
id_to_item[0] = "O"
item_to_id["O"] = 0
return item_to_id, id_to_item
def len_argsort(seq):
"""
Function using to sort a list which its items are also list,the key to sort it is
the length of list in it.
Argument: seq(list of list, e.g. [[1, 2, 3], [4, 5], [c, d, e, g, f, k]])
Output: the sorted index(e. g [1, 0, 2 ])
"""
return sorted(range(len(seq)), key=lambda x: len(seq[x]))
def filter_long_index(triples, lim):
index = [len(triple) < lim for triple in triples]
return index
def random_array(sent, triple, label, sword_to_id, label_to_id):
sents = np.array([[sword_to_id[i] for i in s.split(" ")] for s in sent])
triples = np.array([[sword_to_id[i] for i in s.split(" ")] for s in triple])
labels = np.array([[label_to_id[i] for i in s] for s in label])
#index = filter_long_index(triples, 30)
'''
sent = np.array(sents)
triple = np.array(triples)
label = np.array(labels)
'''
idx = np.random.permutation(sent.shape[0])
return sents[idx], triples[idx], labels[idx]
def write_txt(path, file):
with open(path, "a") as f:
for i in file:
f.write(i + '\n')
class sent_triple_dataset(Dataset):
#self.sent_triple = a
pass
def test():
pwd = os.getcwd()
path = os.path.join(pwd, "data/extractions-all-labeled.txt")
#path = '/home/peng-lu/Projects/deepie/data/extractions-all-labeled.txt'
sent, triple, label= load_data(path)
sent_path = os.path.join(pwd, "data/sents.txt")
triple_path = os.path.join(pwd, "data/triples.txt")
label_path = os.path.join(pwd, "data/labels.txt")
word_dict = build_word_dict(sent + triple)
label_dict = build_label_dict(label)
print(len(word_dict))
sword_to_id, id_to_sword = create_mapping_with_unk(word_dict)
label_to_id, id_to_label = create_mapping(label_dict)
triples = np.array([[sword_to_id[i] for i in s.split(" ")] for s in triple])
labels = np.array([[label_to_id[i] for i in s] for s in label])
index = filter_long_index(triples, 20)
idx = np.array(index)
ll = labels[idx]
print("sum", sum(index))
print("sum l", sum(labels))
print("sum ll", sum(ll))
files = np.loadtxt(path)
print(files[:10])
if __name__ == '__main__':
test()