-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDataHandler.py
393 lines (310 loc) · 14.8 KB
/
DataHandler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
import os
import matplotlib.image as mpimg
import pandas as pd
from utils import unique
from tqdm import tqdm
import numpy as np
import cv2
import zipfile
import shutil
import copy
import matplotlib.pyplot as plt
import sys
class DataHandler:
def __init__(self, label_full_path, image_path, resampled_width=64, resampled_height=64):
self._download()
self.label_ = None
self.image_list_ = None
self.label_full_path = label_full_path
self.image_path = image_path
self.resampled_width = resampled_width
self.resampled_height = resampled_height
self.all_data, self.sick_bool_data, self.only_sick_data = self._get_formated_data()
def _download(self):
"""
Function that install the dataset in the data directory. It downloads the dataset with the kaggle API and unzip
the zip file downloaded (sample.zip.)
:arg
self (DataHandler): instance of the class
:return
None
"""
check_dir = '../data/sample'
if os.path.isdir(check_dir):
print('Kaggle dataset already downloaded !')
return None
os.system('kaggle datasets download -d nih-chest-xrays/sample')
save_directory = '../data'
if not os.path.isdir(save_directory):
os.mkdir(save_directory)
zip_file = 'sample.zip'
try:
print('Extracting zip file from the Kaggle dataset...')
with zipfile.ZipFile(zip_file, 'r') as zip_obj:
zip_obj.extractall(save_directory)
os.remove(zip_file)
except FileNotFoundError as e:
print('Could not download dataset. Please make sure that you Kaggle API token (kaggle.json) is in '
'the directory ~/.kaggle')
print('If your token is in the directory, create a new one.')
sys.exit()
shutil.rmtree('../data/sample/sample')
print('Kaggle dataset extracted extracted')
def _import_png_folder(self):
"""
Function that downloaded images in the images folder and places them in a list.
:arg
self (DataHandler): instance of the class
:return
image_list (list): List or images. The format of images are array
id_list (list): List of string corresponding to the name of each image
"""
image_list = []
id_list = []
print('Downloading images into the memory')
with tqdm(total=len(os.listdir(self.image_path))) as pbar:
for file in os.listdir(self.image_path):
if file.endswith(".png"):
full_file_path = os.path.join(self.image_path, file)
image = mpimg.imread(full_file_path)
if image.ndim > 2:
image = image[:, :, 0]
image_list.append(self.resample_image(image, self.resampled_width, self.resampled_height))
id_list.append(file)
pbar.update(1)
self.image_list_ = image_list
return image_list, id_list
def _import_csv(self):
"""
Function that read the csv in which the target of each image is written.
:arg
self (DataHandler): instance of the class
:return
df (pandas dataframe): pandas dataframe corresponding to the targets. Each columns correspond to a pathology
each rows correspond to an image.
"""
try:
df = pd.read_csv(self.label_full_path)
except IOError as e:
raise e
df = df.iloc[:, [0, 1]]
split_data = df['Finding Labels'].str.split('|')
list1 = split_data.to_list()
flat_list = [item for sublist in list1 for item in sublist]
unique_list = unique(flat_list)
df = pd.concat([df, pd.DataFrame(columns=unique_list)], sort=False)
for value in unique_list:
bool_value = df['Finding Labels'].str.contains(value)
df[value] = bool_value.astype(int)
df = df.drop(labels=['Finding Labels'], axis=1)
return df
def _get_formated_data(self):
"""
Function that format the data in numpy array
:arg
self (DataHandler): instance of the class
:return
(image_flatten_array, all_labels) (tuple of numpy array):
image_flatten_array (numpy array): 2D numpy array, each column is a normalized pixel value (0, 1) and
row correspond to a new image
all_labels (numpy array): 2D binary numpy array, each column correspond to a pathology including the
No Finding as a pathology and each row correspond to a new image
(image_flatten_array, bool_sick_labels) (tuple of numpy array):
image_flatten_array (numpy array): 2D numpy array, each column is a normalized pixel value (0, 1) and
row correspond to a new image
bool_sick_labels (numpy array): 1D binary numpy array, each value correspond to either 0 or 1. If the
value is 1, it means that the image corresponding to the index is the
image of a sick patient. 0 means a healthy patient
(only_sick_image, only_sick_labels) (tuple of numpy array):
only_sick_image (numpy array): 2D numpy array, each column is a normalized pixel value (0, 1) and
row correspond to a new image. Only the sick patient are represented
in this variable
only_sick_labels (numpy array): 2D binary numpy array, each column correspond to a pathology. No Finding
is not included as a pathology. Each row correspond to a new image. It
only represent the pathology of sick patients.
"""
image_list_resample, id_list = self._import_png_folder()
#image_list_resample = [self.resample_image(image, self.resampled_width, self.resampled_height) for image in image_list]
labels = self._import_csv()
labels = labels[labels.iloc[:, 0].isin(id_list)]
labels['ordered_id'] = pd.Categorical(labels.iloc[:, 0], categories=id_list, ordered=True)
labels.sort_values('ordered_id')
labels = labels.drop(labels='ordered_id', axis=1)
labels = labels.iloc[:, 1:]
image_list_flat = self.flatten(image_list_resample)
image_flatten_array = np.asarray(image_list_flat)
all_labels = copy.deepcopy(labels)
all_labels = np.asarray(all_labels)
bool_sick_labels = copy.deepcopy(labels)
bool_sick_labels = np.asarray(bool_sick_labels)
bool_sick_labels = 1 - bool_sick_labels[:, 5]
only_sick_labels = copy.deepcopy(labels)
only_sick_labels = np.asarray(only_sick_labels)
sick_idx = np.nonzero(bool_sick_labels)
only_sick_labels = only_sick_labels[sick_idx[0], :]
only_sick_labels = np.delete(only_sick_labels, 5, axis=1)
only_sick_image = image_flatten_array[sick_idx[0], :]
self.label_ = labels
return (image_flatten_array, all_labels),\
(image_flatten_array, bool_sick_labels),\
(only_sick_image, only_sick_labels)
def get_all_data(self):
"""
Function that return the instance variable self.all_data
:arg
self (DataHandler): instance of the class
:return
self.all_data (tuple of numpy array):
self.all_data[0] (numpy array): 2D numpy array, each column is a normalized pixel value (0, 1) and
row correspond to a new image
self.all_data[1] (numpy array): 2D binary numpy array, each column correspond to a pathology including
the No Finding as a pathology and each row correspond to a new image
"""
return self.all_data
def get_sick_bool_data(self):
"""
Function that return the instance variable self.sick_bool_data
:arg
self (DataHandler): instance of the class
:return
self.sick_bool_data (tuple of numpy array):
self.sick_bool_data[0] (numpy array): 2D numpy array, each column is a normalized pixel value (0, 1) and
row correspond to a new image
self.sick_bool_data[1] (numpy array): 1D binary numpy array, each value correspond to either 0 or 1. If
the value is 1, it means that the image corresponding to the index is
the image of a sick patient. 0 means a healthy patient
"""
return self.sick_bool_data
def get_only_sick_data(self):
"""
Function that return the instance variable self.only_sick_data
:arg
self (DataHandler): instance of the class
:return
self.sick_bool_data (tuple of numpy array):
self.sick_bool_data[0] (numpy array): 2D numpy array, each column is a normalized pixel value (0, 1) and
row correspond to a new image. Only the sick patient are represented
in this variable
self.sick_bool_data[1] (numpy array): 2D binary numpy array, each column correspond to a pathology. No
Finding is not included as a pathology. Each row correspond to a
new image. It only represent the pathology of sick patients.
"""
return self.only_sick_data
def flatten(self, image: list):
"""
Function that flatten a list of 2D numpy array
:arg
self (DataHandler): instance of the class
image (list): list of 2D numpy array to be flatten
:return
image (list): list of 1D numpy array
"""
image = [x.flatten(order='C') for x in image]
return image
def resample_image(self, image, width: int, heigth: int):
"""
Function that resample a 2D numpy array
:arg
self (DataHandler): instance of the class
image (numpy array): 2D numpy array
width (int): width that the image will be resample
heigth (int): heigth that the image will be resample
:return
resampled (numpy array): resampled 2D numpy array
"""
resampled = cv2.resize(image, dsize=(width, heigth), interpolation=cv2.INTER_NEAREST)
return resampled
def plot_data(self):
"""
Function that plot the bar plot of the pathology amongst patients including No Finding,
the bar plot excluding No Finding,
the bar plot os sick vs not sick
:arg
self (DataHandler): instance of the class
:return
value_hist_sick (list): list of the distrbution of sick people
"""
array_labels = np.asarray(self.label_)
value_hist = np.sum(array_labels, axis=0).tolist()
label_names = self.label_.columns.values.tolist()
value_hist, label_names = zip(*sorted(zip(value_hist, label_names)))
# Dataset histogram
y = np.arange(len(label_names))
fig, ax = plt.subplots()
rects = ax.barh(y, value_hist)
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_xlabel('Number of people')
ax.set_title('Distribution of the dataset')
ax.set_yticks(y)
ax.set_yticklabels(label_names)
ax.legend()
for rect in rects:
width = rect.get_width()
ax.annotate('{}'.format(width),
xy=(rect.get_x()+width, rect.get_y()),
xytext=(15, 2), # 3 points vertical offset
textcoords="offset points",
ha='center', va='bottom')
plt.show(block=False)
# Sick people histogram
value_hist_sick = value_hist[:-1]
label_names_sick = label_names[:-1]
y = np.arange(len(label_names_sick))
fig, ax = plt.subplots()
rects = ax.barh(y, value_hist_sick)
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_xlabel('Number of people')
ax.set_title('Distribution of sick people')
ax.set_yticks(y)
ax.set_yticklabels(label_names_sick)
ax.legend()
for rect in rects:
width = rect.get_width()
ax.annotate('{}'.format(width),
xy=(rect.get_x()+width, rect.get_y()),
xytext=(15, 2), # 3 points vertical offset
textcoords="offset points",
ha='center', va='bottom')
plt.show(block=False)
# Sick vs not sick histogram
sick_vs_not = [value_hist[-1], array_labels.shape[0] - value_hist[-1]]
label_names_sick_not = ['Healthy', 'Sick']
colors = ['g', 'r']
y = np.arange(len(sick_vs_not))
fig, ax = plt.subplots()
rects = ax.barh(y, sick_vs_not, color=colors)
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_xlabel('Number of people')
ax.set_title('Sick vs Healthy people')
ax.set_yticks(y)
ax.set_yticklabels(label_names_sick_not)
ax.legend()
for rect in rects:
width = rect.get_width()
ax.annotate('{}'.format(width),
xy=(rect.get_x()+width/2, rect.get_y()),
xytext=(0, 50), # 3 points vertical offset
textcoords="offset points",
ha='center', va='center')
plt.show(block=False)
return value_hist_sick
def show_samples(self):
"""
Function that plot some sample of the image in the dataset
:arg
self (DataHandler): instance of the class
:return
None
"""
idx = 0
plt.figure()
plt.rcParams['figure.figsize'] = (10.0, 10.0)
plt.subplots_adjust(wspace=0, hspace=0)
plt.suptitle('Samples of dataset')
for image in self.image_list_[:16]:
im = self.resample_image(image, 128, 128)
plt.subplot(4, 4, idx + 1) # .set_title(l)
plt.imshow(cv2.cvtColor(im, cv2.COLOR_BGR2RGB))
plt.axis('off')
idx += 1
plt.show(block=False)