-
Notifications
You must be signed in to change notification settings - Fork 2
/
baseline_utils.py
123 lines (115 loc) · 4.77 KB
/
baseline_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import os.path
import csv
import numpy as np
import pandas as pd
import PIL.Image
from sklearn.externals import joblib
from config import *
""" Melanie Bernhardt - Laura Manduchi - Melanie Gaillochet.
This file is a helper file for the preprocessing and feature matrix building
for the baselines presented in the project.
"""
def csv_to_dict(csv_path):
""" This function transform the score/label csv file in a dictionary.
Args:
csv_path: the path to the file to transform.
Returns:
d: the corresponding dictionary
"""
with open(csv_path, 'r') as fp:
csv_fp = csv.reader(fp)
next(csv_fp)
d = dict(filter(None, csv_fp))
return d
def extract_feats(img_arr, bins):
"""This function extracts simple features from the images.
Args:
img_arr: one input image to process
bins: the number of bins for the histogram features.
Returns:
hist: histogram of the input image used as features for the baseline regressors.
"""
hist, _ = np.histogram(img_arr, bins=bins)
return hist
def preprocessing_baseline_train_test(train_ratio, feat_size):
"""This is the main preprocessing function for feature matrix building.
Args:
train_ratio(float between 0 and 1): train/dev split ratio to use
feat_size(int): number of features to use for the feature matrix
Returns:
(train_mat, train_y, test_mat, test_y): train_mat is the feature matrix to use for training
train_y is a vector with the scores for the training split
test_mat is the feature matrix to use for validation
test_y is a vector with the scores for the dev split
"""
try:
# Get preprocessed data if already available
train_mat = np.load(data_folder + 'train_mat_' + str(train_ratio) + '_' + str(feat_size) + '.npy')
train_y = np.load(data_folder + 'train_y_' + str(train_ratio) + '_' + str(feat_size) + '.npy')
test_mat = np.load(data_folder + 'test_mat_' + str(train_ratio) + '_' + str(feat_size) + '.npy')
test_y = np.load(data_folder + 'test_y_' + str(train_ratio) + '_' + str(feat_size) + '.npy')
return(train_mat, train_y, test_mat, test_y)
except:
# If preprocessed data doesn't exist, compute it and save it (because it takes time to compute)
print("Couldn't find matrices; preprocessing data")
### DATA LOADING ###
# Print parameters used
print('feature_size: {}'.format(feat_size))
# Default paths to the scored training data
scored_path = os.path.join(data_folder, "scored")
score_file = os.path.join(data_folder, "scored.csv")
# Randomly shuffle data - seed to ensure that we use the same training data as our final model.
np.random.seed(10)
# Initialization of the preprocessing.
# Create a dictionary -> Image number: score (original order not kept)
score_dict = csv_to_dict(score_file)
# Create a dataframe with Id and score (original order kept)
score_df = pd.read_csv(score_file)
num_images = score_df.shape[0]
print("num images: {}".format(num_images))
shuffled_indices = list(np.random.permutation(num_images))
n_train = int(train_ratio * num_images)
n_test = num_images - n_train
print("n train: {}".format(n_train))
print("n test: {}".format(n_test))
# Initialize the feature matrices
train_mat = np.zeros((n_train, feat_size))
print("train mat shape: {}".format(train_mat.shape))
train_y = np.zeros(n_train)
test_mat = np.zeros((n_test, feat_size))
print("test mat shape: {}".format(test_mat.shape))
test_y = np.zeros(n_test)
train_idx = 0
test_idx = 0
### FEATURE EXTRACTION ###
counter = 0
# Assemble train/test feature matrices / score vectors
for idx in shuffled_indices:
if counter%500==0:
print("Image: {}/{}".format(counter, num_images))
img_index = int(score_df.iloc[idx]['Id'])
# loading the image
raw_image = PIL.Image.open(os.path.join(scored_path, "{}.png".format(img_index)))
img_arr = np.array(raw_image.getdata()).reshape(raw_image.size[0], raw_image.size[1]).astype(np.uint8)
# extracting features
img_feats = extract_feats(img_arr=img_arr, bins=feat_size)
# extracting score
score = float(score_dict[str(img_index)])
# defining processed matrix for dev set
if (test_idx) < n_test:
test_mat[test_idx, :] = img_feats
test_y[test_idx] = score
test_idx += 1
# same for training set
else:
train_mat[train_idx, :] = img_feats
train_y[train_idx] = score
train_idx += 1
counter += 1
# Saving features/scores to disk
print("Saving feature matrices...")
np.save(data_folder + 'train_mat_' + str(train_ratio) + '_' + str(feat_size) , train_mat)
np.save(data_folder + 'train_y_' + str(train_ratio) + '_' + str(feat_size), train_y)
np.save(data_folder + 'test_mat_' + str(train_ratio) + '_' + str(feat_size), test_mat)
np.save(data_folder + 'test_y_' + str(train_ratio) + '_' + str(feat_size), test_y)
return(train_mat, train_y, test_mat, test_y)