-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
executable file
·106 lines (70 loc) · 3.12 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import numpy as np
import librosa as lbr
import tensorflow as tf
import os
slim = tf.contrib.slim
GENRES = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal',
'pop', 'reggae', 'rock']
WINDOW_SIZE = 2048
WINDOW_STRIDE = WINDOW_SIZE // 2
N_MELS = 128
MEL_KWARGS = {
'n_fft': WINDOW_SIZE,
'hop_length': WINDOW_STRIDE,
'n_mels': N_MELS
}
def get_default_shape(dataset_path):
tmp_features, _ = load_track(os.path.join(dataset_path,
'blues/blues.00000.au'))
return tmp_features.shape
def load_track(filename, enforce_shape=None):
new_input, sample_rate = lbr.load(filename, mono=True)
features = lbr.feature.melspectrogram(new_input, **MEL_KWARGS).T
if enforce_shape is not None:
if features.shape[0] < enforce_shape[0]:
delta_shape = (enforce_shape[0] - features.shape[0],
enforce_shape[1])
features = np.append(features, np.zeros(delta_shape), axis=0)
elif features.shape[0] > enforce_shape[0]:
features = features[: enforce_shape[0], :]
features[features == 0] = 1e-6
return (np.log(features), float(new_input.shape[0]) / sample_rate)
def read_and_decode(filename): # read train.tfrecords
filename_queue = tf.train.string_input_producer([filename])# create a queue
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)#return file_name and file
features = tf.parse_single_example(serialized_example,
features={
'genres': tf.FixedLenFeature((1,), tf.int64),
'au_flattern': tf.VarLenFeature(tf.float32),
})#return image and label
labels = tf.cast(features['genres'], tf.int32) #throw label tensor
au_flattern = features['au_flattern']
au_flattern = tf.sparse_tensor_to_dense(au_flattern)
au = tf.reshape(au_flattern, get_default_shape('GTZAN/genres'))
au = tf.expand_dims(au, axis=2)
return au, labels
def audio_augmention(data, sr):
# Adding white noise
wn = np.random.randn(len(data))
data_wn = data + 0.005 * wn
# Shifting the sound
steps = np.random.randint(-10, 10)
data_sf = lbr.effects.pitch_shift(data_wn, sr, n_steps=steps)
# Changing volume
volume = np.random.uniform(.5, 2)
data_sf *= volume
return data_sf
def load_track_with_aug(filename, enforce_shape=None):
new_input, sample_rate = lbr.load(filename, mono=True)
new_input_with_aug = audio_augmention(new_input, sample_rate)
features = lbr.feature.melspectrogram(new_input_with_aug, **MEL_KWARGS).T
if enforce_shape is not None:
if features.shape[0] < enforce_shape[0]:
delta_shape = (enforce_shape[0] - features.shape[0],
enforce_shape[1])
features = np.append(features, np.zeros(delta_shape), axis=0)
elif features.shape[0] > enforce_shape[0]:
features = features[: enforce_shape[0], :]
features[features == 0] = 1e-6
return (np.log(features), float(new_input.shape[0]) / sample_rate)