I have created a data pipeline using tf.data for speech recognition using the following code snippets:
JavaScript
x
37
37
1
def get_waveform_and_label(file_path):
2
label = tf.strings.split(file_path, os.path.sep)[-2]
3
4
audio_binary = tf.io.read_file(file_path)
5
audio, _ = tf.audio.decode_wav(audio_binary)
6
waveform = tf.squeeze(audio, axis=-1)
7
8
return waveform, label
9
10
def get_spectrogram(waveform):
11
# Padding for files with less than 16000 samples
12
# Generate zeros w.r.t how many the waveform lacks
13
zero_padding = tf.zeros([16000] - tf.shape(waveform), dtype=tf.float32)
14
15
# Concatenate audio with padding so that all audio clips will be of the same length
16
waveform = tf.cast(waveform, tf.float32)
17
waveform = tf.concat([waveform, zero_padding], 0)
18
19
spectrogram = tf.signal.stft(waveform, frame_length=255, frame_step=128)
20
spectrogram = tf.abs(spectrogram)
21
22
return spectrogram
23
24
def get_spectrogram_and_label_id(audio, label):
25
spectrogram = get_spectrogram(audio)
26
spectrogram = tf.expand_dims(spectrogram, -1)
27
28
label_id = tf.argmax(label == np.array(labels))
29
label_onehot = tf.one_hot(label_id, len(labels))
30
31
return spectrogram, label_onehot
32
33
files_ds = tf.data.Dataset.from_tensor_slices(files)
34
waveform_ds = files_ds.map(get_waveform_and_label, num_parallel_calls=tf.data.AUTOTUNE)
35
spectrogram_ds = waveform_ds.map(get_spectrogram_and_label_id, num_parallel_calls=tf.data.AUTOTUNE)
36
37
These snippets are borrowed from https://www.tensorflow.org/tutorials/audio/simple_audio#build_and_train_the_model.
And my model is defined as below:
JavaScript
1
31
31
1
import tensorflow as tf
2
3
inputs = tf.keras.layers.Input(shape=(input_shape))
4
x = tf.keras.layers.BatchNormalization()(inputs)
5
6
x = tf.keras.layers.Conv2D(8,13, padding='same', activation='relu', strides=1)(x)
7
x = tf.keras.layers.MaxPooling2D(3)(x)
8
x = tf.keras.layers.Dropout(0.4)(x)
9
x = tf.keras.layers.BatchNormalization()(x)
10
11
x = tf.keras.layers.Conv2D(32, 11, padding='same', activation='relu', strides=1)(x)
12
x = tf.keras.layers.MaxPooling2D(3)(x)
13
x = tf.keras.layers.Dropout(0.4)(x)
14
x = tf.keras.layers.BatchNormalization()(x)
15
16
x = tf.keras.layers.Conv2D(256, 9, padding='same', activation='relu', strides=1)(x)
17
x = tf.keras.layers.MaxPooling2D(3)(x)
18
x = tf.keras.layers.Dropout(0.4)(x)
19
x = tf.keras.layers.BatchNormalization()(x)
20
21
x = tf.keras.layers.Flatten()(x)
22
x = tf.keras.layers.Dense(512, activation='relu')(x)
23
outputs = tf.keras.layers.Dense(len(labels), activation="softmax")(x)
24
25
model = tf.keras.models.Model(inputs, outputs)
26
27
model.compile(loss="categorical_crossentropy",
28
optimizer=tf.keras.optimizers.Adam(),
29
metrics=['accuracy'])
30
model.summary()
31
When I start training process this error appears after a few iterations:
JavaScript
1
13
13
1
> InvalidArgumentError: 2 root error(s) found.
2
3
> (0) Invalid argument:
4
> Dimension -972891 must be >= 0 [[{{node zeros}}]]
5
> [[IteratorGetNext]]
6
> [[categorical_crossentropy/softmax_cross_entropy_with_logits/Shape_2/_6]]
7
8
> (1) Invalid argument: Dimension -972891 must be >= 0 [[{{node
9
> zeros}}]] [[IteratorGetNext]] 0 successful operations. 0 derived
10
> errors ignored. [Op:__inference_train_function_6412]
11
>
12
> Function call stack: train_function -> train_function
13
Advertisement
Answer
I have found that the issue happened in the padding step, I mean
JavaScript
1
4
1
zero_padding = tf.zeros([16000] - tf.shape(waveform), dtype=tf.float32)
2
waveform = tf.cast(waveform, tf.float32)
3
waveform = tf.concat([waveform, zero_padding], 0)
4
I’ve replaced the padding step by tf.signal.frame and the issue is resolved.