Data Pipeline

目录

<!DOCTYPE html>

Coding_Tutorial
In [2]:
import tensorflow as tf
print(tf.__version__)
2.0.0

Data Pipeline


Keras datasets

For a list of Keras datasets and documentation on recommended usage, see this link.

In [2]:
import numpy as np
import matplotlib.pyplot as plt

Load the CIFAR-100 Dataset

In [3]:
from tensorflow.keras.datasets import cifar100
In [11]:
# Load the CIFAR-100 dataset

(train_images, train_labels), (test_images, test_labels) = cifar100.load_data(label_mode="fine")
In [12]:
# Confirm that reloading the dataset does not require a download
(train_images, train_labels), (test_images, test_labels) = cifar100.load_data(label_mode="fine")

Examine the Dataset

In [13]:
# Examine the shape of the data.
print(train_images.shape)
print(train_labels.shape)
(50000, 32, 32, 3)
(50000, 1)
In [17]:
# Examine one of the images and its corresponding label
plt.imshow(train_images[500])
print(train_labels[500])
[41]
In [18]:
# Load the list of labels from a JSON file

import json

with open('data/cifar100_fine_labels.json', 'r') as fine_labels:
    cifar100_fine_labels = json.load(fine_labels)

The list of labels for the CIFAR-100 dataset are available here.

In [19]:
# Print a few of the labels
print(cifar100_fine_labels[:10])
['apple', 'aquarium_fish', 'baby', 'bear', 'beaver', 'bed', 'bee', 'beetle', 'bicycle', 'bottle']
In [20]:
# Print the corresponding label for the example above

cifar100_fine_labels[41]
Out[20]:
'lawn_mower'

Load the data using different label modes

In [21]:
# Display a few examples from category 87 (index 86) and the list of labels

examples = train_images[(train_labels.T == 86)[0]][:3]
fig, ax = plt.subplots(1,3)
ax[0].imshow(examples[0])
ax[1].imshow(examples[1])
ax[2].imshow(examples[2])
Out[21]:
<matplotlib.image.AxesImage at 0x7fb3826254a8>
In [35]:
cifar100_fine_labels[86]
Out[35]:
'telephone'
In [22]:
# Reload the data using the 'coarse' label mode

(train_images, train_labels), (test_images, test_labels) = cifar100.load_data(label_mode="coarse")
In [32]:
# Display three images from the dataset with the label 6 (index 5)

examples = train_images[(train_labels.T == 5)[0]][:3]
fig, ax = plt.subplots(1,3)
ax[0].imshow(examples[0])
ax[1].imshow(examples[1])
ax[2].imshow(examples[2])
Out[32]:
<matplotlib.image.AxesImage at 0x7fb3826b2b00>
In [33]:
# Load the list of coarse labels from a JSON file

with open('data/cifar100_coarse_labels.json', 'r') as coarse_labels:
    cifar100_coarse_labels = json.load(coarse_labels)
In [34]:
# Print a few of the labels
print(cifar100_coarse_labels[:10])
['aquatic mammals', 'fish', 'flowers', 'food containers', 'fruit and vegetables', 'household electrical device', 'household furniture', 'insects', 'large carnivores', 'large man-made outdoor things']
In [36]:
# Print the corresponding label for the example above
print(cifar100_coarse_labels[5])
household electrical device

Load the IMDB Dataset

In [37]:
from tensorflow.keras.datasets import imdb
In [38]:
# Load the IMDB dataset

(train_data, train_labels,), (test_data, test_labels) = imdb.load_data()
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
17465344/17464789 [==============================] - 6s 0us/step
In [39]:
# Print an example from the training dataset, along with its corresponding label

print(train_data[0])
print(train_labels[0])
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
1
In [41]:
# Get the lengths of the input sequences

sequence_lengths = [len(seq) for seq in train_data]
In [42]:
# Determine the maximum and minimum sequence length
print(np.max(sequence_lengths))
print(np.min(sequence_lengths))
2494
11

Using Keyword Arguments

In [43]:
# Load the data ignoring the 50 most frequent words, use oov_char=2 (this is the default)
(train_data, train_labels,), (test_data, test_labels) = imdb.load_data(skip_top=50, oov_char=2)
In [44]:
# Get the lengths of the input sequences

sequence_lengths = [len(seq) for seq in train_data]
In [59]:
# Determine the maximum and minimum sequence length
print(np.max(sequence_lengths))
print(np.min(sequence_lengths))
1648
3
In [60]:
# Define functions for filtering the sequences

def remove_oov_char(element):
    ''' Filter function for removing the oov_char. '''
    return [word for word in element if word!=2]

def filter_list(lst):
    ''' Run remove_oov_char on elements in a list. '''
    return [remove_oov_char(element) for element in lst]
In [61]:
# Remove the oov_char from the sequences using the filter_list function

train_data = filter_list(train_data)
In [62]:
# Get the lengths of the input sequences

sequence_lengths = [len(seq) for seq in train_data]
In [63]:
# Determine the maximum and minimum sequence length

print(np.max(sequence_lengths))
print(np.min(sequence_lengths))
1648
3

Dataset generators

In [83]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

Load the UCI Fertility Dataset

We will be using a dataset available at https://archive.ics.uci.edu/ml/datasets/Fertility from UC Irvine.

In [84]:
# Load the fertility dataset

headers = ['Season', 'Age', 'Diseases', 'Trauma', 'Surgery', 'Fever', 'Alcohol', 'Smoking', 'Sitting', 'Output']
fertility = pd.read_csv('data/fertility_diagnosis.txt', delimiter=',', header=None, names=headers)
In [86]:
# Print the shape of the DataFrame

print(fertility.shape)
(100, 10)
In [87]:
# Show the head of the DataFrame
fertility.head()
Out[87]:
Season Age Diseases Trauma Surgery Fever Alcohol Smoking Sitting Output
0 -0.33 0.69 0 1 1 0 0.8 0 0.88 N
1 -0.33 0.94 1 0 1 0 0.8 1 0.31 O
2 -0.33 0.50 1 0 0 0 1.0 -1 0.50 N
3 -0.33 0.75 0 1 1 0 1.0 -1 0.38 N
4 -0.33 0.67 1 1 0 0 0.8 -1 0.50 O

Process the data

In [88]:
# Map the 'Output' feature from 'N' to 0 and from 'O' to 1

fertility['Output'] = fertility['Output'].map(lambda x : 0.0 if x=='N' else 1.0)
In [91]:
# Show the head of the DataFrame

fertility.head()
Out[91]:
Season Age Diseases Trauma Surgery Fever Alcohol Smoking Sitting Output
0 -0.33 0.69 0.0 1.0 1.0 0.0 0.8 0.0 0.88 0.0
1 -0.33 0.94 1.0 0.0 1.0 0.0 0.8 1.0 0.31 1.0
2 -0.33 0.50 1.0 0.0 0.0 0.0 1.0 -1.0 0.50 0.0
3 -0.33 0.75 0.0 1.0 1.0 0.0 1.0 -1.0 0.38 0.0
4 -0.33 0.67 1.0 1.0 0.0 0.0 0.8 -1.0 0.50 1.0
In [90]:
# Convert the DataFrame so that the features are mapped to floats

fertility = fertility.astype('float32')
In [92]:
# Shuffle the DataFrame

fertility = fertility.sample(frac=1).reset_index(drop=True)
In [93]:
# Show the head of the DataFrame

fertility.head()
Out[93]:
Season Age Diseases Trauma Surgery Fever Alcohol Smoking Sitting Output
0 -0.33 0.56 0.0 0.0 1.0 0.0 1.0 -1.0 0.56 0.0
1 1.00 0.61 1.0 0.0 1.0 0.0 1.0 -1.0 0.63 0.0
2 0.33 0.78 1.0 0.0 0.0 0.0 1.0 1.0 0.06 0.0
3 1.00 0.67 0.0 0.0 1.0 0.0 0.8 -1.0 0.25 0.0
4 -0.33 0.50 1.0 0.0 0.0 0.0 1.0 -1.0 0.50 0.0
In [94]:
# Convert the field Season to a one-hot encoded vector

fertility = pd.get_dummies(fertility, prefix='Season', columns=['Season'])
In [95]:
# Show the head of the DataFrame

fertility.head()
Out[95]:
Age Diseases Trauma Surgery Fever Alcohol Smoking Sitting Output Season_-1.0 Season_-0.33000001311302185 Season_0.33000001311302185 Season_1.0
0 0.56 0.0 0.0 1.0 0.0 1.0 -1.0 0.56 0.0 0 1 0 0
1 0.61 1.0 0.0 1.0 0.0 1.0 -1.0 0.63 0.0 0 0 0 1
2 0.78 1.0 0.0 0.0 0.0 1.0 1.0 0.06 0.0 0 0 1 0
3 0.67 0.0 0.0 1.0 0.0 0.8 -1.0 0.25 0.0 0 0 0 1
4 0.50 1.0 0.0 0.0 0.0 1.0 -1.0 0.50 0.0 0 1 0 0
In [96]:
# Move the Output column such that it is the last column in the DataFrame

fertility.columns = [col for col in fertility.columns if col != 'Output'] + ['Output']
In [99]:
# Show the head of the DataFrame

fertility.head()
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-99-01ea85e37fe1> in <module>
      1 # Show the head of the DataFrame
      2 
----> 3 fertility.head()

AttributeError: 'numpy.ndarray' object has no attribute 'head'
In [98]:
# Convert the DataFrame to a numpy array.

fertility = fertility.to_numpy()

Split the Data

In [100]:
# Split the dataset into training and validation set

training = fertility[0:70]
validation = fertility[70:100]
In [101]:
# Verify the shape of the training data

training.shape
Out[101]:
(70, 13)
In [102]:
# Separate the features and labels for the validation and training data

training_features = training[:,0:-1]
training_labels = training[:,-1]
validation_features = validation[:,0:-1]
validation_labels = validation[:,-1]

Create the Generator

In [103]:
# Create a function that returns a generator producing inputs and labels

def get_generator(features, labels, batch_size=1):
    for n in range(int(len(features)/batch_size)):
        yield (features[n*batch_size: (n+1)*batch_size], labels[n*batch_size: (n+1)*batch_size])
In [169]:
# Apply the function to our training features and labels with a batch size of 10

train_generator = get_generator(training_features, training_labels, batch_size=10)
In [177]:
# Test the generator using the next() function
next(train_generator)
---------------------------------------------------------------------------
StopIteration                             Traceback (most recent call last)
<ipython-input-177-ffc102bd325c> in <module>
      1 # Test the generator using the next() function
----> 2 next(train_generator)

StopIteration: 

Build the model

In [113]:
# Create a model using Keras with 3 layers

from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Input, BatchNormalization

input_shape = (12,)
output_shape = (1,)

model_input = Input(input_shape)
batch_1 = BatchNormalization(momentum=0.8)(model_input)
dense_1 = Dense(100, activation='relu')(batch_1)
batch_2 = BatchNormalization(momentum=0.8)(dense_1)
output = Dense(1, activation='sigmoid')(batch_2)

model = Model([model_input], output)
In [114]:
# Display the model summary to show the resultant structure

model.summary()
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_1 (InputLayer)         [(None, 12)]              0         
_________________________________________________________________
batch_normalization (BatchNo (None, 12)                48        
_________________________________________________________________
dense (Dense)                (None, 100)               1300      
_________________________________________________________________
batch_normalization_1 (Batch (None, 100)               400       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
=================================================================
Total params: 1,849
Trainable params: 1,625
Non-trainable params: 224
_________________________________________________________________

Compile the model

In [115]:
# Create the optimizer object

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-2)
In [116]:
# Compile the model with loss function and metric

model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

Train and evaluate the model using the generator

In [117]:
# Calculate the number of training steps per epoch for the given batch size.

batch_size = 5
train_steps = len(training) // batch_size
In [138]:
# Set the epochs to 3

epochs = 3
In [139]:
# Train the model
for epoch in range(epochs):
    train_generator = get_generator(training_features, training_labels, batch_size=batch_size)
    validation_generator = get_generator(validation_features, validation_labels, batch_size=30)
    model.fit_generator(train_generator, steps_per_epoch=train_steps, 
                         validation_data = validation_generator,
                         validation_steps=1)
14/14 [==============================] - 0s 33ms/step - loss: 0.0033 - accuracy: 1.0000 - val_loss: 0.0133 - val_accuracy: 1.0000
14/14 [==============================] - 0s 31ms/step - loss: 0.0029 - accuracy: 1.0000 - val_loss: 0.0129 - val_accuracy: 1.0000
14/14 [==============================] - 0s 30ms/step - loss: 0.0026 - accuracy: 1.0000 - val_loss: 0.0126 - val_accuracy: 1.0000
In [137]:
# Try to run the fit_generator function once more; observe what happens

model.fit_generator(train_generator, steps_per_epoch=train_steps)
WARNING:tensorflow:Your dataset iterator ran out of data; interrupting training. Make sure that your iterator can generate at least `steps_per_epoch * epochs` batches (in this case, 14 batches). You may need touse the repeat() function when building your dataset.
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-137-6009e73e3e9e> in <module>
      1 # Try to run the fit_generator function once more; observe what happens
      2 
----> 3 model.fit_generator(train_generator, steps_per_epoch=train_steps)

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
   1295         shuffle=shuffle,
   1296         initial_epoch=initial_epoch,
-> 1297         steps_name='steps_per_epoch')
   1298 
   1299   def evaluate_generator(self,

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_generator.py in model_iteration(model, data, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch, mode, batch_size, steps_name, **kwargs)
    299         break
    300 
--> 301     aggregator.finalize()
    302     results = aggregator.results
    303     epoch_logs = cbks.make_logs(model, epoch_logs, results, mode)

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_utils.py in finalize(self)
    138   def finalize(self):
    139     if not self.results:
--> 140       raise ValueError('Empty training data.')
    141     self.results[0] /= (self.num_samples or self.steps)
    142 

ValueError: Empty training data.

Make an infinitely looping generator

In [140]:
# Create a function that returns an infinitely looping generator

def get_generator_cyclic(features, labels, batch_size=1):
    while True:
        for n in range(int(len(features)/batch_size)):
            yield (features[n*batch_size: (n+1)*batch_size], labels[n*batch_size: (n+1)*batch_size])
        permuted = np.random.permutation(len(features))
        features = features[permuted]
        labels = labels[permuted]
In [141]:
# Create a generator using this function.

train_generator_cyclic = get_generator_cyclic(training_features, training_labels, batch_size=batch_size)
In [142]:
# Assert that the new cyclic generator does not raise a StopIteration

for i in range(2*train_steps):
    next(train_generator_cyclic)
In [143]:
# Generate a cyclic validation generator

validation_generator_cyclic = get_generator_cyclic(validation_features, validation_labels, batch_size=batch_size)
In [147]:
# Train the model

model.fit_generator(train_generator_cyclic, steps_per_epoch=train_steps,
                    validation_data= validation_generator_cyclic, validation_steps=1, epochs=3, verbose=1)
Epoch 1/3
14/14 [==============================] - 0s 31ms/step - loss: 0.0979 - accuracy: 0.9571 - val_loss: 3.2068e-06 - val_accuracy: 1.0000
Epoch 2/3
14/14 [==============================] - 0s 28ms/step - loss: 0.0849 - accuracy: 0.9857 - val_loss: 5.2992e-05 - val_accuracy: 1.0000
Epoch 3/3
14/14 [==============================] - 0s 28ms/step - loss: 0.0051 - accuracy: 1.0000 - val_loss: 9.0131e-05 - val_accuracy: 1.0000
Out[147]:
<tensorflow.python.keras.callbacks.History at 0x7fb36bca3240>

Evaluate the model and get predictions

In [200]:
# Let's obtain a validation data generator.

validation_generator = get_generator(validation_features, validation_labels, batch_size=30)
In [201]:
# Get predictions on the validation data

prediction = model.predict_generator(validation_generator, steps=1)
print(np.round(prediction.T[0]))
[0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1.
 1. 0. 1. 0. 0. 0.]
In [202]:
# Print the corresponding validation labels

print(validation_labels)
[0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1.
 1. 0. 1. 0. 0. 0.]
In [208]:
# Obtain a validation data generator

validation_generator = get_generator_cyclic(validation_features, validation_labels, batch_size=30)
In [211]:
# Evaluate the model

model.evaluate_generator(validation_generator, steps=1)
Out[211]:
[0.00015975440328475088, 1.0]

Keras image data augmentation

In [212]:
import matplotlib.pyplot as plt
import numpy as np

Load the CIFAR-10 Dataset

In [213]:
from tensorflow.keras.datasets import cifar10
In [214]:
# Load the CIFAR-10 dataset

(training_features, training_labels), (test_features, test_labels) = cifar10.load_data()
Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
170500096/170498071 [==============================] - 3s 0us/step
In [215]:
# Convert the labels to a one-hot encoding

num_classes = 10

training_labels = tf.keras.utils.to_categorical(training_labels, num_classes)
test_labels = tf.keras.utils.to_categorical(test_labels, num_classes)

Create a generator function

In [216]:
# Create a function that returns a data generator

def get_generator(features, labels, batch_size=1):
    for n in range(int(len(features)/batch_size)):
        yield (features[n*batch_size:(n+1)*batch_size], labels[n*batch_size:(n+1)*batch_size])
In [217]:
# Use the function we created to get a training data generator with a batch size of 1

training_generator = get_generator(training_features, training_labels)
In [218]:
# Assess the shape of the items generated by training_generator using the `next` function to yield an item.

image, label = next(training_generator)
print(image.shape)
print(label.shape)
(1, 32, 32, 3)
(1, 10)
In [219]:
# Test the training generator by obtaining an image using the `next` generator function, and then using imshow to plot it.
# Print the corresponding label

from matplotlib.pyplot import imshow

image, label = next(training_generator)
image_unbatched = image[0,:,:,:]
imshow(image_unbatched)
print(label)
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]
In [220]:
# Reset the generator by re-running the `get_generator` function.

train_generator = get_generator(training_features, training_labels)

Create a data augmention generator

In [221]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
In [222]:
# Create a function to convert an image to monochrome

def monochrome(x):
    def func_bw(a):
        average_colour = np.mean(a)
        return [average_colour, average_colour, average_colour]
    x = np.apply_along_axis(func_bw, -1, x)
    return x
In [223]:
# Create an ImageDataGenerator object

image_generator = ImageDataGenerator(preprocessing_function=monochrome,
                                    rotation_range=180, 
                                    rescale=(1/255.0))
image_generator.fit(training_features)

Check the documentation for the full list of image data augmentation options.

In [224]:
# Create an iterable generator using the `flow` function

image_generator_iterable = image_generator.flow(training_features, training_labels, batch_size=1, shuffle=False)
In [230]:
# Show a sample from the generator and compare with the original

image, label = next(image_generator_iterable)
image_orig, label_orig = next(train_generator)
figs, axes = plt.subplots(1,2)
axes[0].imshow(image[0,:,:,:])
axes[0].set_title('Transformed')
axes[1].imshow(image_orig[0,:,:,:])
axes[1].set_title('Original')
plt.show()

Flow from directory

In [226]:
# Inspect the directory structure

train_path = 'data/flowers-recognition-split/train'
val_path = 'data/flowers-recognition-split/val'
In [231]:
# Create an ImageDataGenerator object

datagenerator = ImageDataGenerator(rescale=(1/255.0))
In [232]:
classes = ['daisy', 'dandelion', 'rose', 'sunflower', 'tulip']
In [233]:
# Create a training data generator
train_generator = datagenerator.flow_from_directory(train_path, batch_size=64, classes=classes, target_size=(16,16))
Found 3027 images belonging to 5 classes.
In [242]:
# Create a validation data generator

val_generator = datagenerator.flow_from_directory(val_path, batch_size=64, classes=classes, target_size=(16,16))
Found 1296 images belonging to 5 classes.
In [235]:
# Get and display an image and label from the training generator

x = next(train_generator)
imshow(x[0][4])
print(x[1][4])
[0. 1. 0. 0. 0.]
In [236]:
# Reset the training generator

train_generator = datagenerator.flow_from_directory(train_path, batch_size=64, classes=classes, target_size=(16,16))
Found 3027 images belonging to 5 classes.

Create a model to train

In [237]:
# Build a CNN model

from tensorflow.keras.layers import Conv2D, MaxPooling2D, Input, Flatten, Dense

model = tf.keras.Sequential()
model.add(Input((16,16,3)))
model.add(Conv2D(8, (8, 8), padding='same', activation='relu'))
model.add(MaxPooling2D((4,4)))
model.add(Conv2D(8, (8, 8), padding='same', activation='relu'))
model.add(MaxPooling2D((2,2)))
model.add(Conv2D(4, (4, 4), padding='same', activation='relu'))
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(5, activation='softmax'))
In [238]:
# Create an optimizer object

optimizer = tf.keras.optimizers.Adam(1e-3)
In [239]:
# Compile the model

model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
In [240]:
# Print the model summary

model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv2d (Conv2D)              (None, 16, 16, 8)         1544      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 4, 4, 8)           0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 4, 4, 8)           4104      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 2, 2, 8)           0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 2, 2, 4)           516       
_________________________________________________________________
flatten (Flatten)            (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_3 (Dense)              (None, 8)                 136       
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 45        
=================================================================
Total params: 6,617
Trainable params: 6,617
Non-trainable params: 0
_________________________________________________________________

Train the model

In [243]:
# Calculate the training generator and test generator steps per epoch

train_steps_per_epoch = train_generator.n // train_generator.batch_size
val_steps = val_generator.n // val_generator.batch_size
print(train_steps_per_epoch, val_steps)
47 20
In [244]:
# Fit the model

model.fit_generator(train_generator, steps_per_epoch=train_steps_per_epoch, epochs=5)
Epoch 1/5
47/47 [==============================] - 44s 943ms/step - loss: 1.6033 - accuracy: 0.2616
Epoch 2/5
47/47 [==============================] - 28s 598ms/step - loss: 1.5292 - accuracy: 0.3032
Epoch 3/5
47/47 [==============================] - 28s 585ms/step - loss: 1.4261 - accuracy: 0.3317
Epoch 4/5
47/47 [==============================] - 28s 594ms/step - loss: 1.3963 - accuracy: 0.3861
Epoch 5/5
47/47 [==============================] - 28s 585ms/step - loss: 1.3308 - accuracy: 0.4256
Out[244]:
<tensorflow.python.keras.callbacks.History at 0x7fb36e985048>

Evaluate the model

In [245]:
# Evaluate the model
model.evaluate_generator(val_generator, steps=val_steps)
Out[245]:
[1.3088390469551086, 0.44296876]

Predict using the generator

In [246]:
# Predict labels with the model
predictions = model.predict_generator(val_generator, steps=1)
print(np.round(predictions, 2))
[[0.1  0.08 0.41 0.21 0.2 ]
 [0.23 0.13 0.18 0.1  0.36]
 [0.1  0.29 0.11 0.45 0.05]
 [0.04 0.01 0.51 0.02 0.42]
 [0.26 0.25 0.12 0.15 0.23]
 [0.27 0.27 0.1  0.16 0.2 ]
 [0.26 0.36 0.05 0.23 0.1 ]
 [0.02 0.   0.52 0.   0.46]
 [0.16 0.07 0.28 0.08 0.4 ]
 [0.23 0.32 0.08 0.24 0.12]
 [0.06 0.31 0.05 0.56 0.02]
 [0.09 0.34 0.03 0.53 0.01]
 [0.25 0.26 0.11 0.19 0.19]
 [0.13 0.35 0.05 0.44 0.03]
 [0.14 0.34 0.07 0.4  0.05]
 [0.08 0.33 0.04 0.54 0.02]
 [0.21 0.16 0.19 0.16 0.28]
 [0.2  0.08 0.21 0.07 0.45]
 [0.11 0.26 0.12 0.45 0.06]
 [0.29 0.27 0.09 0.15 0.21]
 [0.13 0.06 0.34 0.08 0.38]
 [0.22 0.32 0.1  0.23 0.13]
 [0.16 0.32 0.09 0.37 0.07]
 [0.26 0.32 0.08 0.19 0.15]
 [0.22 0.34 0.08 0.26 0.1 ]
 [0.15 0.34 0.09 0.35 0.06]
 [0.16 0.35 0.08 0.37 0.06]
 [0.12 0.35 0.05 0.45 0.03]
 [0.28 0.25 0.1  0.14 0.23]
 [0.08 0.31 0.07 0.52 0.02]
 [0.3  0.22 0.09 0.12 0.26]
 [0.19 0.28 0.13 0.28 0.13]
 [0.3  0.28 0.07 0.15 0.2 ]
 [0.02 0.   0.57 0.01 0.4 ]
 [0.11 0.03 0.32 0.04 0.51]
 [0.27 0.19 0.13 0.14 0.27]
 [0.11 0.03 0.32 0.04 0.49]
 [0.26 0.36 0.06 0.22 0.11]
 [0.03 0.   0.44 0.   0.53]
 [0.01 0.   0.49 0.   0.5 ]
 [0.12 0.34 0.06 0.45 0.03]
 [0.02 0.   0.6  0.   0.38]
 [0.23 0.1  0.17 0.07 0.42]
 [0.13 0.19 0.24 0.31 0.13]
 [0.05 0.01 0.41 0.01 0.52]
 [0.22 0.11 0.19 0.08 0.39]
 [0.25 0.33 0.08 0.21 0.13]
 [0.19 0.36 0.06 0.34 0.05]
 [0.23 0.3  0.11 0.21 0.16]
 [0.25 0.27 0.11 0.18 0.19]
 [0.25 0.33 0.08 0.2  0.14]
 [0.09 0.33 0.05 0.5  0.02]
 [0.16 0.04 0.22 0.04 0.55]
 [0.2  0.1  0.22 0.08 0.4 ]
 [0.24 0.23 0.14 0.17 0.23]
 [0.28 0.19 0.12 0.12 0.29]
 [0.02 0.   0.58 0.   0.4 ]
 [0.02 0.   0.5  0.   0.47]
 [0.27 0.32 0.07 0.19 0.15]
 [0.08 0.02 0.39 0.03 0.48]
 [0.02 0.   0.38 0.   0.6 ]
 [0.28 0.31 0.07 0.17 0.17]
 [0.   0.   0.7  0.   0.3 ]
 [0.17 0.06 0.25 0.06 0.47]]

The Dataset Class

In [8]:
import matplotlib.pyplot as plt
import numpy as np
import os

Create a simple dataset

In [9]:
x = np.zeros((100,10,2,2))
In [10]:
# Create a dataset from the tensor x

dataset1 = tf.data.Dataset.from_tensor_slices(x)
In [11]:
# Inspect the Dataset object

print(dataset1)
print(dataset1.element_spec)
<TensorSliceDataset shapes: (10, 2, 2), types: tf.float64>
TensorSpec(shape=(10, 2, 2), dtype=tf.float64, name=None)
In [12]:
x2 = [np.zeros((10,2,2)), np.zeros((5,2,2))]
In [13]:
# Try creating a dataset from the tensor x2

dataset2 = tf.data.Dataset.from_tensor_slices(x2)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-13-e4370cf142de> in <module>
      1 # Try creating a dataset from the tensor x2
      2 
----> 3 dataset2 = tf.data.Dataset.from_tensor_slices(x2)

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/data/ops/dataset_ops.py in from_tensor_slices(tensors)
    433       Dataset: A `Dataset`.
    434     """
--> 435     return TensorSliceDataset(tensors)
    436 
    437   class _GeneratorState(object):

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/data/ops/dataset_ops.py in __init__(self, element)
   2352   def __init__(self, element):
   2353     """See `Dataset.from_tensor_slices()` for details."""
-> 2354     element = structure.normalize_element(element)
   2355     batched_spec = structure.type_spec_from_value(element)
   2356     self._tensors = structure.to_batched_tensor_list(batched_spec, element)

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/data/util/structure.py in normalize_element(element)
    109         else:
    110           normalized_components.append(
--> 111               ops.convert_to_tensor(t, name="component_%d" % i))
    112   return nest.pack_sequence_as(element, normalized_components)
    113 

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py in convert_to_tensor(value, dtype, name, preferred_dtype, dtype_hint)
   1182   preferred_dtype = deprecation.deprecated_argument_lookup(
   1183       "dtype_hint", dtype_hint, "preferred_dtype", preferred_dtype)
-> 1184   return convert_to_tensor_v2(value, dtype, preferred_dtype, name)
   1185 
   1186 

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py in convert_to_tensor_v2(value, dtype, dtype_hint, name)
   1240       name=name,
   1241       preferred_dtype=dtype_hint,
-> 1242       as_ref=False)
   1243 
   1244 

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py in internal_convert_to_tensor(value, dtype, name, as_ref, preferred_dtype, ctx, accept_composite_tensors)
   1294 
   1295     if ret is None:
-> 1296       ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
   1297 
   1298     if ret is NotImplemented:

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/framework/constant_op.py in _constant_tensor_conversion_function(v, dtype, name, as_ref)
    284                                          as_ref=False):
    285   _ = as_ref
--> 286   return constant(v, dtype=dtype, name=name)
    287 
    288 

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/framework/constant_op.py in constant(value, dtype, shape, name)
    225   """
    226   return _constant_impl(value, dtype, shape, name, verify_shape=False,
--> 227                         allow_broadcast=True)
    228 
    229 

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/framework/constant_op.py in _constant_impl(value, dtype, shape, name, verify_shape, allow_broadcast)
    233   ctx = context.context()
    234   if ctx.executing_eagerly():
--> 235     t = convert_to_eager_tensor(value, ctx, dtype)
    236     if shape is None:
    237       return t

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/framework/constant_op.py in convert_to_eager_tensor(value, ctx, dtype)
     94       dtype = dtypes.as_dtype(dtype).as_datatype_enum
     95   ctx.ensure_initialized()
---> 96   return ops.EagerTensor(value, ctx.device_name, dtype)
     97 
     98 

ValueError: Can't convert non-rectangular Python sequence to Tensor.
In [18]:
x2 = [np.zeros((10,1)), np.zeros((10,1)), np.zeros((10,1))]
In [19]:
# Create another dataset from the new x2 and inspect the Dataset object

dataset2 = tf.data.Dataset.from_tensor_slices(x2)
In [20]:
# Print the element_spec

print(dataset2.element_spec)
TensorSpec(shape=(10, 1), dtype=tf.float64, name=None)

Create a zipped dataset

In [21]:
# Combine the two datasets into one larger dataset

dataset_zipped = tf.data.Dataset.zip((dataset1, dataset2))
In [22]:
# Print the element_spec

print(dataset_zipped.element_spec)
(TensorSpec(shape=(10, 2, 2), dtype=tf.float64, name=None), TensorSpec(shape=(10, 1), dtype=tf.float64, name=None))
In [23]:
# Define a function to find the number of batches in a dataset

def get_batches(dataset):
    iter_dataset = iter(dataset)
    i = 0
    try:
        while next(iter_dataset):
            i = i+1
    except:
        return i
In [24]:
# Find the number of batches in the zipped Dataset

get_batches(dataset_zipped)
Out[24]:
3

Create a dataset from numpy arrays

In [25]:
# Load the MNIST dataset

(train_features, train_labels), (test_features, test_labels) = tf.keras.datasets.mnist.load_data()

print(type(train_features), type(train_labels))
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
In [26]:
# Create a Dataset from the MNIST data

mnist_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_labels))
In [27]:
# Inspect the Dataset object

print(mnist_dataset.element_spec)
(TensorSpec(shape=(28, 28), dtype=tf.uint8, name=None), TensorSpec(shape=(), dtype=tf.uint8, name=None))
In [28]:
# Inspect the length of an element using the take method

element = next(iter(mnist_dataset.take(1)))
len(element)
Out[28]:
2
In [29]:
# Examine the shapes of the data

print(element[0].shape)
print(element[1].shape)
(28, 28)
()

Create a dataset from text data

In [30]:
# Print the list of text files

text_files = sorted([f.path for f in os.scandir('data/shakespeare')])

print(text_files)
['data/shakespeare/tempest.1.1.txt', 'data/shakespeare/tempest.1.2.txt', 'data/shakespeare/tempest.2.1.txt', 'data/shakespeare/tempest.2.2.txt', 'data/shakespeare/tempest.3.1.txt', 'data/shakespeare/tempest.3.2.txt', 'data/shakespeare/tempest.3.3.txt', 'data/shakespeare/tempest.4.1.txt', 'data/shakespeare/tempest.5.1.txt']
In [31]:
# Load the first file using python and print the first 5 lines.

with open(text_files[0], 'r') as fil:
    contents = [fil.readline() for i in range(5)]
    for line in contents:
        print(line)
SCENE I. On a ship at sea: a tempestuous noise

of thunder and lightning heard.

Enter a Master and a Boatswain



Master

In [33]:
# Load the lines from the files into a dataset using TextLineDataset

shakespare_dataset = tf.data.TextLineDataset(text_files)
In [35]:
# Use the take method to get and print the first 5 lines of the dataset

first_5_lines_dataset = iter(shakespare_dataset.take(5))
lines = [line for line in first_5_lines_dataset]
for line in lines:
    print(line)
tf.Tensor(b'SCENE I. On a ship at sea: a tempestuous noise', shape=(), dtype=string)
tf.Tensor(b'of thunder and lightning heard.', shape=(), dtype=string)
tf.Tensor(b'Enter a Master and a Boatswain', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'Master', shape=(), dtype=string)
In [36]:
# Compute the number of lines in the first file

lines = []
with open(text_files[0], 'r') as fil:
    line = fil.readline()
    while line:
        lines.append(line)
        line = fil.readline()
    print(len(lines))
121
In [38]:
# Compute the number of lines in the shakespeare dataset we created

shakespeare_dataset_iterator = iter(shakespare_dataset)
lines = [line for line in shakespeare_dataset_iterator]
print(len(lines))
3134

Interleave lines from the text data files

In [39]:
# Create a dataset of the text file strings

text_files_dataset = tf.data.Dataset.from_tensor_slices(text_files)
files = [file for file in text_files_dataset]
for file in files:
    print(file)
tf.Tensor(b'data/shakespeare/tempest.1.1.txt', shape=(), dtype=string)
tf.Tensor(b'data/shakespeare/tempest.1.2.txt', shape=(), dtype=string)
tf.Tensor(b'data/shakespeare/tempest.2.1.txt', shape=(), dtype=string)
tf.Tensor(b'data/shakespeare/tempest.2.2.txt', shape=(), dtype=string)
tf.Tensor(b'data/shakespeare/tempest.3.1.txt', shape=(), dtype=string)
tf.Tensor(b'data/shakespeare/tempest.3.2.txt', shape=(), dtype=string)
tf.Tensor(b'data/shakespeare/tempest.3.3.txt', shape=(), dtype=string)
tf.Tensor(b'data/shakespeare/tempest.4.1.txt', shape=(), dtype=string)
tf.Tensor(b'data/shakespeare/tempest.5.1.txt', shape=(), dtype=string)
In [40]:
# Interleave the lines from the text files

interleaved_shakespeare_dataset = text_files_dataset.interleave(tf.data.TextLineDataset, cycle_length=9)
In [41]:
# Print the first 10 elements of the interleaved dataset

lines = [line for line in iter(interleaved_shakespeare_dataset.take(10))]
for line in lines:
    print(line)
tf.Tensor(b'SCENE I. On a ship at sea: a tempestuous noise', shape=(), dtype=string)
tf.Tensor(b"SCENE II. The island. Before PROSPERO'S cell.", shape=(), dtype=string)
tf.Tensor(b'SCENE I. Another part of the island.', shape=(), dtype=string)
tf.Tensor(b'SCENE II. Another part of the island.', shape=(), dtype=string)
tf.Tensor(b"SCENE I. Before PROSPERO'S Cell.", shape=(), dtype=string)
tf.Tensor(b'SCENE II. Another part of the island.', shape=(), dtype=string)
tf.Tensor(b'SCENE III. Another part of the island.', shape=(), dtype=string)
tf.Tensor(b"SCENE I. Before PROSPERO'S cell.", shape=(), dtype=string)
tf.Tensor(b"SCENE I. Before PROSPERO'S cell.", shape=(), dtype=string)
tf.Tensor(b'of thunder and lightning heard.', shape=(), dtype=string)

Training with Datasets

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

Load the UCI Bank Marketing Dataset

In [4]:
# Load the CSV file into a pandas DataFrame

bank_dataframe = pd.read_csv('data/bank/bank-full.csv', delimiter=';')
In [5]:
# Show the head of the DataFrame

bank_dataframe.head()
Out[5]:
age job marital education default balance housing loan contact day month duration campaign pdays previous poutcome y
0 58 management married tertiary no 2143 yes no unknown 5 may 261 1 -1 0 unknown no
1 44 technician single secondary no 29 yes no unknown 5 may 151 1 -1 0 unknown no
2 33 entrepreneur married secondary no 2 yes yes unknown 5 may 76 1 -1 0 unknown no
3 47 blue-collar married unknown no 1506 yes no unknown 5 may 92 1 -1 0 unknown no
4 33 unknown single unknown no 1 no no unknown 5 may 198 1 -1 0 unknown no
In [6]:
# Print the shape of the DataFrame

print(bank_dataframe.shape)
(45211, 17)
In [7]:
# Select features from the DataFrame

features = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
            'loan', 'contact', 'campaign', 'pdays', 'poutcome']
labels = ['y']

bank_dataframe = bank_dataframe.filter(features + labels)
In [8]:
# Show the head of the DataFrame

bank_dataframe.head()
Out[8]:
age job marital education default balance housing loan contact campaign pdays poutcome y
0 58 management married tertiary no 2143 yes no unknown 1 -1 unknown no
1 44 technician single secondary no 29 yes no unknown 1 -1 unknown no
2 33 entrepreneur married secondary no 2 yes yes unknown 1 -1 unknown no
3 47 blue-collar married unknown no 1506 yes no unknown 1 -1 unknown no
4 33 unknown single unknown no 1 no no unknown 1 -1 unknown no

Preprocess the data

In [9]:
# Convert the categorical features in the DataFrame to one-hot encodings

from sklearn.preprocessing import LabelBinarizer

encoder = LabelBinarizer()
categorical_features = ['default', 'housing', 'job', 'loan', 'education', 'contact', 'poutcome']

for feature in categorical_features:
    bank_dataframe[feature] = tuple(encoder.fit_transform(bank_dataframe[feature]))
In [10]:
# Show the head of the DataFrame

bank_dataframe.head()
Out[10]:
age job marital education default balance housing loan contact campaign pdays poutcome y
0 58 (0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0) married (0, 0, 1, 0) (0,) 2143 (1,) (0,) (0, 0, 1) 1 -1 (0, 0, 0, 1) no
1 44 (0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0) single (0, 1, 0, 0) (0,) 29 (1,) (0,) (0, 0, 1) 1 -1 (0, 0, 0, 1) no
2 33 (0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0) married (0, 1, 0, 0) (0,) 2 (1,) (1,) (0, 0, 1) 1 -1 (0, 0, 0, 1) no
3 47 (0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) married (0, 0, 0, 1) (0,) 1506 (1,) (0,) (0, 0, 1) 1 -1 (0, 0, 0, 1) no
4 33 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1) single (0, 0, 0, 1) (0,) 1 (0,) (0,) (0, 0, 1) 1 -1 (0, 0, 0, 1) no
In [11]:
# Shuffle the DataFrame

bank_dataframe = bank_dataframe.sample(frac=1).reset_index(drop=True)

Create the Dataset object

In [12]:
# Convert the DataFrame to a Dataset

bank_dataset = tf.data.Dataset.from_tensor_slices(dict(bank_dataframe))
In [13]:
# Inspect the Dataset object

bank_dataset.element_spec
Out[13]:
{'age': TensorSpec(shape=(), dtype=tf.int32, name=None),
 'job': TensorSpec(shape=(12,), dtype=tf.int32, name=None),
 'marital': TensorSpec(shape=(), dtype=tf.string, name=None),
 'education': TensorSpec(shape=(4,), dtype=tf.int32, name=None),
 'default': TensorSpec(shape=(1,), dtype=tf.int32, name=None),
 'balance': TensorSpec(shape=(), dtype=tf.int32, name=None),
 'housing': TensorSpec(shape=(1,), dtype=tf.int32, name=None),
 'loan': TensorSpec(shape=(1,), dtype=tf.int32, name=None),
 'contact': TensorSpec(shape=(3,), dtype=tf.int32, name=None),
 'campaign': TensorSpec(shape=(), dtype=tf.int32, name=None),
 'pdays': TensorSpec(shape=(), dtype=tf.int32, name=None),
 'poutcome': TensorSpec(shape=(4,), dtype=tf.int32, name=None),
 'y': TensorSpec(shape=(), dtype=tf.string, name=None)}

Filter the Dataset

In [14]:
# First check that there are records in the dataset for non-married individuals

def check_divorced():
    bank_dataset_iterable = iter(bank_dataset)
    for x in bank_dataset_iterable:
        if x['marital'] != 'divorced':
            print('Found a person with marital status: {}'.format(x['marital']))
            return
    print('No non-divorced people were found!')

check_divorced()
Found a person with marital status: b'married'
In [15]:
# Filter the Dataset to retain only entries with a 'divorced' marital status

bank_dataset = bank_dataset.filter(lambda x : tf.equal(x['marital'], tf.constant([b'divorced']))[0] )
In [16]:
# Check the records in the dataset again

check_divorced()
No non-divorced people were found!

Map a function over the dataset

In [17]:
# Convert the label ('y') to an integer instead of 'yes' or 'no'
def map_label(x):
    x['y'] = 0 if (x['y'] == tf.constant([b'no'], dtype=tf.string)) else 1
    return x
bank_dataset = bank_dataset.map(map_label)
In [18]:
# Inspect the Dataset object

bank_dataset.element_spec
Out[18]:
{'age': TensorSpec(shape=(), dtype=tf.int32, name=None),
 'job': TensorSpec(shape=(12,), dtype=tf.int32, name=None),
 'marital': TensorSpec(shape=(), dtype=tf.string, name=None),
 'education': TensorSpec(shape=(4,), dtype=tf.int32, name=None),
 'default': TensorSpec(shape=(1,), dtype=tf.int32, name=None),
 'balance': TensorSpec(shape=(), dtype=tf.int32, name=None),
 'housing': TensorSpec(shape=(1,), dtype=tf.int32, name=None),
 'loan': TensorSpec(shape=(1,), dtype=tf.int32, name=None),
 'contact': TensorSpec(shape=(3,), dtype=tf.int32, name=None),
 'campaign': TensorSpec(shape=(), dtype=tf.int32, name=None),
 'pdays': TensorSpec(shape=(), dtype=tf.int32, name=None),
 'poutcome': TensorSpec(shape=(4,), dtype=tf.int32, name=None),
 'y': TensorSpec(shape=(), dtype=tf.int32, name=None)}
In [19]:
# Remove the 'marital' column

bank_dataset = bank_dataset.map(lambda x : {key:val for key,val in x.items() if key != 'marital'})
In [20]:
# Inspect the Dataset object

bank_dataset.element_spec
Out[20]:
{'age': TensorSpec(shape=(), dtype=tf.int32, name=None),
 'job': TensorSpec(shape=(12,), dtype=tf.int32, name=None),
 'education': TensorSpec(shape=(4,), dtype=tf.int32, name=None),
 'default': TensorSpec(shape=(1,), dtype=tf.int32, name=None),
 'balance': TensorSpec(shape=(), dtype=tf.int32, name=None),
 'housing': TensorSpec(shape=(1,), dtype=tf.int32, name=None),
 'loan': TensorSpec(shape=(1,), dtype=tf.int32, name=None),
 'contact': TensorSpec(shape=(3,), dtype=tf.int32, name=None),
 'campaign': TensorSpec(shape=(), dtype=tf.int32, name=None),
 'pdays': TensorSpec(shape=(), dtype=tf.int32, name=None),
 'poutcome': TensorSpec(shape=(4,), dtype=tf.int32, name=None),
 'y': TensorSpec(shape=(), dtype=tf.int32, name=None)}

Create input and output data tuples

In [21]:
# Create an input and output tuple for the dataset

def map_feature_label(x):
    features = [[x['age']], [x['balance']], [x['campaign']], x['contact'], x['default'],
                x['education'], x['housing'], x['job'], x['loan'], [x['pdays']], x['poutcome']]
    return (tf.concat(features, axis=0), x['y'])
In [22]:
# Map this function over the dataset

bank_dataset = bank_dataset.map(map_feature_label)
In [23]:
# Inspect the Dataset object

bank_dataset.element_spec
Out[23]:
(TensorSpec(shape=(30,), dtype=tf.int32, name=None),
 TensorSpec(shape=(), dtype=tf.int32, name=None))

Split into a training and a validation set

In [24]:
# Determine the length of the Dataset

dataset_length = 0
for _ in bank_dataset:
    dataset_length += 1
print(dataset_length)
5207
In [25]:
# Make training and validation sets from the dataset
training_elements = int(dataset_length * 0.7)
train_dataset = bank_dataset.take(training_elements)
validation_dataset = bank_dataset.skip(training_elements)

Build a classification model

Now let's build a model to classify the features.

In [26]:
# Build a classifier model

from tensorflow.keras.layers import Dense, Input, Concatenate, BatchNormalization
from tensorflow.keras import Sequential

model = Sequential()
model.add(Input(shape=(30,)))
model.add(BatchNormalization(momentum=0.8))
model.add(Dense(400, activation='relu'))
model.add(BatchNormalization(momentum=0.8))
model.add(Dense(400, activation='relu'))
model.add(BatchNormalization(momentum=0.8))
model.add(Dense(1, activation='sigmoid'))
In [27]:
# Compile the model

optimizer = tf.keras.optimizers.Adam(1e-4)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
In [28]:
# Show the model summary

model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
batch_normalization (BatchNo (None, 30)                120       
_________________________________________________________________
dense (Dense)                (None, 400)               12400     
_________________________________________________________________
batch_normalization_1 (Batch (None, 400)               1600      
_________________________________________________________________
dense_1 (Dense)              (None, 400)               160400    
_________________________________________________________________
batch_normalization_2 (Batch (None, 400)               1600      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 401       
=================================================================
Total params: 176,521
Trainable params: 174,861
Non-trainable params: 1,660
_________________________________________________________________

Train the model

In [29]:
# Create batched training and validation datasets
train_dataset = train_dataset.batch(20, drop_remainder=True)
validation_dataset = validation_dataset.batch(100)
In [30]:
# Shuffle the training data

train_dataset = train_dataset.shuffle(1000)
In [31]:
# Fit the model
history = model.fit(train_dataset, validation_data=validation_dataset, epochs=5)
Epoch 1/5
182/182 [==============================] - 21s 115ms/step - loss: 0.6554 - accuracy: 0.6519 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00
Epoch 2/5
182/182 [==============================] - 18s 99ms/step - loss: 0.4834 - accuracy: 0.8283 - val_loss: 0.4942 - val_accuracy: 0.8273
Epoch 3/5
182/182 [==============================] - 18s 101ms/step - loss: 0.3796 - accuracy: 0.8942 - val_loss: 0.4522 - val_accuracy: 0.8375
Epoch 4/5
182/182 [==============================] - 19s 102ms/step - loss: 0.3082 - accuracy: 0.9187 - val_loss: 0.4339 - val_accuracy: 0.8688
Epoch 5/5
182/182 [==============================] - 18s 99ms/step - loss: 0.2550 - accuracy: 0.9319 - val_loss: 0.4098 - val_accuracy: 0.8644
In [33]:
# Plot the training and validation accuracy

plt.plot(history.epoch, history.history["accuracy"], label="training")
plt.plot(history.epoch, history.history["val_accuracy"], label="validation")
plt.legend()
plt.xlabel("Epoch")
Out[33]:
Text(0.5, 0, 'Epoch')
In [ ]: