<!DOCTYPE html>
In [2]:
import tensorflow as tf
print(tf.__version__)
Data Pipeline¶
In [2]:
import numpy as np
import matplotlib.pyplot as plt
Load the CIFAR-100 Dataset¶
In [3]:
from tensorflow.keras.datasets import cifar100
In [11]:
# Load the CIFAR-100 dataset
(train_images, train_labels), (test_images, test_labels) = cifar100.load_data(label_mode="fine")
In [12]:
# Confirm that reloading the dataset does not require a download
(train_images, train_labels), (test_images, test_labels) = cifar100.load_data(label_mode="fine")
Examine the Dataset¶
In [13]:
# Examine the shape of the data.
print(train_images.shape)
print(train_labels.shape)
In [17]:
# Examine one of the images and its corresponding label
plt.imshow(train_images[500])
print(train_labels[500])
In [18]:
# Load the list of labels from a JSON file
import json
with open('data/cifar100_fine_labels.json', 'r') as fine_labels:
cifar100_fine_labels = json.load(fine_labels)
The list of labels for the CIFAR-100 dataset are available here.
In [19]:
# Print a few of the labels
print(cifar100_fine_labels[:10])
In [20]:
# Print the corresponding label for the example above
cifar100_fine_labels[41]
Out[20]:
Load the data using different label modes¶
In [21]:
# Display a few examples from category 87 (index 86) and the list of labels
examples = train_images[(train_labels.T == 86)[0]][:3]
fig, ax = plt.subplots(1,3)
ax[0].imshow(examples[0])
ax[1].imshow(examples[1])
ax[2].imshow(examples[2])
Out[21]:
In [35]:
cifar100_fine_labels[86]
Out[35]:
In [22]:
# Reload the data using the 'coarse' label mode
(train_images, train_labels), (test_images, test_labels) = cifar100.load_data(label_mode="coarse")
In [32]:
# Display three images from the dataset with the label 6 (index 5)
examples = train_images[(train_labels.T == 5)[0]][:3]
fig, ax = plt.subplots(1,3)
ax[0].imshow(examples[0])
ax[1].imshow(examples[1])
ax[2].imshow(examples[2])
Out[32]:
In [33]:
# Load the list of coarse labels from a JSON file
with open('data/cifar100_coarse_labels.json', 'r') as coarse_labels:
cifar100_coarse_labels = json.load(coarse_labels)
In [34]:
# Print a few of the labels
print(cifar100_coarse_labels[:10])
In [36]:
# Print the corresponding label for the example above
print(cifar100_coarse_labels[5])
Load the IMDB Dataset¶
In [37]:
from tensorflow.keras.datasets import imdb
In [38]:
# Load the IMDB dataset
(train_data, train_labels,), (test_data, test_labels) = imdb.load_data()
In [39]:
# Print an example from the training dataset, along with its corresponding label
print(train_data[0])
print(train_labels[0])
In [41]:
# Get the lengths of the input sequences
sequence_lengths = [len(seq) for seq in train_data]
In [42]:
# Determine the maximum and minimum sequence length
print(np.max(sequence_lengths))
print(np.min(sequence_lengths))
Using Keyword Arguments¶
In [43]:
# Load the data ignoring the 50 most frequent words, use oov_char=2 (this is the default)
(train_data, train_labels,), (test_data, test_labels) = imdb.load_data(skip_top=50, oov_char=2)
In [44]:
# Get the lengths of the input sequences
sequence_lengths = [len(seq) for seq in train_data]
In [59]:
# Determine the maximum and minimum sequence length
print(np.max(sequence_lengths))
print(np.min(sequence_lengths))
In [60]:
# Define functions for filtering the sequences
def remove_oov_char(element):
''' Filter function for removing the oov_char. '''
return [word for word in element if word!=2]
def filter_list(lst):
''' Run remove_oov_char on elements in a list. '''
return [remove_oov_char(element) for element in lst]
In [61]:
# Remove the oov_char from the sequences using the filter_list function
train_data = filter_list(train_data)
In [62]:
# Get the lengths of the input sequences
sequence_lengths = [len(seq) for seq in train_data]
In [63]:
# Determine the maximum and minimum sequence length
print(np.max(sequence_lengths))
print(np.min(sequence_lengths))
In [83]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
Load the UCI Fertility Dataset¶
We will be using a dataset available at https://archive.ics.uci.edu/ml/datasets/Fertility from UC Irvine.
In [84]:
# Load the fertility dataset
headers = ['Season', 'Age', 'Diseases', 'Trauma', 'Surgery', 'Fever', 'Alcohol', 'Smoking', 'Sitting', 'Output']
fertility = pd.read_csv('data/fertility_diagnosis.txt', delimiter=',', header=None, names=headers)
In [86]:
# Print the shape of the DataFrame
print(fertility.shape)
In [87]:
# Show the head of the DataFrame
fertility.head()
Out[87]:
Process the data¶
In [88]:
# Map the 'Output' feature from 'N' to 0 and from 'O' to 1
fertility['Output'] = fertility['Output'].map(lambda x : 0.0 if x=='N' else 1.0)
In [91]:
# Show the head of the DataFrame
fertility.head()
Out[91]:
In [90]:
# Convert the DataFrame so that the features are mapped to floats
fertility = fertility.astype('float32')
In [92]:
# Shuffle the DataFrame
fertility = fertility.sample(frac=1).reset_index(drop=True)
In [93]:
# Show the head of the DataFrame
fertility.head()
Out[93]:
In [94]:
# Convert the field Season to a one-hot encoded vector
fertility = pd.get_dummies(fertility, prefix='Season', columns=['Season'])
In [95]:
# Show the head of the DataFrame
fertility.head()
Out[95]:
In [96]:
# Move the Output column such that it is the last column in the DataFrame
fertility.columns = [col for col in fertility.columns if col != 'Output'] + ['Output']
In [99]:
# Show the head of the DataFrame
fertility.head()
In [98]:
# Convert the DataFrame to a numpy array.
fertility = fertility.to_numpy()
Split the Data¶
In [100]:
# Split the dataset into training and validation set
training = fertility[0:70]
validation = fertility[70:100]
In [101]:
# Verify the shape of the training data
training.shape
Out[101]:
In [102]:
# Separate the features and labels for the validation and training data
training_features = training[:,0:-1]
training_labels = training[:,-1]
validation_features = validation[:,0:-1]
validation_labels = validation[:,-1]
Create the Generator¶
In [103]:
# Create a function that returns a generator producing inputs and labels
def get_generator(features, labels, batch_size=1):
for n in range(int(len(features)/batch_size)):
yield (features[n*batch_size: (n+1)*batch_size], labels[n*batch_size: (n+1)*batch_size])
In [169]:
# Apply the function to our training features and labels with a batch size of 10
train_generator = get_generator(training_features, training_labels, batch_size=10)
In [177]:
# Test the generator using the next() function
next(train_generator)
Build the model¶
In [113]:
# Create a model using Keras with 3 layers
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Input, BatchNormalization
input_shape = (12,)
output_shape = (1,)
model_input = Input(input_shape)
batch_1 = BatchNormalization(momentum=0.8)(model_input)
dense_1 = Dense(100, activation='relu')(batch_1)
batch_2 = BatchNormalization(momentum=0.8)(dense_1)
output = Dense(1, activation='sigmoid')(batch_2)
model = Model([model_input], output)
In [114]:
# Display the model summary to show the resultant structure
model.summary()
Compile the model¶
In [115]:
# Create the optimizer object
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-2)
In [116]:
# Compile the model with loss function and metric
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
Train and evaluate the model using the generator¶
In [117]:
# Calculate the number of training steps per epoch for the given batch size.
batch_size = 5
train_steps = len(training) // batch_size
In [138]:
# Set the epochs to 3
epochs = 3
In [139]:
# Train the model
for epoch in range(epochs):
train_generator = get_generator(training_features, training_labels, batch_size=batch_size)
validation_generator = get_generator(validation_features, validation_labels, batch_size=30)
model.fit_generator(train_generator, steps_per_epoch=train_steps,
validation_data = validation_generator,
validation_steps=1)
In [137]:
# Try to run the fit_generator function once more; observe what happens
model.fit_generator(train_generator, steps_per_epoch=train_steps)
Make an infinitely looping generator¶
In [140]:
# Create a function that returns an infinitely looping generator
def get_generator_cyclic(features, labels, batch_size=1):
while True:
for n in range(int(len(features)/batch_size)):
yield (features[n*batch_size: (n+1)*batch_size], labels[n*batch_size: (n+1)*batch_size])
permuted = np.random.permutation(len(features))
features = features[permuted]
labels = labels[permuted]
In [141]:
# Create a generator using this function.
train_generator_cyclic = get_generator_cyclic(training_features, training_labels, batch_size=batch_size)
In [142]:
# Assert that the new cyclic generator does not raise a StopIteration
for i in range(2*train_steps):
next(train_generator_cyclic)
In [143]:
# Generate a cyclic validation generator
validation_generator_cyclic = get_generator_cyclic(validation_features, validation_labels, batch_size=batch_size)
In [147]:
# Train the model
model.fit_generator(train_generator_cyclic, steps_per_epoch=train_steps,
validation_data= validation_generator_cyclic, validation_steps=1, epochs=3, verbose=1)
Out[147]:
Evaluate the model and get predictions¶
In [200]:
# Let's obtain a validation data generator.
validation_generator = get_generator(validation_features, validation_labels, batch_size=30)
In [201]:
# Get predictions on the validation data
prediction = model.predict_generator(validation_generator, steps=1)
print(np.round(prediction.T[0]))
In [202]:
# Print the corresponding validation labels
print(validation_labels)
In [208]:
# Obtain a validation data generator
validation_generator = get_generator_cyclic(validation_features, validation_labels, batch_size=30)
In [211]:
# Evaluate the model
model.evaluate_generator(validation_generator, steps=1)
Out[211]:
In [212]:
import matplotlib.pyplot as plt
import numpy as np
Load the CIFAR-10 Dataset¶
In [213]:
from tensorflow.keras.datasets import cifar10
In [214]:
# Load the CIFAR-10 dataset
(training_features, training_labels), (test_features, test_labels) = cifar10.load_data()
In [215]:
# Convert the labels to a one-hot encoding
num_classes = 10
training_labels = tf.keras.utils.to_categorical(training_labels, num_classes)
test_labels = tf.keras.utils.to_categorical(test_labels, num_classes)
Create a generator function¶
In [216]:
# Create a function that returns a data generator
def get_generator(features, labels, batch_size=1):
for n in range(int(len(features)/batch_size)):
yield (features[n*batch_size:(n+1)*batch_size], labels[n*batch_size:(n+1)*batch_size])
In [217]:
# Use the function we created to get a training data generator with a batch size of 1
training_generator = get_generator(training_features, training_labels)
In [218]:
# Assess the shape of the items generated by training_generator using the `next` function to yield an item.
image, label = next(training_generator)
print(image.shape)
print(label.shape)
In [219]:
# Test the training generator by obtaining an image using the `next` generator function, and then using imshow to plot it.
# Print the corresponding label
from matplotlib.pyplot import imshow
image, label = next(training_generator)
image_unbatched = image[0,:,:,:]
imshow(image_unbatched)
print(label)
In [220]:
# Reset the generator by re-running the `get_generator` function.
train_generator = get_generator(training_features, training_labels)
Create a data augmention generator¶
In [221]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
In [222]:
# Create a function to convert an image to monochrome
def monochrome(x):
def func_bw(a):
average_colour = np.mean(a)
return [average_colour, average_colour, average_colour]
x = np.apply_along_axis(func_bw, -1, x)
return x
In [223]:
# Create an ImageDataGenerator object
image_generator = ImageDataGenerator(preprocessing_function=monochrome,
rotation_range=180,
rescale=(1/255.0))
image_generator.fit(training_features)
Check the documentation for the full list of image data augmentation options.
In [224]:
# Create an iterable generator using the `flow` function
image_generator_iterable = image_generator.flow(training_features, training_labels, batch_size=1, shuffle=False)
In [230]:
# Show a sample from the generator and compare with the original
image, label = next(image_generator_iterable)
image_orig, label_orig = next(train_generator)
figs, axes = plt.subplots(1,2)
axes[0].imshow(image[0,:,:,:])
axes[0].set_title('Transformed')
axes[1].imshow(image_orig[0,:,:,:])
axes[1].set_title('Original')
plt.show()
Flow from directory¶
In [226]:
# Inspect the directory structure
train_path = 'data/flowers-recognition-split/train'
val_path = 'data/flowers-recognition-split/val'
In [231]:
# Create an ImageDataGenerator object
datagenerator = ImageDataGenerator(rescale=(1/255.0))
In [232]:
classes = ['daisy', 'dandelion', 'rose', 'sunflower', 'tulip']
In [233]:
# Create a training data generator
train_generator = datagenerator.flow_from_directory(train_path, batch_size=64, classes=classes, target_size=(16,16))
In [242]:
# Create a validation data generator
val_generator = datagenerator.flow_from_directory(val_path, batch_size=64, classes=classes, target_size=(16,16))
In [235]:
# Get and display an image and label from the training generator
x = next(train_generator)
imshow(x[0][4])
print(x[1][4])
In [236]:
# Reset the training generator
train_generator = datagenerator.flow_from_directory(train_path, batch_size=64, classes=classes, target_size=(16,16))
Create a model to train¶
In [237]:
# Build a CNN model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Input, Flatten, Dense
model = tf.keras.Sequential()
model.add(Input((16,16,3)))
model.add(Conv2D(8, (8, 8), padding='same', activation='relu'))
model.add(MaxPooling2D((4,4)))
model.add(Conv2D(8, (8, 8), padding='same', activation='relu'))
model.add(MaxPooling2D((2,2)))
model.add(Conv2D(4, (4, 4), padding='same', activation='relu'))
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(5, activation='softmax'))
In [238]:
# Create an optimizer object
optimizer = tf.keras.optimizers.Adam(1e-3)
In [239]:
# Compile the model
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
In [240]:
# Print the model summary
model.summary()
Train the model¶
In [243]:
# Calculate the training generator and test generator steps per epoch
train_steps_per_epoch = train_generator.n // train_generator.batch_size
val_steps = val_generator.n // val_generator.batch_size
print(train_steps_per_epoch, val_steps)
In [244]:
# Fit the model
model.fit_generator(train_generator, steps_per_epoch=train_steps_per_epoch, epochs=5)
Out[244]:
Evaluate the model¶
In [245]:
# Evaluate the model
model.evaluate_generator(val_generator, steps=val_steps)
Out[245]:
Predict using the generator¶
In [246]:
# Predict labels with the model
predictions = model.predict_generator(val_generator, steps=1)
print(np.round(predictions, 2))
In [8]:
import matplotlib.pyplot as plt
import numpy as np
import os
Create a simple dataset¶
In [9]:
x = np.zeros((100,10,2,2))
In [10]:
# Create a dataset from the tensor x
dataset1 = tf.data.Dataset.from_tensor_slices(x)
In [11]:
# Inspect the Dataset object
print(dataset1)
print(dataset1.element_spec)
In [12]:
x2 = [np.zeros((10,2,2)), np.zeros((5,2,2))]
In [13]:
# Try creating a dataset from the tensor x2
dataset2 = tf.data.Dataset.from_tensor_slices(x2)
In [18]:
x2 = [np.zeros((10,1)), np.zeros((10,1)), np.zeros((10,1))]
In [19]:
# Create another dataset from the new x2 and inspect the Dataset object
dataset2 = tf.data.Dataset.from_tensor_slices(x2)
In [20]:
# Print the element_spec
print(dataset2.element_spec)
Create a zipped dataset¶
In [21]:
# Combine the two datasets into one larger dataset
dataset_zipped = tf.data.Dataset.zip((dataset1, dataset2))
In [22]:
# Print the element_spec
print(dataset_zipped.element_spec)
In [23]:
# Define a function to find the number of batches in a dataset
def get_batches(dataset):
iter_dataset = iter(dataset)
i = 0
try:
while next(iter_dataset):
i = i+1
except:
return i
In [24]:
# Find the number of batches in the zipped Dataset
get_batches(dataset_zipped)
Out[24]:
Create a dataset from numpy arrays¶
In [25]:
# Load the MNIST dataset
(train_features, train_labels), (test_features, test_labels) = tf.keras.datasets.mnist.load_data()
print(type(train_features), type(train_labels))
In [26]:
# Create a Dataset from the MNIST data
mnist_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_labels))
In [27]:
# Inspect the Dataset object
print(mnist_dataset.element_spec)
In [28]:
# Inspect the length of an element using the take method
element = next(iter(mnist_dataset.take(1)))
len(element)
Out[28]:
In [29]:
# Examine the shapes of the data
print(element[0].shape)
print(element[1].shape)
Create a dataset from text data¶
In [30]:
# Print the list of text files
text_files = sorted([f.path for f in os.scandir('data/shakespeare')])
print(text_files)
In [31]:
# Load the first file using python and print the first 5 lines.
with open(text_files[0], 'r') as fil:
contents = [fil.readline() for i in range(5)]
for line in contents:
print(line)
In [33]:
# Load the lines from the files into a dataset using TextLineDataset
shakespare_dataset = tf.data.TextLineDataset(text_files)
In [35]:
# Use the take method to get and print the first 5 lines of the dataset
first_5_lines_dataset = iter(shakespare_dataset.take(5))
lines = [line for line in first_5_lines_dataset]
for line in lines:
print(line)
In [36]:
# Compute the number of lines in the first file
lines = []
with open(text_files[0], 'r') as fil:
line = fil.readline()
while line:
lines.append(line)
line = fil.readline()
print(len(lines))
In [38]:
# Compute the number of lines in the shakespeare dataset we created
shakespeare_dataset_iterator = iter(shakespare_dataset)
lines = [line for line in shakespeare_dataset_iterator]
print(len(lines))
Interleave lines from the text data files¶
In [39]:
# Create a dataset of the text file strings
text_files_dataset = tf.data.Dataset.from_tensor_slices(text_files)
files = [file for file in text_files_dataset]
for file in files:
print(file)
In [40]:
# Interleave the lines from the text files
interleaved_shakespeare_dataset = text_files_dataset.interleave(tf.data.TextLineDataset, cycle_length=9)
In [41]:
# Print the first 10 elements of the interleaved dataset
lines = [line for line in iter(interleaved_shakespeare_dataset.take(10))]
for line in lines:
print(line)
In [3]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
Load the UCI Bank Marketing Dataset¶
In [4]:
# Load the CSV file into a pandas DataFrame
bank_dataframe = pd.read_csv('data/bank/bank-full.csv', delimiter=';')
In [5]:
# Show the head of the DataFrame
bank_dataframe.head()
Out[5]:
In [6]:
# Print the shape of the DataFrame
print(bank_dataframe.shape)
In [7]:
# Select features from the DataFrame
features = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
'loan', 'contact', 'campaign', 'pdays', 'poutcome']
labels = ['y']
bank_dataframe = bank_dataframe.filter(features + labels)
In [8]:
# Show the head of the DataFrame
bank_dataframe.head()
Out[8]:
Preprocess the data¶
In [9]:
# Convert the categorical features in the DataFrame to one-hot encodings
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
categorical_features = ['default', 'housing', 'job', 'loan', 'education', 'contact', 'poutcome']
for feature in categorical_features:
bank_dataframe[feature] = tuple(encoder.fit_transform(bank_dataframe[feature]))
In [10]:
# Show the head of the DataFrame
bank_dataframe.head()
Out[10]:
In [11]:
# Shuffle the DataFrame
bank_dataframe = bank_dataframe.sample(frac=1).reset_index(drop=True)
Create the Dataset object¶
In [12]:
# Convert the DataFrame to a Dataset
bank_dataset = tf.data.Dataset.from_tensor_slices(dict(bank_dataframe))
In [13]:
# Inspect the Dataset object
bank_dataset.element_spec
Out[13]:
Filter the Dataset¶
In [14]:
# First check that there are records in the dataset for non-married individuals
def check_divorced():
bank_dataset_iterable = iter(bank_dataset)
for x in bank_dataset_iterable:
if x['marital'] != 'divorced':
print('Found a person with marital status: {}'.format(x['marital']))
return
print('No non-divorced people were found!')
check_divorced()
In [15]:
# Filter the Dataset to retain only entries with a 'divorced' marital status
bank_dataset = bank_dataset.filter(lambda x : tf.equal(x['marital'], tf.constant([b'divorced']))[0] )
In [16]:
# Check the records in the dataset again
check_divorced()
Map a function over the dataset¶
In [17]:
# Convert the label ('y') to an integer instead of 'yes' or 'no'
def map_label(x):
x['y'] = 0 if (x['y'] == tf.constant([b'no'], dtype=tf.string)) else 1
return x
bank_dataset = bank_dataset.map(map_label)
In [18]:
# Inspect the Dataset object
bank_dataset.element_spec
Out[18]:
In [19]:
# Remove the 'marital' column
bank_dataset = bank_dataset.map(lambda x : {key:val for key,val in x.items() if key != 'marital'})
In [20]:
# Inspect the Dataset object
bank_dataset.element_spec
Out[20]:
Create input and output data tuples¶
In [21]:
# Create an input and output tuple for the dataset
def map_feature_label(x):
features = [[x['age']], [x['balance']], [x['campaign']], x['contact'], x['default'],
x['education'], x['housing'], x['job'], x['loan'], [x['pdays']], x['poutcome']]
return (tf.concat(features, axis=0), x['y'])
In [22]:
# Map this function over the dataset
bank_dataset = bank_dataset.map(map_feature_label)
In [23]:
# Inspect the Dataset object
bank_dataset.element_spec
Out[23]:
Split into a training and a validation set¶
In [24]:
# Determine the length of the Dataset
dataset_length = 0
for _ in bank_dataset:
dataset_length += 1
print(dataset_length)
In [25]:
# Make training and validation sets from the dataset
training_elements = int(dataset_length * 0.7)
train_dataset = bank_dataset.take(training_elements)
validation_dataset = bank_dataset.skip(training_elements)
Build a classification model¶
Now let's build a model to classify the features.
In [26]:
# Build a classifier model
from tensorflow.keras.layers import Dense, Input, Concatenate, BatchNormalization
from tensorflow.keras import Sequential
model = Sequential()
model.add(Input(shape=(30,)))
model.add(BatchNormalization(momentum=0.8))
model.add(Dense(400, activation='relu'))
model.add(BatchNormalization(momentum=0.8))
model.add(Dense(400, activation='relu'))
model.add(BatchNormalization(momentum=0.8))
model.add(Dense(1, activation='sigmoid'))
In [27]:
# Compile the model
optimizer = tf.keras.optimizers.Adam(1e-4)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
In [28]:
# Show the model summary
model.summary()
Train the model¶
In [29]:
# Create batched training and validation datasets
train_dataset = train_dataset.batch(20, drop_remainder=True)
validation_dataset = validation_dataset.batch(100)
In [30]:
# Shuffle the training data
train_dataset = train_dataset.shuffle(1000)
In [31]:
# Fit the model
history = model.fit(train_dataset, validation_data=validation_dataset, epochs=5)
In [33]:
# Plot the training and validation accuracy
plt.plot(history.epoch, history.history["accuracy"], label="training")
plt.plot(history.epoch, history.history["val_accuracy"], label="validation")
plt.legend()
plt.xlabel("Epoch")
Out[33]:
In [ ]: