<!DOCTYPE html>
In [ ]:
import tensorflow as tf
print(tf.__version__)
Load the IMDB review sentiment dataset¶
In [ ]:
# Import imdb
In [ ]:
# Download and assign the data set using load_data()
Inspect the dataset¶
In [ ]:
# Inspect the type of the data
In [ ]:
# Inspect the shape of the data
In [ ]:
# Display the first dataset element input
# Notice encoding
In [ ]:
# Display the first dataset element output
Load dataset with different options¶
In [ ]:
# Load the dataset with defaults
# ~/.keras/dataset/
In [ ]:
# Limit the vocabulary to the top 500 words using num_words
In [ ]:
# Ignore the top 10 most frequent words using skip_top
In [ ]:
# Limit the sequence lengths to 500 using maxlen
In [ ]:
# Use '1' as the character that indicates the start of a sequence
Explore the dataset word index¶
In [ ]:
# Load the imdb word index using get_word_index()
In [ ]:
# View the word index as a dictionary,
# accounting for index_from.
In [ ]:
# Retrieve a specific word's index
In [ ]:
# View an input sentence
In [ ]:
# Get the sentiment value
In [ ]:
# Load the imdb data set
Preprocess the data with padding¶
In [ ]:
# Inspect the input data shape
In [ ]:
# Pad the inputs to the maximum length using maxlen
In [ ]:
# Inspect the output data shape
Create a Masking layer¶
In [ ]:
# Import numpy
In [ ]:
# Masking expects to see (batch, sequence, features)
# Create a dummy feature dimension using expand_dims
In [ ]:
# Create a Masking layer
In [ ]:
# Pass tf_x_train to it
In [ ]:
# Look at the dataset
In [ ]:
# Look at the ._keras_mask for the dataset
Create and apply an Embedding
layer¶
In [ ]:
# Create an embedding layer using layers.Embedding
# Specify input_dim, output_dim, input_length
In [ ]:
# Inspect an Embedding layer output for a fixed input
# Expects an input of shape (batch, sequence, feature)
In [ ]:
# Inspect the Embedding layer weights using get_weights()
In [ ]:
# Get the embedding for the 14th index
Create and apply an Embedding
layer that uses mask_zero=True
¶
In [ ]:
# Create a layer that uses the mask_zero kwarg
In [ ]:
# Apply this layer to the sequence and see the _keras_mask property
Load and preprocess the IMDb data¶
In [ ]:
# A function to load and preprocess the IMDB dataset
def get_and_pad_imdb_dataset(num_words=10000, maxlen=None, index_from=2):
from tensorflow.keras.datasets import imdb
# Load the reviews
(x_train, y_train), (x_test, y_test) = imdb.load_data(path='imdb.npz',
num_words=num_words,
skip_top=0,
maxlen=maxlen,
start_char=1,
oov_char=2,
index_from=index_from)
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train,
maxlen=None,
padding='pre',
truncating='pre',
value=0)
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test,
maxlen=None,
padding='pre',
truncating='pre',
value=0)
return (x_train, y_train), (x_test, y_test)
In [ ]:
# Load the dataset
In [ ]:
# A function to get the dataset word index
def get_imdb_word_index(num_words=10000, index_from=2):
imdb_word_index = tf.keras.datasets.imdb.get_word_index(
path='imdb_word_index.json')
imdb_word_index = {key: value + index_from for
key, value in imdb_word_index.items() if value <= num_words-index_from}
return imdb_word_index
In [ ]:
# Get the word index
In [ ]:
# Swap the keys and values of the word index
In [ ]:
# View the first dataset example sentence
Build an Embedding layer into a model¶
In [ ]:
# Get the maximum token value
In [ ]:
# Specify an embedding dimension
In [ ]:
# Build a model using Sequential:
# 1. Embedding layer
# 2. GlobalAveragePooling1D
# 3. Dense
In [ ]:
# Functional API refresher: use the Model to build the same model
In [ ]:
model.summary()
Compile, train, and evaluate the model¶
In [ ]:
# Compile the model with a binary cross-entropy loss
In [ ]:
# Train the model using .fit(), savng its history
In [ ]:
# Plot the training and validation accuracy
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
history_dict = history.history
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']
epochs = range(1, len(acc) + 1)
plt.figure(figsize=(14,5))
plt.plot(epochs, acc, marker='.', label='Training acc')
plt.plot(epochs, val_acc, marker='.', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epoch')
plt.ylabel('Classification accuracy')
plt.legend(loc='lower right')
plt.ylim(0, 1);
In [ ]:
# Retrieve the embedding layer's weights from the trained model
In [ ]:
# Save the word Embeddings to tsv files
# Two files:
# one contains the embedding labels (meta.tsv),
# one contains the embeddings (vecs.tsv)
import io
from os import path
out_v = io.open(path.join('data', 'vecs.tsv'), 'w', encoding='utf-8')
out_m = io.open(path.join('data', 'meta.tsv'), 'w', encoding='utf-8')
k = 0
for word, token in word_index.items():
if k != 0:
out_m.write('\n')
out_v.write('\n')
out_v.write('\t'.join([str(x) for x in weights[token]]))
out_m.write(word)
k += 1
out_v.close()
out_m.close()
# beware large collections of embeddings!
Initialize and pass an input to a SimpleRNN layer¶
In [ ]:
# Create a SimpleRNN layer and test it
In [ ]:
# Note that only the final cell output is returned
Load and transform the IMDB review sentiment dataset¶
In [ ]:
# A function to load and preprocess the IMDB dataset
def get_and_pad_imdb_dataset(num_words=10000, maxlen=None, index_from=2):
from tensorflow.keras.datasets import imdb
# Load the reviews
(x_train, y_train), (x_test, y_test) = imdb.load_data(path='imdb.npz',
num_words=num_words,
skip_top=0,
maxlen=maxlen,
start_char=1,
oov_char=2,
index_from=index_from)
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train,
maxlen=None,
padding='pre',
truncating='pre',
value=0)
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test,
maxlen=None,
padding='pre',
truncating='pre',
value=0)
return (x_train, y_train), (x_test, y_test)
In [ ]:
# Load the dataset
In [ ]:
# A function to get the dataset word index
def get_imdb_word_index(num_words=10000, index_from=2):
imdb_word_index = tf.keras.datasets.imdb.get_word_index(
path='imdb_word_index.json')
imdb_word_index = {key: value + index_from for
key, value in imdb_word_index.items() if value <= num_words-index_from}
return imdb_word_index
In [ ]:
# Get the word index using get_imdb_word_index()
Create a recurrent neural network model¶
In [ ]:
# Get the maximum index value
In [ ]:
# Using Sequential, build the model:
# 1. Embedding.
# 2. LSTM.
# 3. Dense.
Compile and fit the model¶
In [ ]:
# Compile the model with binary cross-entropy loss
In [ ]:
# Fit the model and save its training history
Plot learning curves¶
In [ ]:
# Plot the training and validation accuracy
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
history_dict = history.history
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']
epochs = range(1, len(acc) + 1)
plt.figure(figsize=(14,5))
plt.plot(epochs, acc, marker='.', label='Training acc')
plt.plot(epochs, val_acc, marker='.', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epoch')
plt.ylabel('Classification accuracy')
plt.legend(loc='lower right')
plt.ylim(0, 1);
Make predictions with the model¶
In [ ]:
# View the first test data example sentence
# (invert the word index)
In [ ]:
# Get the model prediction using model.predict()
In [ ]:
# Get the corresponding label
Load and transform the IMDB review sentiment dataset¶
In [ ]:
# A function to load and preprocess the IMDB dataset
def get_and_pad_imdb_dataset(num_words=10000, maxlen=None, index_from=2):
from tensorflow.keras.datasets import imdb
# Load the reviews
(x_train, y_train), (x_test, y_test) = imdb.load_data(path='imdb.npz',
num_words=num_words,
skip_top=0,
maxlen=maxlen,
start_char=1,
oov_char=2,
index_from=index_from)
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train,
maxlen=None,
padding='pre',
truncating='pre',
value=0)
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test,
maxlen=None,
padding='pre',
truncating='pre',
value=0)
return (x_train, y_train), (x_test, y_test)
In [ ]:
# Load the dataset
In [ ]:
# A function to get the dataset word index
def get_imdb_word_index(num_words=10000, index_from=2):
imdb_word_index = tf.keras.datasets.imdb.get_word_index(
path='imdb_word_index.json')
imdb_word_index = {key: value + index_from for
key, value in imdb_word_index.items() if value <= num_words-index_from}
return imdb_word_index
In [ ]:
# Get the word index using get_imdb_word_index()
Build stacked and bidirectional recurrent models¶
In [ ]:
# Get the maximum index value and specify an embedding dimension
In [ ]:
# Using Sequential, build a stacked LSTM model via return_sequences=True
In [ ]:
# Using Sequential, build a bidirectional RNN with merge_mode='sum'
In [ ]:
# Create a model featuring both stacked recurrent layers and a bidirectional layer
Compile and fit the model¶
In [ ]:
# Compile the model
In [ ]:
# Train the model, saving its history
In [ ]:
# Plot the training and validation accuracy
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
history_dict = history.history
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']
epochs = range(1, len(acc) + 1)
plt.figure(figsize=(14,5))
plt.plot(epochs, acc, marker='.', label='Training acc')
plt.plot(epochs, val_acc, marker='.', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epoch')
plt.ylabel('Classification accuracy')
plt.legend(loc='lower right')
plt.ylim(0, 1);