Importing Packages

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

import os
plt.style.use('ggplot')

Extract the folder into a data folder and go ahead and load the data with Pandas:

filepath_dict = {'yelp':   '../data/sentiment_labelled_sentences/yelp_labelled.txt',
                 'amazon': '../data/sentiment_labelled_sentences/amazon_cells_labelled.txt',
                 'imdb':   '../data/sentiment_labelled_sentences/imdb_labelled.txt'}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source
    df_list.append(df)
len(df_list)
3
df_list
[                                              sentence  label source
 0                             Wow... Loved this place.      1   yelp
 1                                   Crust is not good.      0   yelp
 2            Not tasty and the texture was just nasty.      0   yelp
 3    Stopped by during the late May bank holiday of...      1   yelp
 4    The selection on the menu was great and so wer...      1   yelp
 ..                                                 ...    ...    ...
 995  I think food should have flavor and texture an...      0   yelp
 996                           Appetite instantly gone.      0   yelp
 997  Overall I was not impressed and would not go b...      0   yelp
 998  The whole experience was underwhelming, and I ...      0   yelp
 999  Then, as if I hadn't wasted enough of my life ...      0   yelp
 
 [1000 rows x 3 columns],
                                               sentence  label  source
 0    So there is no way for me to plug it in here i...      0  amazon
 1                          Good case, Excellent value.      1  amazon
 2                               Great for the jawbone.      1  amazon
 3    Tied to charger for conversations lasting more...      0  amazon
 4                                    The mic is great.      1  amazon
 ..                                                 ...    ...     ...
 995  The screen does get smudged easily because it ...      0  amazon
 996  What a piece of junk.. I lose more calls on th...      0  amazon
 997                       Item Does Not Match Picture.      0  amazon
 998  The only thing that disappoint me is the infra...      0  amazon
 999  You can not answer calls with the unit, never ...      0  amazon
 
 [1000 rows x 3 columns],
                                               sentence  label source
 0    A very, very, very slow-moving, aimless movie ...      0   imdb
 1    Not sure who was more lost - the flat characte...      0   imdb
 2    Attempting artiness with black & white and cle...      0   imdb
 3         Very little music or anything to speak of.        0   imdb
 4    The best scene in the movie was when Gerardo i...      1   imdb
 ..                                                 ...    ...    ...
 743  I just got bored watching Jessice Lange take h...      0   imdb
 744  Unfortunately, any virtue in this film's produ...      0   imdb
 745                   In a word, it is embarrassing.        0   imdb
 746                               Exceptionally bad!        0   imdb
 747  All in all its an insult to one's intelligence...      0   imdb
 
 [748 rows x 3 columns]]
df = pd.concat(df_list)
df.iloc[0]
sentence    Wow... Loved this place.
label                              1
source                          yelp
Name: 0, dtype: object
df.head()
sentence label source
0 Wow... Loved this place. 1 yelp
1 Crust is not good. 0 yelp
2 Not tasty and the texture was just nasty. 0 yelp
3 Stopped by during the late May bank holiday of... 1 yelp
4 The selection on the menu was great and so wer... 1 yelp
df.tail()
sentence label source
743 I just got bored watching Jessice Lange take h... 0 imdb
744 Unfortunately, any virtue in this film's produ... 0 imdb
745 In a word, it is embarrassing. 0 imdb
746 Exceptionally bad! 0 imdb
747 All in all its an insult to one's intelligence... 0 imdb

Now use the CountVectorizer provided by the scikit-learn library to vectorize sentences. It takes the words of each sentence and creates a vocabulary of all the unique words in the sentences. This vocabulary can then be used to create a feature vector of the count of the words:

sentences = ['Rashmi likes ice cream', 'Rashmi hates chocolate.']
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(sentences)
vectorizer.vocabulary_
{'Rashmi': 0, 'likes': 5, 'ice': 4, 'cream': 2, 'hates': 3, 'chocolate': 1}
vectorizer.vocabulary_.get(u'ice')
4
d = vectorizer.vocabulary_
{key:d[key] for key in sorted(d.keys())}
{'Rashmi': 0, 'chocolate': 1, 'cream': 2, 'hates': 3, 'ice': 4, 'likes': 5}
vocabulary_list=[[key for key in sorted(d.keys())],[d[key] for key in sorted(d.keys())]]
vocabulary_list
[['Rashmi', 'chocolate', 'cream', 'hates', 'ice', 'likes'], [0, 1, 2, 3, 4, 5]]
vectorizer.transform(sentences).toarray()
array([[1, 0, 1, 0, 1, 1],
       [1, 1, 0, 1, 0, 0]])
vectorizer.transform(sentences).toarray()[1]
array([1, 1, 0, 1, 0, 0])
vectors = vectorizer.transform(sentences).toarray().tolist()
vectors
[[1, 0, 1, 0, 1, 1], [1, 1, 0, 1, 0, 0]]
data = [vocabulary_list[0],vectors[0],vectors[1]]
pd.DataFrame(data)
0 1 2 3 4 5
0 Rashmi chocolate cream hates ice likes
1 1 0 1 0 1 1
2 1 1 0 1 0 0

Extracting features from text files

In order to perform machine learning on text documents, we first need to turn the text content into numerical feature vectors.

Bags of words The most intuitive way to do so is to use a bags of words representation:

Assign a fixed integer id to each word occurring in any document of the training set (for instance by building a dictionary from words to integer indices).

Tokenizing text with scikit-learn¶

Text preprocessing, tokenizing and filtering of stopwords are all included in CountVectorizer, which builds a dictionary of features and transforms documents to feature vectors

The bags of words representation implies that n_features is the number of distinct words in the corpus: this number is typically larger than 100,000.

If n_samples == 10000, storing X as a NumPy array of type float32 would require 10000 x 100000 x 4 bytes = 4GB in RAM which is barely manageable on today’s computers.

Fortunately, most values in X will be zeros since for a given document less than a few thousand distinct words will be used. For this reason we say that bags of words are typically high-dimensional sparse datasets. We can save a lot of memory by only storing the non-zero parts of the feature vectors in memory.

scipy.sparse matrices are data structures that do exactly this, and scikit-learn has built-in support for these structures.

From occurrences to frequencies¶

Occurrence count is a good start but there is an issue: longer documents will have higher average count values than shorter documents, even though they might talk about the same topics.

To avoid these potential discrepancies it suffices to divide the number of occurrences of each word in a document by the total number of words in the document: these new features are called tf for Term Frequencies.

Another refinement on top of tf is to downscale weights for words that occur in many documents in the corpus and are therefore less informative than those that occur only in a smaller portion of the corpus.

This downscaling is called tf–idf for “Term Frequency times Inverse Document Frequency”.

Both tf and tf–idf can be computed as follows using TfidfTransformer:

Defining a Baseline Model

First, you are going to split the data into a training and testing set which will allow you to evaluate the accuracy and see if your model generalizes well. This means whether the model is able to perform well on data it has not seen before. This is a way to see if the model is overfitting.

Overfitting is when a model is trained too well on the training data. You want to avoid overfitting, as this would mean that the model mostly just memorized the training data. This would account for a large accuracy with the training data but a low accuracy in the testing data.

We start by taking the Yelp data set which we extract from our concatenated data set. From there, we take the sentences and labels.

df_yelp = df[df['source'] == 'yelp']
sentences = df_yelp['sentence'].values
y = df_yelp['label'].values

sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)

Create the feature vectors for each sentence of the training and testing set:

vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)
X_train
<750x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 7368 stored elements in Compressed Sparse Row format>

CountVectorizer performs tokenization which separates the sentences into a set of tokens. It additionally removes punctuation and special characters and can apply other preprocessing to each word. If you want, you can use a custom tokenizer from the NLTK library with the CountVectorizer or use any number of the customizations which you can explore to improve the performance of your model.

The classification model we are going to use is the logistic regression which is a simple yet powerful linear model that is mathematically speaking in fact a form of regression between 0 and 1 based on the input feature vector. By specifying a cutoff value (by default 0.5), the regression model is used for classification.

classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

print("Accuracy:", score)
Accuracy: 0.796

You can see that the logistic regression reached an impressive 79.6%, but let’s have a look how this model performs on the other data sets that we have. In this script, we perform and evaluate the whole process for each data set that we have:

for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))
Accuracy for yelp data: 0.7960
Accuracy for amazon data: 0.7960
Accuracy for imdb data: 0.7487

Great! You can see that this fairly simple model achieves a fairly good accuracy.

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', LogisticRegression()),
])
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)
    
    text_clf.fit(sentences_train, y_train)

    score = text_clf.score(sentences_test, y_test)
    print('Model {} Accuracy for {} data: {:.4f}'.format(LogisticRegression,source, score))
Model <class 'sklearn.linear_model._logistic.LogisticRegression'> Accuracy for yelp data: 0.7960
Model <class 'sklearn.linear_model._logistic.LogisticRegression'> Accuracy for amazon data: 0.7960
Model <class 'sklearn.linear_model._logistic.LogisticRegression'> Accuracy for imdb data: 0.7487

try a new feature engineering method

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression()),
])
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)
    
    text_clf.fit(sentences_train, y_train)

    score = text_clf.score(sentences_test, y_test)
    print('Model {} Accuracy for {} data: {:.4f}'.format(LogisticRegression,source, score))
Model <class 'sklearn.linear_model._logistic.LogisticRegression'> Accuracy for yelp data: 0.7680
Model <class 'sklearn.linear_model._logistic.LogisticRegression'> Accuracy for amazon data: 0.8000
Model <class 'sklearn.linear_model._logistic.LogisticRegression'> Accuracy for imdb data: 0.7380
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)
    
    text_clf.fit(sentences_train, y_train)

    score = text_clf.score(sentences_test, y_test)
    print('Model {} Accuracy for {} data: {:.4f}'.format(MultinomialNB,source, score))
Model <class 'sklearn.naive_bayes.MultinomialNB'> Accuracy for yelp data: 0.7680
Model <class 'sklearn.naive_bayes.MultinomialNB'> Accuracy for amazon data: 0.8000
Model <class 'sklearn.naive_bayes.MultinomialNB'> Accuracy for imdb data: 0.7914

Introduction to Deep Neural Networks

Neural networks, or sometimes called artificial neural network (ANN) orfeedforward neural network, are computational networks which were vaguely inspired by the neural networks in the human brain. They consist of neurons (also called nodes) which are connected like in the graph below.

You start by having a layer of input neurons where you feed in your feature vectors and the values are then feeded forward to a hidden layer. At each connection, you are feeding the value forward, while the value is multiplied by a weight and a bias is added to the value. This happens at every connection and at the end you reach an output layer with one or more output nodes.

If you want to have a binary classification you can use one node, but if you have multiple categories you should use multiple nodes for each category:

Introducing Keras

Keras is a deep learning and neural networks API by François Chollet which is capable of running on top of Tensorflow (Google), Theano or CNTK (Microsoft). To quote the wonderful book by François Chollet, Deep Learning with Python:

Keras is a model-level library, providing high-level building blocks for developing deep-learning models. It doesn’t handle low-level operations such as tensor manipulation and differentiation. Instead, it relies on a specialized, well-optimized tensor library to do so, serving as the backend engine of Keras (Source)

It is a great way to start experimenting with neural networks without having to implement every layer and piece on your own. For example Tensorflow is a great machine learning library, but you have to implement a lot of boilerplate code to have a model running.

First Keras Model

Keras supports two main types of models. You have the Sequential model API and the functional API which can do everything of the Sequential model but it can be also used for advanced models with complex network architectures.

The Sequential model is a linear stack of layers, where you can use the large variety of available layers in Keras. The most common layer is the Dense layer which is your regular densely connected neural network layer with all the weights and biases that you are already familiar with.

Before we build our model, we need to know the input dimension of our feature vectors. This happens only in the first layer since the following layers can do automatic shape inference. In order to build the Sequential model, you can add layers one by one in order

input_dim = X_train.shape[1]  # Number of features

model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
2022-06-01 23:34:10.474167: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense (Dense)               (None, 10)                25060     
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
=================================================================
Total params: 25,071
Trainable params: 25,071
Non-trainable params: 0
_________________________________________________________________
history = model.fit(X_train, y_train,
                    epochs=100,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)
Epoch 1/100
/Users/tc390714/.mle_app_demo/lib/python3.9/site-packages/tensorflow/python/framework/indexed_slices.py:444: UserWarning: Converting sparse IndexedSlices(IndexedSlices(indices=Tensor("gradient_tape/sequential/dense/embedding_lookup_sparse/Reshape_1:0", shape=(None,), dtype=int32), values=Tensor("gradient_tape/sequential/dense/embedding_lookup_sparse/Reshape:0", shape=(None, 10), dtype=float32), dense_shape=Tensor("gradient_tape/sequential/dense/embedding_lookup_sparse/Cast:0", shape=(2,), dtype=int32))) to a dense Tensor of unknown shape. This may consume a large amount of memory.
  warnings.warn(
57/57 [==============================] - 1s 8ms/step - loss: 0.6894 - accuracy: 0.5544 - val_loss: 0.6889 - val_accuracy: 0.5401
Epoch 2/100
57/57 [==============================] - 0s 2ms/step - loss: 0.6278 - accuracy: 0.7807 - val_loss: 0.6508 - val_accuracy: 0.6524
Epoch 3/100
57/57 [==============================] - 0s 1ms/step - loss: 0.5480 - accuracy: 0.8752 - val_loss: 0.6624 - val_accuracy: 0.7112
Epoch 4/100
57/57 [==============================] - 0s 1ms/step - loss: 0.4529 - accuracy: 0.9305 - val_loss: 0.5891 - val_accuracy: 0.7540
Epoch 5/100
57/57 [==============================] - 0s 1ms/step - loss: 0.3649 - accuracy: 0.9590 - val_loss: 0.5773 - val_accuracy: 0.7647
Epoch 6/100
57/57 [==============================] - 0s 2ms/step - loss: 0.2926 - accuracy: 0.9750 - val_loss: 0.5517 - val_accuracy: 0.7647
Epoch 7/100
57/57 [==============================] - 0s 2ms/step - loss: 0.2372 - accuracy: 0.9875 - val_loss: 0.5524 - val_accuracy: 0.7754
Epoch 8/100
57/57 [==============================] - 0s 4ms/step - loss: 0.1943 - accuracy: 0.9911 - val_loss: 0.5472 - val_accuracy: 0.7754
Epoch 9/100
57/57 [==============================] - 0s 3ms/step - loss: 0.1612 - accuracy: 0.9911 - val_loss: 0.5165 - val_accuracy: 0.7914
Epoch 10/100
57/57 [==============================] - 0s 2ms/step - loss: 0.1352 - accuracy: 0.9929 - val_loss: 0.5256 - val_accuracy: 0.7861
Epoch 11/100
57/57 [==============================] - 0s 2ms/step - loss: 0.1149 - accuracy: 0.9982 - val_loss: 0.5253 - val_accuracy: 0.7861
Epoch 12/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0987 - accuracy: 0.9982 - val_loss: 0.5190 - val_accuracy: 0.7914
Epoch 13/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0854 - accuracy: 0.9982 - val_loss: 0.5220 - val_accuracy: 0.7968
Epoch 14/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0744 - accuracy: 0.9982 - val_loss: 0.5348 - val_accuracy: 0.8021
Epoch 15/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0654 - accuracy: 0.9982 - val_loss: 0.5236 - val_accuracy: 0.8021
Epoch 16/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0579 - accuracy: 0.9982 - val_loss: 0.5420 - val_accuracy: 0.7914
Epoch 17/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0515 - accuracy: 0.9982 - val_loss: 0.5293 - val_accuracy: 0.7968
Epoch 18/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0460 - accuracy: 0.9982 - val_loss: 0.5447 - val_accuracy: 0.7968
Epoch 19/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0412 - accuracy: 0.9982 - val_loss: 0.5415 - val_accuracy: 0.8021
Epoch 20/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0373 - accuracy: 0.9982 - val_loss: 0.5560 - val_accuracy: 0.7968
Epoch 21/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0340 - accuracy: 0.9982 - val_loss: 0.5582 - val_accuracy: 0.7914
Epoch 22/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0308 - accuracy: 0.9982 - val_loss: 0.5587 - val_accuracy: 0.7968
Epoch 23/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0280 - accuracy: 0.9982 - val_loss: 0.5592 - val_accuracy: 0.8021
Epoch 24/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0256 - accuracy: 0.9982 - val_loss: 0.5739 - val_accuracy: 0.7968
Epoch 25/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0235 - accuracy: 0.9982 - val_loss: 0.5764 - val_accuracy: 0.7968
Epoch 26/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0216 - accuracy: 0.9982 - val_loss: 0.5821 - val_accuracy: 0.7968
Epoch 27/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0200 - accuracy: 0.9982 - val_loss: 0.5914 - val_accuracy: 0.7968
Epoch 28/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0185 - accuracy: 0.9982 - val_loss: 0.5985 - val_accuracy: 0.7968
Epoch 29/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0172 - accuracy: 0.9982 - val_loss: 0.6061 - val_accuracy: 0.7914
Epoch 30/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0159 - accuracy: 0.9982 - val_loss: 0.6023 - val_accuracy: 0.7914
Epoch 31/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0149 - accuracy: 0.9982 - val_loss: 0.6047 - val_accuracy: 0.7914
Epoch 32/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0138 - accuracy: 0.9982 - val_loss: 0.6179 - val_accuracy: 0.7861
Epoch 33/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0129 - accuracy: 1.0000 - val_loss: 0.6271 - val_accuracy: 0.7861
Epoch 34/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0120 - accuracy: 1.0000 - val_loss: 0.6367 - val_accuracy: 0.7861
Epoch 35/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0112 - accuracy: 1.0000 - val_loss: 0.6351 - val_accuracy: 0.7861
Epoch 36/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0105 - accuracy: 1.0000 - val_loss: 0.6384 - val_accuracy: 0.7861
Epoch 37/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0098 - accuracy: 1.0000 - val_loss: 0.6393 - val_accuracy: 0.7861
Epoch 38/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0093 - accuracy: 1.0000 - val_loss: 0.6434 - val_accuracy: 0.7861
Epoch 39/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0087 - accuracy: 1.0000 - val_loss: 0.6580 - val_accuracy: 0.7861
Epoch 40/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0082 - accuracy: 1.0000 - val_loss: 0.6609 - val_accuracy: 0.7861
Epoch 41/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0077 - accuracy: 1.0000 - val_loss: 0.6689 - val_accuracy: 0.7861
Epoch 42/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0073 - accuracy: 1.0000 - val_loss: 0.6639 - val_accuracy: 0.7861
Epoch 43/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0068 - accuracy: 1.0000 - val_loss: 0.6809 - val_accuracy: 0.7914
Epoch 44/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0065 - accuracy: 1.0000 - val_loss: 0.6665 - val_accuracy: 0.7914
Epoch 45/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0061 - accuracy: 1.0000 - val_loss: 0.6821 - val_accuracy: 0.7914
Epoch 46/100
57/57 [==============================] - 0s 3ms/step - loss: 0.0058 - accuracy: 1.0000 - val_loss: 0.6909 - val_accuracy: 0.7914
Epoch 47/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0055 - accuracy: 1.0000 - val_loss: 0.6952 - val_accuracy: 0.7914
Epoch 48/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0052 - accuracy: 1.0000 - val_loss: 0.6963 - val_accuracy: 0.7914
Epoch 49/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0049 - accuracy: 1.0000 - val_loss: 0.7048 - val_accuracy: 0.7914
Epoch 50/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0047 - accuracy: 1.0000 - val_loss: 0.7089 - val_accuracy: 0.7861
Epoch 51/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0044 - accuracy: 1.0000 - val_loss: 0.7178 - val_accuracy: 0.7861
Epoch 52/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0042 - accuracy: 1.0000 - val_loss: 0.7227 - val_accuracy: 0.7861
Epoch 53/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0040 - accuracy: 1.0000 - val_loss: 0.7294 - val_accuracy: 0.7861
Epoch 54/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0038 - accuracy: 1.0000 - val_loss: 0.7422 - val_accuracy: 0.7861
Epoch 55/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0036 - accuracy: 1.0000 - val_loss: 0.7392 - val_accuracy: 0.7861
Epoch 56/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0035 - accuracy: 1.0000 - val_loss: 0.7433 - val_accuracy: 0.7861
Epoch 57/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0033 - accuracy: 1.0000 - val_loss: 0.7461 - val_accuracy: 0.7861
Epoch 58/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0031 - accuracy: 1.0000 - val_loss: 0.7419 - val_accuracy: 0.7861
Epoch 59/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0030 - accuracy: 1.0000 - val_loss: 0.7671 - val_accuracy: 0.7807
Epoch 60/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0029 - accuracy: 1.0000 - val_loss: 0.7671 - val_accuracy: 0.7807
Epoch 61/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0027 - accuracy: 1.0000 - val_loss: 0.7679 - val_accuracy: 0.7807
Epoch 62/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0026 - accuracy: 1.0000 - val_loss: 0.7797 - val_accuracy: 0.7807
Epoch 63/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0025 - accuracy: 1.0000 - val_loss: 0.7966 - val_accuracy: 0.7807
Epoch 64/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0024 - accuracy: 1.0000 - val_loss: 0.8105 - val_accuracy: 0.7807
Epoch 65/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0023 - accuracy: 1.0000 - val_loss: 0.8128 - val_accuracy: 0.7807
Epoch 66/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0022 - accuracy: 1.0000 - val_loss: 0.8040 - val_accuracy: 0.7807
Epoch 67/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0021 - accuracy: 1.0000 - val_loss: 0.8085 - val_accuracy: 0.7807
Epoch 68/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0020 - accuracy: 1.0000 - val_loss: 0.8165 - val_accuracy: 0.7807
Epoch 69/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0019 - accuracy: 1.0000 - val_loss: 0.8230 - val_accuracy: 0.7807
Epoch 70/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0018 - accuracy: 1.0000 - val_loss: 0.8232 - val_accuracy: 0.7807
Epoch 71/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0018 - accuracy: 1.0000 - val_loss: 0.8224 - val_accuracy: 0.7807
Epoch 72/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0017 - accuracy: 1.0000 - val_loss: 0.8332 - val_accuracy: 0.7807
Epoch 73/100
57/57 [==============================] - 0s 5ms/step - loss: 0.0016 - accuracy: 1.0000 - val_loss: 0.8399 - val_accuracy: 0.7807
Epoch 74/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0016 - accuracy: 1.0000 - val_loss: 0.8415 - val_accuracy: 0.7807
Epoch 75/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0015 - accuracy: 1.0000 - val_loss: 0.8530 - val_accuracy: 0.7807
Epoch 76/100
57/57 [==============================] - 0s 3ms/step - loss: 0.0014 - accuracy: 1.0000 - val_loss: 0.8581 - val_accuracy: 0.7807
Epoch 77/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0014 - accuracy: 1.0000 - val_loss: 0.8545 - val_accuracy: 0.7807
Epoch 78/100
57/57 [==============================] - 0s 1ms/step - loss: 0.0013 - accuracy: 1.0000 - val_loss: 0.8605 - val_accuracy: 0.7807
Epoch 79/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0013 - accuracy: 1.0000 - val_loss: 0.8688 - val_accuracy: 0.7807
Epoch 80/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0012 - accuracy: 1.0000 - val_loss: 0.8766 - val_accuracy: 0.7807
Epoch 81/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0012 - accuracy: 1.0000 - val_loss: 0.8799 - val_accuracy: 0.7807
Epoch 82/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0011 - accuracy: 1.0000 - val_loss: 0.8800 - val_accuracy: 0.7807
Epoch 83/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0011 - accuracy: 1.0000 - val_loss: 0.8863 - val_accuracy: 0.7807
Epoch 84/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0010 - accuracy: 1.0000 - val_loss: 0.8866 - val_accuracy: 0.7807
Epoch 85/100
57/57 [==============================] - 0s 2ms/step - loss: 0.0010 - accuracy: 1.0000 - val_loss: 0.8925 - val_accuracy: 0.7807
Epoch 86/100
57/57 [==============================] - 0s 2ms/step - loss: 9.6562e-04 - accuracy: 1.0000 - val_loss: 0.9019 - val_accuracy: 0.7807
Epoch 87/100
57/57 [==============================] - 0s 1ms/step - loss: 9.2877e-04 - accuracy: 1.0000 - val_loss: 0.9003 - val_accuracy: 0.7807
Epoch 88/100
57/57 [==============================] - 0s 2ms/step - loss: 8.9533e-04 - accuracy: 1.0000 - val_loss: 0.9103 - val_accuracy: 0.7807
Epoch 89/100
57/57 [==============================] - 0s 1ms/step - loss: 8.6348e-04 - accuracy: 1.0000 - val_loss: 0.9125 - val_accuracy: 0.7807
Epoch 90/100
57/57 [==============================] - 0s 2ms/step - loss: 8.3003e-04 - accuracy: 1.0000 - val_loss: 0.9177 - val_accuracy: 0.7807
Epoch 91/100
57/57 [==============================] - 0s 2ms/step - loss: 7.9647e-04 - accuracy: 1.0000 - val_loss: 0.9177 - val_accuracy: 0.7807
Epoch 92/100
57/57 [==============================] - 0s 2ms/step - loss: 7.7104e-04 - accuracy: 1.0000 - val_loss: 0.9197 - val_accuracy: 0.7861
Epoch 93/100
57/57 [==============================] - 0s 2ms/step - loss: 7.4279e-04 - accuracy: 1.0000 - val_loss: 0.9226 - val_accuracy: 0.7861
Epoch 94/100
57/57 [==============================] - 0s 2ms/step - loss: 7.1501e-04 - accuracy: 1.0000 - val_loss: 0.9314 - val_accuracy: 0.7861
Epoch 95/100
57/57 [==============================] - 0s 1ms/step - loss: 6.9016e-04 - accuracy: 1.0000 - val_loss: 0.9365 - val_accuracy: 0.7807
Epoch 96/100
57/57 [==============================] - 0s 2ms/step - loss: 6.6338e-04 - accuracy: 1.0000 - val_loss: 0.9384 - val_accuracy: 0.7807
Epoch 97/100
57/57 [==============================] - 0s 2ms/step - loss: 6.4135e-04 - accuracy: 1.0000 - val_loss: 0.9403 - val_accuracy: 0.7861
Epoch 98/100
57/57 [==============================] - 0s 2ms/step - loss: 6.1557e-04 - accuracy: 1.0000 - val_loss: 0.9418 - val_accuracy: 0.7861
Epoch 99/100
57/57 [==============================] - 0s 2ms/step - loss: 5.9639e-04 - accuracy: 1.0000 - val_loss: 0.9459 - val_accuracy: 0.7861
Epoch 100/100
57/57 [==============================] - 0s 1ms/step - loss: 5.7576e-04 - accuracy: 1.0000 - val_loss: 0.9494 - val_accuracy: 0.7861
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
Training Accuracy: 1.0000
Testing Accuracy:  0.7861
history.history.keys()
dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])
def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
plot_history(history)

What Is a Word Embedding?

Text is considered a form of sequence data similar to time series data that you would have in weather data or financial data. Now you will see how to represent each word as vectors. There are various ways to vectorize text, such as:

  • Words represented by each word as a vector
  • Characters represented by each character as a vector
  • N-grams of words/characters represented as a vector (N-grams are overlapping groups of multiple succeeding words/characters in the text)

Here, you’ll see how to deal with representing words as vectors which is the common way to use text in neural networks. Two possible ways to represent a word as a vector are one-hot encoding and word embeddings.

One-Hot Encoding

The first way to represent a word as a vector is by creating a so-called one-hot encoding, which is simply done by taking a vector of the length of the vocabulary with an entry for each word in the corpus.

In this way, you have for each word, given it has a spot in the vocabulary, a vector with zeros everywhere except for the corresponding spot for the word which is set to one.

cities = ['London', 'Berlin', 'Berlin', 'New York', 'London']
cities
['London', 'Berlin', 'Berlin', 'New York', 'London']

LabelEncoder to encode the list of cities into categorical integer values

encoder = LabelEncoder()
city_labels = encoder.fit_transform(cities)
city_labels
array([1, 0, 0, 2, 1])

OneHotEncoder expects each categorical value to be in a separate row, so you’ll need to reshape the array, then you can apply the encoder:

encoder = OneHotEncoder(sparse=False)
city_labels = city_labels.reshape((5, 1))
encoder.fit_transform(city_labels)
array([[0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.]])

Word Embeddings

This method represents words as dense word vectors (also called word embeddings) which are trained unlike the one-hot encoding which are hardcoded. This means that the word embeddings collect more information into fewer dimensions.

Note that the word embeddings do not understand the text as a human would, but they rather map the statistical structure of the language used in the corpus. Their aim is to map semantic meaning into a geometric space. This geometric space is then called the embedding space.

Now you need to tokenize the data into a format that can be used by the word embeddings. Keras offers a couple of convenience methods for text preprocessing and sequence preprocessing which you can employ to prepare your text.

You can start by using the Tokenizer utility class which can vectorize a text corpus into a list of integers. Each integer maps to a value in a dictionary that encodes the entire corpus, with the keys in the dictionary being the vocabulary terms themselves. You can add the parameter num_words, which is responsible for setting the size of the vocabulary. The most common num_words words will be then kept.

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)

X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

print(sentences_train[2])
print(X_train[2])
I am a fan of his ... This movie sucked really bad.  
[7, 150, 2, 932, 4, 49, 6, 11, 563, 45, 30]
for word in ['the', 'all','fan']:
    print('{}: {}'.format(word, tokenizer.word_index[word]))
the: 1
all: 27
fan: 932

pad sequences with Keras

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

print(X_train[0, :])
[170 116 390  35   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0]

Keras Embedding Layer

Now you can use the Embedding Layer of Keras which takes the previously calculated integers and maps them to a dense vector of the embedding. You will need the following parameters:

  • input_dim: the size of the vocabulary
  • output_dim: the size of the dense vector
  • input_length: the length of the sequence

With the Embedding layer we have now a couple of options. One way would be to take the output of the embedding layer and plug it into a Dense layer. In order to do this you have to add a Flatten layer in between that prepares the sequential input for the Dense layer:

embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding (Embedding)       (None, 100, 50)           128750    
                                                                 
 flatten (Flatten)           (None, 5000)              0         
                                                                 
 dense_2 (Dense)             (None, 10)                50010     
                                                                 
 dense_3 (Dense)             (None, 1)                 11        
                                                                 
=================================================================
Total params: 178,771
Trainable params: 178,771
Non-trainable params: 0
_________________________________________________________________
history = model.fit(X_train, y_train,
                    epochs=20,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)
Epoch 1/20
57/57 [==============================] - 1s 4ms/step - loss: 0.6931 - accuracy: 0.5116 - val_loss: 0.7035 - val_accuracy: 0.4920
Epoch 2/20
57/57 [==============================] - 0s 5ms/step - loss: 0.6594 - accuracy: 0.6132 - val_loss: 0.6850 - val_accuracy: 0.5401
Epoch 3/20
57/57 [==============================] - 0s 3ms/step - loss: 0.5408 - accuracy: 0.8948 - val_loss: 0.6708 - val_accuracy: 0.5829
Epoch 4/20
57/57 [==============================] - 0s 4ms/step - loss: 0.3138 - accuracy: 0.9626 - val_loss: 0.6345 - val_accuracy: 0.6257
Epoch 5/20
57/57 [==============================] - 0s 3ms/step - loss: 0.1500 - accuracy: 0.9857 - val_loss: 0.6239 - val_accuracy: 0.6417
Epoch 6/20
57/57 [==============================] - 0s 3ms/step - loss: 0.0745 - accuracy: 0.9947 - val_loss: 0.6313 - val_accuracy: 0.6364
Epoch 7/20
57/57 [==============================] - 0s 3ms/step - loss: 0.0420 - accuracy: 0.9982 - val_loss: 0.6526 - val_accuracy: 0.6684
Epoch 8/20
57/57 [==============================] - 0s 3ms/step - loss: 0.0263 - accuracy: 1.0000 - val_loss: 0.7150 - val_accuracy: 0.6417
Epoch 9/20
57/57 [==============================] - 0s 3ms/step - loss: 0.0186 - accuracy: 1.0000 - val_loss: 0.6624 - val_accuracy: 0.6524
Epoch 10/20
57/57 [==============================] - 0s 3ms/step - loss: 0.0127 - accuracy: 1.0000 - val_loss: 0.6743 - val_accuracy: 0.6471
Epoch 11/20
57/57 [==============================] - 0s 3ms/step - loss: 0.0096 - accuracy: 1.0000 - val_loss: 0.6854 - val_accuracy: 0.6417
Epoch 12/20
57/57 [==============================] - 0s 3ms/step - loss: 0.0071 - accuracy: 1.0000 - val_loss: 0.6970 - val_accuracy: 0.6524
Epoch 13/20
57/57 [==============================] - 0s 7ms/step - loss: 0.0056 - accuracy: 1.0000 - val_loss: 0.7079 - val_accuracy: 0.6578
Epoch 14/20
57/57 [==============================] - 0s 4ms/step - loss: 0.0046 - accuracy: 1.0000 - val_loss: 0.7192 - val_accuracy: 0.6578
Epoch 15/20
57/57 [==============================] - 0s 3ms/step - loss: 0.0038 - accuracy: 1.0000 - val_loss: 0.7323 - val_accuracy: 0.6524
Epoch 16/20
57/57 [==============================] - 0s 3ms/step - loss: 0.0031 - accuracy: 1.0000 - val_loss: 0.7446 - val_accuracy: 0.6524
Epoch 17/20
57/57 [==============================] - 0s 3ms/step - loss: 0.0026 - accuracy: 1.0000 - val_loss: 0.7488 - val_accuracy: 0.6524
Epoch 18/20
57/57 [==============================] - 0s 3ms/step - loss: 0.0023 - accuracy: 1.0000 - val_loss: 0.7617 - val_accuracy: 0.6471
Epoch 19/20
57/57 [==============================] - 0s 3ms/step - loss: 0.0020 - accuracy: 1.0000 - val_loss: 0.7644 - val_accuracy: 0.6578
Epoch 20/20
57/57 [==============================] - 0s 3ms/step - loss: 0.0017 - accuracy: 1.0000 - val_loss: 0.7716 - val_accuracy: 0.6524
Training Accuracy: 1.0000
Testing Accuracy:  0.6524

Global max/average pooling takes the maximum/average of all features whereas in the other case you have to define the pool size. Keras has again its own layer that you can add in the sequential model:

embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding_1 (Embedding)     (None, 100, 50)           128750    
                                                                 
 global_max_pooling1d (Globa  (None, 50)               0         
 lMaxPooling1D)                                                  
                                                                 
 dense_4 (Dense)             (None, 10)                510       
                                                                 
 dense_5 (Dense)             (None, 1)                 11        
                                                                 
=================================================================
Total params: 129,271
Trainable params: 129,271
Non-trainable params: 0
_________________________________________________________________
history = model.fit(X_train, y_train,
                    epochs=50,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)
Training Accuracy: 1.0000
Testing Accuracy:  0.7594

Convolutional Neural Networks (CNN)

Convolutional neural networks or also called convnets are one of the most exciting developments in machine learning in recent years.

They have revolutionized image classification and computer vision by being able to extract features from images and using them in neural networks. The properties that made them useful in image processing makes them also handy for sequence processing. You can imagine a CNN as a specialized neural network that is able to detect specific patterns.

A CNN has hidden layers which are called convolutional layers. When you think of images, a computer has to deal with a two dimensional matrix of numbers and therefore you need some way to detect features in this matrix. These convolutional layers are able to detect edges, corners and other kinds of textures which makes them such a special tool. The convolutional layer consists of multiple filters which are slid across the image and are able to detect specific features.

embedding_dim = 100

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()
Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding_2 (Embedding)     (None, 100, 100)          257500    
                                                                 
 conv1d (Conv1D)             (None, 96, 128)           64128     
                                                                 
 global_max_pooling1d_1 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_6 (Dense)             (None, 10)                1290      
                                                                 
 dense_7 (Dense)             (None, 1)                 11        
                                                                 
=================================================================
Total params: 322,929
Trainable params: 322,929
Non-trainable params: 0
_________________________________________________________________
history = model.fit(X_train, y_train,
                    epochs=10,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)
Training Accuracy: 1.0000
Testing Accuracy:  0.7807

Hyperparameters Optimization

One crucial steps of deep learning and working with neural networks is hyperparameter optimization.

As you saw in the models that we have used so far, even with simpler ones, you had a large number of parameters to tweak and choose from. Those parameters are called hyperparameters. This is the most time consuming part of machine learning and sadly there are no one-fits-all solutions ready.

One popular method for hyperparameter optimization is grid search. What this method does is it takes lists of parameters and it runs the model with each parameter combination that it can find. It is the most thorough way but also the most computationally heavy way to do this. Another common way,random search, which you’ll see in action here, simply takes random combinations of parameters.

In order to apply random search with Keras, you will need to use the KerasClassifier which serves as a wrapper for the scikit-learn API. With this wrapper you are able to use the various tools available with scikit-learn like cross-validation. The class that you need is RandomizedSearchCV which implements random search with cross-validation. Cross-validation is a way to validate the model and take the whole data set and separate it into multiple testing and training data sets.

There are various types of cross-validation. One type is the k-fold cross-validation. In this type the data set is partitioned into k equal sized sets where one set is used for testing and the rest of the partitions are used for training. This enables you to run k different runs, where each partition is once used as a testing set. So, the higher k is the more accurate the model evaluation is, but the smaller each testing set is.

def create_model(num_filters, kernel_size, vocab_size, embedding_dim, maxlen):
    model = Sequential()
    model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
    model.add(layers.Conv1D(num_filters, kernel_size, activation='relu'))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(10, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model
param_grid = dict(num_filters=[32, 64, 128],
                  kernel_size=[3, 5, 7],
                  vocab_size=[5000], 
                  embedding_dim=[50],
                  maxlen=[100])
epochs = 20
embedding_dim = 50
maxlen = 100
output_file = 'output.txt'

# Run grid search for each source (yelp, amazon, imdb)
for source, frame in df.groupby('source'):
    print('Running grid search for data set :', source)
    sentences = df['sentence'].values
    y = df['label'].values

    # Train-test split
    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    # Tokenize words
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(sentences_train)
    X_train = tokenizer.texts_to_sequences(sentences_train)
    X_test = tokenizer.texts_to_sequences(sentences_test)

    # Adding 1 because of reserved 0 index
    vocab_size = len(tokenizer.word_index) + 1

    # Pad sequences with zeros
    X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
    X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

    # Parameter grid for grid search
    param_grid = dict(num_filters=[32, 64, 128],
                      kernel_size=[3, 5, 7],
                      vocab_size=[vocab_size],
                      embedding_dim=[embedding_dim],
                      maxlen=[maxlen])
    model = KerasClassifier(build_fn=create_model,
                            epochs=epochs, batch_size=10,
                            verbose=False)
    grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
                              cv=4, verbose=1, n_iter=5)
    grid_result = grid.fit(X_train, y_train)

    # Evaluate testing set
    test_accuracy = grid.score(X_test, y_test)

    # Save and evaluate results
#     prompt = input(f'finished {source}; write to file and proceed? [y/n]')
#     if prompt.lower() not in {'y', 'true', 'yes'}:
#         break
#     with open(output_file, 'w+') as f:
    s = ('Running {} data set\nBest Accuracy : '
             '{:.4f}\n{}\nTest Accuracy : {:.4f}\n\n')
    output_string = s.format(
        source,
        grid_result.best_score_,
        grid_result.best_params_,
        test_accuracy)
    print(output_string)
#         f.write(output_string)
Running grid search for data set : amazon
Fitting 4 folds for each of 5 candidates, totalling 20 fits
/var/folders/lm/tqm129c507j4njpg3k8cmqf40000gp/T/ipykernel_98623/297795620.py:36: DeprecationWarning: KerasClassifier is deprecated, use Sci-Keras (https://github.com/adriangb/scikeras) instead. See https://www.adriangb.com/scikeras/stable/migration.html for help migrating.
  model = KerasClassifier(build_fn=create_model,
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  3.2min finished
Running amazon data set
Best Accuracy : 0.8113
{'vocab_size': 4603, 'num_filters': 32, 'maxlen': 100, 'kernel_size': 3, 'embedding_dim': 50}
Test Accuracy : 0.8399


Running grid search for data set : imdb
Fitting 4 folds for each of 5 candidates, totalling 20 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  3.2min finished
Running imdb data set
Best Accuracy : 0.8142
{'vocab_size': 4603, 'num_filters': 32, 'maxlen': 100, 'kernel_size': 5, 'embedding_dim': 50}
Test Accuracy : 0.8341


Running grid search for data set : yelp
Fitting 4 folds for each of 5 candidates, totalling 20 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  3.6min finished
Running yelp data set
Best Accuracy : 0.8142
{'vocab_size': 4603, 'num_filters': 32, 'maxlen': 100, 'kernel_size': 7, 'embedding_dim': 50}
Test Accuracy : 0.8297