Emotional Tweets using NLP (Tensorflow)

Tensorflow, Keras

Emotion Detection from Tweets using Natural Language Processing (NLP) and TensorFlow

This project focuses on building an intelligent system that can analyze Twitter posts and identify the emotions expressed within them. Leveraging Natural Language Processing (NLP) techniques and the deep learning capabilities of TensorFlow, the goal is to train a model that can classify tweets into various emotional categories.

Step 1 : Importing the required packages

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import nlp
import random

Step 2 : Defining the required function in the beginning itself

def show_history(h):
    epochs_trained = len(h.history['loss'])
    plt.figure(figsize=(16, 6))

    plt.subplot(1, 2, 1)
    plt.plot(range(0, epochs_trained), h.history.get('accuracy'), label='Training')
    plt.plot(range(0, epochs_trained), h.history.get('val_accuracy'), label='Validation')
    plt.ylim([0., 1.])
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(range(0, epochs_trained), h.history.get('loss'), label='Training')
    plt.plot(range(0, epochs_trained), h.history.get('val_loss'), label='Validation')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()


def show_confusion_matrix(y_true, y_pred, classes):
    from sklearn.metrics import confusion_matrix

    cm = confusion_matrix(y_true, y_pred, normalize='true')

    plt.figure(figsize=(8, 8))
    sp = plt.subplot(1, 1, 1)
    ctx = sp.matshow(cm)
    plt.xticks(list(range(len(classes))), labels=classes)
    plt.yticks(list(range(len(classes))), labels=classes)
    plt.colorbar(ctx)
    plt.show()


print('Using TensorFlow version', tf.__version__)

Step 3 : Importing the dataset and used the inbuilt dataset for the practising NLP

pip install -U datasets
dataset = nlp.load_dataset('ag_news')
dataset

Step 4: Separate test and training dataset(not used validation test intentionally)

train = dataset['train']
test = dataset['test']

Step 5 : Defining the get tweet function, it would be easy to fetch the test and train dataset, by splitting into two (70/30)

def get_tweet(data):
  tweets = [x['text'] for x in data]
  return tweets
train_tweets = get_tweet(train)
test_tweets = get_tweet(test)
train_tweets[0]
Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.
test_tweets[0]
Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.

Step 6 : Creating Tokenizer which is very important for NLP model

from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>')
tokenizer.fit_on_texts(train_tweets)
train_tweets[0]
Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
tokenizer.texts_to_sequences(train_tweets[0])

Step 7 : Creating Padding Technique which means setting the threshold of the maximum length of the word and fetch only those words from the dataset

lengths_train = [t.split() for t in train_tweets]
plt.hist([len(t) for t in lengths_train])
plt.show()
max_length = 80

from tensorflow.keras.preprocessing.sequence import pad_sequences
def get_sequences(tokenizer, tweets):
  sequences = tokenizer.texts_to_sequences(tweets)
  padded_sequences = pad_sequences(sequences, truncating='post', maxlen=max_length)
  return padded_sequences
padded_train_sequences = get_sequences(tokenizer, train_tweets)
padded_test_sequences = get_sequences(tokenizer, test_tweets)
padded_train_sequences[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    1, 1100,  878,
       1304, 4246,   22,   22,  921,  812,  353,    1,  100,  103,   23,
          4, 4522,    9,  509,  510,    1,    7,    1, 1521, 2177,    6,
          2,  531,  248,   23, 3937, 2294,   16, 6561,    8,  213,  369,
          5,    2,  129], dtype=int32)
padded_test_sequences[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    2,  481,   17,    8,  106,
        921,  117, 1098,  440, 1779,    9,  733,    1,  184,   90,  184,
         90,  747,  615,    4,  106,  117,    5,    1, 4030,    9,    2,
        408,  132,   78, 6145, 1340, 1079,    4, 3616,    9, 3374, 6112,
          1,  184,  802,   23, 2497,  148,    2,   40,  440, 1779,    9,
         16, 4416, 1174], dtype=int32)

Step 8: Creating the labels

train_classes = set(train['label'])
train_classes
{0, 1, 2, 3}

test_classes = set(test['label'])
test_classes
{0, 1, 2, 3}

plt.hist(train['label'],bins=20)
plt.show()
plt.hist(test['label'],bins=20)
plt.show()
class_to_index = dict((c,i) for i,c in enumerate(train_classes))
class_to_index
{0: 0, 1: 1, 2: 2, 3: 3}
index_to_class = dict({v:k for k,v in class_to_index.items()})
index_to_class
{0: 0, 1: 1, 2: 2, 3: 3}

names_to_ids = lambda labels: np.array([class_to_index.get(x) for x in labels])

train_labels = names_to_ids(train['label'])
test_labels = names_to_ids(test['label'])
train_labels[0]
test_labels[0]

Step 9: Creation of Model

model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(10000, 16),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(4, activation='softmax')
])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy'])
model.summary()
Model: "sequential_5"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ embedding_5 (Embedding)         │ ?                      │   0 (unbuilt) │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ lstm_9 (LSTM)                   │ ?                      │   0 (unbuilt) │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ bidirectional_4 (Bidirectional) │ ?                      │   0 (unbuilt) │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_5 (Dense)                 │ ?                      │   0 (unbuilt) │
└─────────────────────────────────┴────────────────────────┴───────────────┘
 Total params: 0 (0.00 B)
 Trainable params: 0 (0.00 B)
 Non-trainable params: 0 (0.00 B)

Step 9.1: Train the model

val_tweets_train = get_tweet(dataset['train'])
val_sequences_train = get_sequences(tokenizer, val_tweets_train)
val_labels_train = names_to_ids(dataset['train']['label'])
val_tweets_train[0],val_labels_train[0]
("Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.",
 np.int64(2))
h = model.fit(
    padded_train_sequences, train_labels.reshape(-1, 1),
    validation_data=(val_sequences_train, val_labels_train.reshape(-1, 1)),
    epochs=10,
    callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=2)]
)

Step 10: Evaluation of the model and confusion matrix

show_history(h)
test_tweets= get_tweet(test)
test_sequences = get_sequences(tokenizer, test_tweets)
test_labels = names_to_ids(test['label'])
_ = model.evaluate(test_sequences, test_labels.reshape(-1, 1))
i = random.randint(0, len(test_tweets[50]) - 1)

print('Sentence:', test_tweets[i])
print('Emotion:', index_to_class[test_labels[i]])

Sentence: Vodafone hires Citi for Cesky bid (TheDeal.com) TheDeal.com - The U.K. mobile giant wants to find a way to disentagle the Czech wireless and fixed-line businesses.
Emotion: 3
import numpy as np

# Reshape the single test sequence to have a batch dimension
single_test_sequence = np.expand_dims(test_sequences[i], axis=0)

# Get the prediction for the single sample
p = model.predict(single_test_sequence)[0]

pred_class = index_to_class[np.argmax(p).astype('uint8')]
print('Sentence:', test_tweets[i])
print('Emotion:', index_to_class[test_labels[i]])
print('Predicted Emotion:', pred_class)

Sentence: Vodafone hires Citi for Cesky bid (TheDeal.com) TheDeal.com - The U.K. mobile giant wants to find a way to disentagle the Czech wireless and fixed-line businesses.
Emotion: 3
Predicted Emotion: 3
predicted_classes = np.argmax(preds, axis=1)
show_confusion_matrix(test_labels, predicted_classes, list(train_classes))