Using natural language processing to build a spam filter for text messages¶
from PIL import Image
Image.open('NLP.png')
Table of contents
import wordcloud
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
import string
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix
import random as rd
import os
import json
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('mode.chained_assignment', None)
1. Inspecting the dataset
#Read data
path = "Emails/"
labels = ['Ham','Spam']
Category=[]
textdata = []
a=0
for l in labels:
for file in os.listdir(path+l):
Category.append(l)
with open(os.path.join(path+l,file),encoding='utf8',errors="ignore") as f:
data = f.read()
textdata.append(data)
df = pd.DataFrame({"Text" : textdata , "Category":Category})
df.info()
We have a collection of text data known as a corpus. Specifically, there are 5857 emails written in English, serving as training examples. The first column is the target variable containing the class labels, which tells us if the message is spam or ham (aka not spam). The second column is the SMS message itself, stored as a string.
Since the target variable contains discrete values, this is a classification task. Let's start by placing the target variable in its own table and checking out how the two classes are distributed.
y = df.Category
y.value_counts()
2. Text preprocessing
There are many feature engineering strategies for transforming text data into features. Some involve assigning each unique word-like term to a feature and counting the number of occurrences per training example. However, if we were to perform this strategy right now, we'd end up with an absurd number of features, a result of the myriad possible terms. The classifier would take too long to train and likely overfit. As a result, each NLP problem requires a tailored approach to determine which terms are relevant and meaningful.
Normalization
Let's begin by taking a step back and examining the terms of a hypothetical Emails.#Clean non letter characters and words containing three or less letters
#Clean non letter characters and words containing three or less letters
df = df.replace("[^a-zA-Z]"," ", regex=True)
df = df.replace("\s(\w{1,3})\s",' ', regex=True)
df.head()
df.rename(columns = {'Category': 'label', 'Text': 'Email'}, inplace = True)
df['label'] = df['label'].map({'Ham': 0, 'Spam': 1})
df=df.sample(frac=1).reset_index(drop=True)
df.head()
processed=df.Email
processed = processed.str.lower()
Removing stop words
Some words in the English language, while necessary, don't contribute much to the meaning of a phrase. These words, such as "when", "had", "those" or "before", are called stop words and should be filtered out. The Natural Language Toolkit (NLTK), a popular Python library for NLP, provides common stop words.
stop_words = nltk.corpus.stopwords.words('english')
processed = processed.apply(lambda x: ' '.join(
term for term in x.split() if term not in set(stop_words))
)
Stemming
It's likely the corpus contains words with various suffixes such as "distribute", "distributing", "distributor" or "distribution". We can replace these four words with just "distribut" via a preprocessing step called stemming. There are numerous stemming strategies, some more aggressive than others.
porter = nltk.PorterStemmer()
processed = processed.apply(lambda x: ' '.join(
porter.stem(term) for term in x.split())
)
3. Feature engineering
Implementing the tf-idf statistic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_ngrams = vectorizer.fit_transform(processed)
X_ngrams.shape
4. Training and evaluating a model
y_enc=df['label']
X_train, X_test, y_train, y_test = train_test_split(
X_ngrams,
y_enc,
test_size=0.2,
random_state=42,
stratify=y_enc
)
clf = svm.LinearSVC(loss='hinge')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
metrics.f1_score(y_test, y_pred)
pd.DataFrame(
metrics.confusion_matrix(y_test, y_pred),
index=[['actual', 'actual'], ['spam', 'ham']],
columns=[['predicted', 'predicted'], ['spam', 'ham']]
)
array = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(array, range(2),
range(2))
#plt.figure(figsize = (10,7))
sns.set(font_scale=1.4)#for label size
sns.heatmap(df_cm, annot=True,annot_kws={"size": 16}, fmt="d")# font size
print(classification_report(y_test, y_pred))
Diagnosing the model with learning curves
from sklearn.model_selection import learning_curve
from sklearn.model_selection import StratifiedShuffleSplit
sample_space = np.linspace(500, len(df.Email) * 0.8, 10, dtype='int')
train_sizes, train_scores, valid_scores = learning_curve(
estimator=svm.LinearSVC(loss='hinge', C=1e10),
X=X_ngrams,
y=y_enc,
train_sizes=sample_space,
cv=StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=40),
scoring='f1',
n_jobs=-1
)
def make_tidy(sample_space, train_scores, valid_scores):
messy_format = pd.DataFrame(
np.stack((sample_space, train_scores.mean(axis=1),
valid_scores.mean(axis=1)), axis=1),
columns=['# of training examples', 'Training set', 'Validation set']
)
return pd.melt(
messy_format,
id_vars='# of training examples',
value_vars=['Training set', 'Validation set'],
var_name='Scores',
value_name='F1 score'
)
g = sns.FacetGrid(
make_tidy(sample_space, train_scores, valid_scores), hue='Scores', size=5
)
g.map(plt.scatter, '# of training examples', 'F1 score')
g.map(plt.plot, '# of training examples', 'F1 score').add_legend();
Notice the performance on the training set is near perfect regardless of dataset size, which makes sense because we're evaluating the classifier on the same data used to train it. At first, it looks like the classifier is suffering from high variance and is overfitting since the validation scores never reach the same level. However, taking a closer look at the scale of the y-axis makes me believe this issue isn't that pronounced.
Using nested cross-validation to minimize information leakage
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
param_grid = [{'C': np.logspace(-4, 4, 20)}]
grid_search = GridSearchCV(
estimator=svm.LinearSVC(loss='hinge'),
param_grid=param_grid,
cv=StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42),
scoring='f1',
n_jobs=-1
)
scores = cross_val_score(
estimator=grid_search,
X=X_ngrams,
y=y_enc,
cv=StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=0),
scoring='f1',
n_jobs=-1
)
scores
scores.mean()
5. What terms are the top predictors of spam?
grid_search.fit(X_ngrams, y_enc)
final_clf = svm.LinearSVC(loss='hinge', C=grid_search.best_params_['C'])
final_clf.fit(X_ngrams, y_enc);
pd.Series(
final_clf.coef_.T.ravel(),
index=vectorizer.get_feature_names()
).sort_values(ascending=False)[:10].plot(kind='barh', figsize=(5,10))
There are a few obvious ones at the top: click, life, softwar, http..etc