import string
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_columns = 50
sns.set(style='darkgrid')
%matplotlib inline


email = pd.read_csv('../Dataset/email-spam.csv')
email.sample(5)


email.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


email.groupby('label').describe()


fig = plt.figure(figsize=(4,3))
ax = fig.add_axes([0,0,1,1])
ax.axis('equal')
labels = email['label'].sort_values().unique().tolist()
email_val_count = email.label.value_counts()
ax.pie(email_val_count, labels=labels, autopct='%.0f%%')
plt.show()


email['length'] = email['message'].apply(lambda x: len(x.split()) * 1)
email.head()


plt.figure(figsize=(6,3))
sns.distplot(email.length)
plt.show()


plt.figure(figsize=(8,2))
sns.boxplot(x=email.length)
plt.show()


email[email['length'] > 125]['message']

1085    For me the love should start with attraction.i...
1863    The last thing i ever wanted to do was hurt yo...
Name: message, dtype: object


g = sns.FacetGrid(data=email, hue="label", height=4, aspect=2)
g.map(sns.distplot, 'length', bins=30)
g.set(xticks=np.arange(0,200,10))
plt.legend()
plt.show()


ham_wordcloud = WordCloud().generate(' '.join(email[email['label'] == 'ham']['message']))
spam_wordcloud = WordCloud().generate(' '.join(email[email['label'] == 'spam']['message']))


plt.imshow(ham_wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


plt.imshow(spam_wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


PUNCT_TO_REMOVE = string.punctuation

def text_clean(text):
    text = text.lower()
    # removes all text inside square brackets
    text = re.sub('\[.*?\]', '', text)
    # remove all text that contains the url
    text = re.sub('https?://\S+|www\.\S+', '', text)
    # removes all HTML tags contained in the text
    text = re.sub('<.*?>+', '', text)
    # this pattern will search for all words that contain numbers in them
    text = re.sub('\w*\d\w*', '', text)
    # Remove underline from text
    text = re.sub('\n', ' ', text)
    # remove punctuation
    text = text.translate(str.maketrans('','', PUNCT_TO_REMOVE))
    # Delete more than one space
    text = re.sub(r'\s+', ' ', text)
    return text

email['message_clean'] = email['message'].apply(text_clean)
email[['message_clean']].head()


nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\iwanXone\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

True


LIST_STOPWORDS = set(stopwords.words('english'))

# stopword removal
def stopwords_removal(text):
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    tokenizer = tokenizer.tokenize(text)
    stopwords_rmv = [s for s in tokenizer if s not in LIST_STOPWORDS]
    stopwords_rmv = ' '.join(stopwords_rmv)
    return stopwords_rmv

# stemming
def porter_stemmer(text):
    stemmer = nltk.porter.PorterStemmer()
    ps = " ".join([stemmer.stem(word) for word in text.split()])
    return ps


email['stopword_rmv'] = email['message_clean'].apply(stopwords_removal)
email['stemmer'] = email['stopword_rmv'].apply(porter_stemmer)


email['label'] = LabelBinarizer().fit_transform(email['label'])
email[['stemmer','label']].sample(5)


from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(email['stemmer'],
                                                    email['label'], 
                                                    test_size=0.25,
                                                    random_state=45)


tfidf = TfidfVectorizer()
tfidf.fit(x_train.values)
train_tfidf = tfidf.transform(x_train.values)
test_tfidf = tfidf.transform(x_test.values)


df_tfidf = pd.DataFrame(train_tfidf.todense().T,
                        index=tfidf.get_feature_names(),
                        columns=[f'D{i+1}' for i in range(len(train_tfidf.toarray()))])
df_tfidf.sample(5)


from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix


multinomial_nb = MultinomialNB()
multinomial_nb.fit(train_tfidf, y_train)

MultinomialNB()

MultinomialNB()


multinomial_nb.score(train_tfidf, y_train), multinomial_nb.score(test_tfidf, y_test)

(0.9770279971284996, 0.9533381191672649)


pred = multinomial_nb.predict(test_tfidf)

print('Confusion matrix')
print(confusion_matrix(y_test, pred))
print('\n====================================\n')
print('Classification Report')
print(classification_report(y_test, pred, target_names=['ham','spam']))

Confusion matrix
[[1211    0]
 [  65  117]]

====================================

Classification Report
              precision    recall  f1-score   support

         ham       0.95      1.00      0.97      1211
        spam       1.00      0.64      0.78       182

    accuracy                           0.95      1393
   macro avg       0.97      0.82      0.88      1393
weighted avg       0.96      0.95      0.95      1393


random_fores = RandomForestClassifier()
random_fores.fit(train_tfidf, y_train)

RandomForestClassifier()

RandomForestClassifier()


random_fores.score(train_tfidf, y_train), random_fores.score(test_tfidf, y_test)

(0.9997607083034219, 0.9626704953338119)


pred = random_fores.predict(test_tfidf)

print('Confusion matrix')
print(confusion_matrix(y_test, pred))
print('\n====================================\n')
print('Classification Report')
print(classification_report(y_test, pred, target_names=['ham','spam']))

Confusion matrix
[[1210    1]
 [  51  131]]

====================================

Classification Report
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1211
        spam       0.99      0.72      0.83       182

    accuracy                           0.96      1393
   macro avg       0.98      0.86      0.91      1393
weighted avg       0.96      0.96      0.96      1393

	label	message
1098	ham	Don't fret. I'll buy the ovulation test strips...
1198	ham	He also knows about lunch menu only da. . I know
3421	spam	As a valued customer, I am pleased to advise y...
2777	ham	Send me your id and password
4533	ham	Ok both our days. So what are you making for d...

	message_clean
0	go until jurong point crazy available only in ...
1	ok lar joking wif u oni
2	free entry in a wkly comp to win fa cup final ...
3	u dun say so early hor u c already then say
4	nah i dont think he goes to usf he lives aroun...

	stemmer	label
645	allo brave buse taken train triumph mean b ham...	0
1810	aight ill ask roommat	0
389	half price orang line rental latest camera pho...	1
5401	babe think got ur brolli left english wil brin...	0
2692	hey tmr meet bugi	0

	D1	D2	D3	D4	D5	D6	D7	D8	D9	D10	D11	D12	D13	D14	D15	D16	D17	D18	D19	D20	D21	D22	D23	D24	D25	...	D4155	D4156	D4157	D4158	D4159	D4160	D4161	D4162	D4163	D4164	D4165	D4166	D4167	D4168	D4169	D4170	D4171	D4172	D4173	D4174	D4175	D4176	D4177	D4178	D4179
nowaday	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
uncl	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
hum	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
spirit	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
brisk	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

Intoduction¶

Dataset Understanding¶

Import Library¶

Load Data¶

Exploratory Data Analysis¶

Text Pre-Processing¶

Splitting Data¶

Feature Extraction¶

Term Frequency–Inverse Document Frequency (TF-IDF)¶

Modeling¶

Multinomial Naive Bayes¶

Random Fores Classifier¶

Conclusion¶

	message
	count	unique	top	freq
label
ham	4825	4516	Sorry, I'll call later	30
spam	747	641	Please call our customer service representativ...	4

	label	message	length
0	ham	Go until jurong point, crazy.. Available only ...	20
1	ham	Ok lar... Joking wif u oni...	6
2	spam	Free entry in 2 a wkly comp to win FA Cup fina...	28
3	ham	U dun say so early hor... U c already then say...	11
4	ham	Nah I don't think he goes to usf, he lives aro...	13