Spam Filter using Logistic Regression

2 minute read

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from math import log, sqrt
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\I330087\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.

True

# Dataset can be downloaded from https://www.kaggle.com/uciml/sms-spam-collection-dataset/data

messages = pd.read_csv("Dataset/spam.csv",encoding="latin-1")
messages.head()

	v1	v2	Unnamed: 2	Unnamed: 3	Unnamed: 4
0	ham	Go until jurong point, crazy.. Available only ...	NaN	NaN	NaN
1	ham	Ok lar... Joking wif u oni...	NaN	NaN	NaN
2	spam	Free entry in 2 a wkly comp to win FA Cup fina...	NaN	NaN	NaN
3	ham	U dun say so early hor... U c already then say...	NaN	NaN	NaN
4	ham	Nah I don't think he goes to usf, he lives aro...	NaN	NaN	NaN

messages.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis = 1,inplace=True)
messages.head()

	v1	v2
0	ham	Go until jurong point, crazy.. Available only ...
1	ham	Ok lar... Joking wif u oni...
2	spam	Free entry in 2 a wkly comp to win FA Cup fina...
3	ham	U dun say so early hor... U c already then say...
4	ham	Nah I don't think he goes to usf, he lives aro...

messages = messages.rename(columns={"v1":"label","v2":"text"})
messages.head()

	label	text
0	ham	Go until jurong point, crazy.. Available only ...
1	ham	Ok lar... Joking wif u oni...
2	spam	Free entry in 2 a wkly comp to win FA Cup fina...
3	ham	U dun say so early hor... U c already then say...
4	ham	Nah I don't think he goes to usf, he lives aro...

messages['label'] = messages['label'].map({'ham':0,'spam':1})
messages.head()

	label	text
0	0	Go until jurong point, crazy.. Available only ...
1	0	Ok lar... Joking wif u oni...
2	1	Free entry in 2 a wkly comp to win FA Cup fina...
3	0	U dun say so early hor... U c already then say...
4	0	Nah I don't think he goes to usf, he lives aro...

messages['label'].value_counts()

0    4825
1     747
Name: label, dtype: int64

X_train,X_test,y_train,y_test = train_test_split(messages["text"],messages["label"], test_size = 0.2, random_state = 10)

v = CountVectorizer()
v.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

train_df = v.transform(X_train)
test_df = v.transform(X_test)
hamwords = ''
spamwords = ''
hamw = messages[messages['label']==0]['text']
spamw = messages[messages['label']==1]['text']
for row in hamw:
    words = word_tokenize(row)
    #print(word)
    for x in words:
        hamwords += x + ' '
for row in spamw:
    words = word_tokenize(row)
    #print(word)
    for x in words:
        spamwords += x + ' '

wc_spam = WordCloud().generate(spamwords)
wc_ham = WordCloud().generate(hamwords)
#Spam Word cloud
plt.figure( figsize=(10,8), facecolor='k')
plt.imshow(wc_spam)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()
#Ham word cloud
plt.figure( figsize=(10,8), facecolor='k')
plt.imshow(wc_ham)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

WordCloud

type(train_df)

scipy.sparse.csr.csr_matrix

print(train_df.shape, test_df.shape)

(4457, 7757) (1115, 7757)

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(train_df,y_train)

C:\Users\I330087\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

predictions= model.predict(train_df)
accuracy_score(y_train,predictions)

0.99798070450976

predictions= model.predict(test_df)
accuracy_score(y_test,predictions)

0.9802690582959641

Share on

Twitter Facebook LinkedIn

Gaurav Chaudhary

Spam Filter using Logistic Regression

Share on

You may also enjoy

Image classification task using Naive Bayes and logistic regression classifier

Logistic Regression Implementation using Python