You are on page 1of 9

Logistic Regression to Amazon reviews

November 4, 2018

1 5] Exercise: Apply Logistic regression to Amazon reviews data set.


In [4]: %matplotlib inline
import pandas as pd
import sqlite3
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from string import punctuation
from sklearn import svm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from nltk import ngrams
from itertools import chain
import warnings
warnings.filterwarnings('ignore')

In [5]: con = sqlite3.connect('/home/rahul/Desktop/MLAAIC/database.sqlite')


df = pd.read_sql_query("""SELECT * FROM Reviews""",con)

1.1 Data Preprocessing


In [6]: df = df.dropna()
df['review'] = np.where(df['Score']>3, 'positive', 'negative')
#i will be using only 10% of available data
#this assignments
#https://stackoverflow.com/questions/38085547/random-sample-of-a-subset-of-a-dataframe-i
# sample = np.random.uniform(0,len(df),int(len(df)*0.10))
# df_sample = df.iloc[sample]
df_sample = df

1
#### removing duplicate copies
sorted_data=df_sample.sort_values('ProductId', axis=0, ascending=True, inplace=False, ki
final=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='f
final=final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]
In [7]: DataPoints, Features = final.shape
l= final["review"].unique()
x,y= final["review"].value_counts()
print("Final Data \n")
# print(len(l))
print ("Total number of Datapoints -",DataPoints,'\n\n'
'Total Number Features or Independent Variable -',Features-1,'\n')
print('The Positive review has label \"{}\" and \"{}\" datapoint \n'.format(l[0],x))
print('The Negatve review has class label \"{}\" and \"{}\" datapoints'.format(l[1],y))
Final Data

Total number of Datapoints - 393931

Total Number Features or Independent Variable - 10

The Positive review has label "positive" and "307052" datapoint

The Negatve review has class label "negative" and "86879" datapoints

1.1.1 Text preprocessing


In [8]: import itertools

def cleanhtml(sentence): #function to clean the word of any html-tags


cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, ' ', sentence)
return cleantext.lower()
final['CleanedText']=list(map(cleanhtml,final['Text']))
In [9]: positive_reviews = final[final.review=='positive']
negative_reviews = final[final.review=='negative']
# https://pandas.pydata.org/pandas-docs/version/0.17.0/generated/pandas.DataFrame.sample
negative_reviews = negative_reviews.sample(50000)
positive_reviews = positive_reviews.sample(50000)
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.concat.html
final_sample = pd.concat([positive_reviews, negative_reviews])
In [10]: # creating dataset by removing unwanted data
X_c = final_sample[['Time','review','CleanedText']]
X_c = X_c.sort_values(['Time'],ascending=True)
X = X_c['CleanedText']
Y = X_c['review']

2
In [11]: # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.h
# encoding label data
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

2 Grid Search
In [12]: #https://www.kaggle.com/laowingkin/amazon-fine-food-review-sentiment-analysis
vectorizer = CountVectorizer(stop_words = 'english')
BoW = vectorizer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(BoW, Y, test_size=0.20)
X_train.shape

Out[12]: (80000, 60314)

In [13]: tfidf = TfidfVectorizer(stop_words='english')


tf_idf = tfidf.fit_transform(X)
Xtrain, Xtest, ytrain, ytest = train_test_split(tf_idf, Y, test_size=0.20)

Note after label encoding "positive" is "class-1" and "negative" is "class0"

In [17]: tuned_parameters = [{'penalty': ['l1'],'C': [10**-4, 10**-2, 10**0, 10**2, 10**4]},


{'penalty': ['l2'], 'C': [10**-4, 10**-2, 10**0, 10**2, 10**4]}]

from sklearn.model_selection import GridSearchCV


# from sklearn.linear_model import LogisticRegression

#BoW
modelBoW = GridSearchCV(LogisticRegression(), tuned_parameters,scoring = 'f1', cv=5)
modelBoW.fit(X_test, y_test)
#TFiDF
model_tf = GridSearchCV(LogisticRegression(), tuned_parameters,scoring = 'f1', cv=5)
model_tf.fit(Xtest, ytest)

Out[17]: GridSearchCV(cv=5, error_score='raise',


estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept
intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False),
fit_params=None, iid=True, n_jobs=1,
param_grid=[{'C': [0.0001, 0.01, 1, 100, 10000], 'penalty': ['l1']}, {'C': [0.00
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring='f1', verbose=0)

In [19]: BoW_Grid = pd.DataFrame.from_dict(modelBoW.cv_results_)


Tf_IDF_Grid = pd.DataFrame.from_dict(model_tf.cv_results_)

3
3 Random search
In [23]: from sklearn.model_selection import RandomizedSearchCV

#https://chrisalbon.com/machine_learning/model_selection/hyperparameter_tuning_using_ra
from scipy.stats import uniform
# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter distribution using uniform distribution


C = uniform(loc=0, scale=4)

# Create hyperparameter options


hyperparameters = dict(C=C, penalty=penalty)

#BoW
modelBoWr=RandomizedSearchCV(LogisticRegression(), hyperparameters,scoring = 'f1', cv=
modelBoWr.fit(X_test, y_test)

#TFiDF
model_tfr = RandomizedSearchCV(LogisticRegression(), hyperparameters,scoring = 'f1', c
model_tfr.fit(Xtest, ytest)

Out[23]: RandomizedSearchCV(cv=5, error_score='raise',


estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_interc
intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False),
fit_params=None, iid=True, n_iter=10, n_jobs=1,
param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object
pre_dispatch='2*n_jobs', random_state=None, refit=True,
return_train_score='warn', scoring='f1', verbose=0)

In [24]: BoW_Rcv = pd.DataFrame.from_dict(modelBoWr.cv_results_)


Tf_IDF_Rcv = pd.DataFrame.from_dict(model_tfr.cv_results_)

3.1 L1 and L2 regularization


In [14]: from sklearn.metrics import accuracy_score
#BoW -penalty l1

clfBoW = LogisticRegression(penalty = 'l1')


clfBoW.fit( X_train,y_train)
predBoW=clfBoW.predict(X_test)
Accuracy=round(accuracy_score(y_test,predBoW,normalize=True),4)

#BoW -penalty l2

4
clfBoW2 = LogisticRegression(penalty = 'l2')
clfBoW2.fit( X_train,y_train)
predBoW2=clfBoW2.predict(X_test)
Accuracy2=round(accuracy_score(y_test,predBoW2,normalize=True),4)

#Tf-IDF -penalty l1

clftf = LogisticRegression(penalty = 'l1')


clftf.fit( Xtrain,ytrain)
predtf=clftf.predict(Xtest)
Accuracy3=round(accuracy_score(ytest,predtf,normalize=True),4)

#Tf-IDF -penalty l2

clftf1 = LogisticRegression(penalty = 'l1')


clftf1.fit( Xtrain,ytrain)
predtf1=clftf1.predict(Xtest)
Accuracy4=round(accuracy_score(ytest,predtf1,normalize=True),4)

3.2 labda vs Sparsity vs Error


3.2.1 http://scikit-learn.org/stable/auto_examples/linear_model/plot_logistic_l1_l2_sparsity.html#sphx-
glr-auto-examples-linear-model-plot-logistic-l1-l2-sparsity-py
In [39]: C=[0.001,0.005,0.01,0.05,0.1,0.5,1,10,25,50,35,100]
Score=[]#Error
Sparse=[]#Sparsity

for i in C:
clf = LogisticRegression(C=i, penalty='l1')
clf.fit(X_train, y_train)
pred=clf.predict(X_test)
accuracy=round(accuracy_score(y_test,pred,normalize=True),4)

S=1-round(accuracy,5)
Score.append(S)
Sp =np.count_nonzero(clf.coef_)
Sparse.append(Sp)

3.3 Multi colinearity and feature importance


In [73]: w = tfidf.get_feature_names()
coef = clftf.coef_.tolist()[0]
coeff_df = pd.DataFrame({'Word' : w, 'Coefficient' : coef})
coeff_df = coeff_df.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
print('')
print('-Top 20 positive-')

5
print(coeff_df.head(20).to_string(index=False))
print('')
print('-Top 20 negative-')
print(coeff_df.tail(20).to_string(index=False))

-Top 20 positive-
Coefficient Word
11.250287 highly
10.779505 hooked
9.960425 great
9.807265 best
9.719121 delicious
9.653650 perfect
8.752537 skeptical
8.719017 excellent
8.168333 wonderful
7.925378 pleasantly
7.855584 loves
7.799740 amazing
7.245107 yum
7.203459 glad
6.977704 thank
6.942655 love
6.685501 awesome
6.635842 pleased
6.539283 worry
6.310895 yummy

-Top 20 negative-
Coefficient Word
-6.755963 hopes
-6.844374 hoping
-7.077717 stale
-7.079282 deceptive
-7.250155 okay
-7.597882 return
-7.748032 horrible
-7.773034 disgusting
-7.919038 bland
-8.029840 threw
-8.100359 ok
-8.333270 refund
-8.527532 disappointed
-8.796234 mediocre
-9.760861 disappointment
-10.095175 awful
-10.241993 terrible

6
-10.337588 unfortunately
-13.506665 disappointing
-14.761232 worst

3.4 Results
In [52]: from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Model", "Model Selection Method","penalty", "Optimal lambda"]

x.add_row(["BoW", "GridSearch", modelBoW.best_estimator_.get_params()['penalty'],


modelBoW.best_estimator_.get_params()['C']],)
x.add_row(["TF-IDF", "GridSearch", model_tf.best_estimator_.get_params()['penalty'],
model_tf.best_estimator_.get_params()['C']])
x.add_row(["BoW", "Rondomized CV", modelBoWr.best_estimator_.get_params()['penalty'],
round(modelBoWr.best_estimator_.get_params()['C'],4)])
x.add_row(["TF-IDF", "Rondomized CV", model_tfr.best_estimator_.get_params()['penalty']
round(model_tfr.best_estimator_.get_params()['C'],4)])

print('Table 1 - Optimal Value of Lambda, Train error and test error')


print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

# print(x.get_string(title="Australian cities"))
print(x)

Table 1 - Optimal Value of Lambda, Train error and test error


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--------+------------------------+---------+----------------+
| Model | Model Selection Method | penalty | Optimal lambda |
+--------+------------------------+---------+----------------+
| BoW | GridSearch | l2 | 1 |
| TF-IDF | GridSearch | l2 | 1 |
| BoW | Rondomized CV | l2 | 0.4468 |
| TF-IDF | Rondomized CV | l2 | 3.8947 |
+--------+------------------------+---------+----------------+

In [53]: y = PrettyTable()
y.field_names = ["Model","penalty", "Acuuracy"]

y.add_row(["BoW", "L1",round(Accuracy*100,2)])
y.add_row(["BoW", "L2",round(Accuracy2*100,2)])
y.add_row(["TfIDf", "L1",round(Accuracy3*100,2)])
y.add_row(["TfIDF", "L2",round(Accuracy4*100,2)])

print('Table 2 - L1 and L2 regularization')

7
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

# print(x.get_string(title="Australian cities"))
print(y)

Table 2 - L1 and L2 regularization


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-------+---------+----------+
| Model | penalty | Acuuracy |
+-------+---------+----------+
| BoW | L1 | 83.76 |
| BoW | L2 | 83.57 |
| TfIDf | L1 | 84.56 |
| TfIDF | L2 | 84.56 |
+-------+---------+----------+

In [44]: #https://stackoverflow.com/questions/22276066/how-to-plot-multiple-functions-on-the-sam
from sklearn import preprocessing
# [0.001,0.005,0.01,0.05,0.1,0.5,1]
#scaling to plot them
Sparse =np.array(Sparse)
Sparse.reshape(1,-1)
Score = np.array(Score)
Score.reshape(1,-1)

Sparse_normalized = (Sparse-min(Sparse))/(max(Sparse)-min(Sparse))
Score_normalized = (Score-min(Score))/(max(Score)-min(Score))

C=np.array(C)
lambd=1/C
#https://matplotlib.org/users/legend_guide.html
plt.loglog(lambd, Sparse_normalized, label='Sparse') # plotting t, a separately
plt.loglog(lambd, Score_normalized, label='Error') # plotting t, b separately
plt.xlabel('Lambda')
plt.ylabel('Normalized log Scale')
plt.title('Realtion between Lambda, Error and Sparsity')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

8
9

You might also like