Logistic Regression To Amazon Reviews

Logistic Regression to Amazon reviews
November 4, 2018
1 5] Exercise: Apply Logistic regression to Amazon reviews data set.

In [4]: %matplotlib inline
import pandas as pd
import sqlite3
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from string import punctuation
from sklearn import svm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from nltk import ngrams
from itertools import chain
import warnings
warnings.filterwarnings('ignore')
In [5]: con = sqlite3.connect('/home/rahul/Desktop/MLAAIC/database.sqlite')

df = pd.read_sql_query("""SELECT * FROM Reviews""",con)
1.1 Data Preprocessing

In [6]: df = df.dropna()
df['review'] = np.where(df['Score']>3, 'positive', 'negative')
#i will be using only 10% of available data
#this assignments
#https://stackoverflow.com/questions/38085547/random-sample-of-a-subset-of-a-dataframe-i
# sample = np.random.uniform(0,len(df),int(len(df)*0.10))
# df_sample = df.iloc[sample]
df_sample = df
1
#### removing duplicate copies
sorted_data=df_sample.sort_values('ProductId', axis=0, ascending=True, inplace=False, ki
final=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='f
final=final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]
In [7]: DataPoints, Features = final.shape
l= final["review"].unique()
x,y= final["review"].value_counts()
print("Final Data \n")
# print(len(l))
print ("Total number of Datapoints -",DataPoints,'\n\n'
'Total Number Features or Independent Variable -',Features-1,'\n')
print('The Positive review has label \"{}\" and \"{}\" datapoint \n'.format(l[0],x))
print('The Negatve review has class label \"{}\" and \"{}\" datapoints'.format(l[1],y))
Final Data
Total number of Datapoints - 393931
Total Number Features or Independent Variable - 10
The Positive review has label "positive" and "307052" datapoint
The Negatve review has class label "negative" and "86879" datapoints
1.1.1 Text preprocessing

In [8]: import itertools
def cleanhtml(sentence): #function to clean the word of any html-tags

cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, ' ', sentence)
return cleantext.lower()
final['CleanedText']=list(map(cleanhtml,final['Text']))
In [9]: positive_reviews = final[final.review=='positive']
negative_reviews = final[final.review=='negative']
# https://pandas.pydata.org/pandas-docs/version/0.17.0/generated/pandas.DataFrame.sample
negative_reviews = negative_reviews.sample(50000)
positive_reviews = positive_reviews.sample(50000)
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.concat.html
final_sample = pd.concat([positive_reviews, negative_reviews])
In [10]: # creating dataset by removing unwanted data
X_c = final_sample[['Time','review','CleanedText']]
X_c = X_c.sort_values(['Time'],ascending=True)
X = X_c['CleanedText']
Y = X_c['review']
2
In [11]: # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.h
# encoding label data
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)
2 Grid Search
In [12]: #https://www.kaggle.com/laowingkin/amazon-fine-food-review-sentiment-analysis
vectorizer = CountVectorizer(stop_words = 'english')
BoW = vectorizer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(BoW, Y, test_size=0.20)
X_train.shape
Out[12]: (80000, 60314)
In [13]: tfidf = TfidfVectorizer(stop_words='english')

tf_idf = tfidf.fit_transform(X)
Xtrain, Xtest, ytrain, ytest = train_test_split(tf_idf, Y, test_size=0.20)
Note after label encoding "positive" is "class-1" and "negative" is "class0"
In [17]: tuned_parameters = [{'penalty': ['l1'],'C': [10**-4, 10**-2, 10**0, 10**2, 10**4]},

{'penalty': ['l2'], 'C': [10**-4, 10**-2, 10**0, 10**2, 10**4]}]
from sklearn.model_selection import GridSearchCV

# from sklearn.linear_model import LogisticRegression
#BoW
modelBoW = GridSearchCV(LogisticRegression(), tuned_parameters,scoring = 'f1', cv=5)
modelBoW.fit(X_test, y_test)
#TFiDF
model_tf = GridSearchCV(LogisticRegression(), tuned_parameters,scoring = 'f1', cv=5)
model_tf.fit(Xtest, ytest)
Out[17]: GridSearchCV(cv=5, error_score='raise',

estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept
intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False),
fit_params=None, iid=True, n_jobs=1,
param_grid=[{'C': [0.0001, 0.01, 1, 100, 10000], 'penalty': ['l1']}, {'C': [0.00
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring='f1', verbose=0)
In [19]: BoW_Grid = pd.DataFrame.from_dict(modelBoW.cv_results_)

Tf_IDF_Grid = pd.DataFrame.from_dict(model_tf.cv_results_)
3
3 Random search
In [23]: from sklearn.model_selection import RandomizedSearchCV
#https://chrisalbon.com/machine_learning/model_selection/hyperparameter_tuning_using_ra
from scipy.stats import uniform
# Create regularization penalty space
penalty = ['l1', 'l2']
# Create regularization hyperparameter distribution using uniform distribution

C = uniform(loc=0, scale=4)
# Create hyperparameter options

hyperparameters = dict(C=C, penalty=penalty)
#BoW
modelBoWr=RandomizedSearchCV(LogisticRegression(), hyperparameters,scoring = 'f1', cv=
modelBoWr.fit(X_test, y_test)
#TFiDF
model_tfr = RandomizedSearchCV(LogisticRegression(), hyperparameters,scoring = 'f1', c
model_tfr.fit(Xtest, ytest)
Out[23]: RandomizedSearchCV(cv=5, error_score='raise',

estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_interc
intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False),
fit_params=None, iid=True, n_iter=10, n_jobs=1,
param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object
pre_dispatch='2*n_jobs', random_state=None, refit=True,
return_train_score='warn', scoring='f1', verbose=0)
In [24]: BoW_Rcv = pd.DataFrame.from_dict(modelBoWr.cv_results_)

Tf_IDF_Rcv = pd.DataFrame.from_dict(model_tfr.cv_results_)
3.1 L1 and L2 regularization

In [14]: from sklearn.metrics import accuracy_score
#BoW -penalty l1
clfBoW = LogisticRegression(penalty = 'l1')

clfBoW.fit( X_train,y_train)
predBoW=clfBoW.predict(X_test)
Accuracy=round(accuracy_score(y_test,predBoW,normalize=True),4)
#BoW -penalty l2
4
clfBoW2 = LogisticRegression(penalty = 'l2')
clfBoW2.fit( X_train,y_train)
predBoW2=clfBoW2.predict(X_test)
Accuracy2=round(accuracy_score(y_test,predBoW2,normalize=True),4)
#Tf-IDF -penalty l1
clftf = LogisticRegression(penalty = 'l1')

clftf.fit( Xtrain,ytrain)
predtf=clftf.predict(Xtest)
Accuracy3=round(accuracy_score(ytest,predtf,normalize=True),4)
#Tf-IDF -penalty l2
clftf1 = LogisticRegression(penalty = 'l1')

clftf1.fit( Xtrain,ytrain)
predtf1=clftf1.predict(Xtest)
Accuracy4=round(accuracy_score(ytest,predtf1,normalize=True),4)
3.2 labda vs Sparsity vs Error

3.2.1 http://scikit-learn.org/stable/auto_examples/linear_model/plot_logistic_l1_l2_sparsity.html#sphx-
glr-auto-examples-linear-model-plot-logistic-l1-l2-sparsity-py
In [39]: C=[0.001,0.005,0.01,0.05,0.1,0.5,1,10,25,50,35,100]
Score=[]#Error
Sparse=[]#Sparsity
for i in C:
clf = LogisticRegression(C=i, penalty='l1')
clf.fit(X_train, y_train)
pred=clf.predict(X_test)
accuracy=round(accuracy_score(y_test,pred,normalize=True),4)
S=1-round(accuracy,5)
Score.append(S)
Sp =np.count_nonzero(clf.coef_)
Sparse.append(Sp)
3.3 Multi colinearity and feature importance

In [73]: w = tfidf.get_feature_names()
coef = clftf.coef_.tolist()[0]
coeff_df = pd.DataFrame({'Word' : w, 'Coefficient' : coef})
coeff_df = coeff_df.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
print('')
print('-Top 20 positive-')
5
print(coeff_df.head(20).to_string(index=False))
print('')
print('-Top 20 negative-')
print(coeff_df.tail(20).to_string(index=False))
-Top 20 positive-
Coefficient Word
11.250287 highly
10.779505 hooked
9.960425 great
9.807265 best
9.719121 delicious
9.653650 perfect
8.752537 skeptical
8.719017 excellent
8.168333 wonderful
7.925378 pleasantly
7.855584 loves
7.799740 amazing
7.245107 yum
7.203459 glad
6.977704 thank
6.942655 love
6.685501 awesome
6.635842 pleased
6.539283 worry
6.310895 yummy
-Top 20 negative-
Coefficient Word
-6.755963 hopes
-6.844374 hoping
-7.077717 stale
-7.079282 deceptive
-7.250155 okay
-7.597882 return
-7.748032 horrible
-7.773034 disgusting
-7.919038 bland
-8.029840 threw
-8.100359 ok
-8.333270 refund
-8.527532 disappointed
-8.796234 mediocre
-9.760861 disappointment
-10.095175 awful
-10.241993 terrible
6
-10.337588 unfortunately
-13.506665 disappointing
-14.761232 worst
3.4 Results
In [52]: from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Model", "Model Selection Method","penalty", "Optimal lambda"]
x.add_row(["BoW", "GridSearch", modelBoW.best_estimator_.get_params()['penalty'],

modelBoW.best_estimator_.get_params()['C']],)
x.add_row(["TF-IDF", "GridSearch", model_tf.best_estimator_.get_params()['penalty'],
model_tf.best_estimator_.get_params()['C']])
x.add_row(["BoW", "Rondomized CV", modelBoWr.best_estimator_.get_params()['penalty'],
round(modelBoWr.best_estimator_.get_params()['C'],4)])
x.add_row(["TF-IDF", "Rondomized CV", model_tfr.best_estimator_.get_params()['penalty']
round(model_tfr.best_estimator_.get_params()['C'],4)])
print('Table 1 - Optimal Value of Lambda, Train error and test error')

print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
# print(x.get_string(title="Australian cities"))
print(x)
Table 1 - Optimal Value of Lambda, Train error and test error

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--------+------------------------+---------+----------------+
| Model | Model Selection Method | penalty | Optimal lambda |
+--------+------------------------+---------+----------------+
| BoW | GridSearch | l2 | 1 |
| TF-IDF | GridSearch | l2 | 1 |
| BoW | Rondomized CV | l2 | 0.4468 |
| TF-IDF | Rondomized CV | l2 | 3.8947 |
+--------+------------------------+---------+----------------+
In [53]: y = PrettyTable()
y.field_names = ["Model","penalty", "Acuuracy"]
y.add_row(["BoW", "L1",round(Accuracy*100,2)])
y.add_row(["BoW", "L2",round(Accuracy2*100,2)])
y.add_row(["TfIDf", "L1",round(Accuracy3*100,2)])
y.add_row(["TfIDF", "L2",round(Accuracy4*100,2)])
print('Table 2 - L1 and L2 regularization')
7
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
# print(x.get_string(title="Australian cities"))
print(y)
Table 2 - L1 and L2 regularization

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-------+---------+----------+
| Model | penalty | Acuuracy |
+-------+---------+----------+
| BoW | L1 | 83.76 |
| BoW | L2 | 83.57 |
| TfIDf | L1 | 84.56 |
| TfIDF | L2 | 84.56 |
+-------+---------+----------+
In [44]: #https://stackoverflow.com/questions/22276066/how-to-plot-multiple-functions-on-the-sam
from sklearn import preprocessing
# [0.001,0.005,0.01,0.05,0.1,0.5,1]
#scaling to plot them
Sparse =np.array(Sparse)
Sparse.reshape(1,-1)
Score = np.array(Score)
Score.reshape(1,-1)
Sparse_normalized = (Sparse-min(Sparse))/(max(Sparse)-min(Sparse))
Score_normalized = (Score-min(Score))/(max(Score)-min(Score))
C=np.array(C)
lambd=1/C
#https://matplotlib.org/users/legend_guide.html
plt.loglog(lambd, Sparse_normalized, label='Sparse') # plotting t, a separately
plt.loglog(lambd, Score_normalized, label='Error') # plotting t, b separately
plt.xlabel('Lambda')
plt.ylabel('Normalized log Scale')
plt.title('Realtion between Lambda, Error and Sparsity')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
8
9

Logistic Regression To Amazon Reviews

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Logistic Regression To Amazon Reviews

Uploaded by

Copyright:

Available Formats

Logistic Regression to Amazon reviews

1 5] Exercise: Apply Logistic regression to Amazon reviews data set.

In [5]: con = sqlite3.connect('/home/rahul/Desktop/MLAAIC/database.sqlite')

1.1 Data Preprocessing

Total number of Datapoints - 393931

Total Number Features or Independent Variable - 10

The Positive review has label "positive" and "307052" datapoint

1.1.1 Text preprocessing

def cleanhtml(sentence): #function to clean the word of any html-tags

Out[12]: (80000, 60314)

In [13]: tfidf = TfidfVectorizer(stop_words='english')

Note after label encoding "positive" is "class-1" and "negative" is "class0"

In [17]: tuned_parameters = [{'penalty': ['l1'],'C': [10**-4, 10**-2, 10**0, 10**2, 10**4]},

from sklearn.model_selection import GridSearchCV

Out[17]: GridSearchCV(cv=5, error_score='raise',

In [19]: BoW_Grid = pd.DataFrame.from_dict(modelBoW.cv_results_)

# Create regularization hyperparameter distribution using uniform distribution

# Create hyperparameter options

Out[23]: RandomizedSearchCV(cv=5, error_score='raise',

In [24]: BoW_Rcv = pd.DataFrame.from_dict(modelBoWr.cv_results_)

3.1 L1 and L2 regularization

clfBoW = LogisticRegression(penalty = 'l1')

clftf = LogisticRegression(penalty = 'l1')

clftf1 = LogisticRegression(penalty = 'l1')

3.2 labda vs Sparsity vs Error

3.3 Multi colinearity and feature importance

x.add_row(["BoW", "GridSearch", modelBoW.best_estimator_.get_params()['penalty'],

print('Table 1 - Optimal Value of Lambda, Train error and test error')

Table 1 - Optimal Value of Lambda, Train error and test error

print('Table 2 - L1 and L2 regularization')

Table 2 - L1 and L2 regularization

You might also like

In [17]: tuned_parameters = [{'penalty': ['l1'],'C': [10-4, 10-2, 100, 102, 10**4]},