NavieBayes - Revised

NavieBayes- Revised
October 10, 2018
1 Assigment - 4
1.1 Apply Naive Bayes to Amazon reviews
Note: Used 5000 review 2500 positive and 2500 negative
In [1]: %matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sn
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score
# from sklearn import cross_validation
from sklearn.model_selection import cross_val_score
#from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score,precision_score,recall_score
import warnings
warnings.filterwarnings('ignore')
In [2]: df=pd.read_pickle('./Amazon_5000.pkl')
#sorting values TBS
df = df.sort_values(['Time'],ascending=True)
In [3]: #Creating Train and test data (80-20 split, Since i ahve small dataset i need more data
# therefore converted in 80-20 split instead of 70-30)
## 80% of sorted data will = Total number of rows (sorted) *0.8
X_train = df.iloc[:int(len(df)*.8),1]
Y_train = df.iloc[:int(len(df)*.8),-1]
# len(Y_train)== len(X_train)
X_test = df.iloc[int(len(df)*.8)+1:,1]
Y_test = df.iloc[int(len(df)*.8)+1:,-1]
# len(Y_test)== len(X_test)
1
In [4]: # Label encoding
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Ytrain = le.fit_transform(Y_train)
Ytest = le.transform(Y_test)
2 BoW
In [5]: # #BoW without bigram
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
Xtrain1 = vectorizer.fit_transform(X_train.values)
Xtest1 = vectorizer.transform(X_test.values)
# Xtest.shape
In [6]: score_1 = []
alpha = np.arange(0.001,10,0.1)
# alpha.shape
for Alpha in alpha:
clf=MultinomialNB(alpha=Alpha)
scores = cross_val_score(clf, Xtrain1, Ytrain, cv=10, scoring='accuracy')
# print('Aplha =',round(Alpha,5),'\t\t\tScore =',round(scores.mean(),5))
score_1.append(scores.mean())
In [17]: # changing to misclassification error

MSE_1 = [1 - x for x in score_1]
Training_Error1=round(1-max(score_1),4)
alpha_BoW = round(alpha[MSE_1.index(min(MSE_1))],3)
In [8]: clf1=MultinomialNB(alpha=alpha_BoW)
clf1.fit( Xtrain1, Ytrain)
pred_BoW=clf1.predict(Xtest1)
Accuracy1=round(accuracy_score(Ytest,pred_BoW,normalize=True),4)
testError1 = 1-Accuracy1
In [11]: #confusion matrix

cm1=confusion_matrix(Ytest, pred_BoW)
p1=precision_score(Ytest, pred_BoW, average=None)
r1=recall_score(Ytest, pred_BoW, average=None)
f1_1= (f1_score(Ytest, pred_BoW, average=None))

df_cm1 = pd.DataFrame(cm1,columns=[le.inverse_transform(0),le.inverse_transform(1)]
,index=[le.inverse_transform(0),le.inverse_transform(1)])
In [12]: #BoW Feature importance

features = vectorizer.get_feature_names()
feature_count = clf1.feature_count_
2
class_count = clf1.class_count_
pos_points_prob_sort = clf1.feature_log_prob_[1, :].argsort()
neg_points_prob_sort = clf1.feature_log_prob_[0, :].argsort()
log_prob = clf1.feature_log_prob_
feature_prob= pd.DataFrame(log_prob, columns=features).T
feature_prob.columns = [le.inverse_transform(0),le.inverse_transform(1)]
top_positive_BoW = feature_prob['positive'].sort_values(ascending=False)[:10]
top_negative_BoW = feature_prob['negative'].sort_values(ascending=False)[:10]
positive_BoW= pd.Series.to_frame(top_positive_BoW)
positive_BoW.reset_index(level=0, inplace=True)
positive_BoW.columns = ['BoW_Words_Pos','BoW_Prob']
negative_BoW= pd.Series.to_frame(top_negative_BoW)
negative_BoW.reset_index(level=0, inplace=True)
negative_BoW.columns = ['BoW_Words_neg','BoW_Prob']
BoW_FI= pd.concat([positive_BoW,negative_BoW], axis=1)
print('Feature importance - BoW')
BoW_FI
Feature importance - BoW
Out[12]: BoW_Words_Pos BoW_Prob BoW_Words_neg BoW_Prob

0 like -4.821223 tast -4.583463
1 tast -4.936946 like -4.628531
2 flavor -5.010554 product -4.847117
3 good -5.031487 one -5.157214
4 one -5.109834 tri -5.158664
5 love -5.115570 flavor -5.174748
6 great -5.130054 would -5.201639
7 coffe -5.191737 coffe -5.295225
8 use -5.199527 food -5.311992
9 tea -5.282650 good -5.459290
3 Tf-IDF
In [13]: from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vect = TfidfVectorizer()
Xtrain_tf_idf = tf_idf_vect.fit_transform(X_train.values)
Xtest_tf_idf = tf_idf_vect.transform(X_test.values)
In [15]: score_2 = []
for Alpha in alpha:
scores = cross_val_score(clf, Xtrain_tf_idf, Ytrain, cv=10, scoring='accuracy')
3
alpha_tfidf = round(alpha[MSE_2.index(min(MSE_2))],3)
In [24]: clf2=MultinomialNB(alpha=alpha_tfidf)
clf2.fit( Xtrain_tf_idf, Ytrain)
pred_tfidf=clf2.predict(Xtest_tf_idf)
Accuracy2=round(accuracy_score(Ytest,pred_tfidf,normalize=True),4)

cm2=confusion_matrix(Ytest, pred_tfidf)
p2=precision_score(Ytest, pred_tfidf, average=None)
r2=recall_score(Ytest, pred_tfidf, average=None)
f1_2= (f1_score(Ytest, pred_tfidf, average=None))

In [34]: #BoW Feature importance

features = tf_idf_vect.get_feature_names()
feature_count = clf2.feature_count_
class_count = clf2.class_count_
pos_points_prob_sort = clf2.feature_log_prob_[1, :].argsort()
neg_points_prob_sort = clf2.feature_log_prob_[0, :].argsort()
log_prob = clf2.feature_log_prob_
feature_prob= pd.DataFrame(log_prob, columns=features).T
feature_prob.columns = [le.inverse_transform(0),le.inverse_transform(1)]
top_positive_tfidf = feature_prob['positive'].sort_values(ascending=False)[:10]
top_negative_tfidf = feature_prob['negative'].sort_values(ascending=False)[:10]
positive= pd.Series.to_frame(top_positive_tfidf)
positive.reset_index(level=0, inplace=True)
positive.columns = ['Tf-IDF_Words_Pos','Tf-IDF_Prob']
negative= pd.Series.to_frame(top_negative_tfidf)
negative.reset_index(level=0, inplace=True)
negative.columns = ['TFIDF_Words_neg','TFIDF_Prob']
TfIDF_FI= pd.concat([positbive,negative], axis=1)
print('Feature importance - TF-IDF')
TfIDF_FI
Feature importance - TF-IDF
Out[34]: Tf-IDF_Words_Pos Tf-IDF_Prob TFIDF_Words_neg TFIDF_Prob

0 great -6.242255 tast -6.107194
1 love -6.315906 like -6.216595
4
2 coffe -6.365689 product -6.316280
3 good -6.376578 coffe -6.465138
4 flavor -6.430549 would -6.574775
5 like -6.436484 flavor -6.585915
6 tea -6.450763 tri -6.613772
7 tast -6.501234 one -6.643812
8 one -6.600750 dog -6.728935
9 use -6.628035 buy -6.749567
4 Avg W2V
In [35]: from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=
In [36]: def list_of_sent(df):

i=0
list_sent=[]
for sent in df.values:
filtered_sentence=[]
for w in sent.split():
for cleaned_words in w.split():
if(cleaned_words.isalpha()):
filtered_sentence.append(cleaned_words.lower())
else:
continue
list_sent.append(filtered_sentence)
return list_sent
In [56]: W2VTraining_list = list_of_sent(X_train)

W2VTest_list = list_of_sent(X_test)
# W2VTraining_list
list_all_words = list_of_sent(df['WoutStopWords'])
w2v_model = Word2Vec(list_all_words,min_count=5,size=50, workers=4)
words = list(w2v_model.wv.vocab)
# print(len(words))
Traing_vectors = []# the avg-w2v for each sentence/review is stored in this list
for sent in W2VTraining_list: # for each review/sentence
sent_vec = np.zeros(50) # as word vectors are of zero length
cnt_words =0 # num of words with a valid vector in the sentence/review
for word in sent: # for each word in a review/sentence
try:
vec = w2v_model.wv[word]
sent_vec += vec
5
cnt_words += 1
except:
pass
sent_vec /= cnt_words
Traing_vectors.append(sent_vec)
# scaling training vector

xtrain=np.array(Traing_vectors)+2
# x.min()
In [51]: score_3 = []
for Alpha in alpha:
scores = cross_val_score(clf, xtrain, Ytrain, cv=10, scoring='accuracy')

alpha_W2V = round(alpha[MSE_3.index(min(MSE_3))],3)
In [55]: Test_vectors = []# the avg-w2v for each sentence/review is stored in this list
for sent in W2VTest_list: # for each review/sentence
i=0
cnt_words =1 # num of words with a valid vector in the sentence/review
try:
sent_vec += vec
cnt_words += 1
except:
pass
# i+=1
sent_vec /= cnt_words
# print(i,'\t',cnt_words)
Test_vectors.append(sent_vec)
xtest=np.array(Test_vectors)+2
# xtest.min()
Out[55]: 0.8316740879887028
In [63]: clf3=MultinomialNB(alpha=alpha_W2V)
clf3.fit( xtrain, Ytrain)
pred_W2V=clf3.predict(xtest)
Accuracy3=round(accuracy_score(Ytest,pred_W2V,normalize=True),4)
6
cm3=confusion_matrix(Ytest, pred_W2V)
p3=precision_score(Ytest, pred_W2V, average=None)
r3=recall_score(Ytest, pred_W2V, average=None)
f1_3= (f1_score(Ytest, pred_W2V, average=None))

5 TF-IDF W2V
In [67]: tfidf_feat = tf_idf_vect.get_feature_names()
# tfidf_feat = tf_idf_vect.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf
tfidf_train_vectors = []; # the tfidf-w2v for each sentence/review is stored in this li

row=0;
for sent in W2VTraining_list: # for each review/sentence
weight_sum =0; # num of words with a valid vector in the sentence/review
try:
# print(vec)
# obtain the tf_idfidf of a word in a sentence/review
tfidf = Xtrain_tf_idf[row, tfidf_feat.index(word)]
# print(tfidf)
sent_vec += (vec * tfidf)
# print(sent_vec)
weight_sum += tfidf
# print(weight_sum)
except:
pass
sent_vec /= weight_sum
# print(sent_vec)
tfidf_train_vectors.append(sent_vec)
row += 1
Xtrain =np.array(tfidf_train_vectors)+2
In [68]: score_4 = []
for Alpha in alpha:
scores = cross_val_score(clf, Xtrain, Ytrain, cv=10, scoring='accuracy')
7
alpha_tfW2V = round(alpha[MSE_4.index(min(MSE_4))],3)
In [81]: tfidf_test_vectors = []; # the tfidf-w2v for each sentence/review is stored in this lis
row=0;
for sent in W2VTest_list: # for each review/sentence
weight_sum =1; # num of words with a valid vector in the sentence/review
try:
# print(vec)
# obtain the tf_idfidf of a word in a sentence/review
tfidf = Xtrain_tf[row, tfidf_feat.index(word)]
# print(tfidf)
sent_vec += (vec * tfidf)
# print(sent_vec)
weight_sum += tfidf
# print(weight_sum)
except:
pass
sent_vec /= weight_sum
# print(sent_vec)
tfidf_test_vectors.append(sent_vec)
row += 1
Xtest=np.array(tfidf_test_vectors)+2
In [82]: clf4=MultinomialNB(alpha=alpha_tfW2V)
clf4.fit( Xtrain, Ytrain)
pred_tfW2V=clf4.predict(Xtest)
Accuracy4=round(accuracy_score(Ytest,pred_tfW2V,normalize=True),4)
testError4 = round(1-Accuracy4,4)

cm4=confusion_matrix(Ytest, pred_tfW2V)
p4=precision_score(Ytest, pred_tfW2V, average=None)
r4=recall_score(Ytest, pred_tfW2V, average=None)
f1_4= (f1_score(Ytest, pred_tfW2V, average=None))

6 Conclusion Results
In [84]: print('\tTable 1 - Comparing Value of alpha, Train error and test error')
print('\t~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
8
print('\n')
print("\t\t BoW\t\t TF-ID\t\t Avg W2V\t TF-ID W2V")
print("\t\t ---\t\t -----\t\t -------\t ---------")
print("Optimal Alpha\t",alpha_BoW,"\t\t",alpha_tfidf,"\t\t",alpha_W2V,"\t\t",alpha_tfW2
print("Training Error\t",Training_Error1,"\t",Training_Error2,"\t\t",Training_Error3,"\
print("Test Error\t",testError1,"\t",testError2,"\t",testError3,"\t",testError4)
Table 1 - Comparing Value of alpha, Train error and test error

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BoW TF-ID Avg W2V TF-ID W2V

--- ----- ------- ---------
Optimal Alpha 4.401 3.101 6.001 0.001
Training Error 0.1732 0.166 0.4615 0.4619
Test Error 0.1622 0.1602 0.4334 0.4955
In [86]: print('\t\t\tTable 2 - Perfomance Measurements')

print('\t\t\t~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print('\n')
print("\t\t\t BoW\t\t TF-ID\t\t Avg W2V\t TF-ID W2V")
print("\t\t\t ---\t\t -----\t\t -------\t ---------")
print("Accuracy\t\t ",Accuracy1*100,"\t",Accuracy2*100,"\t\t",Accuracy3*100,"\t\t",Accu
print('\n')
print("f1 Score-a)Positive\t ",round(f1_1[1],4),"\t",round(f1_2[1],4),'\t',round(f1_3[1
print(" b)Negative\t ",round(f1_1[0],4),"\t",round(f1_2[0],4),'\t',round(f1_3[0
print('\n')
print("PrecisionScore-a)Positive",round(p1[1],4),"\t",round(p2[1],4),'\t',round(p3[1],4
print(" b)Negative",round(p1[0],4),"\t",round(p2[0],4),'\t',round(p3[0],4
print('\n')
print("RecallScore-a)Positive\t ",round(r1[1],4),"\t",round(r2[1],4),'\t',round(r3[1],4
print(" b)Negative\t ",round(r1[0],4),"\t",round(r2[0],4),'\t',round(r3[0],4
print('\n')
Table 2 - Perfomance Measurements

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BoW TF-ID Avg W2V TF-ID W2V

--- ----- ------- ---------
Accuracy 83.78 83.98 56.66 83.78
f1 Score-a)Positive 0.8396 0.8447 0.5595 0.0

b)Negative 0.836 0.8347 0.5734 0.6707
9
PrecisionScore-a)Positive 0.8233 0.8131 0.5635 0.0
b)Negative 0.8533 0.8707 0.5695 0.5045
RecallScore-a)Positive 0.8566 0.8788 0.5556 0.0

b)Negative 0.8194 0.8016 0.5774 1.0
In [80]: # Confusion Matrix

# fig, axs = plt.subplots(2,2, figsize=(8,8), sharey=True)
fig = plt.figure(figsize=(15,14))
ax1 = fig.add_subplot(221)
sn.set(font_scale=1.4)#for label size

sn.heatmap(df_cm1, annot=True,annot_kws={"size": 12},ax=ax1)# font size
ax1.set_title('BoW - Confusion Matrix')

ax2.set_title('TF-IDF - Confusion Matrix')

ax3.set_title('Avg W2V - Confusion Matrix')

ax4.set_title('Tf-IDF - Confusion Matrix')
Out[80]: Text(0.5,1,'Tf-IDF - Confusion Matrix')
10
11

NavieBayes - Revised

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

NavieBayes - Revised

Uploaded by

Copyright:

Available Formats

NavieBayes- Revised

October 10, 2018

In [1]: %matplotlib inline

import matplotlib.pyplot as plt

from sklearn.naive_bayes import MultinomialNB

In [17]: # changing to misclassification error

In [11]: #confusion matrix

f1_1= (f1_score(Ytest, pred_BoW, average=None))

In [12]: #BoW Feature importance

Feature importance - BoW

Out[12]: BoW_Words_Pos BoW_Prob BoW_Words_neg BoW_Prob

In [27]: #confusion matrix

f1_2= (f1_score(Ytest, pred_tfidf, average=None))

In [34]: #BoW Feature importance

Feature importance - TF-IDF

Out[34]: Tf-IDF_Words_Pos Tf-IDF_Prob TFIDF_Words_neg TFIDF_Prob

model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=

In [36]: def list_of_sent(df):

In [56]: W2VTraining_list = list_of_sent(X_train)

# scaling training vector

In [52]: # changing to misclassification error

f1_3= (f1_score(Ytest, pred_W2V, average=None))

tfidf_train_vectors = []; # the tfidf-w2v for each sentence/review is stored in this li

In [70]: # changing to misclassification error

In [83]: #confusion matrix

f1_4= (f1_score(Ytest, pred_tfW2V, average=None))

Table 1 - Comparing Value of alpha, Train error and test error

BoW TF-ID Avg W2V TF-ID W2V

In [86]: print('\t\t\tTable 2 - Perfomance Measurements')

Table 2 - Perfomance Measurements

BoW TF-ID Avg W2V TF-ID W2V

f1 Score-a)Positive 0.8396 0.8447 0.5595 0.0

RecallScore-a)Positive 0.8566 0.8788 0.5556 0.0

In [80]: # Confusion Matrix

sn.set(font_scale=1.4)#for label size

sn.set(font_scale=1.4)#for label size

sn.set(font_scale=1.4)#for label size

sn.set(font_scale=1.4)#for label size

Out[80]: Text(0.5,1,'Tf-IDF - Confusion Matrix')

You might also like