Professional Documents
Culture Documents
1 Assigment - 4
1.1 Apply Naive Bayes to Amazon reviews
Note: Used 5000 review 2500 positive and 2500 negative
import warnings
warnings.filterwarnings('ignore')
In [2]: df=pd.read_pickle('./Amazon_5000.pkl')
#sorting values TBS
df = df.sort_values(['Time'],ascending=True)
In [3]: #Creating Train and test data (80-20 split, Since i ahve small dataset i need more data
# therefore converted in 80-20 split instead of 70-30)
## 80% of sorted data will = Total number of rows (sorted) *0.8
X_train = df.iloc[:int(len(df)*.8),1]
Y_train = df.iloc[:int(len(df)*.8),-1]
# len(Y_train)== len(X_train)
X_test = df.iloc[int(len(df)*.8)+1:,1]
Y_test = df.iloc[int(len(df)*.8)+1:,-1]
# len(Y_test)== len(X_test)
1
In [4]: # Label encoding
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Ytrain = le.fit_transform(Y_train)
Ytest = le.transform(Y_test)
2 BoW
In [5]: # #BoW without bigram
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
Xtrain1 = vectorizer.fit_transform(X_train.values)
Xtest1 = vectorizer.transform(X_test.values)
# Xtest.shape
In [6]: score_1 = []
alpha = np.arange(0.001,10,0.1)
# alpha.shape
for Alpha in alpha:
clf=MultinomialNB(alpha=Alpha)
scores = cross_val_score(clf, Xtrain1, Ytrain, cv=10, scoring='accuracy')
# print('Aplha =',round(Alpha,5),'\t\t\tScore =',round(scores.mean(),5))
score_1.append(scores.mean())
In [8]: clf1=MultinomialNB(alpha=alpha_BoW)
clf1.fit( Xtrain1, Ytrain)
pred_BoW=clf1.predict(Xtest1)
Accuracy1=round(accuracy_score(Ytest,pred_BoW,normalize=True),4)
testError1 = 1-Accuracy1
2
class_count = clf1.class_count_
pos_points_prob_sort = clf1.feature_log_prob_[1, :].argsort()
neg_points_prob_sort = clf1.feature_log_prob_[0, :].argsort()
log_prob = clf1.feature_log_prob_
feature_prob= pd.DataFrame(log_prob, columns=features).T
feature_prob.columns = [le.inverse_transform(0),le.inverse_transform(1)]
top_positive_BoW = feature_prob['positive'].sort_values(ascending=False)[:10]
top_negative_BoW = feature_prob['negative'].sort_values(ascending=False)[:10]
positive_BoW= pd.Series.to_frame(top_positive_BoW)
positive_BoW.reset_index(level=0, inplace=True)
positive_BoW.columns = ['BoW_Words_Pos','BoW_Prob']
negative_BoW= pd.Series.to_frame(top_negative_BoW)
negative_BoW.reset_index(level=0, inplace=True)
negative_BoW.columns = ['BoW_Words_neg','BoW_Prob']
BoW_FI= pd.concat([positive_BoW,negative_BoW], axis=1)
print('Feature importance - BoW')
BoW_FI
3 Tf-IDF
In [13]: from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vect = TfidfVectorizer()
Xtrain_tf_idf = tf_idf_vect.fit_transform(X_train.values)
Xtest_tf_idf = tf_idf_vect.transform(X_test.values)
In [15]: score_2 = []
for Alpha in alpha:
clf=MultinomialNB(alpha=Alpha)
scores = cross_val_score(clf, Xtrain_tf_idf, Ytrain, cv=10, scoring='accuracy')
# print('Aplha =',round(Alpha,5),'\t\t\tScore =',round(scores.mean(),5))
score_2.append(scores.mean())
3
In [18]: # changing to misclassification error
MSE_2 = [1 - x for x in score_2]
Training_Error2=round(1-max(score_2),4)
alpha_tfidf = round(alpha[MSE_2.index(min(MSE_2))],3)
In [24]: clf2=MultinomialNB(alpha=alpha_tfidf)
clf2.fit( Xtrain_tf_idf, Ytrain)
pred_tfidf=clf2.predict(Xtest_tf_idf)
Accuracy2=round(accuracy_score(Ytest,pred_tfidf,normalize=True),4)
testError2 = 1-Accuracy2
positive= pd.Series.to_frame(top_positive_tfidf)
positive.reset_index(level=0, inplace=True)
positive.columns = ['Tf-IDF_Words_Pos','Tf-IDF_Prob']
negative= pd.Series.to_frame(top_negative_tfidf)
negative.reset_index(level=0, inplace=True)
negative.columns = ['TFIDF_Words_neg','TFIDF_Prob']
TfIDF_FI= pd.concat([positbive,negative], axis=1)
print('Feature importance - TF-IDF')
TfIDF_FI
4
2 coffe -6.365689 product -6.316280
3 good -6.376578 coffe -6.465138
4 flavor -6.430549 would -6.574775
5 like -6.436484 flavor -6.585915
6 tea -6.450763 tri -6.613772
7 tast -6.501234 one -6.643812
8 one -6.600750 dog -6.728935
9 use -6.628035 buy -6.749567
4 Avg W2V
In [35]: from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle
list_sent.append(filtered_sentence)
return list_sent
5
cnt_words += 1
except:
pass
sent_vec /= cnt_words
Traing_vectors.append(sent_vec)
In [51]: score_3 = []
for Alpha in alpha:
clf=MultinomialNB(alpha=Alpha)
scores = cross_val_score(clf, xtrain, Ytrain, cv=10, scoring='accuracy')
# print('Aplha =',round(Alpha,5),'\t\t\tScore =',round(scores.mean(),5))
score_3.append(scores.mean())
In [55]: Test_vectors = []# the avg-w2v for each sentence/review is stored in this list
for sent in W2VTest_list: # for each review/sentence
i=0
sent_vec = np.zeros(50) # as word vectors are of zero length
cnt_words =1 # num of words with a valid vector in the sentence/review
for word in sent: # for each word in a review/sentence
try:
vec = w2v_model.wv[word]
sent_vec += vec
cnt_words += 1
except:
pass
# i+=1
sent_vec /= cnt_words
# print(i,'\t',cnt_words)
Test_vectors.append(sent_vec)
xtest=np.array(Test_vectors)+2
# xtest.min()
Out[55]: 0.8316740879887028
In [63]: clf3=MultinomialNB(alpha=alpha_W2V)
clf3.fit( xtrain, Ytrain)
pred_W2V=clf3.predict(xtest)
Accuracy3=round(accuracy_score(Ytest,pred_W2V,normalize=True),4)
testError3 = 1-Accuracy3
6
In [65]: #confusion matrix
cm3=confusion_matrix(Ytest, pred_W2V)
p3=precision_score(Ytest, pred_W2V, average=None)
r3=recall_score(Ytest, pred_W2V, average=None)
5 TF-IDF W2V
In [67]: tfidf_feat = tf_idf_vect.get_feature_names()
# tfidf_feat = tf_idf_vect.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf
weight_sum += tfidf
# print(weight_sum)
except:
pass
sent_vec /= weight_sum
# print(sent_vec)
tfidf_train_vectors.append(sent_vec)
row += 1
Xtrain =np.array(tfidf_train_vectors)+2
In [68]: score_4 = []
for Alpha in alpha:
clf=MultinomialNB(alpha=Alpha)
scores = cross_val_score(clf, Xtrain, Ytrain, cv=10, scoring='accuracy')
# print('Aplha =',round(Alpha,5),'\t\t\tScore =',round(scores.mean(),5))
score_4.append(scores.mean())
7
MSE_4 = [1 - x for x in score_4]
Training_Error4=round(1-max(score_4),4)
alpha_tfW2V = round(alpha[MSE_4.index(min(MSE_4))],3)
In [81]: tfidf_test_vectors = []; # the tfidf-w2v for each sentence/review is stored in this lis
row=0;
for sent in W2VTest_list: # for each review/sentence
sent_vec = np.zeros(50) # as word vectors are of zero length
weight_sum =1; # num of words with a valid vector in the sentence/review
for word in sent: # for each word in a review/sentence
try:
vec = w2v_model.wv[word]
# print(vec)
# obtain the tf_idfidf of a word in a sentence/review
tfidf = Xtrain_tf[row, tfidf_feat.index(word)]
# print(tfidf)
sent_vec += (vec * tfidf)
# print(sent_vec)
weight_sum += tfidf
# print(weight_sum)
except:
pass
sent_vec /= weight_sum
# print(sent_vec)
tfidf_test_vectors.append(sent_vec)
row += 1
Xtest=np.array(tfidf_test_vectors)+2
In [82]: clf4=MultinomialNB(alpha=alpha_tfW2V)
clf4.fit( Xtrain, Ytrain)
pred_tfW2V=clf4.predict(Xtest)
Accuracy4=round(accuracy_score(Ytest,pred_tfW2V,normalize=True),4)
testError4 = round(1-Accuracy4,4)
6 Conclusion Results
In [84]: print('\tTable 1 - Comparing Value of alpha, Train error and test error')
print('\t~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
8
print('\n')
print("\t\t BoW\t\t TF-ID\t\t Avg W2V\t TF-ID W2V")
print("\t\t ---\t\t -----\t\t -------\t ---------")
print("Optimal Alpha\t",alpha_BoW,"\t\t",alpha_tfidf,"\t\t",alpha_W2V,"\t\t",alpha_tfW2
print("Training Error\t",Training_Error1,"\t",Training_Error2,"\t\t",Training_Error3,"\
print("Test Error\t",testError1,"\t",testError2,"\t",testError3,"\t",testError4)
print("Accuracy\t\t ",Accuracy1*100,"\t",Accuracy2*100,"\t\t",Accuracy3*100,"\t\t",Accu
print('\n')
print("f1 Score-a)Positive\t ",round(f1_1[1],4),"\t",round(f1_2[1],4),'\t',round(f1_3[1
print(" b)Negative\t ",round(f1_1[0],4),"\t",round(f1_2[0],4),'\t',round(f1_3[0
print('\n')
print("PrecisionScore-a)Positive",round(p1[1],4),"\t",round(p2[1],4),'\t',round(p3[1],4
print(" b)Negative",round(p1[0],4),"\t",round(p2[0],4),'\t',round(p3[0],4
print('\n')
print("RecallScore-a)Positive\t ",round(r1[1],4),"\t",round(r2[1],4),'\t',round(r3[1],4
print(" b)Negative\t ",round(r1[0],4),"\t",round(r2[0],4),'\t',round(r3[0],4
print('\n')
9
PrecisionScore-a)Positive 0.8233 0.8131 0.5635 0.0
b)Negative 0.8533 0.8707 0.5695 0.5045
10
11