Professional Documents
Culture Documents
October 8, 2018
1 Assignment -2
1.1 t-SNE visualization of Amazon reviews with polarity based color-coding
In [1]: %matplotlib inline
import pickle
import numpy as np
import pandas as pd
import sqlite3
import nltk
from nltk.stem.porter import PorterStemmer
import re
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
In [3]: df = df.dropna()
# https://stackoverflow.com/questions/19913659/pandas-conditional-creation-of-a-series-d
df['review'] = np.where(df['Score']>3, 'positive', 'negative')
# df.review.value_counts()
1
In [7]: # final data set
DataPoints, Features = Amazon_5000.shape
l= Amazon_5000["review"].unique()
x,y= Amazon_5000["review"].value_counts()
print("Final Data \n")
# print(len(l))
print ("Total number of Datapoints -",DataPoints,'\n\n'
'Total Number Features or Independent Variable -',Features-1,'\n')
print('The Positive review has label \"{}\" and \"{}\" datapoint \n'.format(l[0],x))
print('The Negatve review has class label \"{}\" and \"{}\" datapoints'.format(l[1],y))
Final Data
The Negatve review has class label "negative" and "2500" datapoints
def cleanpunc(sentence): #function to clean the word of any punctuation or special chara
cleaned = re.sub(r'[?|!|\';|"|#]',r'',sentence)
cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
return cleaned
# # cleaned Text
Amazon_5000['CleanedText']=list(map(cleanhtml,(map(cleanpunc,Amazon_5000['Text']))))
2
s=(sno.stem(cleaned_words.lower()))#.encode('utf8') #remoiving utf give
word.append(s) #list of all words used to describe positive reviews
else:
continue
else:
continue
words=' '.join(str(e) for e in word)
return words
Amazon_5000['WoutStopWords']=list(map(list_of_words,Amazon_5000['CleanedText']))
In [14]: X = Amazon_5000[['Time',"WoutStopWords",'review']]
X = X.sort_values(['Time'],ascending=True)
#using Pickle to save dataset genrated for future refrences
#https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_pickle.html
X.to_pickle("./Amazon_5000.pkl")
2 Bag Of Words
In [36]: from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer() #in scikit-learn
final_count = count_vect.fit_transform(Amazon_5000['WoutStopWords'].values)
final_counts=final_count.toarray()
tsne_df.to_csv('Tsne_BoW.csv')
3
3 TFID
In [44]: from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vect = TfidfVectorizer()
final_tf_idf = tf_idf_vect.fit_transform(Amazon_5000['WoutStopWords'].values)
final_tf_idf = final_tf_idf.toarray()
tsne_dfifd.to_csv('Tsne_TFID.csv')
4
sns.FacetGrid(tsne_dfifd, hue="labels", size=6).map(plt.scatter, 'Dim_1', 'Dim_2').add_
plt.title("t-SNE TFID")
plt.show()
4 Avg W2V
In [13]: from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle
5
filtered_sentence=[]
for w in sent.split():
for cleaned_words in cleanpunc(w).split():
if(cleaned_words.isalpha()):
filtered_sentence.append(cleaned_words.lower())
else:
continue
list_sent.append(filtered_sentence)
return list_sent
3338
In [21]: Traing_vectors = []# the avg-w2v for each sentence/review is stored in this list
for sent in list_all_words: # for each review/sentence
sent_vec = np.zeros(50) # as word vectors are of zero length
cnt_words =0 # num of words with a valid vector in the sentence/review
for word in sent: # for each word in a review/sentence
try:
vec = w2v_model.wv[word]
sent_vec += vec
cnt_words += 1
except:
pass
sent_vec /= cnt_words
Traing_vectors.append(sent_vec)
tsne_W2V.to_csv('Tsne_W2V.csv')
6
5 TFID Avg W2V
In [76]: from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vect = TfidfVectorizer()
tf_idf = tf_idf_vect.fit_transform(X['WoutStopWords'].values)
7
try:
vec = w2v_model.wv[word]
# print(vec)
# obtain the tf_idfidf of a word in a sentence/review
tfidf = tf_idf[row, tfidf_feat.index(word)]
# print(tfidf)
sent_vec += (vec * tfidf)
# print(sent_vec)
weight_sum += tfidf
# print(weight_sum)
except:
pass
sent_vec /= weight_sum
# print(sent_vec)
tfidf_train_vectors.append(sent_vec)
row += 1
tsne_W2V.to_csv('Tsne_TFIDW2V.csv')
8
9