import os
# Dataframe
import pandas as pd
# Array
import numpy as np
import itertools
# Visualizations
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
import matplotlib.colors as colors
%matplotlib inline
from wordcloud import WordCloud, STOPWORDS
# Datetime
from datetime import datetime
# text preprocessing
import string
import spacy
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize, regexp_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize.toktok import ToktokTokenizer
import gensim
import re
## Modeling
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import learning_curve
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from gensim.models import Word2Vec
from tqdm import tqdm
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras
from keras.models import Sequential, Input
from keras.layers import Dense
from keras.backend import eval
from keras.optimizers import Adam
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.layers import Dropout
from keras.layers.convolutional import Conv1D,MaxPooling1D
import keras.backend as K
import sys
np.set_printoptions(threshold=sys.maxsize)
def get_dataset():
!wget --load-cookies /tmp/cookies.txt\
"https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt\
--keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1EzY1IfN_QGCVp9EUVxZ3dZhRF_EUtyJA' -O- \
| sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1EzY1IfN_QGCVp9EUVxZ3dZhRF_EUtyJA" -O kaggle.json && rm -rf /tmp/cookies.txt
!pip install kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d datafiniti/consumer-reviews-of-amazon-products
!unzip 'consumer-reviews-of-amazon-products.zip'
def text_process(text, as_string = False):
'''
performs the following on the input text :
1- removes punctuation
2- removes stopwords
3- returns the remining as list of words
'''
nopunc = ''.join([char.lower() for char in text if char not in string.punctuation])
nopunc_nostopwords = [word for word in nopunc.split() if word not in stopwords_en]
if as_string : return ' '.join(nopunc_nostopwords)
else : return nopunc_nostopwords
%%capture
get_dataset()
#df = pd.read_csv("Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv")
#df = pd.read_csv("Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv")
df = pd.read_csv("1429_1.csv")
df.head()
print(df.shape)
df.columns
relevant_features = ['reviews.doRecommend', 'reviews.numHelpful', 'reviews.rating', 'reviews.text', 'reviews.title']
relevant_df = df[relevant_features]
relevant_df.head()
relevant_df.shape
relevant_df.isnull().sum()
relevant_df = relevant_df.dropna()
relevant_df.isnull().sum()
threshold_rating = 3
ratings = relevant_df['reviews.rating'].values
np.unique(ratings)
ratings[ratings <= threshold_rating] = 0
ratings[ratings > threshold_rating] = 1
np.unique(ratings)
relevant_df['reviews.rating'] = ratings
do_recommend = relevant_df['reviews.doRecommend'].values
do_recommend[do_recommend == False] = 0
do_recommend[do_recommend == True] = 1
relevant_df['reviews.doRecommend'] = do_recommend.astype(np.float64)
relevant_df.head()
relevant_df.info()
stopwords_en = stopwords.words('english')
stopwords_en.remove('not')
X = relevant_df['reviews.text']
Y = relevant_df['reviews.rating']
print(X[0])
print(' '.join(text_process(X[0])))
bow_transformer = CountVectorizer(analyzer = text_process).fit(X)
XX = bow_transformer.transform(X)
type(bow_transformer.vocabulary_)
XX.toarray().shape
x_train, x_test, y_train, y_test = train_test_split(XX, Y, test_size = 0.3)
naive_bayes = MultinomialNB()
naive_bayes.fit(x_train, y_train)
nb_predictions = naive_bayes.predict(x_test)
print("confusion matrix")
print(confusion_matrix(y_test, nb_predictions))
print("___________________")
print()
print("classification report")
print(classification_report(y_test, nb_predictions))
print("___________________")
svm = SVC()
svm.fit(x_train, y_train)
svm_predictions = svm.predict(x_test)
print("confusion matrix")
print(confusion_matrix(y_test, svm_predictions))
print("___________________")
print()
print("classification report")
print(classification_report(y_test, svm_predictions))
print("___________________")
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(x_train, y_train)
knn_predictions = knn.predict(x_test)
print("confusion matrix")
print(confusion_matrix(y_test, knn_predictions))
print("___________________")
print()
print("classification report")
print(classification_report(y_test, knn_predictions))
print("___________________")
gb_classifier = XGBClassifier()
gb_classifier.fit(x_train, y_train)
gb_predictions = gb_classifier.predict(x_test)
print("confusion matrix")
print(confusion_matrix(y_test, gb_predictions))
print("___________________")
print()
print("classification report")
print(classification_report(y_test, gb_predictions))
print("___________________")
keras_x = X.values.tolist()
keras_y = Y.values.tolist()
sentences = [text_process(x, as_string = True) for x in keras_x]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
sentences_lens = [len(sequence) for sequence in sequences]
plt.figure(figsize = (10,10))
limit = 30
plt.hist(sentences_lens, bins = np.arange(limit))
plt.xlim(0,limit)
plt.show()
max_len = 15
sequences = pad_sequences(sequences, maxlen = max_len, truncating = 'post')
np.random.seed(1024)
perm = np.arange((len(sequences)))
np.random.shuffle(perm)
sequences = np.array(sequences)[perm]
labels = np.array(keras_y)[perm]
sequences.shape, labels.shape
train_x, test_x, train_y, test_y = train_test_split(sequences, labels, test_size = 0.3)
train_x.shape, train_y.shape, test_x.shape, test_y.shape
VOCAB_SIZE = len(tokenizer.word_index) + 1 # +1 is for the reserved 0 for padding
OUTPUT_DIM = 20
INPUT_LENGTH = max_len
K.clear_session()
entry = Input(shape = (INPUT_LENGTH))
model = Embedding(input_dim = VOCAB_SIZE, output_dim= OUTPUT_DIM, input_length = INPUT_LENGTH )(entry)
model = LSTM(OUTPUT_DIM, return_sequences = True)(model)
model = LSTM(10)(model)
model = Dense(5, activation = 'relu')(model)
model = Dense(1, activation = 'sigmoid')(model)
model = keras.Model(inputs = entry, outputs = model)
print(model.summary())
optimizer = Adam(lr=0.0001, decay=0.0001);
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.fit(train_x, train_y, batch_size=16, epochs=5, validation_data=(test_x, test_y))
sample_negative_rates = relevant_df[relevant_df['reviews.rating']==0.0]['reviews.text'].values[:10]
sample_positive_rates = relevant_df[relevant_df['reviews.rating']==1.0]['reviews.text'].values[:10]
sample_negative_rates = bow_transformer.transform(sample_negative_rates)
sample_positive_rates = bow_transformer.transform(sample_positive_rates)
sample_negative_rates
classifiers = [naive_bayes, svm, knn, gb_classifier]
classifier_names = ['naive bayes', 'SVM', 'KNN', 'GradientBoostClassifier']
for classifier_name, classifier in zip(classifier_names, classifiers) :
negative_preds = classifier.predict(sample_negative_rates)
positive_preds = classifier.predict(sample_positive_rates)
negative_acc = negative_preds.sum() / len(negative_preds)
positive_acc = positive_preds.sum() / len(positive_preds)
print("accuracy of {} for negative rates is {} and for positive rates is {}".format(classifier_name, negative_acc, positive_acc))