In [1]:
import pandas as pd
import numpy as np

import requests 
from collections import OrderedDict

import boto3

read data

In [5]:
df = pd.read_csv("1429_1.csv") 
df.head()
/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3063: DtypeWarning: Columns (1,10) have mixed types.Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
Out[5]:
id name asins brand categories keys manufacturer reviews.date reviews.dateAdded reviews.dateSeen ... reviews.doRecommend reviews.id reviews.numHelpful reviews.rating reviews.sourceURLs reviews.text reviews.title reviews.userCity reviews.userProvince reviews.username
0 AVqkIhwDv8e3D1O-lebb All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,... B01AHB9CN2 Amazon Electronics,iPad & Tablets,All Tablets,Fire Ta... 841667104676,amazon/53004484,amazon/b01ahb9cn2... Amazon 2017-01-13T00:00:00.000Z 2017-07-03T23:33:15Z 2017-06-07T09:04:00.000Z,2017-04-30T00:45:00.000Z ... True NaN 0.0 5.0 http://reviews.bestbuy.com/3545/5620406/review... This product so far has not disappointed. My c... Kindle NaN NaN Adapter
1 AVqkIhwDv8e3D1O-lebb All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,... B01AHB9CN2 Amazon Electronics,iPad & Tablets,All Tablets,Fire Ta... 841667104676,amazon/53004484,amazon/b01ahb9cn2... Amazon 2017-01-13T00:00:00.000Z 2017-07-03T23:33:15Z 2017-06-07T09:04:00.000Z,2017-04-30T00:45:00.000Z ... True NaN 0.0 5.0 http://reviews.bestbuy.com/3545/5620406/review... great for beginner or experienced person. Boug... very fast NaN NaN truman
2 AVqkIhwDv8e3D1O-lebb All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,... B01AHB9CN2 Amazon Electronics,iPad & Tablets,All Tablets,Fire Ta... 841667104676,amazon/53004484,amazon/b01ahb9cn2... Amazon 2017-01-13T00:00:00.000Z 2017-07-03T23:33:15Z 2017-06-07T09:04:00.000Z,2017-04-30T00:45:00.000Z ... True NaN 0.0 5.0 http://reviews.bestbuy.com/3545/5620406/review... Inexpensive tablet for him to use and learn on... Beginner tablet for our 9 year old son. NaN NaN DaveZ
3 AVqkIhwDv8e3D1O-lebb All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,... B01AHB9CN2 Amazon Electronics,iPad & Tablets,All Tablets,Fire Ta... 841667104676,amazon/53004484,amazon/b01ahb9cn2... Amazon 2017-01-13T00:00:00.000Z 2017-07-03T23:33:15Z 2017-06-07T09:04:00.000Z,2017-04-30T00:45:00.000Z ... True NaN 0.0 4.0 http://reviews.bestbuy.com/3545/5620406/review... I've had my Fire HD 8 two weeks now and I love... Good!!! NaN NaN Shacks
4 AVqkIhwDv8e3D1O-lebb All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,... B01AHB9CN2 Amazon Electronics,iPad & Tablets,All Tablets,Fire Ta... 841667104676,amazon/53004484,amazon/b01ahb9cn2... Amazon 2017-01-12T00:00:00.000Z 2017-07-03T23:33:15Z 2017-06-07T09:04:00.000Z,2017-04-30T00:45:00.000Z ... True NaN 0.0 5.0 http://reviews.bestbuy.com/3545/5620406/review... I bought this for my grand daughter when she c... Fantastic Tablet for kids NaN NaN explore42

5 rows × 21 columns

In [6]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34660 entries, 0 to 34659
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    34660 non-null  object 
 1   name                  27900 non-null  object 
 2   asins                 34658 non-null  object 
 3   brand                 34660 non-null  object 
 4   categories            34660 non-null  object 
 5   keys                  34660 non-null  object 
 6   manufacturer          34660 non-null  object 
 7   reviews.date          34621 non-null  object 
 8   reviews.dateAdded     24039 non-null  object 
 9   reviews.dateSeen      34660 non-null  object 
 10  reviews.didPurchase   1 non-null      object 
 11  reviews.doRecommend   34066 non-null  object 
 12  reviews.id            1 non-null      float64
 13  reviews.numHelpful    34131 non-null  float64
 14  reviews.rating        34627 non-null  float64
 15  reviews.sourceURLs    34660 non-null  object 
 16  reviews.text          34659 non-null  object 
 17  reviews.title         34655 non-null  object 
 18  reviews.userCity      0 non-null      float64
 19  reviews.userProvince  0 non-null      float64
 20  reviews.username      34658 non-null  object 
dtypes: float64(5), object(16)
memory usage: 5.6+ MB

extract relevant columns

In [7]:
df = df[["reviews.rating", "reviews.text"]]
In [8]:
df = df.dropna()
In [9]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 34626 entries, 0 to 34659
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   reviews.rating  34626 non-null  float64
 1   reviews.text    34626 non-null  object 
dtypes: float64(1), object(1)
memory usage: 811.5+ KB
In [22]:
text = df["reviews.text"].values.tolist()
labels = df["reviews.rating"].values.tolist()
labels = [1.0 if label >= 3.0 else 0.0 for label in labels] 
In [24]:
len(labels), np.sum(labels)
Out[24]:
(34626, 33814.0)

comprehend

In [11]:
comprehend = boto3.client('comprehend', region_name = 'us-east-1')
In [12]:
print("review:", text[0]) 
res = comprehend.detect_sentiment(Text = text[0], LanguageCode = 'en') 
print("label:", labels[0])
review: This product so far has not disappointed. My children love to use it and I like the ability to monitor control what content they see with ease.
label: 5.0
In [25]:
ground_truth = []
predictions = [] 
for i in range(1000):
    prediction = comprehend.detect_sentiment(Text = text[i], LanguageCode = 'en').get('Sentiment')
    predictions.append(prediction)
    ground_truth.append(labels[i])
In [26]:
len(ground_truth), len(predictions)
Out[26]:
(1000, 1000)
In [34]:
ground_truth = np.array(ground_truth) 
predictions = [1.0 if prediction=='POSITIVE' else 0.0 for prediction in predictions]
predictions = np.array(predictions) 
In [35]:
accuracy = np.array(ground_truth==predictions).sum() / len(ground_truth)
accuracy
Out[35]:
0.857