by Xuelin Hou xuelin.amy@gmail.com
In this analysis, I've done some basic exploratory data analysis on Amazon review data to reveal the distribution & correlation between different factors.
In the second section, I introduced some basic concepts of text analysis and did some text analysis over the customer reviews. In sentiment analysis, I validated the sentiment score was well matched with customer ratings of the products. In topic modelling, I generated a LDA model to cluster the reviews into 5 distinct topics for better understanding about the reviews.
Dataset was extracted from kaggle at https://www.kaggle.com/datafiniti/consumer-reviews-of-amazon-products
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
%matplotlib inline
print('numpy version: ' + np.__version__)
print('pandas version: ' + pd.__version__)
print('seaborn version: ' + sns.__version__)
Let's take a look at the dataset, retrieved from kaggle, by loading the data using pandas read_csv
function.
data = pd.read_csv('Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv',
low_memory=False)
data.head(2)
There are more columns than we need and we just simplied the data by keeping only columns:
name
primaryCategories
dateAdded
reviews.id
reviews.username
reviews.title
reviews.text
reviews.numHelpful
reviews.rating
data = data[['name','primaryCategories','dateAdded',
'reviews.username',
'reviews.title','reviews.text',
'reviews.numHelpful','reviews.rating']]
data.head(2)
Before summarizing the dataset, I added some additional columns, such as:
reviews.len
: length of the reviewshour
, ym
, dow
: hour, year-month, day-of-week of when the review was added data['dateAdded'] = pd.to_datetime(data.dateAdded)
data['reviews.len'] = data['reviews.text'].map(len)
data['hour'] = data.dateAdded.dt.strftime('%H')
data['ym'] = data.dateAdded.dt.strftime('%Y-%m')
data['dow'] = data.dateAdded.dt.strftime('%a')
# summary of numeric columns
data.describe()
# summary of categorical columns
data.describe(include=['O'])
After examing the data summary, we can find some preliminary insights:
The univariant summary above may not be suffcient for us to understand the data, so we can add another dimension for partition. This can be simply done by using groupby
function from pandas
reviews.numHelpful
and reviews.len
res = data.groupby('primaryCategories')\
.agg(num_product = pd.NamedAgg('name', pd.Series.nunique),
num_reviewer = pd.NamedAgg('reviews.username', pd.Series.nunique),
num_review = pd.NamedAgg('reviews.text', pd.Series.nunique),
avg_review_len = pd.NamedAgg('reviews.len', lambda i: np.round(np.mean(i),2)),
avg_rating = pd.NamedAgg('reviews.rating', lambda i: np.round(np.mean(i),2)),
avg_review_helpful = pd.NamedAgg('reviews.numHelpful', lambda i: np.round(np.mean(i),2))
)
res
fig, axs = plt.subplots(2,3, sharey=True)
fig.set_size_inches(14, 6)
sns.barplot(res['num_product'], res.index, ax = axs[0,0])
sns.barplot(res['num_reviewer'], res.index, ax = axs[0,1])
sns.barplot(res['num_review'], res.index, ax = axs[0,2])
sns.boxplot(data['reviews.len'], data.primaryCategories, ax=axs[1,0])
sns.boxplot(data['reviews.rating'], data.primaryCategories, ax=axs[1,1])
sns.boxplot(data['reviews.numHelpful'], data.primaryCategories, ax=axs[1,2])
plt.tight_layout()
plt.show()
We have seen a peak of number of reviews on 2017-03 over 12 different products. This may be due to some bias in data collection and not necessarily reflect the real distribution of the reviews.
res = data.groupby('ym')\
.agg(num_product = pd.NamedAgg('name', pd.Series.nunique),
num_review = pd.NamedAgg('reviews.text', pd.Series.nunique),
avg_review_len = pd.NamedAgg('reviews.len', lambda i: np.round(np.mean(i),2)),
avg_rating = pd.NamedAgg('reviews.rating', lambda i: np.round(np.mean(i),2))
).reset_index()
res
There is some mild correlation between reviews.len
and reveiews.numHelpful
, which may suggest that longer reviews tend to be more helpful for other users.
from string import ascii_letters
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white")
# convert categorical columns to integers to estimate their correlations
fnames_categorical = ['hour','dow','ym','primaryCategories']
data_ = data.copy()
data_[fnames_categorical] = data_[fnames_categorical].apply(lambda i: pd.factorize(i)[0])
corr = data_.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=np.bool))
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
Because of the sparsity of the data, there are a lot reviews with zero numHelpful
, which makes it difficult to view the pattern of the correlation. After removing zero cout of numHelpful
, we are able to find a correlation between review length and number helpful recieved.
sns.scatterplot('reviews.len', 'reviews.numHelpful', data = data.query('`reviews.numHelpful` > 0'))
# check some random reviews
random_reviews = data.sample(3)
for i in range(len(random_reviews)):
print('Review #{} ({} stars) {}'.format(i,
random_reviews['reviews.rating'].iloc[i],
random_reviews['dateAdded'].iloc[i]))
print(random_reviews['reviews.title'].iloc[i])
print(random_reviews['reviews.text'].iloc[i])
print('-'* 50 + '\r')
Tokenization means breaking sentences into words / phrases.
Following is an example of tokenziation using any character other than alphanum.
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer
# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
def process_text(x):
x = x.lower()
return tokenizer.tokenize(x)
raw_text = random_reviews['reviews.text'].iloc[0]
print('-'* 60)
print('raw review: \n' + raw_text)
print('-'* 60)
print('tokenized review: \n' + str(process_text(raw_text)))
print('-'* 60)
word cloud is an another interesting visualization to show the distribution of tokens of text.
from wordcloud import WordCloud
fig, axs = plt.subplots(2,2)
fig.set_size_inches(14,6)
for i, cate in enumerate(data.primaryCategories.unique()):
text = '\n'.join(data.loc[data.primaryCategories == cate, 'reviews.text'].values)
wordcloud = WordCloud(background_color='white').generate(text)
axs[i // 2, i % 2].imshow(wordcloud, interpolation="bilinear")
axs[i // 2, i % 2].set_title(cate)
axs[i // 2, i % 2].axis('off')
fig.tight_layout()
plt.show()
Sentiment Analysis is to score the sentiment from the human text. It can be used to monitor the brand awareness/perceptions, customer's attitude towards products via analyzing the reveiews.
How does sentiment score work ?
In a sentiment model, it stores lists of postive / neutral / negative keywords and compare the tokens with the keywords to compute the individual pos/neu/neg scores. Finally a combined score will be aggegrated as the sentiment score for the sentence.
Following is an example of sentiment analysis of a random review.
analyzer = SentimentIntensityAnalyzer()
text = random_reviews['reviews.text'].iloc[0]
print(text)
analyzer.polarity_scores(text)
pos_reviews = data.loc[data['reviews.rating'] == 5, :].sample(2)
neg_reviews = data.loc[data['reviews.rating'] == 1, :].sample(2)
random_reviews = pd.concat([pos_reviews, neg_reviews])
scores = random_reviews['reviews.text'].map(lambda i: analyzer.polarity_scores(i)['compound'])
random_reviews['score'] = scores
Show some more examples of Positive & Negative sentiment reviews
for i in range(len(random_reviews)):
print('Review #{} ({} stars) by {}'.format(i,
random_reviews['reviews.rating'].iloc[i],
random_reviews['reviews.username'].iloc[i]))
print(random_reviews['reviews.title'].iloc[i])
print('{} (sentiment score: {:0.2f})'.format(random_reviews['reviews.text'].iloc[i],
random_reviews['score'].iloc[i]))
print('-'*50 + '\r')
I correlated the sentiment score with review ratings as a method of validation. Generally speaking, the sentiment score is well correlated with review ratings. This is also making sense that, angry customer are making negative reveiws and gave the rating as low as possible.
scores = data['reviews.text'].map(lambda i: analyzer.polarity_scores(i)['compound'])
data['score'] = scores
# sentiment score distribution
fig, axs = plt.subplots(ncols=4, sharey=True, sharex=True)
fig.set_size_inches(12, 3)
for idx, cate in enumerate(data.primaryCategories.unique()):
sns.distplot(data.loc[data.primaryCategories == cate, 'score'].values, ax = axs[idx])
axs[idx].set_title(cate)
fig.tight_layout()
plt.show()
# relationship between score and rating
sns.violinplot(x='reviews.rating', y='score', data=data)
Topic modelling clusters the corpus of texts into several topics(groups) by assuming:
Topic modelling helps us to understand proximity of the review meanings. In the following section, I cluster the reviews into 5 topics, and an interactive visualization was generate to explore how each topic was made up with different words, so that we can use our domain knowledge to come up with a specific topic tag for those reviews.
In some modern e-commerce site (Taobao), topic tags were added at the top of the review section for customer to quickly filter review with certrain types of topics
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer
# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
def process_text(x):
x = x.lower()
return tokenizer.tokenize(x)
docs = data['reviews.text'].map(process_text)
# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]
# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]
# Remove stopwords
from nltk.corpus import stopwords
docs = [[token for token in doc if token not in stopwords.words('english')] for doc in docs]
# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
# Compute bigrams.
from gensim.models import Phrases
# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
for token in bigram[docs[idx]]:
if '_' in token:
# Token is a bigram, add to document.
docs[idx].append(token)
# Remove rare and common tokens.
from gensim.corpora import Dictionary
# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)
# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))
# Train LDA model.
from gensim.models import LdaModel
# Set training parameters.
num_topics = 5
chunksize = 2000
passes = 20
iterations = 200
eval_every = None # Don't evaluate model perplexity, takes too much time.
# Make a index to word dictionary.
temp = dictionary[0] # This is only to "load" the dictionary.
id2word = dictionary.id2token
model = LdaModel(
corpus=corpus,
id2word=id2word,
chunksize=chunksize,
alpha='auto',
eta='auto',
iterations=iterations,
num_topics=num_topics,
passes=passes,
eval_every=eval_every
)
top_topics = model.top_topics(corpus) #, num_words=20)
# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, corpus, dictionary)
vis