Data Visualization and Analysis (Goodreads Part III)

Published in

Connor Higgins

6 min readDec 11, 2019

This is a short followup to this post showing some of the simple analyses and visualizations you can do with the data collected by the webscraper. I’ll be using a fairly sizable dataset with 2891 works, from some of the most popular epic fantasy books on Goodreads.com.

Key statistical findings from Best Epic Fantasy:

User-submitted genre tags are not independent of whether the work receives or is nominated for an award
The genre also seems to be related to page lengths as well, with “Fantasy” works having the longest length.
Average rating is not significantly related to whether a work recieves an award…
…but the total number of ratings recieved is significantly related to awards.

Some other suggestions for uses not covered here:

Machine Learning: For example, using text classification to predict the genre tag given a title
Conduct analyses across several lists: One can scrape several lists and combine the results by the goodReadsID identifier column (e.g., for comparing favorite genres)

Visualizations

Character data

Wordcloud of the book titles

############# TITLE WORDCLOUD #############################################
# Create corpus
title.corpus<-corpus(as.character(bestEpicFantasy$title))# now creating a document-feature matrix using dfm()
T.plot<-dfm(title.corpus, tolower = TRUE, remove_punct = TRUE, remove_twitter = TRUE, remove_numbers = TRUE, remove=stopwords(source = "smart"))# Colors
T.col <- brewer.pal(10, "BrBG")  # Create Wordcloud
textplot_wordcloud(T.plot, min_count = 16, color = T.col)

Wordcloud of the book descriptions

############# DESCRIPTIONS WORDCLOUD #############################################
# Create corpus
descriptions.corpus<-corpus(as.character(bestEpicFantasy$book.descriptions))# now creating a document-feature matrix using dfm()
D.plot<-dfm(descriptions.corpus, tolower = TRUE, remove_punct = TRUE, remove_twitter = TRUE, remove_numbers = TRUE, remove=stopwords(source = "smart"))# Colors
D.col <- brewer.pal(10, "BrBG")  # Wordcloud
textplot_wordcloud(D.plot, min_count = 16, color = D.col)

Numerical data

Total Ratings: Histogram and Boxplot

# Note: Total Reviews were very highly skewed, therefore the below graph uses logarithms.
ggplot(data = bestEpicFantasy, aes(log(bestEpicFantasy$total_ratings)))+
  geom_histogram(bins = 25,fill="dodgerblue4")+
  labs(title="Histogram for Total # of Reviews")+
  labs(x="Log Total Reviews",y="Counts")## Warning: Removed 8 rows containing non-finite values (stat_bin).

ggplot(data = bestEpicFantasy, aes(y=bestEpicFantasy$total_ratings,x=""))+
  geom_boxplot(fill="dodgerblue4",width=0.1)+
  labs(title="Boxplot for Total Ratings")+
  labs(y="Total # of Ratings",x="")

#Outlier: It is Harry Potter book 1 (no surprise there)
bestEpicFantasy[which.max(bestEpicFantasy$total_ratings),c("title","total_ratings")]##                                                         title
## 2080 Harry Potter and the Sorcerer's Stone (Harry Potter, #1)
##      total_ratings
## 2080       5374966

Pages: Histogram and Boxplot

#################### PLOT Pages #########################################pageplt<-ggplot(data = bestEpicFantasy, aes(bestEpicFantasy$pageCounts))+
  geom_histogram(bins = 25,fill="dodgerblue4")+
  labs(title="Histogram for Page Lengths")+
  labs(x="Total Pages",y="Counts")
pageplt## Warning: Removed 472 rows containing non-finite values (stat_bin).

ggplot(data = bestEpicFantasy, aes(y=bestEpicFantasy$pageCounts,x=""))+
  geom_boxplot(fill="dodgerblue4",width=0.1)+
  labs(title="Boxplot for Page Lengths")+
  labs(y="Page Lengths",x="")## Warning: Removed 472 rows containing non-finite values (stat_boxplot).

Average Ratings: Histogram and Boxplot

################### PLOT Average Ratings ###################################avgplt<-ggplot(data = bestEpicFantasy, aes(bestEpicFantasy$average_rating))+
  geom_histogram(bins = 25,fill="dodgerblue4")+
  labs(title="Histogram for Average Ratings")+
  labs(x="Average Rating",y="Counts")
avgplt

ggplot(data = bestEpicFantasy, aes(y=bestEpicFantasy$average_rating,x=""))+
  geom_boxplot(fill="dodgerblue4",width=0.1)+
  labs(title="Boxplot for Average Ratings")+
  labs(y="Average Rating",x="")

Categorical data

Awards/Nominations: Barplot

#######################PLOT Award ########################################awardplt<-ggplot(data=bestEpicFantasy,aes(bestEpicFantasy$hasAward))+geom_bar(fill="dodgerblue4")+
  labs(x = "Award/Nomination")
awardplt

Genres: Barplot

Almost all books in this set are tagged Fantasy, with a large number of other genres with very few titles. Therefore I relabeled these rarer genres as “Other genres” for presentation and testing.

######################PLOT Genres ###########################################tp <- bestEpicFantasy
levels(tp$genreVoted)[levels(tp$genreVoted) != "Fantasy"] <- "Other Genres"genreplt<-ggplot(data = tp,aes(x = genreVoted))+geom_bar(fill="dodgerblue4")+
  labs(title = "Histogram for Genres")+
  labs(x = "Genre 'tagged' by vote on GoodReads.com", y = "Count")
genreplt

Crossing Variables and Statistical Testing

Genres and Awards: CHI-SQ

############# CROSS Genres With Awards###################################
# Outcome: SUCCESSFULLY reject Null Hypothesis
crossplt<-ggplot(data = tp,aes(x = genreVoted, fill = hasAward))+geom_bar(position = "dodge")+
  labs(title = "Genres crossed with Awards")+
  labs(x = "Genre", y = "Count")+
  scale_fill_discrete("Has Award/Nom")
crossplt

# Same plot but if we wanted to exclude NA cases
tpNoNA<-tp[complete.cases(tp),]
tbl<-table(tpNoNA$genreVoted, tpNoNA$hasAward)
chisq.test(tbl)## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  tbl
## X-squared = 11.002, df = 1, p-value = 0.0009101crossplt2<-ggplot(data = tpNoNA,aes(x = genreVoted, fill = hasAward))+geom_bar(position = "dodge")+
  labs(title = "Genres crossed with Awards (excluding NAs)")+
  labs(x = "Genre", y = "Count")+
  scale_fill_discrete("Has Award/Nom")
crossplt2

Average Rating with Genre: Kruskal-Wallis

############# CROSS Genres With Average Ratings###################################
# Outcome: Close... can reject the null hypothesis at 0.05 confidence threshold.avgGenrePlt<-ggplot(data = tp,aes(x = genreVoted, y = average_rating))+geom_boxplot(fill="dodgerblue4", width=0.1)+
  labs(title = "Genres By Ratings")+
  labs(x = "Genre", y = "Ratings")
avgGenrePlt

kruskal.test(average_rating~genreVoted,data = tp)## 
##  Kruskal-Wallis rank sum test
## 
## data:  average_rating by genreVoted
## Kruskal-Wallis chi-squared = 4.0544, df = 1, p-value = 0.04406tblResults<-with(tp,tapply(average_rating,genreVoted,median))
tblResults## Other Genres      Fantasy 
##         4.05         4.04

Page Lengths with Genre: Kruskal-Wallis

############# CROSS Genres With Page Counts###################################
# Outcome: SUCCESSFULLY reject Null hypothesispageGenrePlt<-ggplot(data = tp,aes(x = genreVoted, y = pageCounts))+geom_boxplot(fill="dodgerblue4",width=0.1)+
  labs(title = "Genres By Pages")+
  labs(x = "Genre", y = "Page Length")
pageGenrePlt## Warning: Removed 472 rows containing non-finite values (stat_boxplot).

kruskal.test(pageCounts~genreVoted,data = tp)## 
##  Kruskal-Wallis rank sum test
## 
## data:  pageCounts by genreVoted
## Kruskal-Wallis chi-squared = 33.987, df = 1, p-value = 5.548e-09

Average Rating with Award/Nomination: Kruskal-Wallis

############# CROSS Award With Average Ratings###################################
#Outcome: FAIL to reject nullavgGenrePlt<-ggplot(data = tp,aes(x = hasAward, y = average_rating))+geom_boxplot(fill="dodgerblue4", width=0.1)+
  labs(title = "Awards/Nominations By Ratings")+
  labs(x = "Award/Nominations", y = "Ratings")
avgGenrePlt

kruskal.test(average_rating~hasAward,data = tp)## 
##  Kruskal-Wallis rank sum test
## 
## data:  average_rating by hasAward
## Kruskal-Wallis chi-squared = 0.37614, df = 1, p-value = 0.5397

Total Ratings with Award/Nomination: Kruskal-Wallis

############# CROSS Awards With Total Numbers of Reviews ###################################
#Outcome: SUCCESSFULLY reject Null
pageGenrePlt<-ggplot(data = tp,aes(x = hasAward, y = total_ratings))+geom_boxplot(fill="dodgerblue4",wdith=0.1)+
  labs(title = "Award/Nomination By Total Ratings")+
  labs(x = "Award", y = "Total Ratings")## Warning: Ignoring unknown parameters: wdithpageGenrePlt

kruskal.test(total_ratings~hasAward,data = tp)## 
##  Kruskal-Wallis rank sum test
## 
## data:  total_ratings by hasAward
## Kruskal-Wallis chi-squared = 269.06, df = 1, p-value < 2.2e-16tblResults<-with(tp,tapply(total_ratings,hasAward,median))
tblResults## FALSE  TRUE 
##  1087 73265