# This reload library is just used for developing the REPUBLIC hOCR parser
# and can be removed once this module is stable.
%reload_ext autoreload
%autoreload 2
# This is needed to add the repo dir to the path so jupyter
# can load the modules in the scripts directory from the notebooks
import os
import sys
repo_dir = os.path.split(os.getcwd())[0]
print(repo_dir)
if repo_dir not in sys.path:
sys.path.append(repo_dir)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import csv
import os
data_dir = '../data/GoodReads'
books_10k_file = os.path.join(data_dir, 'goodreads_reviews-books_above_10k_lang_reviews.csv.gz')
reviewers_5k_file = os.path.join(data_dir, 'goodreads_reviews-reviewers_above_5k_reviews.csv.gz')
random_1M_file = os.path.join(data_dir, 'goodreads_reviews-random_sample_1M_non_zero.csv.gz')
author_file = os.path.join(data_dir, 'goodreads_book_authors.csv.gz') # author information
book_file = os.path.join(data_dir, 'goodreads_books.csv.gz') # basic book metadata
review_df = pd.read_csv(random_1M_file, sep='\t', compression='gzip')
print('Number of reviews in dataset:', len(review_df))
review_df = review_df[review_df.review_length > 0]
print('Number of non-zero-length reviews in dataset:', len(review_df))
import math
review_df['log_length'] = review_df.review_length.apply(lambda x: int(math.log(x)*4)/4)
review_df['length_bin'] = review_df.log_length.apply(math.exp)
print('longest review:', review_df.review_length.max())
print('mean review length:', review_df.review_length.std())
print('median review length:', np.median(review_df.review_length))
review_df.length_bin.value_counts().sort_index().plot(logx=True)
from dateutil.parser import parse, tz
def parse_date(date_str):
try:
return parse(date_str).astimezone(utc)
except TypeError:
return None
utc = tz.gettz('UTC')
review_df['date_added'] = review_df.date_added.apply(parse_date)
review_df['date_updated'] = review_df.date_updated.apply(parse_date)
review_df['read_at'] = review_df.read_at.apply(parse_date)
review_df['started_at'] = review_df.started_at.apply(parse_date)
review_df.columns
# get a list of book ids that are in the review dataset
review_book_ids = set(review_df.book_id.unique())
print('unique book ids:', len(review_book_ids))
# load basic book metadata (only book and author id and book title)
bookmeta_df = pd.read_csv(book_file, sep='\t', compression='gzip', usecols=['book_id', 'work_id', 'author_id', 'title'])
print('book metadata read')
# filter the book metadata to only the book ids in the review dataset
bookmeta_df = bookmeta_df[bookmeta_df.book_id.isin(review_book_ids)]
print('book metadata filtered')
# load the author metadata to get author names
author_df = pd.read_csv(author_file, sep='\t', compression='gzip', usecols=['author_id', 'name'])
author_df = author_df.rename(columns={'name': 'author_name'})
print('author metadata read')
# merge the book and author metadata into a single dataframe,
# keeping only author names for books in the review dataset
metadata_df = pd.merge(bookmeta_df, author_df, how='left')
print('book and author metadata merged')
# merge the review dataset with the book metadata
review_df = pd.merge(review_df, metadata_df, on='book_id')
print('review and book metadata')
genre_file = os.path.join(data_dir, 'goodreads_book_genres_initial.csv.gz') # book genre information
#genre_df = pd.read_csv(genre_file, sep='\t', compression='gzip')
#genre_df = genre_df[genre_df.book_id.isin(review_book_ids)]
genre_df.genres.value_counts()
g = genre_df.groupby(['book_id', 'genres']).size()
print(len(g))
u_genres = g.unstack('genres').fillna(0)
u_genres = u_genres.reset_index()
print('number of books with genre information:', len(u_genres))
review_df = pd.merge(review_df, u_genres, on='book_id', how='left')
genres = list(u_genres.columns)[1:]
print(genres)
print('Number of reviews per genre:')
for genre in genres:
print(f'{genre: <40}{len(review_df[review_df[genre] == 1]): >10}')
u_genres
# add number of reviews per author for popular author selection
review_df['author_freq'] = review_df.groupby(['author_name'])['review_id'].transform('count')
review_df['book_freq'] = review_df.groupby(['book_id'])['review_id'].transform('count')
review_df
print('number of distinct books:', review_df.book_id.nunique())
print('number of distinct works:', review_df.work_id.nunique())
The 1 million review dataset contains reviews for 397,482 distinct books and 298,169 distinct works. The differences between books and works are to do with different versions and editions of a work being different books, but all are expressions or manifestations of the same intellectual work (there are a lot of nuances to make, for more on the distinctions between work, expression, manifestion and item levels, see the Functional Requirements for Bibliographic Records (FRBR)).
The most frequently reviewed books are no surprises:
review_df.groupby(['book_id', 'author_name', 'title']).size().sort_values()
review_df.groupby(['work_id', 'author_name', 'title']).size().sort_values()
review_df.author_name.value_counts()
review_df.groupby(['author_name'])['book_id'].nunique().sort_values()
review_df.groupby(['author_name'])['work_id'].nunique().sort_values()
plt.rcParams['figure.figsize'] = [15, 5]
# group all reviews by year and month that they were published
g = review_df.groupby([review_df.date_updated.dt.year, review_df.date_updated.dt.month]).size()
# plot the number of reviews per month as a bar chart
ax = g.plot(kind='bar')
# update the ticks on the x-axis so that they remain readable...
ax.set_xticks(range(len(g)));
# ... with only a tick label for January of each year
ax.set_xticklabels(["%s-%02d" % item if item[1] == 1 else '' for item in g.index.tolist()], rotation=90);
plt.gcf().autofmt_xdate()
plt.xlabel('Review month')
plt.ylabel('Number of reviews')
plt.show()
review_df.date_added.dt.month.value_counts().sort_index().plot(kind='bar')
January, July and August are the months with the most reviews. The January peak is probably a consequence of the Christmas holidays in the USA, where most of the reviewers in the Goodreads dataset are from. The July and August reviews are possibly due to summer holidays.
review_df.rating.value_counts().sort_index().plot(kind='bar')
g = review_df.groupby([review_df.date_updated.dt.year, 'rating']).size()
u = g.unstack('date_updated')
for year in u.columns:
u[year] = u[year] / sum(u[year])
g = u.stack()
u = g.unstack('rating')
u.plot(kind='bar')
u
Rating behaviour hasn't changed much over the years. The proportion of positive reviews (4 and 5 stars) has grown from around 60% (30% for each of 4 and 5 stars) to almost 70%, with small drops in proportion for the lower ratings.
Note that with this highly aggregated view (with 1 million reviews for over 400,000 distinct books), we cannot see how this relates to the rating behaviour for individual books, reviewers, authors or genres. For that we need to focus on those individual entities.
from scripts.text_tail_analysis import get_dataframe_review_texts
random_seed = 1205921
sample_df = review_df.sample(100, random_state=random_seed)
review_texts = [review_text for review_text in get_dataframe_review_texts(sample_df)]
sample_df.book_id.value_counts()
All reviews are for different books. That is, in this sample there are no books with more than one review.
from langdetect import detect
reviews_en = [text for text in review_texts if detect(text) == 'en']
print('number of reviews:', len(review_texts))
print('number of reviews in English:', len(reviews_en))
There are 91 reviews in English.
from collections import Counter
import re
tf = Counter()
for text in reviews_en:
# split the texts on any non-word characters
words = re.split(r'\W+', text.strip())
# count the number of times each word occurs across the review texts
tf.update(words)
tf.most_common(20)
import spacy
# load the large model for English
nlp = spacy.load('en_core_web_lg')
# use nlp to parse each text and store the parsed results as a list of docs
docs = [nlp(text) for text in reviews_en]
# iterate over the docs, then over the entities in each doc and count them
tf = Counter([entity.text for doc in docs for entity in doc.ents])
tf.most_common()
With just 100 reviews, all for different books, it is perhaps surprising that the most frequent entities are very specific names that appear many times ('Jessamin', 'Finn', 'Albion', 'Rakhi', 'Black Snow', 'Bulgakov').
But this becomes more understandable when we take into account that some reviews are long and repeat the same names of authors and characters multiple times. We can compensate for this by looking at the document frequency, e.g. the number of documents (each review is a document) in which a term occurs.
df_ent = Counter([entity for doc in docs for entity in set([ent.text for ent in doc.ents])])
df_ent.most_common()
The names that remain are mostly more common terms in the book review domain ('Goodreads', 'Amazon') and more common names of authors or characters ('Elizabeth'). But also the highest frequencies of such names is much lower than the term frequency.
The high term frequency of specific names is a consequence of the scale at which we are looking at the random sample of reviews. With only 100 reviews, the influence of a single long review can and probably will be significant. As the number of randomly sampled reviews grows, the proportional contribution of individual reviews goes down. As we saw with the content analysis of reviews for popular books, zooming out has the effect of bringing the commonalities across reviews into focus.
If all reviews are of the same book (that is, the focus is on the book, the commonalities will probably contain names of author and characters and other plot-related aspects. If all reviews are of books in a specific genre (e.g. romance or mystery, crime and thriller, the commonalities will probably include genre tropes as well as generic aspects of books like narrative (story, plot, characters), and writing style (phrasing, pace, tone and mood). If all reviews are of different books across a wide range of genres, the influence of individual books or and genres is drowned out in the heterogeneity of the selection, and the focus will shift to what is common across book reviews in general, which are aspects of writing style and quality.
from collections import Counter
import re
from scripts.text_tail_analysis import get_dataframe_review_texts
from langdetect.lang_detect_exception import LangDetectException
def detect_lang(text):
try:
return detect(text)
except LangDetectException:
return None
random_seed = 1205921
sample_df = review_df.sample(10000, random_state=random_seed)
review_texts = [review_text for review_text in get_dataframe_review_texts(sample_df)]
reviews_en = [text for text in review_texts if len(text) > 0 and detect_lang(text) == 'en']
print('number of reviews in sample:', len(review_texts))
print('number of English reviews in sample:', len(reviews_en))
print('number of reviewed books in sample:', sample_df.book_id.nunique())
tf = Counter()
for text in reviews_en:
# split the texts on any non-word characters
words = re.split(r'\W+', text.strip())
# count the number of times each word occurs across the review texts
tf.update(words)
tf.most_common(20)
With 10,000 reviews, the top 20 terms start to look more like a common stopword list, with just the domain stopword 'book' in there as well. If we would sample ever more reviews, the list of most frequent words will get ever close to existing stopword lists.
Note that these reviews are for 8,866 different books, so the vast majority of books will have only a single review. In other words, to the extent that there is overlap between reviews, it mostly comes from commonalities across a large set of randomly selected books.
from scripts.text_tail_analysis import write_docs_to_bin, read_docs_from_bin
#docs = [nlp(text) for text in reviews_en]
nlp_docs_file = f'../data/review_spacy_docs.random_1M.sample-10000.seed-{random_seed}.docbin'
#write_docs_to_bin(docs, nlp_docs_file)
docs = read_docs_from_bin(nlp_docs_file, nlp)
# iterate over the docs, then over the entities in each doc and count them
tf = Counter([entity.text for doc in docs for entity in doc.ents])
print('Total number of entities in the sample:', sum(tf.values()))
tf.most_common(50)
The most frequent entities now have very generic entities, including numbers, common names of persons and geographic locations and very popular books ('Harry Potter'). But note that even the common person names 'Jack' and 'Sam' occur only 137 and 135 times respectively in 10,000 reviews and 54,675 entities.
If we shift from term frequency to document frequency, the person names even further:
df_ent = Counter([entity for doc in docs for entity in set([ent.text for ent in doc.ents])])
df_ent.most_common(50)
tf_word = Counter([token.text for doc in docs for token in doc if not token.is_stop and not token.is_punct])
print('Number of words:', sum(tf_word.values()))
print('Number of distinct words:', len(tf_word.keys()))
sizes = [10, 20, 100, 200]
for size in sizes:
sum_top = sum([freq for term, freq in tf_word.most_common(size)])
print(f'Sum frequency of top {size} terms: {sum_top} (fraction: {sum_top / sum(tf_word.values()): >.2f})')
The proportion of the top terms is lower in a heterogeneous set of reviews than in a set of reviews focused in a single book. In the content analysis of reviews for popular books we found 27,632 distinct words, versus 44,024 in these more heterogeneous reviews. Furthermore, top 10 terms represent 16% of all occurring words in reviews for a single popular book, while in this selection it is 11%. For the top 200 terms, the numbers are 54% and 34% respectively. In other words, the wider focus of the random selection also leads to a larger vocabulary that is less skewed. (Less skewed because the total numbers of words is not so different. The 10,000 single-book reviews contain 487,298 words, while the 10,000 randomly sampled reviews contain 514,791 words.
from collections import defaultdict
from scripts.text_tail_analysis import show_pos_tail_distribution
tf_lemma_pos = Counter([(token.lemma_, token.pos_) for doc in docs for token in doc if not token.is_stop and not token.is_punct])
show_pos_tail_distribution(tf_lemma_pos)
We see now that in the tail, almost half of all single-occurrence terms are proper nouns. With 8,866 different books being reviewed, there are probably many different names from characters, places, events and other entities. It is also possible that some non-English reviews got misclassified as English and Spacy does not recognize their words as English words so classifies many or most of them as proper nouns.
We should inspect the tail to see what is going on.
from scripts.text_tail_analysis import show_tail_lemmas
show_tail_lemmas(tf_lemma_pos, tf_threshold=1, pos='PROPN', num_lemmas=100)
There are indeed many names of places, organizations and persons or fictional characters. But there are also a lot of non-English terms in here, with the last 20 or so being probably Italian. As said, it is possible that some reviews were misclassified in terms of language, but another possibility is that reviews contain quotes in different languages.
Let's look at one of the reviews with these Italian terms, e.g. the one containing the word 'dettaglio':
for doc in docs:
for token in doc:
if token.text == 'dettaglio':
print(doc)
We see another reason for the occurrence of non-English terms: reviews with versions of the text in multiple languages. As is typical with user-generated content on the web, at large scale, the variation in contributions is enormous [1]. With so many different people contributing, each in their own way, variation grows as the number of contributions grow.
[1] X. Ochoa, E. Duval, Quantitative analysis of user-generated content on the web, 2008.
Let's look at the contributing reviewers. We expect most reviewers to contribute only a single review, but no doubt there are a small number of highly prolific reviewers contributing hundreds of reviews.
review_df.user_id.value_counts()
In this random sample of 1 million reviews (which is only a fraction of the 15 million reviews that were crawled, which is only a fraction of the over 90 million reviews on Goodreads [2]), there is a reviewer with staggering 1474 reviews.
[2] Goodreads - About us (Accessed 2020-08-04).
reviewer = 'a2d6dd1685e5aa0a72c9410f8f55e056'
reviewer_df = review_df[review_df.user_id == reviewer]
reviewer_df.rating.value_counts().sort_index()
This reviewer provides no ratings. Let's look at the temporal distribution of the reviews.
#reviewer_df.date_updated.dt.year.value_counts().sort_index().plot(kind='bar')
# group all reviews by year and month that they were published
g = reviewer_df.groupby([reviewer_df.date_updated.dt.year, reviewer_df.date_updated.dt.month]).size()
# plot the number of reviews per month as a bar chart
ax = g.plot(kind='bar')
# update the ticks on the x-axis so that they remain readable...
ax.set_xticks(range(len(g)));
# ... with only a tick label for January of each year
ax.set_xticklabels(["%s-%02d" % item if item[1] == 1 else '' for item in g.index.tolist()], rotation=90);
plt.gcf().autofmt_xdate()
plt.xlabel('Review month')
plt.ylabel('Number of reviews')
plt.show()
This reviewer has been contributing reviews since 2011, with many months where they contributed more than 20 reviews and a peak in 2016 with over 100 reviews.
Let's look at the length of these reviews.
reviewer_df.review_length.hist(bins=50)
Almost all reviews are only one or two characters in length. What is going on here?
reviewer_df[reviewer_df.review_length == 1].review_text.value_counts()
reviewer_df[reviewer_df.review_length == 2].review_text.value_counts()
These look like idiosyncratic code that make sense to reviewer, but not necessarily to many others. It might be that these reflect ratings. This demonstrates again the enormous variation that is typically found in user-generated content.
Let's look for prolific reviewers who write longer reviews:
review_df[review_df.review_length > 100].user_id.value_counts()
reviewer = '843a44e2499ba9362b47a089b0b0ce75'
reviewer_df = review_df[review_df.user_id == reviewer]
print('reviewer number of reviews:', len(reviewer_df))
print('reviewer ratings:')
reviewer_df.rating.value_counts().sort_index()
reviewer_df.length_bin.value_counts().sort_index().plot(logx=True)
reviewer_texts = [review_text for review_text in get_dataframe_review_texts(reviewer_df)]
reviewer_texts_en = [text for text in reviewer_texts if len(text) > 0 and detect_lang(text) == 'en']
reviewer_docs = [nlp(text) for text in reviewer_texts_en]
tf_word = Counter([token.text for doc in reviewer_docs for token in doc if not token.is_stop and not token.is_punct])
print('Number of total words (tokens):', sum(tf_word.values()))
print('Number of distinct words (types):', len(tf_word.keys()))
tf_word.most_common(20)
tf_ent = Counter([ent.text for doc in reviewer_docs for ent in doc.ents])
print('Number of total entities (tokens):', sum(tf_ent.values()))
print('Number of distinct entities (types):', len(tf_ent.keys()))
tf_ent.most_common()
The entity list contains a large number of first person names, and many of them with a relatively high frequency. In the 34,719 words of all this reviewer's reviews, there are 4,310 named entities (some with multiple words). So it seems this reviewers has a tendency to name the characters in their review.
from collections import defaultdict
from scripts.text_tail_analysis import show_pos_tail_distribution
tf_lemma_pos = Counter([(token.lemma_, token.pos_) for doc in reviewer_docs for token in doc if not token.is_stop and not token.is_punct])
show_pos_tail_distribution(tf_lemma_pos)
from scripts.text_tail_analysis import show_tail_lemmas
show_tail_lemmas(tf_lemma_pos, tf_threshold=1, pos='NOUN', num_lemmas=100)
for genre in genres:
print(f'{genre: <40}{reviewer_df[reviewer_df[genre] == 1][genre].count(): >5}')
Most of the reviewed books are romance (566 out of 666), with an overlap with mystery, thriller, crime and fantasy, paranormal.
reviewer_df.author_name.value_counts()