We will start with a relatively simple question, but with a difficult journey to get any answers:
We look at Goodreads reviews at different scales and with different selection criteria.
# This reload library is just used for developing the REPUBLIC hOCR parser
# and can be removed once this module is stable.
%reload_ext autoreload
%autoreload 2
# This is needed to add the repo dir to the path so jupyter
# can load the modules in the scripts directory from the notebooks
import os
import sys
repo_dir = os.path.split(os.getcwd())[0]
print(repo_dir)
if repo_dir not in sys.path:
sys.path.append(repo_dir)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import csv
import os
data_dir = '../data/GoodReads'
books_10k_file = os.path.join(data_dir, 'goodreads_reviews-books_above_10k_lang_reviews.csv.gz')
reviewers_5k_file = os.path.join(data_dir, 'goodreads_reviews-reviewers_above_5k_reviews.csv.gz')
random_1M_file = os.path.join(data_dir, 'goodreads_reviews-random_sample_1M.csv.gz')
author_file = os.path.join(data_dir, 'goodreads_book_authors.csv.gz') # author information
book_file = os.path.join(data_dir, 'goodreads_books.csv.gz') # basic book metadata
# the review dataframe
review_df = pd.read_csv(random_1M_file, sep='\t', compression='gzip')
review_df
from dateutil.parser import parse, tz
def parse_date(date_str):
try:
return parse(date_str).astimezone(utc)
except TypeError:
return None
utc = tz.gettz('UTC')
review_df['date_added'] = review_df.date_added.apply(parse_date)
review_df['date_updated'] = review_df.date_updated.apply(parse_date)
review_df['read_at'] = review_df.read_at.apply(parse_date)
review_df['started_at'] = review_df.started_at.apply(parse_date)
print('Number of empty reviews:', len(review_df[review_df.review_length == 0]))
review_df = review_df[review_df.review_length > 0]
# get a list of book ids that are in the review dataset
review_book_ids = set(review_df.book_id.unique())
# load basic book metadata (only book and author id and book title)
metadata_columns = ['book_id', 'work_id', 'author_id', 'title', 'num_pages', 'publication_year']
bookmeta_df = pd.read_csv(book_file, sep='\t', compression='gzip', usecols=metadata_columns)
def get_first(row):
try:
return first[row['work_id']]
except KeyError:
return None
first = bookmeta_df[bookmeta_df.publication_year.notna()].groupby(['work_id']).publication_year.min()
bookmeta_df['first_publication_year'] = bookmeta_df.apply(get_first, axis=1)
# filter the book metadata to only the book ids in the review dataset
bookmeta_df = bookmeta_df[bookmeta_df.book_id.isin(review_book_ids)]
# load the author metadata to get author names
author_df = pd.read_csv(author_file, sep='\t', compression='gzip', usecols=['author_id', 'name'])
author_df = author_df.rename(columns={'name': 'author_name'})
# merge the book and author metadata into a single dataframe,
# keeping only author names for books in the review dataset
metadata_df = pd.merge(bookmeta_df, author_df, how='left')
# merge the review dataset with the book metadata
review_df = pd.merge(review_df, metadata_df, on='book_id')
genre_file = os.path.join(data_dir, 'goodreads_book_genres_initial.csv.gz') # book genre information
genremeta_df = pd.read_csv(genre_file, sep='\t', compression='gzip')
genre_df = genremeta_df[genremeta_df.book_id.isin(review_book_ids)]
groups = genre_df.groupby(['book_id', 'genres']).size()
genre_df = groups.unstack('genres').fillna(0)
genre_df = genre_df.reset_index()
print('number of books with genre information:', len(genre_df))
review_df = pd.merge(review_df, genre_df, on='book_id', how='left')
# This step writes the current dataframe to file,
# so all the merging steps can be skipped in reruns of the notebook
merged_data_file = '../data/Goodreads/goodreads_reviews-random_sample_1M.genre_merged.csv.gzip'
#review_df.to_csv(merged_data_file, sep='\t', compression='gzip')
review_df = pd.read_csv(merged_data_file, sep='\t', compression='gzip')
genres = list(genre_df.columns)[1:]
print(genres)
print('Number of reviews per genre:')
for genre in genres:
print(f'{genre: <40}{len(review_df[review_df[genre] == 1]): >10}')
genre_df
The genres have been added as separate columns to the review dataframe, so that it's easy to select reviews for books in a specific genre.
The poetry and comics, graphics genres are the smallest genres, with fewer than 100,000 reviews. The fiction genre is by far the largest, as it is operates as a aggregate genre with various fiction sub-genres. That is, most of the other genres overlap with the fiction genre. We can check the co-occurrence matrix to see the overlap in genres in more detail.
temp_df = genre_df[genres].fillna(0)
cooc = temp_df.T.dot(temp_df)
for genre in genres:
cooc[genre] = cooc[genre] / cooc.loc[genre, genre]
cooc
Above we see the co-occurrence of genres per book (a book can have multiple genre labels). The data in the co-occurrence matrix shows proportions. So the diagional cells from top left to bottom right are always 1.0 (a genre necessarily co-occurs with itself).
The general fiction genre overlaps strongly with most other genres. That is, the other genres are sub-genres of fiction (we selected only books that had at least a fiction genre label).
The poetry and comics, graphic genres are very distinct from the other subgenres.
The genres fantasy, paranormal has a moderate overlap with children, mystery, thriller, crime, romance, young-adult.
The history, historical fiction, biography genre has more overlap with comics, graphic, poetry and especially non-fiction.
For genre comparison, we will choose three genres:
We'll start with a quantitative analysis of the metadata.
from itertools import combinations
review_genre_df = review_df[genres].fillna(0)
cooc = review_genre_df.T.dot(review_genre_df)
for genre in genres:
cooc[genre] = cooc[genre] / cooc.loc[genre, genre]
cooc
compare_genres = ['history, historical fiction, biography', 'mystery, thriller, crime', 'romance']
genre_df = {}
for genre in compare_genres:
genre_df[genre] = review_df[review_df[genre] == 1]
We check the rating distribution to see if there are differences in rating behaviour between genres.
rating_series = []
for genre in compare_genres:
group = review_df.groupby(['rating', genre]).size()
u = group.unstack(genre)
u = u.rename(columns={1.0: genre})
u[genre] = u[genre] / sum(u[genre])
rating_series.append(u[genre])
pd.concat(rating_series, axis=1).plot(kind='bar')
There are no big differences. The distributions look fairly similar in terms of rating behaviour. The romance genre has a slightly higher proportion of 5-star ratings and a lower proportion of 3- and 4-star ratings than the other two genres.
Some books and authors are very popular and are reviewed by many different readers, which leads to these books and authors having more influence on the overall picture we get for a genre than books that are relatively obscure. So understanding differences between genres in terms of the content of reviews is aided by looking at differences in the actors that influence what content is generated and how it is generated.
Another set of actor influencing this review generation process are the reviewers. Some reviewers write many reviews and have developed conventions for how to write them and what to included in them, others only write an occasional review and perhaps write whatever comes to mind. Some reviewers are very elaborate and discuss the story, writing style and reading experience of a book in detail, while others are succinct and focus the most salient aspect. Some focus more narrative, others on aesthetics and yet others on their own thoughts and feelings.
If there differences in reviewers across genres, or in popularity of authors and books, these can help explain differences in content. We first look at total numbers of reviews, reviewers, authors and books across the three genres.
print('Genre\t\t\t\t\t\tReviews\t\tReviewers\tAuthors\t\tBooks')
stats_columns = ['review_id', 'user_id', 'author_id', 'work_id']
freq = {}
for genre in compare_genres:
stats_string = ''
for column in stats_columns:
freq[(genre, column)] = genre_df[genre][column].nunique()
stats_string += f'{freq[(genre, column)]: >16}'
print(f'{genre: <38}{stats_string}')
print(f'{"Genre": <44}{"Reviewers": <16}{"Authors": <16}{"Books": <16}')
print(f'{"Genre": <44}{"mean median": <16}{"mean median": <16}{"mean median": <16}')
for genre in compare_genres:
stats_string = ''
for column in stats_columns[1:]:
prop = freq[(genre, 'review_id')] / freq[(genre, column)]
median = np.median(genre_df[genre][column].value_counts())
stats_string += f'{prop: >10.2f}{median: >6.0f}'
print(f'{genre: <38}{stats_string}')
Above we show the average number of reviews per reviewer, author and book (both the mean and the median). There are some significant differences between the three genres. In the romance genre, individual reviewers write more reviews, and more there are more reviews per author, especially in comparison with history, historical fiction, biography. Reviewers tend to read (or at least review) more books, and also more books by the same author. This may have consequences for the comparison. If individual reviewers have personal characteristics that influence reviews (e.g. a tendency to write long or short reviews, to use certain vocabulary, to focus their reviews on certain book aspects), this has an influence in what are statistical commonalities and differences.
With more reviews by the same reviewer, their idiosyncracies have relatively high frequency. Also, the reviews of books by the same author may all mention the same author name, or if the books belong to a series, characters and place may recur, which also results in higher relative frequencies.
So, in the comparative analysis, one question is whether to compensate for these kinds of differences and if so, how. For instance, should we balance the selection to only include a single review per reviewer, book and author, or to leave the natural imbalance in tact.
If the goal of the comparison is to say something about reviews in a certain genre in general, we should perhaps let the different frequencies come through as characteristics for indidivual genres. If, on the other hand, the goal is to compare the reception of books in a genre, these imbalances should probably be compensated for, or at least be taken into account in interpreting the observed similarities and differences.
But there is a problem with the analysis above, which is signaled by the difference between the mean and median. We calculated the mean number of reviews per reviewer author and book, but statistics like mean are only meaningful if the data is roughly normally distributed (see the Analyzing Distributions notebook for an elaboration on the problem). When the data distribution is very skewed, such statistics are deceptive. When most items in a set have a low frequency, and there is a small number of outliers with a very high frequency, the outliers drive up the mean, such that the majority of the items are below average.
from collections import Counter
from scripts.helper import ecdf
for genre in compare_genres:
dist = Counter([int(count) for count in genre_df[genre].user_id.value_counts()])
x = dist.keys()
y = list(dist.values())
y_prob = [y_point / sum(y) for y_point in y]
plt.scatter(x,y_prob, label=genre)
plt.xscale('log')
plt.yscale('log')
plt.ylabel('Proportion of reviewers')
plt.xlabel('Number of reviews per reviewer')
plt.legend()
plt.show()
Above we see the number of reviews per reviewer against the number of reviewers. At the top left we see that the vast majority of reviewers in all three genres write only one or a few reviews. The proportion is shown on a logarithmic scale, so the dots for a single review per reviewer are between 40% and 60% of all reviewers, while all dots for more than 10 reviews per reviewers quickly fall far below 1%. But the genres show a difference in their distributions at the higher end (i.e. above 10 reviews per reviewer). What this means is, there is larger proportion of reviewers in the romance genre who write many reviews than in the other genres, but these represent only a few percent of all reviewers.
Next, we look at the number of reviews per author.
from collections import Counter
from scripts.helper import ecdf
for genre in compare_genres:
dist = Counter([int(count) for count in genre_df[genre].author_id.value_counts()])
x = dist.keys()
y = list(dist.values())
y_prob = [y_point / sum(y) for y_point in y]
plt.scatter(x,y_prob, label=genre)
plt.xscale('log')
plt.yscale('log')
plt.ylabel('Proportion of authors')
plt.xlabel('Number of reviews per auhtor')
plt.legend()
plt.show()
Above we see the log-log distribution of the number reviews per author. The history, historical fiction, biography genre has relatively many authors with only a single review (the blue dot in the top left is higher than the top left green and orange dot), and therefore fewer authors with multiple reviews (more the right, the blue dots tend be below the green and orange dots). In other words, for most authors in this genre a comparison is difficult because there is only a single review for it. But for all three genres, the majority of authors have only one or a few reviews, so the much higher average for romance authors than for history, historical fiction, biography authors is caused by a relatively small set of outliers with a very high number of reviews.
A possible explanation is that there is a subset of authors writings romance novels or mystery, thriller, crime novels who are more prolific than authors of historical novels and biographies.
# group by author and cont the number of works they wrote
g = metadata_df.groupby(['author_id']).work_id.nunique()
# turn series into a dataframe and rename the column to reflect the number of works per author
u = g.reset_index()
u = u.rename(columns={'work_id': 'author_works_num'})
# merge the new dataframe with the metadata dataframe created at the top of this notebook
meta_df = pd.merge(metadata_df, u, on='author_id')
# create a dataframe for all books and their genre labels, with one column per genre
groups = genremeta_df.groupby(['book_id', 'genres']).size()
genremeta_df = groups.unstack('genres').fillna(0)
genremeta_df = genremeta_df.reset_index()
# merge the new genre metadata frame with the extended book metadata frame
# so we can connect nmber of works per author with the genre labels
meta_df = pd.merge(meta_df, genremeta_df, on='book_id')
fig = plt.figure(tight_layout=True, figsize=(12,4))
plt.subplot(1,2,1)
for genre in compare_genres:
temp_df = meta_df[meta_df[genre] == 1]
temp_df = temp_df[['author_id', 'author_works_num']].drop_duplicates()
s = temp_df.author_works_num.value_counts().sort_index()
u = s.reset_index()
u = u.rename(columns={'index': 'author_works_num', 'author_works_num': 'num_authors'})
plt.scatter(u.author_works_num, u.num_authors, label=genre)
plt.xlabel('Number of works per author')
plt.ylabel('Number of authors')
plt.xscale('log')
plt.yscale('log')
plt.legend()
plt.subplot(1,2,2)
for genre in compare_genres:
temp_df = meta_df[meta_df[genre] == 1]
temp_df = temp_df[['author_id', 'author_works_num']].drop_duplicates()
s = temp_df.author_works_num.value_counts().sort_index()
u = s.reset_index()
u = u.rename(columns={'index': 'author_works_num', 'author_works_num': 'num_authors'})
plt.scatter(u.author_works_num, u.num_authors / sum(u.num_authors), label=genre)
num_single_work_authors = int(u[u.author_works_num == 1].num_authors)
print(f'{genre: <40}\tauthors: {sum(u.num_authors): >6}\tworks: {sum(u.author_works_num * u.num_authors): >8}\tsingle work authors: {num_single_work_authors: >6}')
plt.xlabel('Number of works per author')
plt.ylabel('Proportion of authors')
plt.xscale('log')
plt.yscale('log')
plt.legend()
In the entire dataset, there 186,360 different authors of history, historical fiction, biography books, which is the same as the number of different authors for romance and mystery, thriller, crime combined. So one thing to take into account is that the history genre has many more distinct authors than the other two.
Above are two distribution plots. The first shows the number of works per author set off against the number of authors (how many authors have written X books). The second shows the number of works per author set off against the proportion of authors (what proportion authors have written X books).
First, there are many more authors in the history genre who write only one book than in the other genres. In the plot on the left, it is clear that for all three genres, the majority of authors write only one or a few books. But the first blue dots are above the other colored dots, which means that there are many more history authors with few books. At the higher end, the long tail of highly prolific author, there are few differences as the colored dots overlap so much that only the green dots are visible. In other words, it is not that there are fewer prolific authors in the history genre than in the other two genres, but that there are many more history authors, and therefore, proportionally many more history authors with few books. This is what the plot on the right shows. From 4 works per author and more, the blue dots are well below the others, showing that the proportion of authors with 4 works of more is lower in the history genre than in the other genres.
from collections import Counter
from scripts.helper import ecdf
for genre in compare_genres:
dist = Counter([int(count) for count in genre_df[genre].work_id.value_counts()])
x = dist.keys()
y = list(dist.values())
y_prob = [y_point / sum(y) for y_point in y]
plt.scatter(x,y_prob, label=genre)
plt.xscale('log')
plt.yscale('log')
plt.ylabel('Proportion of books')
plt.xlabel('Number of reviews per book')
plt.legend()
plt.show()
Above we see number of reviews per book set of against the proportion of books. The three distributions show no big differences. In other words, book popularity behaves no differently between the three genres.
From the analyses of these three aspects (number of reviews per reviewer, per author and per book), we have learned that in the romance genre, there is a longer tail of more prolific reviewers than for the other genres, while the history genre has a higher peak of authors with just a single work. In other words, there is less overlap in authors in the history reviews, and the is more overlap of reviewers in the romance reviews. The romance genre is therefore less heterogeneous than the mystery, thriller, crime genre and especially than the history, historical fiction, biography genre.
review_df.groupby(['book_id', 'publication_year', review_df.date_updated.dt.year]).size()
for genre in compare_genres:
temp_df = genre_df[genre][(genre_df[genre].publication_year > 1950) & (genre_df[genre].publication_year <= 2020)]
year_min = temp_df.publication_year.min()
year_max = temp_df.publication_year.max()
print(genre, year_min, year_max)
temp_df.publication_year.value_counts().sort_index().plot(kind='bar', label=genre)
plt.legend()
for genre in compare_genres:
counts = (genre_df[genre].date_updated.dt.year - genre_df[genre].first_publication_year).value_counts()
dist = Counter([int(count) for count in counts])
x = dist.keys()
y = list(dist.values())
y_prob = [y_point / sum(y) for y_point in y]
plt.scatter(x,y_prob, label=genre)
plt.xscale('log')
plt.yscale('log')
plt.ylabel('Proportion of reviews')
plt.xlabel('Years between book publication and review')
plt.legend()
plt.show()
temp_df = review_df[(review_df.publication_year > 1950) & (review_df.publication_year <= 2020)]
#temp_df.publication_year.value_counts().sort_index().plot(kind='bar')
print(len(temp_df))
temp_df = review_df[(review_df.publication_year <= 1950) | (review_df.publication_year > 2020)]
#temp_df.publication_year.value_counts().sort_index().plot(kind='bar')
print(len(temp_df))
random_seed = 1205921
sample_df = review_df.sample(100, random_state=random_seed)
from scripts.text_tail_analysis import write_docs_to_bin, read_docs_from_bin
#docs = [nlp(text) for text in reviews_en]
nlp_docs_file = f'../data/review_spacy_docs.random_1M.genre-{genre.replace(' ','_')}.sample-10000.seed-{random_seed}.docbin'
#write_docs_to_bin(docs, nlp_docs_file)
docs = read_docs_from_bin(nlp_docs_file, nlp)
# iterate over the docs, then over the entities in each doc and count them
tf = Counter([entity.text for doc in docs for entity in doc.ents])
print('Total number of entities in the sample:', sum(tf.values()))
tf.most_common(50)