In [ ]:
 
In [141]:
# This reload library is just used for developing the REPUBLIC hOCR parser 
# and can be removed once this module is stable.
%reload_ext autoreload
%autoreload 2

# This is needed to add the repo dir to the path so jupyter
# can load the modules in the scripts directory from the notebooks
import os
import sys
repo_dir = os.path.split(os.getcwd())[0]
print(repo_dir)
if repo_dir not in sys.path:
    sys.path.append(repo_dir)
    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import csv
import os

data_dir = '../data/GoodReads'

books_10k_file = os.path.join(data_dir, 'goodreads_reviews-books_above_10k_lang_reviews.csv.gz')
reviewers_5k_file = os.path.join(data_dir, 'goodreads_reviews-reviewers_above_5k_reviews.csv.gz')
random_1M_file = os.path.join(data_dir, 'goodreads_reviews-random_sample_1M_non_zero.csv.gz')
author_file = os.path.join(data_dir, 'goodreads_book_authors.csv.gz') # author information
book_file = os.path.join(data_dir, 'goodreads_books.csv.gz') # basic book metadata
/Users/marijnkoolen/Code/Huygens/scale
In [142]:
review_df = pd.read_csv(random_1M_file, sep='\t', compression='gzip')
print('Number of reviews in dataset:', len(review_df))
review_df = review_df[review_df.review_length > 0]
print('Number of non-zero-length reviews in dataset:', len(review_df))
1007600
In [143]:
import math

review_df['log_length'] = review_df.review_length.apply(lambda x: int(math.log(x)*4)/4)
review_df['length_bin'] = review_df.log_length.apply(math.exp)
In [144]:
print('longest review:', review_df.review_length.max())
print('mean review length:', review_df.review_length.std())
print('median review length:', np.median(review_df.review_length))

review_df.length_bin.value_counts().sort_index().plot(logx=True)
longest review: 20111
mean review length: 1017.6698168086382
median review length: 338.0
Out[144]:
<AxesSubplot:>
In [145]:
from dateutil.parser import parse, tz

def parse_date(date_str):
    try:
        return parse(date_str).astimezone(utc)
    except TypeError:
        return None

utc = tz.gettz('UTC')

review_df['date_added'] = review_df.date_added.apply(parse_date)
review_df['date_updated'] = review_df.date_updated.apply(parse_date)
review_df['read_at'] = review_df.read_at.apply(parse_date)
review_df['started_at'] = review_df.started_at.apply(parse_date)
In [152]:
review_df.columns
Out[152]:
Index(['Unnamed: 0', 'user_id', 'book_id', 'review_id', 'rating', 'date_added',
       'date_updated', 'read_at', 'started_at', 'n_votes', 'n_comments',
       'review_length', 'review_text', 'log_length', 'length_bin', 'author_id',
       'work_id', 'title', 'author_name', 'children', 'comics, graphic',
       'fantasy, paranormal', 'fiction',
       'history, historical fiction, biography', 'mystery, thriller, crime',
       'non-fiction', 'poetry', 'romance', 'young-adult', 'author_freq',
       'book_freq'],
      dtype='object')
In [ ]:
 
In [147]:
# get a list of book ids that are in the review dataset
review_book_ids = set(review_df.book_id.unique())
print('unique book ids:', len(review_book_ids))

# load basic book metadata (only book and author id and book title)
bookmeta_df = pd.read_csv(book_file, sep='\t', compression='gzip', usecols=['book_id', 'work_id', 'author_id', 'title'])
print('book metadata read')

# filter the book metadata to only the book ids in the review dataset
bookmeta_df = bookmeta_df[bookmeta_df.book_id.isin(review_book_ids)]
print('book metadata filtered')

# load the author metadata to get author names 
author_df = pd.read_csv(author_file, sep='\t', compression='gzip', usecols=['author_id', 'name'])
author_df = author_df.rename(columns={'name': 'author_name'})
print('author metadata read')

# merge the book and author metadata into a single dataframe, 
# keeping only author names for books in the review dataset
metadata_df = pd.merge(bookmeta_df, author_df, how='left')
print('book and author metadata merged')

# merge the review dataset with the book metadata
review_df = pd.merge(review_df, metadata_df, on='book_id')
print('review and book metadata')
unique book ids: 404059
book metadata read
book metadata filtered
author metadata read
book and author metadata merged
review and book metadata
In [148]:
genre_file = os.path.join(data_dir, 'goodreads_book_genres_initial.csv.gz') # book genre information

#genre_df = pd.read_csv(genre_file, sep='\t', compression='gzip')
#genre_df = genre_df[genre_df.book_id.isin(review_book_ids)]
genre_df.genres.value_counts()

g = genre_df.groupby(['book_id', 'genres']).size()
print(len(g))
u_genres = g.unstack('genres').fillna(0)
u_genres = u_genres.reset_index()
print('number of books with genre information:', len(u_genres))

review_df = pd.merge(review_df, u_genres, on='book_id', how='left')

genres = list(u_genres.columns)[1:]
print(genres)

print('Number of reviews per genre:')
for genre in genres:
    print(f'{genre: <40}{len(review_df[review_df[genre] == 1]): >10}')

u_genres
1136144
number of books with genre information: 394888
['children', 'comics, graphic', 'fantasy, paranormal', 'fiction', 'history, historical fiction, biography', 'mystery, thriller, crime', 'non-fiction', 'poetry', 'romance', 'young-adult']
Number of reviews per genre:
children                                    127656
comics, graphic                              90404
fantasy, paranormal                         446156
fiction                                     918241
history, historical fiction, biography      354004
mystery, thriller, crime                    409302
non-fiction                                 199731
poetry                                       34223
romance                                     573330
young-adult                                 408106
Out[148]:
genres book_id children comics, graphic fantasy, paranormal fiction history, historical fiction, biography mystery, thriller, crime non-fiction poetry romance young-adult
0 1 1.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 1.0 1.0
1 2 1.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 1.0 1.0
2 3 1.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0
3 4 1.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0
4 5 1.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0
... ... ... ... ... ... ... ... ... ... ... ...
394883 36488099 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
394884 36494299 1.0 1.0 1.0 1.0 0.0 1.0 0.0 0.0 1.0 1.0
394885 36498328 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0
394886 36508486 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0
394887 36514196 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0

394888 rows × 11 columns

In [149]:
# add number of reviews per author for popular author selection
review_df['author_freq'] = review_df.groupby(['author_name'])['review_id'].transform('count')
review_df['book_freq'] = review_df.groupby(['book_id'])['review_id'].transform('count')
review_df
Out[149]:
Unnamed: 0 user_id book_id review_id rating date_added date_updated read_at started_at n_votes ... fantasy, paranormal fiction history, historical fiction, biography mystery, thriller, crime non-fiction poetry romance young-adult author_freq book_freq
0 0 8842281e1d1347389f2ab93d60773d4d 16981 a5d2c3628987712d0e05c4f90798eb67 3 2016-12-05 18:46:44+00:00 2017-03-22 18:37:04+00:00 None None 1 ... 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 47.0 39
1 8146 37d3651e895e35cd2818eb36d87c4367 16981 989d104e066141f8261d33d0ce64d7b5 4 2013-12-19 14:42:58+00:00 2013-12-26 17:16:02+00:00 2013-12-26 17:16:02+00:00 2013-12-19 08:00:00+00:00 0 ... 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 47.0 39
2 50004 a76cc6d83a5206d931a0064e2de99ae3 16981 61598fc0327e7ccf7099c1249128994f 5 2012-08-03 15:43:41+00:00 2012-11-24 23:36:41+00:00 2012-11-24 23:36:41+00:00 2012-11-20 08:00:00+00:00 0 ... 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 47.0 39
3 61923 2d712f5dba10518b7374de3cf946b993 16981 5924e2aaadfaf8fccb7dcaee289de15d 5 2012-08-21 02:32:11+00:00 2012-08-21 02:37:24+00:00 None None 0 ... 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 47.0 39
4 62258 c5287ace96293b661475e65d40bf00d3 16981 95e2bd7a5349dc18832250aad0896533 4 2014-01-13 00:36:30+00:00 2017-05-11 06:29:51+00:00 2017-05-12 06:29:51+00:00 2017-04-29 07:00:00+00:00 1 ... 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 47.0 39
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1107085 1008009 e223be160b89f218dbee70b5fbdccf76 20369388 7c1395ba0a319423707d8ffff79aeafc 5 2014-06-07 05:32:23+00:00 2014-08-26 21:07:08+00:00 2014-08-10 23:25:37+00:00 2013-12-25 08:00:00+00:00 0 ... NaN NaN NaN NaN NaN NaN NaN NaN 4.0 3
1107086 1008009 e223be160b89f218dbee70b5fbdccf76 20369388 7c1395ba0a319423707d8ffff79aeafc 5 2014-06-07 05:32:23+00:00 2014-08-26 21:07:08+00:00 2014-08-10 23:25:37+00:00 2013-12-25 08:00:00+00:00 0 ... NaN NaN NaN NaN NaN NaN NaN NaN 3.0 3
1107087 1008009 e223be160b89f218dbee70b5fbdccf76 20369388 7c1395ba0a319423707d8ffff79aeafc 5 2014-06-07 05:32:23+00:00 2014-08-26 21:07:08+00:00 2014-08-10 23:25:37+00:00 2013-12-25 08:00:00+00:00 0 ... NaN NaN NaN NaN NaN NaN NaN NaN 4.0 3
1107088 1008010 e223be160b89f218dbee70b5fbdccf76 18518801 d2ed77d013ca33fe0eaa9a4013b352c7 5 2013-09-19 09:49:29+00:00 2014-08-26 22:00:58+00:00 2014-08-13 06:38:26+00:00 2013-08-10 07:00:00+00:00 0 ... NaN NaN NaN NaN NaN NaN NaN NaN 4.0 2
1107089 1008010 e223be160b89f218dbee70b5fbdccf76 18518801 d2ed77d013ca33fe0eaa9a4013b352c7 5 2013-09-19 09:49:29+00:00 2014-08-26 22:00:58+00:00 2014-08-13 06:38:26+00:00 2013-08-10 07:00:00+00:00 0 ... NaN NaN NaN NaN NaN NaN NaN NaN 4.0 2

1107090 rows × 31 columns

In [155]:
print('number of distinct books:', review_df.book_id.nunique())
print('number of distinct works:', review_df.work_id.nunique())
number of distinct books: 397482
number of distinct works: 298169

The 1 million review dataset contains reviews for 397,482 distinct books and 298,169 distinct works. The differences between books and works are to do with different versions and editions of a work being different books, but all are expressions or manifestations of the same intellectual work (there are a lot of nuances to make, for more on the distinctions between work, expression, manifestion and item levels, see the Functional Requirements for Bibliographic Records (FRBR)).

The most frequently reviewed books are no surprises:

In [151]:
review_df.groupby(['book_id', 'author_name', 'title']).size().sort_values()
Out[151]:
book_id   author_name           title                                  
14743435  Rebecca    Alexander  Dangerous Assignment                          1
18338982  Stephan Attia         The Children of Paradise                      1
18338980  Bohumil Hrabal        نظارت دقیق قطارها                             1
18338935  Matthew Quick         Forgive Me, Leonard Peacock                   1
18338904  Olivia Cunning        Double Time (Sinners on Tour, #5)             1
                                                                           ... 
5470      George Orwell         1984                                        794
7260188   Suzanne Collins       Mockingjay (The Hunger Games, #3)           829
22557272  Paula Hawkins         The Girl on the Train                       887
2767052   Suzanne Collins       The Hunger Games (The Hunger Games, #1)    1137
11870085  John Green            The Fault in Our Stars                     1363
Length: 457777, dtype: int64
In [156]:
review_df.groupby(['work_id', 'author_name', 'title']).size().sort_values()
Out[156]:
work_id   author_name      title                                  
40        Vance Packard    Hidden Persuaders                             1
24066494  Nicole  Banks    Into Pieces (Shattered Hearts, #2)            1
24066448  N.K. Smith       Are You Mine?                                 1
24066387  Tenaya Jayne     Forest Fire (The Legends of Regia, #2)        1
24066275  Matt Shaw        Consumed                                      1
                                                                      ... 
41107568  Paula Hawkins    The Girl on the Train                      1037
13306276  Gillian Flynn    Gone Girl                                  1133
13155899  Veronica Roth    Divergent (Divergent, #1)                  1152
2792775   Suzanne Collins  The Hunger Games (The Hunger Games, #1)    1307
16827462  John Green       The Fault in Our Stars                     1502
Length: 396887, dtype: int64
In [157]:
review_df.author_name.value_counts()
Out[157]:
J.K. Rowling          5921
Stephen King          5213
Cassandra Clare       4665
Neil Gaiman           3764
Sarah J. Maas         3436
                      ... 
Gloria Kamen             1
Heoin Bru                1
S.V. Shorts              1
Jeffrey O.G. Ogbar       1
Nerys Purchon            1
Name: author_name, Length: 145850, dtype: int64
In [139]:
review_df.groupby(['author_name'])['book_id'].nunique().sort_values()
Out[139]:
author_name
"Big" John McCarthy       1
M.S. Tarot                1
M.S. Spencer              1
M.S. Reese                1
M.S. Kapitsa              1
                       ... 
J.K. Rowling            517
James Patterson         593
Neil Gaiman             596
Agatha Christie         730
Stephen King           1013
Name: book_id, Length: 145850, dtype: int64
In [158]:
review_df.groupby(['author_name'])['work_id'].nunique().sort_values()
Out[158]:
author_name
"Big" John McCarthy      1
Marc Hofmann             1
Marc Hirsch              1
Marc Hillel              1
Marc Hillefeld           1
                      ... 
Stephen King           209
R.L. Stine             212
James Patterson        223
Anonymous              228
Nora Roberts           242
Name: work_id, Length: 145850, dtype: int64
In [37]:
plt.rcParams['figure.figsize'] = [15, 5]

# group all reviews by year and month that they were published
g = review_df.groupby([review_df.date_updated.dt.year, review_df.date_updated.dt.month]).size()
# plot the number of reviews per month as a bar chart
ax = g.plot(kind='bar')
# update the ticks on the x-axis so that they remain readable...
ax.set_xticks(range(len(g)));
# ... with only a tick label for January of each year
ax.set_xticklabels(["%s-%02d" % item if item[1] == 1 else '' for item in g.index.tolist()], rotation=90);
plt.gcf().autofmt_xdate()
plt.xlabel('Review month')
plt.ylabel('Number of reviews')
plt.show()
In [54]:
review_df.date_added.dt.month.value_counts().sort_index().plot(kind='bar')
Out[54]:
<AxesSubplot:>

January, July and August are the months with the most reviews. The January peak is probably a consequence of the Christmas holidays in the USA, where most of the reviewers in the Goodreads dataset are from. The July and August reviews are possibly due to summer holidays.

In [113]:
review_df.rating.value_counts().sort_index().plot(kind='bar')
Out[113]:
<AxesSubplot:>
In [125]:
g = review_df.groupby([review_df.date_updated.dt.year, 'rating']).size()

u = g.unstack('date_updated')
for year in u.columns:
    u[year] = u[year] / sum(u[year])
g = u.stack()
u = g.unstack('rating')
u.plot(kind='bar')

u
Out[125]:
rating 0 1 2 3 4 5
date_updated
2007 0.048246 0.045322 0.078034 0.220395 0.302449 0.305556
2008 0.042875 0.032043 0.082865 0.217261 0.313693 0.311263
2009 0.050650 0.035842 0.087796 0.229717 0.328518 0.267478
2010 0.050543 0.034725 0.083366 0.230796 0.333148 0.267422
2011 0.049351 0.028993 0.078967 0.220177 0.344099 0.278413
2012 0.038452 0.031554 0.074853 0.211019 0.341865 0.302257
2013 0.033521 0.028535 0.071468 0.201412 0.335722 0.329343
2014 0.029814 0.028692 0.069875 0.195384 0.331968 0.344267
2015 0.030089 0.027308 0.068839 0.193690 0.332502 0.347572
2016 0.038056 0.026552 0.066130 0.193052 0.329745 0.346465
2017 0.034391 0.026444 0.064822 0.188774 0.331966 0.353603

Rating behaviour hasn't changed much over the years. The proportion of positive reviews (4 and 5 stars) has grown from around 60% (30% for each of 4 and 5 stars) to almost 70%, with small drops in proportion for the lower ratings.

Note that with this highly aggregated view (with 1 million reviews for over 400,000 distinct books), we cannot see how this relates to the rating behaviour for individual books, reviewers, authors or genres. For that we need to focus on those individual entities.

In [159]:
from scripts.text_tail_analysis import get_dataframe_review_texts

random_seed = 1205921

sample_df = review_df.sample(100, random_state=random_seed)

review_texts = [review_text for review_text in get_dataframe_review_texts(sample_df)]
In [160]:
sample_df.book_id.value_counts()
Out[160]:
16237311    1
1140553     1
106550      1
4407        1
12471098    1
           ..
1618        1
18131118    1
10293725    1
59825       1
799232      1
Name: book_id, Length: 100, dtype: int64

All reviews are for different books. That is, in this sample there are no books with more than one review.

In [161]:
from langdetect import detect

reviews_en = [text for text in review_texts if detect(text) == 'en']
print('number of reviews:', len(review_texts))
print('number of reviews in English:', len(reviews_en))
number of reviews: 100
number of reviews in English: 91

There are 91 reviews in English.

In [162]:
from collections import Counter
import re

tf = Counter()
for text in reviews_en:
    # split the texts on any non-word characters
    words = re.split(r'\W+', text.strip())
    # count the number of times each word occurs across the review texts
    tf.update(words)

tf.most_common(20)
Out[162]:
[('the', 528),
 ('and', 338),
 ('I', 332),
 ('to', 271),
 ('a', 263),
 ('of', 263),
 ('in', 180),
 ('it', 168),
 ('is', 167),
 ('s', 136),
 ('that', 135),
 ('this', 133),
 ('was', 131),
 ('book', 120),
 ('with', 96),
 ('t', 89),
 ('for', 85),
 ('', 84),
 ('but', 74),
 ('The', 71)]
In [163]:
import spacy

# load the large model for English
nlp = spacy.load('en_core_web_lg')

# use nlp to parse each text and store the parsed results as a list of docs
docs = [nlp(text) for text in reviews_en]

# iterate over the docs, then over the entities in each doc and count them
tf = Counter([entity.text for doc in docs for entity in doc.ents])

tf.most_common()
Out[163]:
[('first', 26),
 ('one', 15),
 ('two', 15),
 ('Alice', 14),
 ('three', 10),
 ('Anne', 9),
 ('Baratte', 8),
 ('Louisa', 8),
 ('Iran', 6),
 ('second', 5),
 ('Mary Anne', 5),
 ("Mary Anne's", 5),
 ('5', 4),
 ('English', 4),
 ('third', 4),
 ('Asher', 4),
 ('Wiebe', 4),
 ('One', 4),
 ('these days', 4),
 ('Jean-Baptiste Baratte', 4),
 ('Les Innocents', 4),
 ('Wonderland', 4),
 ('Jamie', 4),
 ('Godwin', 4),
 ('Caleb', 4),
 ('Edward', 4),
 ('Marjane', 4),
 ('Dawn', 4),
 ('BSC', 4),
 ('half', 4),
 ('today', 3),
 ('Beck', 3),
 ('Anne Merchant', 3),
 ('Cania Christy', 3),
 ('Gaiman', 3),
 ('f**king', 3),
 ('iLove', 3),
 ('Simon', 3),
 ('John Grey', 3),
 ('John', 3),
 ('Bella', 3),
 ('32', 2),
 ('Scout', 2),
 ('Atticus', 2),
 ('Alabama', 2),
 ('fourth', 2),
 ('eight', 2),
 ('nine', 2),
 ('Katarina', 2),
 ('Allie', 2),
 ('COOPER', 2),
 ('Chelcie', 2),
 ('days', 2),
 ("Cania Christy's", 2),
 ('A Little Life', 2),
 ('Simenon', 2),
 ('12,000', 2),
 ('Yunior', 2),
 ('Diaz', 2),
 ('American', 2),
 ('Sammie', 2),
 ('Christian', 2),
 ('Sybill', 2),
 ('Versailles', 2),
 ('Walker', 2),
 ('Eve', 2),
 ('the 19th century', 2),
 ('Wes', 2),
 ('Elaine Fox', 2),
 ('Mary Kay McComas', 2),
 ('RC Ryan', 2),
 ('3', 2),
 ('2', 2),
 ('Grace', 2),
 ('NetGalley', 2),
 ('the Outlander series', 2),
 ('Claire', 2),
 ('America', 2),
 ('Outlander', 2),
 ('Killion', 2),
 ('Snyder', 2),
 ('Taylor', 2),
 ('Caleb Williams', 2),
 ('Falkland', 2),
 ('Kimberly', 2),
 ('Claudia', 2),
 ('Diana', 2),
 ('Ee', 2),
 ('raffe', 2),
 ('beliel', 2),
 ('penyrn', 2),
 ('Paige', 2),
 ('five', 2),
 ('Livingston Press', 2),
 ('Hal', 2),
 ('Lea Rachel', 1),
 ('34', 1),
 ("William Shakespeare's", 1),
 ('56', 1),
 ('58', 1),
 ('31', 1),
 ('ages 9-14', 1),
 ('7th', 1),
 ('Jem', 1),
 ('a couple of weeks', 1),
 ('Finish Short Trips', 1),
 ('Jac Rayner', 1),
 ('2003', 1),
 ('Terpsichore', 1),
 ('Teach Yourself Ballroom Dancing', 1),
 ('Robert Shearman', 1),
 ('Sixth', 1),
 ('Thalia', 1),
 ('The Brain of Socrates', 1),
 ('Gareth Roberts', 1),
 ('the Fourth Doctor and Leela', 1),
 ('Clio', 1),
 ('The Glass Princess', 1),
 ('Justin Richards', 1),
 ('Calliope', 1),
 ('Steve Lyons', 1),
 ('ARC', 1),
 ('Net Galley', 1),
 ('Crooked Lane Books', 1),
 ('Inspector Zhang', 1),
 ('Sherlock Holmes', 1),
 ('six', 1),
 ('3,5-4', 1),
 ('Ow Kanin', 1),
 ('Kanin', 1),
 ('3.5/5', 1),
 ('a 15 year old', 1),
 ('Corps Security', 1),
 ('the days', 1),
 ('Asher Cooper', 1),
 ('Chelcie Avery', 1),
 ('Sunshine', 1),
 ('Harper Sloan', 1),
 ('the Corps Security Family', 1),
 ('Tonje Glimmerdal', 1),
 ('this year', 1),
 ('Russian', 1),
 ('ITV Maigret', 1),
 ('the Three Widows Crossroads', 1),
 ('Maigret', 1),
 ('French', 1),
 ('ten', 1),
 ('Maigrets', 1),
 ('1931', 1),
 ('a few years', 1),
 ('World Book Club', 1),
 ('June 2013', 1),
 ('FYI', 1),
 ('Oscar Wao', 1),
 ('Dominican', 1),
 ('Rafa', 1),
 ('Spanish', 1),
 ('Lits', 1),
 ('Jane', 1),
 ('Addy', 1),
 ('the Selection series', 1),
 ('Jonathon', 1),
 ('Patty Smith', 1),
 ('Smith', 1),
 ('age 12', 1),
 ('2011', 1),
 ('Newbery', 1),
 ('Phillip', 1),
 ('Quinns', 1),
 ('Seth', 1),
 ('Christen', 1),
 ('Shawn', 1),
 ('1785', 1),
 ('France', 1),
 ('Voltaire', 1),
 ('Paris', 1),
 ('30', 1),
 ('Lecoeur', 1),
 ('day', 1),
 ('The year', 1),
 ('Adam', 1),
 ('Lazarus', 1),
 ('Andrew Miller', 1),
 ('Jennifer Cameron-Smith', 1),
 ('Just Another Judgement Day', 1),
 ('John Taylor', 1),
 ('Nightside', 1),
 ('Thorns', 1),
 ('Christianity', 1),
 ('The Book Pushers   ', 1),
 ('This year', 1),
 ('150th', 1),
 ('Geoffrey Maguire', 1),
 ('later this year', 1),
 ("the Mad Hatter's Tea Party", 1),
 ('Red Queen', 1),
 ('Tweedledum', 1),
 ('Tweedledee', 1),
 ('Dallas', 1),
 ('Roarke', 1),
 ('Wonderment in Death.', 1),
 ('41.5', 1),
 ('41', 1),
 ('1', 1),
 ('Naked', 1),
 ('Earth', 1),
 ('2050s', 1),
 ("Lewis Carroll's", 1),
 ('the Mad Hatter', 1),
 ("Mary Blayney's", 1),
 ('Blayney', 1),
 ('J.D. Robb', 1),
 ('Weston', 1),
 ('Alice Kemp', 1),
 ('their 21st century', 1),
 ('21st century', 1),
 ('the 21st', 1),
 ('Countess', 1),
 ('The 21st century', 1),
 ('4', 1),
 ('The Book of Bones', 1),
 ('Saturday', 1),
 ('evening', 1),
 ('320', 1),
 ('Harold', 1),
 ('Her, Fates and Furies', 1),
 ('the Secret Scripture', 1),
 ('LFL', 1),
 ('3 year old', 1),
 ('Victorian England', 1),
 ('Lucy Childs', 1),
 ('Wildthorn', 1),
 ('Eliza', 1),
 ('Beatrice', 1),
 ('Christmas', 1),
 ('sookie', 1),
 ('Shadows', 1),
 ('Fiona Skye', 1),
 ('Riley', 1),
 ('David', 1),
 ('Aine', 1),
 ('Orla', 1),
 ('Silver Shackles', 1),
 ('more than a week', 1),
 ('week', 1),
 ('Jamie Fraser', 1),
 ('The Scottish Prisoner', 1),
 ('German', 1),
 ('Europe', 1),
 ('London', 1),
 ('Germany', 1),
 ('British', 1),
 ('Tempting Mr. Wrong', 1),
 ('Lance', 1),
 ('komono', 1),
 ('Goodreads', 1),
 ('KonMari', 1),
 ('Shintoism', 1),
 ('Tom Killion', 1),
 ('PDA', 1),
 ('last summer', 1),
 ('Henry', 1),
 ('3.5', 1),
 ('One minute', 1),
 ('a few days', 1),
 ('FIRST', 1),
 ('HUUUGGEE', 1),
 ('Justice', 1),
 ('justice', 1),
 ('Hong Kong', 1),
 ('at least 2 years', 1),
 ('Dracula', 1),
 ('Twilight', 1),
 ('Isabella Swan', 1),
 ('Biology II', 1),
 ('Edward Cullen', 1),
 ("Stephenie Meyer's", 1),
 ('Pride', 1),
 ('Lizzy', 1),
 ('Darcy', 1),
 ('Canada', 1),
 ('July 2017', 1),
 ("Sherman Alexie's", 1),
 ('Marjane Strapi', 1),
 ('Iraq', 1),
 ('fourteen', 1),
 ('Austria', 1),
 ('Years', 1),
 ('1980s - 1990s', 1),
 ('Iranians', 1),
 ('Obraztsov', 1),
 ('Obraztsovs', 1),
 ('the Day', 1),
 ('Jenny Prezzioso', 1),
 ('Kristy', 1),
 ('Shillaber', 1),
 ('Stacey', 1),
 ('Cokie Mason', 1),
 ('Alma', 1),
 ('almost eleven years', 1),
 ('night', 1),
 ('Keep Out', 1),
 ('Pikes', 1),
 ('Catholic', 1),
 ('Jenny', 1),
 ('Shadow of the Night', 1),
 ('A Discovery of Witches', 1),
 ('Matthew', 1),
 ('Elizabethan', 1),
 ('the last couple of days', 1),
 ('tonight', 1),
 ('Richelle Mead', 1),
 ('Raffe', 1),
 ("penyrn's '", 1),
 ('Frankenstein', 1),
 ('Cheers', 1),
 ('Rise of Rome', 1),
 ('Anthony Everitt', 1),
 ('Rome', 1),
 ('another day', 1),
 ('Regency', 1),
 ('3.9', 1),
 ('The Book Description: An imprisoned child prince', 1),
 ('Roman', 1),
 ('Denny', 1),
 ('Flagstaff', 1),
 ('Arizona', 1),
 ('quarter-ton', 1),
 ('ten-year-old', 1),
 ('Yangchow', 1),
 ('Jin Nong', 1),
 ('1687-1763', 1),
 ('My Review', 1),
 ('Finish one', 1),
 ('a few seconds', 1),
 ("Lou Beach's", 1),
 ('420', 1),
 ('21 June', 1),
 ('UWA', 1),
 ('one dozen', 1),
 ('Station 22', 1),
 ('The University of West Alabama', 1),
 ('Livingston', 1),
 ('summer solstice', 1),
 ('June 21', 1),
 ('Kit', 1),
 ('BBC', 1),
 ('Zavac', 1),
 ('Lydia', 1),
 ('Stig', 1),
 ('The Strange and Beautiful Sorrows of Ava Lavender', 1),
 ('Polish', 1)]

With just 100 reviews, all for different books, it is perhaps surprising that the most frequent entities are very specific names that appear many times ('Jessamin', 'Finn', 'Albion', 'Rakhi', 'Black Snow', 'Bulgakov').

But this becomes more understandable when we take into account that some reviews are long and repeat the same names of authors and characters multiple times. We can compensate for this by looking at the document frequency, e.g. the number of documents (each review is a document) in which a term occurs.

In [164]:
df_ent = Counter([entity for doc in docs for entity in set([ent.text for ent in doc.ents])])

df_ent.most_common()
Out[164]:
[('first', 17),
 ('two', 12),
 ('one', 11),
 ('three', 8),
 ('second', 5),
 ('5', 4),
 ('third', 4),
 ('One', 4),
 ('these days', 4),
 ('today', 3),
 ('English', 3),
 ('fourth', 2),
 ('American', 2),
 ('3', 2),
 ('Louisa', 2),
 ('NetGalley', 2),
 ('America', 2),
 ('five', 2),
 ('half', 2),
 ('Lea Rachel', 1),
 ('56', 1),
 ('32', 1),
 ("William Shakespeare's", 1),
 ('31', 1),
 ('34', 1),
 ('58', 1),
 ('ages 9-14', 1),
 ('7th', 1),
 ('Jem', 1),
 ('a couple of weeks', 1),
 ('Alabama', 1),
 ('Scout', 1),
 ('Atticus', 1),
 ('eight', 1),
 ('Thalia', 1),
 ('The Glass Princess', 1),
 ('Sixth', 1),
 ('Justin Richards', 1),
 ('Katarina', 1),
 ('Calliope', 1),
 ('The Brain of Socrates', 1),
 ('the Fourth Doctor and Leela', 1),
 ('2003', 1),
 ('Gareth Roberts', 1),
 ('Robert Shearman', 1),
 ('Jac Rayner', 1),
 ('Clio', 1),
 ('Steve Lyons', 1),
 ('Teach Yourself Ballroom Dancing', 1),
 ('nine', 1),
 ('Finish Short Trips', 1),
 ('Terpsichore', 1),
 ('Net Galley', 1),
 ('Crooked Lane Books', 1),
 ('ARC', 1),
 ('Inspector Zhang', 1),
 ('Sherlock Holmes', 1),
 ('Allie', 1),
 ('Kanin', 1),
 ('3,5-4', 1),
 ('six', 1),
 ('Ow Kanin', 1),
 ('3.5/5', 1),
 ('a 15 year old', 1),
 ('Sunshine', 1),
 ('COOPER', 1),
 ('Asher Cooper', 1),
 ('Harper Sloan', 1),
 ('the days', 1),
 ('Chelcie Avery', 1),
 ('Chelcie', 1),
 ('the Corps Security Family', 1),
 ('Beck', 1),
 ('Corps Security', 1),
 ('Asher', 1),
 ('days', 1),
 ('this year', 1),
 ('Tonje Glimmerdal', 1),
 ('Russian', 1),
 ('Cania Christy', 1),
 ('Anne Merchant', 1),
 ("Cania Christy's", 1),
 ('Anne', 1),
 ('Wiebe', 1),
 ('A Little Life', 1),
 ('Simenon', 1),
 ('the Three Widows Crossroads', 1),
 ('1931', 1),
 ('Maigret', 1),
 ('ITV Maigret', 1),
 ('French', 1),
 ('ten', 1),
 ('a few years', 1),
 ('Maigrets', 1),
 ('June 2013', 1),
 ('World Book Club', 1),
 ('12,000', 1),
 ('Gaiman', 1),
 ('FYI', 1),
 ('Rafa', 1),
 ('Diaz', 1),
 ('Yunior', 1),
 ('Spanish', 1),
 ('Dominican', 1),
 ('Oscar Wao', 1),
 ('Lits', 1),
 ('Jane', 1),
 ('f**king', 1),
 ('Sammie', 1),
 ('Christian', 1),
 ('the Selection series', 1),
 ('Addy', 1),
 ('Jonathon', 1),
 ('Patty Smith', 1),
 ('Smith', 1),
 ('age 12', 1),
 ('2011', 1),
 ('Newbery', 1),
 ('Seth', 1),
 ('Sybill', 1),
 ('Phillip', 1),
 ('Quinns', 1),
 ('Shawn', 1),
 ('Christen', 1),
 ('Adam', 1),
 ('day', 1),
 ('Jean-Baptiste Baratte', 1),
 ('Paris', 1),
 ('The year', 1),
 ('Baratte', 1),
 ('1785', 1),
 ('France', 1),
 ('Versailles', 1),
 ('Les Innocents', 1),
 ('Voltaire', 1),
 ('Lazarus', 1),
 ('Andrew Miller', 1),
 ('Lecoeur', 1),
 ('Jennifer Cameron-Smith', 1),
 ('30', 1),
 ('Just Another Judgement Day', 1),
 ('Thorns', 1),
 ('Walker', 1),
 ('Christianity', 1),
 ('John Taylor', 1),
 ('Nightside', 1),
 ('1', 1),
 ("Lewis Carroll's", 1),
 ('Weston', 1),
 ('This year', 1),
 ('Earth', 1),
 ('J.D. Robb', 1),
 ('Mary Kay McComas', 1),
 ('Wes', 1),
 ('the 19th century', 1),
 ('Blayney', 1),
 ('iLove', 1),
 ('Red Queen', 1),
 ('the Mad Hatter', 1),
 ('The 21st century', 1),
 ('Tweedledum', 1),
 ("the Mad Hatter's Tea Party", 1),
 ('RC Ryan', 1),
 ("Mary Blayney's", 1),
 ('Elaine Fox', 1),
 ('Naked', 1),
 ('Wonderment in Death.', 1),
 ('Countess', 1),
 ('Alice', 1),
 ('41', 1),
 ('Eve', 1),
 ('Dallas', 1),
 ('Wonderland', 1),
 ('2050s', 1),
 ('the 21st', 1),
 ('The Book Pushers   ', 1),
 ('Geoffrey Maguire', 1),
 ('150th', 1),
 ('41.5', 1),
 ('Tweedledee', 1),
 ('21st century', 1),
 ('later this year', 1),
 ('Roarke', 1),
 ('their 21st century', 1),
 ('Alice Kemp', 1),
 ('2', 1),
 ('4', 1),
 ('The Book of Bones', 1),
 ('evening', 1),
 ('Simon', 1),
 ('Saturday', 1),
 ('320', 1),
 ('LFL', 1),
 ('Harold', 1),
 ('Her, Fates and Furies', 1),
 ('the Secret Scripture', 1),
 ('3 year old', 1),
 ('Victorian England', 1),
 ('Grace', 1),
 ('Wildthorn', 1),
 ('Eliza', 1),
 ('Lucy Childs', 1),
 ('Beatrice', 1),
 ('Christmas', 1),
 ('sookie', 1),
 ('Shadows', 1),
 ('Silver Shackles', 1),
 ('Aine', 1),
 ('Fiona Skye', 1),
 ('Riley', 1),
 ('Orla', 1),
 ('David', 1),
 ('more than a week', 1),
 ('Outlander', 1),
 ('London', 1),
 ('week', 1),
 ('British', 1),
 ('Jamie Fraser', 1),
 ('Europe', 1),
 ('The Scottish Prisoner', 1),
 ('Germany', 1),
 ('John', 1),
 ('the Outlander series', 1),
 ('German', 1),
 ('Jamie', 1),
 ('John Grey', 1),
 ('Claire', 1),
 ('Tempting Mr. Wrong', 1),
 ('Lance', 1),
 ('Shintoism', 1),
 ('Goodreads', 1),
 ('KonMari', 1),
 ('komono', 1),
 ('Snyder', 1),
 ('Tom Killion', 1),
 ('Killion', 1),
 ('Taylor', 1),
 ('Henry', 1),
 ('last summer', 1),
 ('PDA', 1),
 ('One minute', 1),
 ('FIRST', 1),
 ('HUUUGGEE', 1),
 ('3.5', 1),
 ('a few days', 1),
 ('justice', 1),
 ('Caleb Williams', 1),
 ('Godwin', 1),
 ('Justice', 1),
 ('Caleb', 1),
 ('Falkland', 1),
 ('Kimberly', 1),
 ('Hong Kong', 1),
 ('at least 2 years', 1),
 ('Bella', 1),
 ('Twilight', 1),
 ('Edward', 1),
 ('Edward Cullen', 1),
 ('Biology II', 1),
 ('Dracula', 1),
 ("Stephenie Meyer's", 1),
 ('Isabella Swan', 1),
 ('Canada', 1),
 ('Pride', 1),
 ('Darcy', 1),
 ('Lizzy', 1),
 ('July 2017', 1),
 ("Sherman Alexie's", 1),
 ('fourteen', 1),
 ('1980s - 1990s', 1),
 ('Iranians', 1),
 ('Years', 1),
 ('Marjane', 1),
 ('Marjane Strapi', 1),
 ('Austria', 1),
 ('Iran', 1),
 ('Iraq', 1),
 ('Obraztsovs', 1),
 ('Obraztsov', 1),
 ('almost eleven years', 1),
 ('the Day', 1),
 ('Shillaber', 1),
 ('Claudia', 1),
 ('night', 1),
 ('Mary Anne', 1),
 ('BSC', 1),
 ("Mary Anne's", 1),
 ('Cokie Mason', 1),
 ('Jenny', 1),
 ('Stacey', 1),
 ('Catholic', 1),
 ('Keep Out', 1),
 ('Alma', 1),
 ('Pikes', 1),
 ('Dawn', 1),
 ('Kristy', 1),
 ('Jenny Prezzioso', 1),
 ('Shadow of the Night', 1),
 ('tonight', 1),
 ('A Discovery of Witches', 1),
 ('Diana', 1),
 ('Elizabethan', 1),
 ('Matthew', 1),
 ('the last couple of days', 1),
 ('Richelle Mead', 1),
 ('Raffe', 1),
 ('Paige', 1),
 ('penyrn', 1),
 ('beliel', 1),
 ('Ee', 1),
 ('Frankenstein', 1),
 ('Cheers', 1),
 ('raffe', 1),
 ("penyrn's '", 1),
 ('Anthony Everitt', 1),
 ('Rise of Rome', 1),
 ('Rome', 1),
 ('another day', 1),
 ('Regency', 1),
 ('Arizona', 1),
 ('21 June', 1),
 ('quarter-ton', 1),
 ('Livingston Press', 1),
 ("Lou Beach's", 1),
 ('Station 22', 1),
 ('UWA', 1),
 ('1687-1763', 1),
 ('ten-year-old', 1),
 ('Yangchow', 1),
 ('The Book Description: An imprisoned child prince', 1),
 ('420', 1),
 ('Flagstaff', 1),
 ('June 21', 1),
 ('Denny', 1),
 ('Livingston', 1),
 ('one dozen', 1),
 ('a few seconds', 1),
 ('3.9', 1),
 ('My Review', 1),
 ('Roman', 1),
 ('Jin Nong', 1),
 ('The University of West Alabama', 1),
 ('Finish one', 1),
 ('summer solstice', 1),
 ('BBC', 1),
 ('Kit', 1),
 ('Stig', 1),
 ('Hal', 1),
 ('Lydia', 1),
 ('Zavac', 1),
 ('Polish', 1),
 ('The Strange and Beautiful Sorrows of Ava Lavender', 1)]

The names that remain are mostly more common terms in the book review domain ('Goodreads', 'Amazon') and more common names of authors or characters ('Elizabeth'). But also the highest frequencies of such names is much lower than the term frequency.

The high term frequency of specific names is a consequence of the scale at which we are looking at the random sample of reviews. With only 100 reviews, the influence of a single long review can and probably will be significant. As the number of randomly sampled reviews grows, the proportional contribution of individual reviews goes down. As we saw with the content analysis of reviews for popular books, zooming out has the effect of bringing the commonalities across reviews into focus.

If all reviews are of the same book (that is, the focus is on the book, the commonalities will probably contain names of author and characters and other plot-related aspects. If all reviews are of books in a specific genre (e.g. romance or mystery, crime and thriller, the commonalities will probably include genre tropes as well as generic aspects of books like narrative (story, plot, characters), and writing style (phrasing, pace, tone and mood). If all reviews are of different books across a wide range of genres, the influence of individual books or and genres is drowned out in the heterogeneity of the selection, and the focus will shift to what is common across book reviews in general, which are aspects of writing style and quality.

In [187]:
from collections import Counter
import re

from scripts.text_tail_analysis import get_dataframe_review_texts
from langdetect.lang_detect_exception import LangDetectException

def detect_lang(text):
    try:
        return detect(text)
    except LangDetectException:
        return None

random_seed = 1205921

sample_df = review_df.sample(10000, random_state=random_seed)

review_texts = [review_text for review_text in get_dataframe_review_texts(sample_df)]
reviews_en = [text for text in review_texts if len(text) > 0 and detect_lang(text) == 'en']
print('number of reviews in sample:', len(review_texts))
print('number of English reviews in sample:', len(reviews_en))
print('number of reviewed books in sample:', sample_df.book_id.nunique())

tf = Counter()
for text in reviews_en:
    # split the texts on any non-word characters
    words = re.split(r'\W+', text.strip())
    # count the number of times each word occurs across the review texts
    tf.update(words)

tf.most_common(20)
number of reviews in sample: 10000
number of English reviews in sample: 8790
number of reviewed books in sample: 8866
Out[187]:
[('the', 50859),
 ('and', 35437),
 ('I', 31695),
 ('to', 29932),
 ('a', 29613),
 ('of', 27066),
 ('is', 16944),
 ('in', 16481),
 ('it', 15034),
 ('that', 14894),
 ('was', 12994),
 ('this', 12178),
 ('s', 11826),
 ('book', 11401),
 ('for', 9643),
 ('with', 8883),
 ('', 8264),
 ('t', 8251),
 ('her', 8099),
 ('but', 7980)]

With 10,000 reviews, the top 20 terms start to look more like a common stopword list, with just the domain stopword 'book' in there as well. If we would sample ever more reviews, the list of most frequent words will get ever close to existing stopword lists.

Note that these reviews are for 8,866 different books, so the vast majority of books will have only a single review. In other words, to the extent that there is overlap between reviews, it mostly comes from commonalities across a large set of randomly selected books.

In [265]:
from scripts.text_tail_analysis import write_docs_to_bin, read_docs_from_bin
#docs = [nlp(text) for text in reviews_en]
nlp_docs_file = f'../data/review_spacy_docs.random_1M.sample-10000.seed-{random_seed}.docbin'

#write_docs_to_bin(docs, nlp_docs_file)

docs = read_docs_from_bin(nlp_docs_file, nlp)

# iterate over the docs, then over the entities in each doc and count them
tf = Counter([entity.text for doc in docs for entity in doc.ents])

print('Total number of entities in the sample:', sum(tf.values()))
tf.most_common(50)
Total number of entities in the sample: 54675
Out[265]:
[('first', 1853),
 ('one', 1561),
 ('two', 1156),
 ('second', 486),
 ('One', 339),
 ('three', 330),
 ('5', 299),
 ('3', 246),
 ('2', 227),
 ('4', 207),
 ('third', 172),
 ('3.5', 156),
 ('four', 154),
 ('Jack', 137),
 ('Sam', 135),
 ('five', 135),
 ('today', 135),
 ('half', 130),
 ('First', 129),
 ('American', 124),
 ('1', 119),
 ('Alex', 107),
 ('4.5', 105),
 ('Christmas', 101),
 ('English', 100),
 ('Kate', 87),
 ('America', 86),
 ('years', 84),
 ('French', 81),
 ('Anna', 80),
 ('Emma', 79),
 ('London', 77),
 ('POV', 73),
 ('ARC', 72),
 ('Rachel', 70),
 ('Christian', 69),
 ('Grace', 68),
 ('Ben', 67),
 ('Max', 66),
 ('Jake', 64),
 ('NetGalley', 62),
 ('Nick', 60),
 ('Harry', 60),
 ('Two', 58),
 ('Tom', 58),
 ('British', 57),
 ('Elizabeth', 55),
 ('Harry Potter', 55),
 ('England', 55),
 ('Jamie', 54)]
In [ ]:
 

The most frequent entities now have very generic entities, including numbers, common names of persons and geographic locations and very popular books ('Harry Potter'). But note that even the common person names 'Jack' and 'Sam' occur only 137 and 135 times respectively in 10,000 reviews and 54,675 entities.

If we shift from term frequency to document frequency, the person names even further:

In [175]:
df_ent = Counter([entity for doc in docs for entity in set([ent.text for ent in doc.ents])])

df_ent.most_common(50)
Out[175]:
[('first', 1382),
 ('one', 1192),
 ('two', 867),
 ('second', 429),
 ('One', 299),
 ('three', 277),
 ('5', 242),
 ('2', 205),
 ('3', 203),
 ('4', 180),
 ('third', 151),
 ('3.5', 147),
 ('four', 128),
 ('First', 124),
 ('today', 120),
 ('five', 118),
 ('half', 115),
 ('1', 108),
 ('American', 94),
 ('4.5', 94),
 ('English', 85),
 ('years', 81),
 ('ARC', 69),
 ('London', 67),
 ('POV', 65),
 ('Jack', 63),
 ('America', 61),
 ('Christmas', 59),
 ('NetGalley', 58),
 ('French', 58),
 ('Two', 53),
 ('the day', 52),
 ('Christian', 49),
 ('one day', 48),
 ('England', 48),
 ('British', 47),
 ('Sam', 46),
 ('2.5', 44),
 ('fourth', 44),
 ('Netgalley', 43),
 ('Harry Potter', 43),
 ('Goodreads', 42),
 ('HEA', 42),
 ('the years', 40),
 ('summer', 39),
 ('this year', 36),
 ('New York', 36),
 ('Three', 36),
 ('Alex', 36),
 ('Emma', 35)]
In [179]:
tf_word = Counter([token.text for doc in docs for token in doc if not token.is_stop and not token.is_punct])

print('Number of words:', sum(tf_word.values()))
print('Number of distinct words:', len(tf_word.keys()))
Number of words: 514791
Number of distinct words: 44024
In [180]:
sizes = [10, 20, 100, 200]
for size in sizes:
    sum_top = sum([freq for term, freq in tf_word.most_common(size)])
    print(f'Sum frequency of top {size} terms: {sum_top} (fraction: {sum_top / sum(tf_word.values()): >.2f})')
Sum frequency of top 10 terms: 55996 (fraction: 0.11)
Sum frequency of top 20 terms: 73309 (fraction: 0.14)
Sum frequency of top 100 terms: 137086 (fraction: 0.27)
Sum frequency of top 200 terms: 175572 (fraction: 0.34)

The proportion of the top terms is lower in a heterogeneous set of reviews than in a set of reviews focused in a single book. In the content analysis of reviews for popular books we found 27,632 distinct words, versus 44,024 in these more heterogeneous reviews. Furthermore, top 10 terms represent 16% of all occurring words in reviews for a single popular book, while in this selection it is 11%. For the top 200 terms, the numbers are 54% and 34% respectively. In other words, the wider focus of the random selection also leads to a larger vocabulary that is less skewed. (Less skewed because the total numbers of words is not so different. The 10,000 single-book reviews contain 487,298 words, while the 10,000 randomly sampled reviews contain 514,791 words.

Long Tails and Classification

In [184]:
from collections import defaultdict
from scripts.text_tail_analysis import show_pos_tail_distribution

tf_lemma_pos = Counter([(token.lemma_, token.pos_) for doc in docs for token in doc if not token.is_stop and not token.is_punct])

show_pos_tail_distribution(tf_lemma_pos)
Word form	All TF (frac)	TF <= 5 (frac)	TF = 1 (frac)
------------------------------------------------------------
ADJ       	   78737  0.15	  6784  0.13	  2017  0.11
VERB      	  123434  0.24	  4939   0.1	  1384  0.08
NOUN      	  197692  0.38	 13313  0.26	  4213  0.23
PROPN     	   61135  0.12	 22795  0.44	  8893  0.49
ADV       	   23982  0.05	  1621  0.03	   573  0.03
SPACE     	   16429  0.03	     0   0.0	     0   0.0
INTJ      	    2703  0.01	   395  0.01	   172  0.01
NUM       	    4878  0.01	   856  0.02	   397  0.02
ADP       	     543  0.00	    85   0.0	    44   0.0
SCONJ     	    3220  0.01	     5   0.0	     0   0.0
PRON      	     197  0.00	    45   0.0	    32   0.0
X         	     957  0.00	   539  0.01	   377  0.02
CCONJ     	     205  0.00	    16   0.0	     9   0.0
PUNCT     	     329  0.00	   195   0.0	   132  0.01
PART      	     170  0.00	     4   0.0	     4   0.0
DET       	      99  0.00	    42   0.0	    27   0.0
SYM       	      81  0.00	     6   0.0	     3   0.0

We see now that in the tail, almost half of all single-occurrence terms are proper nouns. With 8,866 different books being reviewed, there are probably many different names from characters, places, events and other entities. It is also possible that some non-English reviews got misclassified as English and Spacy does not recognize their words as English words so classifies many or most of them as proper nouns.

We should inspect the tail to see what is going on.

In [192]:
from scripts.text_tail_analysis import show_tail_lemmas

show_tail_lemmas(tf_lemma_pos, tf_threshold=1, pos='PROPN', num_lemmas=100)
night           Amis            Limbo           thy             eARCs           
Encyclopaedists Kovite          Bookworms       Balicki         Balicki(father  
Switzerland     .This           luke            Sidman          Summerstoke     
Merfield        Marsh           Isabell         Garnett         Semple          
Timby           LaMar           Poptropica      Bennig          Seymour         
Shubin          Thrillogy       Anansi          kakamochi       Punahou         
Lea             AngelWay        Rina            Arlington       Darkwing        
Blane           Mickelson       Matterhorn      R.A.            Mielikki        
Lah             hackeysack      Beijing         atmospherey     Hovercars       
Agricorps       Bandomreer      Bandomeer       Thibault        connected       
Shape           WW              eDiets          Heigl           Taboo           
Berris          Norris          coq             au              vin             
Timeless        Scarbrough      Louisville      Silvey          Kenny           
livingston      Seagull         Woofs           Tease           New Mexico      
Silber          Romania         Aubrey          Barak           Stiltskin       
Python          contro          mostro          Avevo           neppure         
fossi           procurato       iniziato        leggerlo        perche          
incuriosita     descrizione     realta          storia          narrata         
ben             poco            quarta          copertina       descrive        
solo            dettaglio       intrecciano     romanzo         minimo          

There are indeed many names of places, organizations and persons or fictional characters. But there are also a lot of non-English terms in here, with the last 20 or so being probably Italian. As said, it is possible that some reviews were misclassified in terms of language, but another possibility is that reviews contain quotes in different languages.

Let's look at one of the reviews with these Italian terms, e.g. the one containing the word 'dettaglio':

In [263]:
for doc in docs:
    for token in doc:
        if token.text == 'dettaglio':
            print(doc)
Scroll down for the English version.   La lotta contro il mostro   Avevo da un po' di tempo questo libro, non so neppure come me lo fossi procurato, e ho iniziato a leggerlo perche incuriosita dalla descrizione.   In realta, e direi anche per fortuna, la storia narrata ha ben poco a che vedere con la quarta di copertina, che descrive solo un dettaglio di una delle storie che si intrecciano in questo bellissimo romanzo, ma soprattutto non fa il minimo accenno all'argomento principale che lo anima: l'alcolismo.   Un medico, un operaio, un sacerdote, la giovane rampolla di una famiglia facoltosa, persone diverse che si incontrano all'interno della storia proprio a causa della loro dipendenza dall'alcol.   L'autore entra nella mente dell'alcolista e riesce a mostrare al lettore quale filo di pensieri spinge il primo a tornare bere, anche dopo essere stato malissimo e aver giurato se stesso che avrebbe smesso, anche se questo significa trascurare le persone che ama e che l'amano, anche se puo portarlo a un passo dalla morte, anche se cio costringe lui e la sua famiglia alla poverta, anche se sa perfettamente il motivo di questa sua malattia, tanto da essere in grado di consigliare altri come lui.   L'alcolismo e il mostro che controlla i protagonisti di questo romanzo, che li accompagna nella loro discesa all'inferno. Alcuni non ce la fanno e vengono sconfitti, soprattutto se non hanno nessuno cui affidarsi. Altri, che hanno la fortuna di poter contare sui propri cari, trovano la forza, o almeno obbligano se stessi a trovarla, per combatterlo, e magari vincerlo.   La meravigliosa prosa di Konsalik scorre tra disperazione, ironia e speranza, tra una lacrima e un sorriso, finche arrivi alla fine con la sensazione di aver ricevuto un dono.   The fight against the monster   I've had this book for some time, I don't even know how I got it, and I started reading it because I was intrigued by the description.   In fact, and I would say fortunately, the narrated story has little to do with the back cover, which describes only a detail of one of the stories that intertwine in this beautiful novel, but mostly it does not make the slightest reference to the main theme: alcoholism.   A doctor, a worker, a priest, the young scion of a wealthy family, different people meet in the story precisely because of their addiction to alcohol.   The author enters the mind of the alcoholic and manages to show the reader what train of thought leads the former to drink again, even after feeling very bad and having sworn to themselves that they would stop, even if it means neglecting the people they love and that love them, although it may take them to the brink of death, even if it forces them and their family out of poverty, even though they perfectly know the reason for their illness, as to be able to advise others like them.   Alcoholism is the monster that controls the protagonists of this novel, accompanying them in their descent into hell. Some don't make it and are defeated, especially if they have no one to rely on. Others, who have the good fortune to be able to count on their loved ones, find the strength, or at least oblige themselves to find it, to fight the monster, and maybe win.   The wonderful prose of Konsalik flows between despair, humour, and hope, between a tear and a smile, until you reach the end with the feeling of having received a gift.

We see another reason for the occurrence of non-English terms: reviews with versions of the text in multiple languages. As is typical with user-generated content on the web, at large scale, the variation in contributions is enormous [1]. With so many different people contributing, each in their own way, variation grows as the number of contributions grow.

[1] X. Ochoa, E. Duval, Quantitative analysis of user-generated content on the web, 2008.

Reviews of Prolific Reviewers

Let's look at the contributing reviewers. We expect most reviewers to contribute only a single review, but no doubt there are a small number of highly prolific reviewers contributing hundreds of reviews.

In [195]:
review_df.user_id.value_counts()
Out[195]:
a2d6dd1685e5aa0a72c9410f8f55e056    1474
459a6c4decf925aedd08e45045c0d8c6     716
4922591667fd3e8adc0c5e3d42cf557a     710
843a44e2499ba9362b47a089b0b0ce75     666
dd9785b14664103617304996541ed77a     610
                                    ... 
1bf135f904e8a5f40155668044764afb       1
b0fdb686313d58bd622a0852aad2c4a2       1
a4e803cd1c6677c75abd23ad70740a07       1
0e081be86e69d2a3ecc95a655d31cc5f       1
785e5b3ab68616ed228e02dd74363824       1
Name: user_id, Length: 205264, dtype: int64

In this random sample of 1 million reviews (which is only a fraction of the 15 million reviews that were crawled, which is only a fraction of the over 90 million reviews on Goodreads [2]), there is a reviewer with staggering 1474 reviews.

[2] Goodreads - About us (Accessed 2020-08-04).

In [196]:
reviewer = 'a2d6dd1685e5aa0a72c9410f8f55e056'

reviewer_df = review_df[review_df.user_id == reviewer]
reviewer_df.rating.value_counts().sort_index()
Out[196]:
0    1474
Name: rating, dtype: int64

This reviewer provides no ratings. Let's look at the temporal distribution of the reviews.

In [204]:
#reviewer_df.date_updated.dt.year.value_counts().sort_index().plot(kind='bar')

# group all reviews by year and month that they were published
g = reviewer_df.groupby([reviewer_df.date_updated.dt.year, reviewer_df.date_updated.dt.month]).size()
# plot the number of reviews per month as a bar chart
ax = g.plot(kind='bar')
# update the ticks on the x-axis so that they remain readable...
ax.set_xticks(range(len(g)));
# ... with only a tick label for January of each year
ax.set_xticklabels(["%s-%02d" % item if item[1] == 1 else '' for item in g.index.tolist()], rotation=90);
plt.gcf().autofmt_xdate()
plt.xlabel('Review month')
plt.ylabel('Number of reviews')
plt.show()

This reviewer has been contributing reviews since 2011, with many months where they contributed more than 20 reviews and a peak in 2016 with over 100 reviews.

Let's look at the length of these reviews.

In [213]:
reviewer_df.review_length.hist(bins=50)
Out[213]:
<AxesSubplot:>

Almost all reviews are only one or two characters in length. What is going on here?

In [215]:
reviewer_df[reviewer_df.review_length == 1].review_text.value_counts()
Out[215]:
O    305
A    301
E    236
F      1
Name: review_text, dtype: int64
In [216]:
reviewer_df[reviewer_df.review_length == 2].review_text.value_counts()
Out[216]:
SM    535
Name: review_text, dtype: int64

These look like idiosyncratic code that make sense to reviewer, but not necessarily to many others. It might be that these reflect ratings. This demonstrates again the enormous variation that is typically found in user-generated content.

Let's look for prolific reviewers who write longer reviews:

In [234]:
review_df[review_df.review_length > 100].user_id.value_counts()
Out[234]:
843a44e2499ba9362b47a089b0b0ce75    660
9003d274774f4c47e62f77600b08ac1d    520
dd9785b14664103617304996541ed77a    486
b7772313835ce6257a3fbe7ad2649a29    449
8bb031b637de69eba020a8a466d1110b    377
                                   ... 
725bd65f3662a4c5325ed47617f870fe      1
862ce18aeb9e74094084ae2480d1f464      1
0dc500dd128b0e21ceb78586149e24a2      1
bfecfc020966f12c52edf574d094509a      1
571381fc614f55e6dc0f50868a47fb38      1
Name: user_id, Length: 172242, dtype: int64
In [251]:
reviewer = '843a44e2499ba9362b47a089b0b0ce75'

reviewer_df = review_df[review_df.user_id == reviewer]

print('reviewer number of reviews:', len(reviewer_df))
print('reviewer ratings:')
reviewer_df.rating.value_counts().sort_index()
reviewer number of reviews: 666
reviewer ratings:
Out[251]:
1     62
2    132
3    344
4    114
5     14
Name: rating, dtype: int64
In [237]:
reviewer_df.length_bin.value_counts().sort_index().plot(logx=True)
Out[237]:
<AxesSubplot:>
In [238]:
reviewer_texts = [review_text for review_text in get_dataframe_review_texts(reviewer_df)]
reviewer_texts_en = [text for text in reviewer_texts if len(text) > 0 and detect_lang(text) == 'en']

reviewer_docs = [nlp(text) for text in reviewer_texts_en]
In [253]:
tf_word = Counter([token.text for doc in reviewer_docs for token in doc if not token.is_stop and not token.is_punct])


print('Number of total words (tokens):', sum(tf_word.values()))
print('Number of distinct words (types):', len(tf_word.keys()))

tf_word.most_common(20)
Number of total words (tokens): 34719
Number of distinct words (types): 5062
Out[253]:
[('  ', 1035),
 ('book', 753),
 ('like', 562),
 ('things', 455),
 ('story', 402),
 ('liked', 388),
 ('loved', 336),
 ('think', 255),
 ('way', 242),
 ('end', 238),
 ('time', 237),
 ('great', 214),
 ('sure', 207),
 ('know', 204),
 ('found', 196),
 ('going', 195),
 ('people', 188),
 ('bad', 182),
 ('series', 182),
 ('interesting', 179)]
In [254]:
tf_ent = Counter([ent.text for doc in reviewer_docs for ent in doc.ents])

print('Number of total entities (tokens):', sum(tf_ent.values()))
print('Number of distinct entities (types):', len(tf_ent.keys()))


tf_ent.most_common()
Number of total entities (tokens): 4310
Number of distinct entities (types): 1155
Out[254]:
[('first', 139),
 ('one', 118),
 ('Warren', 48),
 ('2', 46),
 ('Kyle', 40),
 ('two', 35),
 ('Rick', 34),
 ('Hardin', 32),
 ('Alex', 30),
 ('HEA', 26),
 ('Alice', 26),
 ('Lucas', 24),
 ('Sam', 23),
 ('half', 23),
 ('Kelley Armstrong', 23),
 ('Jill', 22),
 ('Ian', 21),
 ('Lily', 21),
 ('Claire', 21),
 ('Marr', 20),
 ('Liam', 19),
 ('Nick', 19),
 ('Anna', 19),
 ('Sara', 19),
 ('Helen', 19),
 ('Gabe', 19),
 ('Sookie', 19),
 ('3', 18),
 ('Lucy', 18),
 ('Vaughn', 17),
 ('second', 16),
 ('Cole', 16),
 ('Cam', 16),
 ('Charlaine Harris', 16),
 ('Carrie Vaughn', 16),
 ('MLN Hanover', 16),
 ('Patricia Briggs', 16),
 ('Harris', 16),
 ('Brigg', 16),
 ('Ben', 15),
 ('Logan', 15),
 ('Shane', 15),
 ('Jody', 15),
 ('Hannah', 14),
 ('Rafe', 14),
 ('Jack', 14),
 ('Max', 14),
 ('Mike', 14),
 ('Seth', 14),
 ('Eve', 13),
 ('Rory', 13),
 ('Emma', 13),
 ('Sean', 13),
 ('Eric', 13),
 ('Val', 13),
 ('Damien', 13),
 ('Maggie', 12),
 ('Molly', 12),
 ('Matt', 12),
 ('Becky', 12),
 ('George', 12),
 ('Amy', 11),
 ('Charlie', 11),
 ('Piper', 11),
 ('Six', 11),
 ('The Strong Silent Type (East Coast 8', 11),
 ('0.5', 11),
 ('Elle Kennedy', 11),
 ('Quinn', 11),
 ('Adam', 11),
 ('Rachel', 10),
 ('Gabriel', 10),
 ('4', 10),
 ('Vlad', 10),
 ('Amelia', 10),
 ('Dylan', 10),
 ('Sarah', 10),
 ('Desi', 10),
 ('Cal', 10),
 ('Moira', 10),
 ("Allison Brennan's", 10),
 ("Lori Armstrong's", 10),
 ("Julie Collin's", 10),
 ("Sylvia Day's", 10),
 ('Bared to You', 10),
 ("Lorelei James's", 10),
 ('Leo', 9),
 ('Chris', 9),
 ('Letty', 9),
 ('Avery', 9),
 ('One', 9),
 ('Lincoln', 9),
 ('Kade', 9),
 ('Zach', 9),
 ('Xander', 9),
 ('Kate', 9),
 ('Jake', 8),
 ('Kelsey', 8),
 ('Liz', 8),
 ('Amanda', 8),
 ('Stephanie', 8),
 ('Miranda', 8),
 ('Adeline', 8),
 ('Cade', 8),
 ('Bren', 8),
 ('Cara', 8),
 ('Luc', 8),
 ('Elena', 8),
 ('Eveline', 8),
 ('Millie', 8),
 ('Maddie', 8),
 ('Zoya', 8),
 ('Holly', 8),
 ('Lauren', 8),
 ('Mari', 8),
 ('Dean', 8),
 ('Mary', 8),
 ('Addie', 8),
 ('Aerin', 8),
 ('Teirra', 8),
 ('Aidan', 8),
 ('Anya', 8),
 ('Marissa', 8),
 ('Circe', 8),
 ('Deacon', 8),
 ('Reseph', 8),
 ('Lydia', 7),
 ('Jane', 7),
 ('Becca', 7),
 ('FBI', 7),
 ('Anita', 7),
 ('Josh', 7),
 ('Ren', 7),
 ('today', 7),
 ('Josie', 7),
 ('Tom', 7),
 ('Jules', 7),
 ('Nathan', 7),
 ('Ty', 7),
 ('Cyn', 7),
 ('Graeme', 7),
 ('Joe', 7),
 ('Valentin', 7),
 ('Layla', 7),
 ('Jeremy', 7),
 ('Kara', 7),
 ('Cheyenne', 7),
 ('Leila', 7),
 ('Jace', 7),
 ('Eliza', 7),
 ('Serena', 7),
 ('Sophia', 7),
 ('Darian', 7),
 ('Paul', 7),
 ('Maura', 7),
 ('Archie', 7),
 ('Ellie', 6),
 ('Lisbeth', 6),
 ('Alan', 6),
 ('Tyler', 6),
 ('Noah', 6),
 ('Jet', 6),
 ('Colt', 6),
 ('Parker', 6),
 ('Patrick', 6),
 ('Kylie', 6),
 ('Natalie', 6),
 ('Luke', 6),
 ('Tal', 6),
 ('Zack', 6),
 ('Joanna', 6),
 ('Elaina', 6),
 ('Violet', 6),
 ('Camryn', 6),
 ('Andrew', 6),
 ('Mick', 6),
 ('Jenna', 6),
 ('Christos', 6),
 ('First', 6),
 ('Peter', 6),
 ('Savannah', 6),
 ('Mac', 6),
 ('Meryn', 6),
 ('Caine', 6),
 ('Isabella', 6),
 ('Isla', 6),
 ('Eagle', 6),
 ('Xcor', 6),
 ('Clay', 6),
 ('Allison', 6),
 ('Khaki', 6),
 ('Jackson', 6),
 ('Charlotte', 6),
 ('Tristan', 6),
 ('Daisy', 6),
 ('Veronica', 6),
 ('Woody', 6),
 ('Georgia', 6),
 ('Julia Kagawa', 6),
 ('Thorne', 6),
 ('Nicole', 6),
 ('David', 6),
 ('Wiggs', 6),
 ('Mallery', 6),
 ('Woods', 6),
 ('Donatti', 6),
 ('Zeke', 5),
 ('Ream', 5),
 ('100', 5),
 ('Dee', 5),
 ('Dave', 5),
 ('Hill', 5),
 ('Alec', 5),
 ('Laura', 5),
 ('Eddie', 5),
 ('Mia', 5),
 ('Riley', 5),
 ('Giulia', 5),
 ('1', 5),
 ('Delilah', 5),
 ('Sophie', 5),
 ('Danny', 5),
 ('Ted', 5),
 ('Roman', 5),
 ('Gray', 5),
 ('Lexi', 5),
 ('Zoe', 5),
 ('Gavin', 5),
 ('Ryan', 5),
 ('Jay', 5),
 ('JD Robb', 5),
 ('BBQ', 5),
 ('Nika', 5),
 ('Maya', 5),
 ('Nadine', 5),
 ('Eva Mae', 5),
 ('Carmela', 5),
 ('Cain', 5),
 ('Kendrick', 5),
 ('Kai', 5),
 ('Janet', 5),
 ('10 years', 5),
 ('Davy', 5),
 ('Andre', 5),
 ('Bethany', 5),
 ('Bliss', 5),
 ('Mitch', 5),
 ('only one', 5),
 ('Heloise', 5),
 ('Risa', 5),
 ('Sage', 5),
 ('Derrick', 5),
 ('Melanie', 5),
 ('Jase', 5),
 ('Liv', 5),
 ('Marc', 5),
 ('Morgan', 5),
 ('Simon', 5),
 ('Jillian', 5),
 ('Jess Haines', 5),
 ('H&W Investigations', 5),
 ('Alyssa', 5),
 ('Britt', 5),
 ('40', 4),
 ('Kat', 4),
 ('Aiden', 4),
 ('Hunter', 4),
 ('Mae', 4),
 ('Ric', 4),
 ('Trella', 4),
 ('Deuce', 4),
 ('Jacqueline', 4),
 ('Julian', 4),
 ('Janie', 4),
 ('Jordan', 4),
 ('Ares', 4),
 ('one night', 4),
 ('Sarah Jo', 4),
 ('Dane', 4),
 ('Jon', 4),
 ('three', 4),
 ('MTV', 4),
 ('London', 4),
 ('Dare', 4),
 ('Hayley', 4),
 ('Ethan', 4),
 ('Adriel', 4),
 ('Lusty', 4),
 ('Caleb', 4),
 ('Tegan', 4),
 ('Addison', 4),
 ('Qhuinn', 4),
 ('Seamus', 4),
 ('Jared', 4),
 ('Justin', 4),
 ('Hugh', 4),
 ('Tierra', 4),
 ('Four', 4),
 ('EJ', 4),
 ('Teddy', 4),
 ('Tony', 4),
 ('Vikki', 4),
 ('Krista', 4),
 ('Law', 4),
 ('Stella', 4),
 ('Jensen', 4),
 ('Leon', 4),
 ('Harry', 4),
 ('Marty', 4),
 ('Stevie', 4),
 ('Milo', 4),
 ('Callie', 4),
 ('Lynn', 4),
 ('Remy', 4),
 ('Carlotta', 4),
 ('Jonathan Maberry.', 4),
 ('Top', 4),
 ('Bunny', 4),
 ('Kyra', 4),
 ('Wyatt', 4),
 ('Westy', 4),
 ('Kristy', 4),
 ('Cesca', 4),
 ('Lori', 4),
 ('Burton', 4),
 ('Haynes', 4),
 ('Rossetti', 4),
 ('Jude', 4),
 ('Anka', 4),
 ('Gretchen', 4),
 ('Lena', 4),
 ('5', 3),
 ('Kristine', 3),
 ('10', 3),
 ('Elizabeth', 3),
 ('Katie', 3),
 ('Jamie', 3),
 ('TSTL', 3),
 ('Chance', 3),
 ('secondary', 3),
 ('Roarke', 3),
 ('Torolf', 3),
 ('Hildy', 3),
 ('Calla', 3),
 ('Finn', 3),
 ('about 20', 3),
 ('Marshall', 3),
 ('5 years', 3),
 ('Tommy', 3),
 ('Shelly Laurenston', 3),
 ('Carolyn', 3),
 ('Connie', 3),
 ('Anton', 3),
 ('Anica', 3),
 ('Jasper', 3),
 ('the last quarter', 3),
 ('Monica', 3),
 ('John', 3),
 ('50', 3),
 ('Neil', 3),
 ('Mackenzie', 3),
 ('Mimi', 3),
 ('Raven', 3),
 ('Cori', 3),
 ('Emmaline', 3),
 ('Kaci', 3),
 ('Aura', 3),
 ('Javier', 3),
 ('James', 3),
 ('more than one', 3),
 ('Charli', 3),
 ('Armstrong', 3),
 ('Tessa', 3),
 ('3/4', 3),
 ('Chasen', 3),
 ('Trey', 3),
 ('Vivi', 3),
 ('Gia', 3),
 ('Tori', 3),
 ('Elli', 3),
 ('Shea', 3),
 ('Jaz', 3),
 ('Robyn', 3),
 ('Shohn', 3),
 ('one day', 3),
 ('Noctem Falls', 3),
 ('Zaal', 3),
 ('Bud', 3),
 ('years', 3),
 ('Jacko', 3),
 ('Bailey', 3),
 ('Sven', 3),
 ('Aaron', 3),
 ('Sebastian', 3),
 ('Selena', 3),
 ('Axe', 3),
 ('Elise', 3),
 ('Hope', 3),
 ('Sawyer', 3),
 ('Malcolm', 3),
 ('Dewey', 3),
 ('Spenser', 3),
 ('Theo', 3),
 ('Emily', 3),
 ('Jayna', 3),
 ('SWAT', 3),
 ('Annie', 3),
 ('Harlow', 3),
 ('Charley', 3),
 ('Jessie', 3),
 ('Silas', 3),
 ('Maia', 3),
 ('Clare', 3),
 ('20', 3),
 ('Ari', 3),
 ('a week', 3),
 ('Jim', 3),
 ('Henry', 3),
 ('Dahlia', 3),
 ('River', 3),
 ('Jayce', 3),
 ('GR', 3),
 ('Seb', 3),
 ('Naomi', 3),
 ('Nightingale', 3),
 ('60', 3),
 ('Tilly', 3),
 ('Gary', 3),
 ('Bjornolf', 3),
 ('Lahn', 3),
 ('Asia', 3),
 ('Cameron', 3),
 ('Erica', 3),
 ('Jeanette', 3),
 ('Gage', 3),
 ('Ann Aguirre', 3),
 ("Ann Aguirre's", 3),
 ('Puller', 3),
 ('Jennifer', 3),
 ('Knight', 3),
 ('Margo', 3),
 ('Poppy', 3),
 ('Dara', 3),
 ('Rebeka', 3),
 ('Sera', 3),
 ('Melina', 3),
 ('Libby', 3),
 ('Gillie', 3),
 ('Angelina', 3),
 ('Gina', 3),
 ('Killian', 3),
 ('Erik', 3),
 ('Sloane', 3),
 ('Kari', 3),
 ('Circenn', 3),
 ('Thorn', 3),
 ('Raley', 3),
 ('Syn', 3),
 ('Terri', 3),
 ('Native American', 2),
 ('Julia', 2),
 ('Chloe', 2),
 ('30-40', 2),
 ('Bennent', 2),
 ('Laurel', 2),
 ('Merrick', 2),
 ('Fisher', 2),
 ('Mikael', 2),
 ('Pack', 2),
 ('Sydney', 2),
 ('the first half', 2),
 ('Grace', 2),
 ('Eli', 2),
 ('Stephanie Plum', 2),
 ('Ava', 2),
 ('a mile', 2),
 ('the day', 2),
 ('6', 2),
 ('America', 2),
 ('Hank', 2),
 ('Indy', 2),
 ('Ally', 2),
 ('Brian', 2),
 ('The last quarter', 2),
 ('the Dark Room', 2),
 ('Cass', 2),
 ('Mercy', 2),
 ('Marguerite', 2),
 ('Christian', 2),
 ('MC', 2),
 ('Tatiana', 2),
 ('Edward', 2),
 ('Melvin', 2),
 ('Shana', 2),
 ('Raphael', 2),
 ('Giff', 2),
 ('30', 2),
 ("O'Kane", 2),
 ('Alix', 2),
 ('Devyn', 2),
 ('Zee', 2),
 ('Demonica', 2),
 ('Dragos', 2),
 ('the next day', 2),
 ('Jessica', 2),
 ('Florida', 2),
 ('Brotherhood', 2),
 ('Lenobia', 2),
 ('Demetrius', 2),
 ('Isa', 2),
 ('Montgomery', 2),
 ('Rorie', 2),
 ('Myrna', 2),
 ('Jen', 2),
 ('Olivia', 2),
 ('Dan', 2),
 ('Jayne', 2),
 ('Rainbow', 2),
 ('Erin', 2),
 ('Vic', 2),
 ('Madoc', 2),
 ('60%', 2),
 ('Eight', 2),
 ('Dakota', 2),
 ('Boris', 2),
 ('so many years', 2),
 ('Calvin', 2),
 ('Brody', 2),
 ('last week', 2),
 ('Friday', 2),
 ('Domitian', 2),
 ('Sarita', 2),
 ('Corey', 2),
 ('Blay', 2),
 ('180', 2),
 ('Brothers', 2),
 ('Zel', 2),
 ('Elle', 2),
 ('Cooper', 2),
 ('Mason', 2),
 ('Kori', 2),
 ('Marco', 2),
 ('Jackie', 2),
 ("V'Dan", 2),
 ('Kimi', 2),
 ('Michael', 2),
 ('Wade', 2),
 ('Joan', 2),
 ('Rob', 2),
 ('Shavlis', 2),
 ('Kennedy', 2),
 ('Hadley', 2),
 ('Deb', 2),
 ('all the years', 2),
 ('a few years', 2),
 ('Walt', 2),
 ('Scarlett', 2),
 ('Ranch', 2),
 ('Libbie', 2),
 ('India', 2),
 ('Danika', 2),
 ('Malachi', 2),
 ('Cherise', 2),
 ('UK', 2),
 ('Nicholas', 2),
 ('Kristen Bell', 2),
 ('Veronica Mars', 2),
 ('Neptune', 2),
 ('Lamb', 2),
 ('every too weeks', 2),
 ('Noelle', 2),
 ('Isen', 2),
 ('Celia', 2),
 ('Renee/Shori', 2),
 ('Marz', 2),
 ('Mark', 2),
 ('Trisha', 2),
 ('Bill', 2),
 ('Elizabeth Smart', 2),
 ('Levi', 2),
 ('Garrett', 2),
 ('Tina', 2),
 ('Ash', 2),
 ('Section 8', 2),
 ('Everly', 2),
 ("Crow's Row", 2),
 ('Boone', 2),
 ('Brandee', 2),
 ('the 6 year', 2),
 ('Compton', 2),
 ('Nick the loosy goosy playboy', 2),
 ('Jessa', 2),
 ('Walter', 2),
 ('Travis', 2),
 ('Audrey', 2),
 ('Tiff', 2),
 ('Sarah Booth', 2),
 ('Tinkie', 2),
 ('Indigo', 2),
 ('100%', 2),
 ('the weekend', 2),
 ('Julie', 2),
 ('Jessi', 2),
 ('Caitlyn', 2),
 ('Julio', 2),
 ('Kendal', 2),
 ('Jo', 2),
 ('night', 2),
 ('Pestilence', 2),
 ('Ageis', 2),
 ('200', 2),
 ('Ilex', 2),
 ('Trif', 2),
 ('Kayla', 2),
 ('Mexico', 2),
 ('the last minute', 2),
 ('Tens', 2),
 ('Meridian', 2),
 ('55', 2),
 ('Taye', 2),
 ('Mockingbird', 2),
 ('Primes', 2),
 ('Evalle', 2),
 ('Vinh', 2),
 ('Jenk', 2),
 ('Drea', 2),
 ('New Orleans', 2),
 ('Paige', 2),
 ('Mel', 2),
 ('Lucan', 2),
 ('Ronan', 2),
 ('Mineral City', 2),
 ('Susan', 2),
 ('Choices', 2),
 ('Nyx', 2),
 ('Amish', 2),
 ('Griffin', 1),
 ('Trixa', 1),
 ('25', 1),
 ('2 weeks', 1),
 ('about a quarter', 1),
 ('5 months later', 1),
 ('Luce', 1),
 ('Imogene', 1),
 ('two year old', 1),
 ('a new school year', 1),
 ('the school year', 1),
 ('20 years', 1),
 ('Jilly', 1),
 ('the next one', 1),
 ('sassy', 1),
 ('4th of July', 1),
 ('Gay Pride week', 1),
 ('Disney', 1),
 ('Mickey', 1),
 ('Halle Puma', 1),
 ('the night', 1),
 ('Merc', 1),
 ('age 13 to 18', 1),
 ('13', 1),
 ('about 35', 1),
 ('Corine', 1),
 ('Sirantha Jax', 1),
 ('Ryu', 1),
 ('The second half', 1),
 ('only 1-2', 1),
 ('Lusty TX series', 1),
 ('Peabody', 1),
 ('the same man years earlier', 1),
 ('Hilda', 1),
 ('viking', 1),
 ('SEALS', 1),
 ('Seduction and Snacks', 1),
 ('Virgil Flowers', 1),
 ('Dalton', 1),
 ('Larry', 1),
 ('Zebrowski', 1),
 ('Dolph', 1),
 ('Claudia', 1),
 ('about 1 1/2 weeks', 1),
 ('mid-series', 1),
 ('Ax', 1),
 ('Carole', 1),
 ('Smithie', 1),
 ('Tex', 1),
 ('Loopy Loo', 1),
 ('McKays', 1),
 ('College', 1),
 ('Huntress', 1),
 ('Cat', 1),
 ('Webb', 1),
 ('Lincoln Rhyme', 1),
 ('125', 1),
 ('Thom', 1),
 ('Sasha', 1),
 ('Hawk', 1),
 ('Sienna', 1),
 ('weekend', 1),
 ('One weekend', 1),
 ('Shay', 1),
 ('Ellen', 1),
 ('Argeneau', 1),
 ('Carmelina', 1),
 ('Leonello', 1),
 ('Evanovich', 1),
 ('Vinnie', 1),
 ('Moon Man', 1),
 ('Grandma Mazur', 1),
 ('Lula', 1),
 ('Reyes', 1),
 ('Charly', 1),
 ('Breakneck', 1),
 ('Perry', 1),
 ('Derek', 1),
 ('Feehan', 1),
 ('SA', 1),
 ('Stubin', 1),
 ('Germany', 1),
 ('Nazis', 1),
 ('WWI', 1),
 ('Robert Pattinson', 1),
 ('Twilight', 1),
 ('2 days', 1),
 ('Three', 1),
 ('Glen', 1),
 ('about 50', 1),
 ('Dracula', 1),
 ('40 minutes', 1),
 ('50th', 1),
 ('Camp Kioga', 1),
 ('as much as Six', 1),
 ('Dallas', 1),
 ('the span of minutes', 1),
 ('60 or so', 1),
 ('FBI Ice Queen', 1),
 ('Years later', 1),
 ('Menagerie', 1),
 ('Anatoly', 1),
 ('Nightside', 1),
 ('season', 1),
 ('200 hundred', 1),
 ('Ian put Harte', 1),
 ('Schyler', 1),
 ('Schyuler', 1),
 ('Blue Bloods', 1),
 ('Julianna', 1),
 ('Emerson', 1),
 ('10 year old', 1),
 ('2 months later', 1),
 ('Spy Wyr', 1),
 ('Pia', 1),
 ('M&M', 1),
 ('Phil', 1),
 ('the last 300 hundred years', 1),
 ("the Seelie Queen's", 1),
 ('Aeric', 1),
 ('Phaendir', 1),
 ('Seelie', 1),
 ('Master Z', 1),
 ('Tara', 1),
 ('Gypsy', 1),
 ('Whit', 1),
 ('Zora', 1),
 ('Rachael', 1),
 ('Knights', 1),
 ('Brandon', 1),
 ('Brotherhoood', 1),
 ('marrige', 1),
 ('Halle', 1),
 ('Megan', 1),
 ('DMP', 1),
 ('Bevy', 1),
 ('BBF', 1),
 ('Isadora', 1),
 ('200 years', 1),
 ('Landon', 1),
 ('Flynn', 1),
 ('Plum Orchard', 1),
 ('Nox', 1),
 ('Manor', 1),
 ("Eveline's", 1),
 ('The Guy Next Door', 1),
 ('Williams', 1),
 ('Frey', 1),
 ('Culebra', 1),
 ('Ortiz', 1),
 ('Deveraux', 1),
 ('Charlene', 1),
 ('Two', 1),
 ('Rock', 1),
 ('the last couple of months', 1),
 ('Cadogan House', 1),
 ('Merit', 1),
 ('Reagan', 1),
 ('Mira', 1),
 ('Etain', 1),
 ('months', 1),
 ('NY', 1),
 ('Joseph and Gilda', 1),
 ('Torr', 1),
 ('Gilda', 1),
 ('Joseph', 1),
 ('Carmen', 1),
 ('Tynan', 1),
 ('9 years', 1),
 ('Sheli', 1),
 ('Curran', 1),
 ('Gutshot', 1),
 ('Vayl', 1),
 ('Bergman', 1),
 ('a few days', 1),
 ('Sarah Byrnes', 1),
 ("the mid 1800's", 1),
 ("Hell's Eight", 1),
 ('Kita', 1),
 ('Firth', 1),
 ('Bobby', 1),
 ('Nathaniel', 1),
 ('DMS', 1),
 ('Church', 1),
 ('WWII Germany', 1),
 ('Nazi', 1),
 ('Lars', 1),
 ('Aya', 1),
 ('Mikael Blomkvist', 1),
 ('12', 1),
 ('14', 1),
 ('Amber', 1),
 ('Lycaonia', 1),
 ('Gavriel', 1),
 ('Sugar Rush', 1),
 ('Joss', 1),
 ('Braden', 1),
 ('Declan', 1),
 ('only 16', 1),
 ('Brett', 1),
 ('RS', 1),
 ('1973', 1),
 ('Bernice', 1),
 ('Heron', 1),
 ('Texas', 1),
 ('Pam', 1),
 ('the Italian Forestry', 1),
 ('Immortals', 1),
 ('Everette', 1),
 ('Demetria', 1),
 ('Tons', 1),
 ('tons', 1),
 ('Terry', 1),
 ('the next 100 nights', 1),
 ('Blue', 1),
 ('Vishous', 1),
 ('Thor', 1),
 ('six', 1),
 ('Blay and Xcor', 1),
 ('Lassiter', 1),
 ('Trez', 1),
 ('Theresa', 1),
 ('John Matthew', 1),
 ('Darius', 1),
 ('Rhage', 1),
 ('Faith', 1),
 ('Lake', 1),
 ('Davis Cain', 1),
 ('Davis', 1),
 ('Half-Moon Hollow', 1),
 ("Lori Foster's", 1),
 ('2017', 1),
 ('Laney', 1),
 ('Silence of the Lambs', 1),
 ('Hannibal', 1),
 ('Starling', 1),
 ('Morrissey', 1),
 ('MT', 1),
 ('Sanctum', 1),
 ('Case', 1),
 ('TJ', 1),
 ("Blue's", 1),
 ('all those years ago', 1),
 ('Janya', 1),
 ('Nathan and Amelia', 1),
 ('Fierce', 1),
 ('Fredericka', 1),
 ('Ricka', 1),
 ('Shadow Falls', 1),
 ('inch', 1),
 ('Terran', 1),
 ('the Cimarron Ranch', 1),
 ('all night long', 1),
 ('Nate', 1),
 ('Durands', 1),
 ('Fiji', 1),
 ('Faith and Mason', 1),
 ('the first year', 1),
 ('Delta', 1),
 ('Nonna', 1),
 ('Kaleb', 1),
 ('Sahara', 1),
 ('Austin', 1),
 ('Presley', 1),
 ('Saint', 1),
 ('Chaos', 1),
 ('Denver', 1),
 ('Dayna', 1),
 ('un-Ian', 1),
 ('Abby', 1),
 ('the week', 1),
 ('Petunia', 1),
 ('Kira', 1),
 ('Meaghan', 1),
 ('Nichol', 1),
 ('Clayton', 1),
 ('The initial days', 1),
 ('Grover', 1),
 ('Pandora', 1),
 ('Murphy', 1),
 ("Hope's Crossing", 1),
 ('Ramie', 1),
 ('Trick', 1),
 ('DCO', 1),
 ('Clayne', 1),
 ('Tammy Jo', 1),
 ('Elliot', 1),
 ('Brooks', 1),
 ('Sullivan', 1),
 ('Jazz', 1),
 ('Wainwright', 1),
 ('every week', 1),
 ('Zip', 1),
 ('Benedict', 1),
 ('Johnson', 1),
 ('Olympiad', 1),
 ('Commerce City', 1),
 ('Kane', 1),
 ('11-12 years old', 1),
 ('53', 1),
 ('Nina', 1),
 ('Wanda', 1),
 ('Jewish', 1),
 ('Italian', 1),
 ('Tammy', 1),
 ('Fable', 1),
 ('Billy', 1),
 ('Dilya', 1),
 ('2 days later', 1),
 ('Joaquin', 1),
 ('the first season', 1),
 ('Alliance', 1),
 ('JD', 1),
 ('Zone', 1),
 ('Tina wade', 1),
 ('Zonie', 1),
 ('Saturn', 1),
 ('ACME', 1),
 ('Vanderventer', 1),
 ('Miami', 1),
 ('Shayne', 1),
 ('many years ago', 1),
 ('WTH', 1),
 ('4,000', 1),
 ('15 years', 1),
 ('Delaney', 1),
 ('the second half', 1),
 ('Rocky', 1),
 ('Tasha', 1),
 ('about 40', 1),
 ('more than 1', 1),
 ('Gunner', 1),
 ('Jem', 1),
 ("Seb's", 1),
 ('Jethro', 1),
 ('Place', 1),
 ('Wolf', 1),
 ('Strange Neighbors', 1),
 ('Supernatural', 1),
 ('One Night Nick', 1),
 ('the 3 decades', 1),
 ('Mother Nature', 1),
 ('the middle of April', 1),
 ('35 minutes', 1),
 ('Ashyn', 1),
 ('Moria', 1),
 ('30 minutes', 1),
 ('a few years ago', 1),
 ('Missy', 1),
 ('Graham', 1),
 ('British', 1),
 ('Cristian', 1),
 ('Allanon', 1),
 ('Shea Omshford', 1),
 ('Patty', 1),
 ('day', 1),
 ('Ed', 1),
 ('the boys night', 1),
 ('Francis', 1),
 ('Karen', 1),
 ('Flo', 1),
 ('Kyndred', 1),
 ('Mencheres', 1),
 ('Motorcycle Man', 1),
 ...]

The entity list contains a large number of first person names, and many of them with a relatively high frequency. In the 34,719 words of all this reviewer's reviews, there are 4,310 named entities (some with multiple words). So it seems this reviewers has a tendency to name the characters in their review.

In [261]:
from collections import defaultdict
from scripts.text_tail_analysis import show_pos_tail_distribution

tf_lemma_pos = Counter([(token.lemma_, token.pos_) for doc in reviewer_docs for token in doc if not token.is_stop and not token.is_punct])

show_pos_tail_distribution(tf_lemma_pos)
Word form	All TF (frac)	TF <= 5 (frac)	TF = 1 (frac)
------------------------------------------------------------
ADJ       	    4835  0.14	   944  0.15	   345  0.17
NOUN      	   11723  0.34	  2337  0.36	   791  0.38
VERB      	   10984  0.32	  1110  0.17	   361  0.18
NUM       	     224  0.01	    48  0.01	    16  0.01
PROPN     	    4258  0.12	  1774  0.27	   450  0.22
SPACE     	    1035  0.03	     0   0.0	     0   0.0
ADV       	    1150  0.03	   225  0.03	    69  0.03
SCONJ     	     353  0.01	     1   0.0	     1   0.0
INTJ      	     102  0.00	    29   0.0	    12  0.01
PRON      	       8  0.00	     2   0.0	     2   0.0
DET       	      19  0.00	     1   0.0	     1   0.0
ADP       	      24  0.00	     9   0.0	     3   0.0
CCONJ     	       2  0.00	     2   0.0	     2   0.0
X         	       2  0.00	     2   0.0	     2   0.0
In [259]:
from scripts.text_tail_analysis import show_tail_lemmas

show_tail_lemmas(tf_lemma_pos, tf_threshold=1, pos='NOUN', num_lemmas=100)
jacket          Griffin         Trixa           loud            request         
shelf           kooky           march           beat            drummer         
normal          laughter        enjoyment       location        blow            
ramification    push            shove           dive            translate       
increment       Luce            bud             thug            unexplained     
Imogene         brainy          nascar          prevail         unthinkable     
forefront       straighten      tantrum         school          confide         
sarcasm         phantom         innuendo        storm           inhabitant      
racism          soccer          coward          childlike       acceptation     
better          odd             uncomfortablenessjunior          attorney        
reclaim         Jilly           loner           absolute        capable         
chronicle       leak            dialogue        awwww           stake           
claim           together        4th             July            hybrid          
public          selfishness     Pack            gang            silliness       
Gay             Pride           Disney          Mickey          scope           
surreal         shirt           groveling       Puma            genuine         
Merc            life            choice          stork           18              
guardian        harvest         organ           donation        ultimate        
parental        budget          orphanage       tithe           despicable      
differently     Corine          Chance          Sirantha        goofy           
In [249]:
for genre in genres:
    print(f'{genre: <40}{reviewer_df[reviewer_df[genre] == 1][genre].count(): >5}')
children                                   27
comics, graphic                            11
fantasy, paranormal                       363
fiction                                   598
history, historical fiction, biography     92
mystery, thriller, crime                  339
non-fiction                                38
poetry                                      0
romance                                   566
young-adult                               171

Most of the reviewed books are romance (566 out of 666), with an overlap with mystery, thriller, crime and fantasy, paranormal.

In [250]:
reviewer_df.author_name.value_counts()
Out[250]:
Paige Tyler              6
Suzanne Brockmann        6
Kelley Armstrong         6
Shayla Black             5
Sherryl Woods            5
                        ..
Fine Cooking Magazine    1
Nathan Ballingrud        1
Jennifer Graham          1
Tracey Garvis-Graves     1
Raine Thomas             1
Name: author_name, Length: 469, dtype: int64
In [ ]: