# This reload library is just used for developing the REPUBLIC hOCR parser
# and can be removed once this module is stable.
%reload_ext autoreload
%autoreload 2
# This is needed to add the repo dir to the path so jupyter
# can load the modules in the scripts directory from the notebooks
import os
import sys
repo_dir = os.path.split(os.getcwd())[0]
print(repo_dir)
if repo_dir not in sys.path:
sys.path.append(repo_dir)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import csv
import os
data_dir = '../data/GoodReads'
books_10k_file = os.path.join(data_dir, 'goodreads_reviews-books_above_10k_lang_reviews.csv.gz')
reviewers_5k_file = os.path.join(data_dir, 'goodreads_reviews-reviewers_above_5k_reviews.csv.gz')
random_1M_file = os.path.join(data_dir, 'goodreads_reviews-random_sample_1M.csv.gz')
author_file = os.path.join(data_dir, 'goodreads_book_authors.csv.gz') # author information
book_file = os.path.join(data_dir, 'goodreads_books.csv.gz') # basic book metadata
We start with a subset of reviews for frequently reviewed books. To see how this subset was created, go to the Filtering Goodreads reviews notebook. This subset contains all reviews for books that have at least 10,000 reviews each.
We first load the reviews into a Pandas dataframe, then add metadata for the reviewed books from some of the datasets with book metadata.
# the review dataframe
review_df = pd.read_csv(books_10k_file, sep='\t', compression='gzip')
review_df
from dateutil.parser import parse, tz
def parse_date(date_str):
try:
return parse(date_str).astimezone(utc)
except TypeError:
return None
utc = tz.gettz('UTC')
review_df['date_added'] = review_df.date_added.apply(parse_date)
review_df['date_updated'] = review_df.date_updated.apply(parse_date)
review_df['read_at'] = review_df.read_at.apply(parse_date)
review_df['started_at'] = review_df.started_at.apply(parse_date)
# get a list of book ids that are in the review dataset
review_book_ids = set(review_df.book_id.unique())
# load basic book metadata (only book and author id and book title)
bookmeta_df = pd.read_csv(book_file, sep='\t', compression='gzip', usecols=['book_id', 'work_id', 'author_id', 'title'])
# filter the book metadata to only the book ids in the review dataset
bookmeta_df = bookmeta_df[bookmeta_df.book_id.isin(review_book_ids)]
# load the author metadata to get author names
author_df = pd.read_csv(author_file, sep='\t', compression='gzip', usecols=['author_id', 'name'])
author_df = author_df.rename(columns={'name': 'author_name'})
# merge the book and author metadata into a single dataframe,
# keeping only author names for books in the review dataset
metadata_df = pd.merge(bookmeta_df, author_df, how='left')
# merge the review dataset with the book metadata
review_df = pd.merge(review_df, metadata_df, on='book_id')
We remove empty reviews as they are non-reviews (see Filtering Goodreads Reviews for details on how and why we do this).
print('Number of empty reviews:', len(review_df[review_df.review_length == 0]))
review_df = review_df[review_df.review_length > 0]
# This step writes the current dataframe to file,
# so all the merging steps can be skipped in reruns of the notebook
merged_data_file = '../data/Goodreads/goodreads_reviews-books_above_10k.merged.csv.gzip'
review_df.to_csv(merged_data_file, sep='\t', compression='gzip')
#review_df = pd.read_csv(merged_data_file, sep='\t', compression='gzip')
This datasets contains reviews for nine books that each have at least 10,000 reviews:
review_df.groupby(['author_name', 'title']).size()
Suzanne Collins has three books, all part of the same trilogy, among the most frequently reviewed books:
review_df.author_name.value_counts()
There are reviews in different languages:
review_df.review_lang.value_counts()
For content analysis, we'll remove the non-English reviews, so content can be more easily compared across reviews.
review_df = review_df[review_df.review_lang == 'en']
First, we compare how the reviews are spread over time, for all books together and per book.
plt.rcParams['figure.figsize'] = [15, 5]
# group all reviews by year and month that they were published
g = review_df.groupby([review_df.date_updated.dt.year, review_df.date_updated.dt.month]).size()
# plot the number of reviews per month as a bar chart
ax = g.plot(kind='bar')
# update the ticks on the x-axis so that they remain readable...
ax.set_xticks(range(len(g)));
# ... with only a tick label for January of each year
ax.set_xticklabels(["%s-%02d" % item if item[1] == 1 else '' for item in g.index.tolist()], rotation=90);
plt.gcf().autofmt_xdate()
plt.xlabel('Review month')
plt.ylabel('Number of reviews')
plt.show()
The first reviews are from late 2007, the last from late 2017. The plot shows that the total number of reviews for these nine books increased from late 2007 with a sudden jump in 2012 and with another jump in 2014. However, with the current scale (over 100,000 reviews) and focus (reviews for nine popular books) we don't see differences in patterns per book. We shift our focus by creating views on numbers of reviews per book.
# Group the number of reviews by year and by book title
g = review_df.groupby([review_df.date_updated.dt.year, 'title']).size()
# is zero for years in which a book has no reviews
u = g.unstack('title').fillna(0)
for title in review_df.title.unique():
# divide the number of reviews for a book in a certain
# year by the number of reviews over all years to get proportions
u[title] = u[title] / sum(u[title])
# plot as bar chart
u.plot(kind='bar')
We notice that there are some marked differences in how reviews of a book are spread over time. For some, there is large burst just after release (especially Fifty shades grey with 50% of its reviews in 2012, then the amount of reviews drops off rapidly), while for others the reviews are more spread out, like Twilight and particularly The book thief which was released in 2005, had a small fraction of its reviews in 2007, but got an increasing amount of reviews up to a peak in 2014, a full 9 years after its release, and still receiving many reviews in 2017.
We start with analysing the reviews for a single book. A random pick from the book ids:
np.random.choice(list(review_book_ids))
We create a new dataframe by selecting only the reviews for the randomly selected book.
book_id = 7260188
book_df = review_df[review_df.book_id == book_id]
book_df.title.drop_duplicates()
The chosen book is Mockingjay, the third book in The Hunger Games trilogy by Suzanne Collins. Let's start with a quick look at the ratings to know if we can expect positive and/or negative reviews:
book_df.rating.value_counts()
The ratings of zero are not actual ratings, but non-ratings, i.e. the reviewer wrote a review but provided no explicit rating.
plt.rcParams['figure.figsize'] = [15, 5]
g = book_df.groupby([book_df.date_added.dt.year, 'rating']).size()
u = g.unstack('date_added')
print('year\tavg. rating')
for year in u.columns:
print(f'{year}\t{book_df[book_df.date_added.dt.year == year].rating.mean(): >4.2f}')
u[year] = u[year] / sum(u[year])
g = u.stack()
u = g.unstack('rating')
u.plot(kind='bar')
The plot shows that fraction of reviews per year that gets a rating of 1-5 stars (or no rating, represented by the zero values).
The majority of reviews have a positive rating, and although the fraction of 5-star reviews drops somewhat after the first year (the lowest average rating is in 2014), the majority remains positive. This is typical of online reviews. People don't choose books to read randomly, but those which they expect to like. Furthermore, people who liked a book are more likely willing to put effort into reviewing it.
Let's look at the differences in review length:
book_df.review_length.value_counts().sort_index()
from collections import Counter
# count the number of reviews of each length
counts = book_df.review_length.value_counts().sort_index()
print('The shortest review (in text characters):', book_df.review_length.min())
print('The longest review (in text characters):', book_df.review_length.max())
print('The average review length:', book_df.review_length.mean())
print('The standard deviation in review lengths:', book_df.review_length.std())
print('\nNumber of reviews with fewer than 100 characters:', sum(book_df.review_length < 100))
print('Number of reviews of below average length:', sum(book_df.review_length < book_df.review_length.mean()))
print('Number of reviews of above average length:', sum(book_df.review_length > book_df.review_length.mean()))
dist = {length: count for length, count in counts.iteritems()}
book_df.review_length.value_counts()
x, y = zip(*book_df.review_length.value_counts().sort_index().iteritems())
plt.plot(x, y)
plt.axvline(x=book_df.review_length.mean(), color='red', linestyle='dotted')
plt.xscale('log')
The plot above shows the distribution of review lengths in number of characters per review. There is a large spread in review lengths. There are thousands of reviews with fewer than 100 characters. Based on typical average word lengths in English of just over 4 characters per word, plus whitespace between words, that means that these are reviews with fewer than 20 words. The average length is 628 characters (the red dotted line), while the longest is almost 18,000 characters long (roughly 3600 words).
Slight tangent on the distribution: The standard deviation is higher than the average length, signaling that this distribution is skewed towards the left (most reviews are shorter than the average). See the notebook on Analysing Distributions for a detailed analysis of the different types of distributions and our arguments on why it is important to know about them and take them into account when interpreting data.
Let's sample a review and look at the text.
random_seed = 1205921
sample_df = book_df.sample(1, random_state=random_seed)
review_text_col = list(sample_df.columns).index('review_text')
sample_df.iloc[0,review_text_col]
This review describes a somewhat negative reading experience due to the violence in the book, but the reviewer found the story interesting and the ending surprising.
Let's compare a small sample of 10 reviews:
from scripts.text_tail_analysis import get_dataframe_review_texts
sample_size = 10
sample_df = book_df.sample(sample_size, random_state=random_seed)
for ri, review_text in enumerate(get_dataframe_review_texts(sample_df)):
print(f'review {ri+1}:', review_text)
print('\n')
Many reviews are very short, just one or two short sentences. Many reviewers mention the ending. This book being the last of a trilogy, this is not unexpected, as this book wraps up a longer narrative. We see quite some difference of opinion.
Taking a first step into a more quantitative analysis of the content, we do a Keyword in Context (KWiC) search for the words 'end', 'ended' and 'ending' to get insight in what reviewers say about it.
import re
def kwic(pattern, reviews, word_boundaries=True):
pattern = pattern if not word_boundaries else r'\b' + pattern + r'\b'
for review in reviews:
for match in re.finditer(pattern, review):
start = match.start(0) - 40 if match.start(0) > 40 else 0
end = match.end(0) + 40
print(f'{match[1]: <15}{review[start:end]}')
pattern = '(end|ends|ended|ending|endings)'
kwic(pattern, get_dataframe_review_texts(sample_df))
Another way to get insight in the content of multiple reviews is to make frequency lists.
import re
tf = Counter()
for text in get_dataframe_review_texts(sample_df):
# split the texts on any non-word characters
words = re.split(r'\W+', text.strip())
# count the number of times each word occurs across the review texts
tf.update(words)
tf.most_common(20)
Among the top 20 most frequent words, we find a domain-generic term, 'book', as well as the name of a character in the book, 'Katniss'.
How often do variants of 'end' and 'ending' appear in these 10 reviews?
for term in ['end', 'ends', 'ended', 'ending', 'endings']:
print(f'{term}:', tf[term])
print('Number of words:', sum(tf.values()))
print('Number of distinct words:', len(tf.keys()))
We can also use some of the many wonderful open source Natural Language Processing (NLP) tools to have more control on the textual content. We use Spacy to parsed the reviews to have access to the individual sentences and words, and get additional information on word forms, part-of-speech, lemmas, etc.
We start with listing all entities that Spacy identified in the sample of reviews.
import spacy
# load the large model for English
nlp = spacy.load('en_core_web_lg')
# use nlp to parse each text and store the parsed results as a list of docs
docs = [nlp(text) for text in get_dataframe_review_texts(sample_df)]
# iterate over the docs, then over the entities in each doc and count them
tf = Counter([entity.text for doc in docs for entity in doc.ents])
tf.most_common()
There is only a short list of entities found in the 10 reviews, most appearing only once. If we look not only at named entities, but at all noun phrases, we get a longer list:
# instead of entities, we can also look at noun-phrases
tf = Counter([ne.text for doc in docs for ne in doc.noun_chunks])
tf.most_common()
Many of these noun chunks are pronouns like 'I', 'me', 'you', 'she', 'they', 'them', 'we'. These are common in reviews, as reviewers often describe their personal reading experience and the affect that the book had on them. In a small sample, they get in way of seeing what content aspects are mentioned.
Spacy adds word form information to each word in the document. We can easily filter out common stopwords to get a better view of the content words that are mentioned.
tf = Counter([token.text for doc in docs for token in doc if not token.is_stop])
tf.most_common(20)
Now we see many punctuation symbols. Let's filter those out as well.
tf = Counter([token.text for doc in docs for token in doc if not token.is_stop and not token.is_punct])
tf.most_common(20)
The most common words are clearly related to the book domain (such as book, read, series, story) and the review domain (like, loved, felt, love, good). Notice that there are many morphological variants of each other.
We can also count the word lemmas instead of the surface variants in the text:
tf = Counter([token.lemma_ for doc in docs for token in doc if not token.is_stop and not token.is_punct])
tf.most_common(20)
Now we also see end as a common word.
With 10 short reviews we can only see a few commonalities and distinctions. Several mention the ending, some like and some don't. A quantitative perspective doesn't give us much beyond what a close reading of the reviews would give us.
If we zoom out to a larger group of 10,000 reviews, we get a more stable picture of what aspects are commonly mentioned. But now a different problems rears up.
from scripts.text_tail_analysis import read_spacy_docs_for_dataframe, select_dataframe_spacy_docs
import spacy
import time
nlp = spacy.load('en_core_web_lg')
fname = '../data/goodreads-reviews-books_above_10k.doc_bin'
start = time.time()
review_docs = read_spacy_docs_for_dataframe(fname, review_df, nlp)
print('took:', time.time() - start, 'seconds')
print('number of spacy docs loaded:', len(review_docs))
book_docs = select_dataframe_spacy_docs(book_df, review_docs, as_dict=True)
print('number of spacy docs selected:', len(book_docs.keys()))
sample_size = 10000
sample_df = book_df.sample(sample_size, random_state=random_seed)
docs = [nlp(text) for text in get_dataframe_review_texts(sample_df)]
docs = select_dataframe_spacy_docs(sample_df, review_docs, as_dict=False)
# calculate the term frequency of individual words
tf = Counter([token.lemma_ for doc in docs for token in doc if not token.is_stop and not token.is_punct])
tf.most_common(20)
This list is very similar to the one for ten reviews. The book and review domain terms, plus the names of the book, author and main characters.
Plain word lists are a quick way to get an overview of what is common across a set of reviews. Apart from total word counts, we can also count each word once per document regardless of how frequently the reviewer uses it, so that we get insight in how many reviewers mention a specific term, e.g. 'ending'. With each review being a document, this frequency is known as the document frequency.
from scripts.text_tail_analysis import get_doc_word_token_set
df = Counter([lemma for doc in docs for lemma in get_doc_word_token_set(doc, use_lemma=True)])
df.most_common(20)
This is quite insightful. There are 3713 reviews (37% of the 10,000 in the sample) that mention the word end and 2949 reviews (could be many of the same reviews) that mention ending. Also, 2375 reviewers mention the word character, and 1960 mention story.
But what is the problem that rears up here? Let's look at the total number of words and distinct word forms:
print('Number of words:', sum(tf.values()))
print('Number of distinct words:', len(tf.keys()))
The 10,000 reviews contain 487,298 words in total, and 27,632 distinct words. Above, we have looked at only the 20 most frequent ones. What are these remaining 27,612 words?
This is where the highly skewed distribution of word frequencies throws up barriers to analysis. How do we get a good overview of what those low-frequency are?
sizes = [10, 20, 100, 200]
for size in sizes:
sum_top = sum([freq for term, freq in tf.most_common(size)])
print(f'Sum frequency of top {size} terms: {sum_top} (fraction: {sum_top / sum(tf.values()): >.2f})')
These top 20 terms represent only 25% of all words. Even if we look at the top 200 words, we're ignoring half of the text.
tf_lemma_pos = Counter([(token.lemma_, token.pos_) for doc in docs for token in doc if not token.is_stop and not token.is_punct])
tf_lemma_pos.most_common(20)
One thing we can do to organise items in the long tail is to categorise or classify them.
from collections import defaultdict
from scripts.text_tail_analysis import show_pos_tail_distribution
tf_lemma_pos = Counter([(token.lemma_, token.pos_) for doc in docs for token in doc if not token.is_stop and not token.is_punct])
show_pos_tail_distribution
Above we see the proportion of Part-Of-Speech tags across all words and across words that occur at most five times and at most once. Remember, this is after removal of stopwords and punctuation.
First, the largest categories overall are nouns (36%), verbs (28%), adjectives (15%), proper nouns (10%) and adverbs (5%). Proper nouns refer to single identifiable entities.
Among the less frequent words, the proportion of nouns and adverbs remain stable, the proportion of verbs drop, while the number of adjectives and proper nouns go up.
In other words, the tail has relatively many adjectives and entities, but also many other nouns. In terms of content analysis, these are important categories. Of course, with 1000 reviews and only a few thousand of these words, it is possible to go through all of them to get insights in what they are and how they relate to the book, the reading experience or something else. If we were to scale up to tens of thousands or millions of reviews, this would become increasingly infeasible.
from scripts.text_tail_analysis import get_lemma_pos_df_index
df_group1 = book_df[book_df.rating > 3]
df_group2 = book_df[book_df.rating < 3]
book_docs_group1 = select_dataframe_spacy_docs(df_group1, review_docs, as_dict=True)
book_docs_group2 = select_dataframe_spacy_docs(df_group2, review_docs, as_dict=True)
print(len(book_docs_group1), len(book_docs_group2))
token_pos_types = ['ADJ', 'ADV', 'NOUN', 'PROPN', 'VERB']
docs_group1 = [book_docs_group1[review_id] for review_id in book_docs_group1]
docfreq_group1 = get_lemma_pos_df_index(docs_group1, keep_pron=True)
docs_group2 = [book_docs_group2[review_id] for review_id in book_docs_group2]
docfreq_group2 = get_lemma_pos_df_index(docs_group2, keep_pron=True)
total_group1 = len(book_docs_group1)
total_group2 = len(book_docs_group2)
for pos_type in token_pos_types:
for term, freq in docfreq_group1.most_common(1000):
lemma, pos = term
if pos != pos_type:
continue
prop_group1 = freq / total_group1
prop_group2 = docfreq_group2[term] / total_group2
prop = prop_group2 / prop_group1
if prop < 1.5:
continue
print(f'{lemma: <20}{pos: <6}{freq: >6}{docfreq_group2[term]: >6}{prop_group1: >8.4f}{prop_group2: >8.4f}{prop: >6.2f}')
Adjectives:
Adverbs:
Nouns:
Pronouns:
If we compare the proper nouns, we see that the negative reviews make a comparison to the Twilight series.
Verbs:
tail_groupings = get_tail_groupings(docs_group2, docfreq_group2, token_pos_types, liwc, max_threshold=5000, min_threshold=10)
tail_df = pd.DataFrame(tail_groupings)
book_terms = ['book', 'novel', 'story', 'plot', 'character', 'twist', 'development']
tail_df[(tail_df.dependency_word == 'bad') & (tail_df.tail_pos == 'NOUN')]
from scripts.text_tail_analysis import has_lemma_pos, sentence_iter
lemma = 'writing'
pos = 'NOUN'
for sent in sentence_iter(docs_group2):
if has_lemma_pos(sent, lemma, pos):
print(sent)
from scripts.text_tail_analysis import has_lemma_pos, sentence_iter
lemma = 'bad'
pos = 'ADJ'
for sent in sentence_iter(docs_group2):
if has_lemma_pos(sent, lemma, pos):
print(sent)
for pos_type in token_pos_types:
for term, freq in docfreq_group1.most_common(1000):
lemma, pos = term
if pos != pos_type:
continue
prop_group1 = freq / total_group1
prop_group2 = docfreq_group2[term] / total_group2
prop = prop_group2 / prop_group1
if prop > 0.66:
continue
print(f'{lemma: <20}{pos: <6}{freq: >6}{docfreq_group2[term]: >6}{prop_group1: >8.4f}{prop_group2: >8.4f}{prop: >6.2f}')
for pos_type in token_pos_types:
for term, freq in docfreq_group1.most_common(1000):
lemma, pos = term
if pos != pos_type:
continue
prop_group1 = freq / total_group1
prop_group2 = docfreq_group2[term] / total_group2
prop = prop_group2 / prop_group1
if prop < 0.66 or prop > 1.5:
continue
print(f'{lemma: <20}{pos: <6}{freq: >6}{docfreq_group2[term]: >6}{prop_group1: >8.4f}{prop_group2: >8.4f}{prop: >6.2f}')
The word 'end' is used slightly more in positive reviews, while 'ending' is used more in negative reviews.
from scripts.liwc import LIWC
# This dictionary is part of LIWC 2007, which is a commercial product, so not available in our Github repo
liwc_dict_file = '../data/LIWC2007_English131104.dic'
liwc = LIWC(liwc_dict_file)
sample_size = 1000
sample_df = book_df.sample(sample_size, random_state=random_seed)
sample_docs = select_dataframe_spacy_docs(sample_df, review_docs, as_dict=True)
from scripts.text_tail_analysis import get_lemma_pos_tf_index, group_by_head, group_by_child
token_pos_types = ['ADJ', 'NOUN', 'PROPN', 'VERB']
doc_list = [sample_docs[review_id] for review_id in sample_docs]
tf_lemma_pos = get_lemma_pos_tf_index(doc_list)
from scripts.text_tail_analysis import get_tail_groupings
tail_groupings = get_tail_groupings(doc_list, tf_lemma_pos, token_pos_types, liwc, max_threshold=5, min_threshold=0)
tail_df = pd.DataFrame(tail_groupings)
book_terms = ['book', 'novel', 'story', 'plot', 'character', 'twist', 'development']
tail_df[(tail_df.tail_pos == 'ADJ') & (tail_df.dependency_word == 'book')]
tail_groupings = get_tail_groupings(doc_list, tf_lemma_pos, token_pos_types, liwc, max_threshold=50, min_threshold=10)
tail_df = pd.DataFrame(tail_groupings)
book_terms = [
'book', 'novel', 'story', 'plot', 'character', 'twist', 'development',
'pace', 'scene', 'setting', 'narrative', 'theme', 'event']
author_terms = ['writing', 'style', 'write', 'author', 'writer', 'voice', 'describe', 'explain']
reader_terms = ['reader', 'feel', 'feeling', 'make', 'pull', 'throw']
tail_df[(tail_df.dependency_word == 'describe')]
terms = ['memorable', 'chilling', 'overwrought', 'gripping']
for term in terms:
syns = wn.synsets(term)
for syn in syns:
print(syn.lemmas())
print(syn.hypernyms())
#print(syn.hyponyms())
affect = wn.synset('affect.v.01')
print(syn.wup_similarity(affect))
print()
review_df.groupby(['book_id', 'author_name', 'title']).size()
sample_size = 1000
hg1_df = review_df[review_df.book_id == 2767052]
sample_hg1_df = hg1_df.sample(sample_size)
sample_hg1_df
docs_hg1 = [nlp(text) for text in get_sample_review_texts(sample_hg1_df)]
tf_lemma_pos = get_lemma_pos_tf_index(docs + docs_hg1)
child_group_hg1 = group_by_child(docs_hg1, tf_lemma_pos, token_pos_types, max_threshold=5)
shared_tokens = [token for token in child_group_hg1 if token in child_group]
token_lemma_pos = ('Katniss', 'PROPN')
if token_lemma_pos in child_group_hg1:
print(token_lemma_pos)
for token_pos in token_pos_types:
print('\t', token_pos, '\t', [lemma for lemma, pos in child_group_hg1[token_lemma_pos] if pos == token_pos])
print()
#for token_lemma_pos in child_group_hg1:
for token_lemma_pos in shared_tokens:
if sum(child_group_hg1[token_lemma_pos].values()) < 20:
continue
print(token_lemma_pos)
for token_pos in token_pos_types:
print('\t', token_pos, '\t', [lemma for lemma, pos in child_group[token_lemma_pos] if pos == token_pos])
print('\t', token_pos, '\t', [lemma for lemma, pos in child_group_hg1[token_lemma_pos] if pos == token_pos])
print()
def filter_doc_terms(doc, filter_terms):
return [token for token in doc if token in filter_terms]
def doc_generator(docs, use_sentences=False):
for doc in docs:
if use_sentences:
for sent in doc.sents:
yield sent
else:
yield doc
def get_cooc(docs, filter_terms=None, use_sentences=False, use_lemma=False):
cooc = Counter()
for doc in doc_generator(docs, use_sentences=use_sentences):
token_set = get_doc_token_set(doc, use_lemma=use_lemma)
if filter_terms:
token_set = filter_doc_terms(token_set, filter_terms)
cooc.update([term_pair for term_pair in combinations(sorted(token_set), 2)])
return cooc
common_terms = [term for term, freq in df.most_common() if freq >= 100 and term != ' ']
cooc = get_cooc(docs, filter_terms=common_terms, use_sentences=False, use_lemma=True)
cooc.most_common(50)
cooc = get_cooc(docs, filter_terms=common_terms, use_sentences=True, use_lemma=True)
cooc.most_common(50)
from helper import get_pmi_cooc
pmi_cooc = get_pmi_cooc(df, cooc, filter_terms=common_terms)
for ti, term_pair in enumerate(pmi_cooc):
print(term_pair, pmi_cooc[term_pair])
if ti == 10:
break
from helper import get_doc_content_chunks
from scripts.pmi import PMICOOC
token_sets = [sent_chunks for doc in docs for sent_chunks in get_doc_content_chunks(doc)]
token_sets = [[token.lemma_ if token.lemma_ != '-PRON-' else token.text for token in token_set] for token_set in token_sets]
pmi_cooc = PMICOOC(token_sets, filter_terms=common_terms)
token_freq = Counter([token for token_set in token_sets for token in token_set])
cooc_freq = Counter([token_pair for token_set in token_sets for token_pair in combinations([token for token in token_set], 2)])
pmi_cooc = get_pmi_cooc(token_freq, cooc_freq, filter_terms=common_terms)
for ti, term_pair in enumerate(pmi_cooc):
print(term_pair, pmi_cooc[term_pair])
if ti == 10:
break
from scripts.pmi import PMICOOC
pmi_cooc = PMICOOC(token_sets, filter_terms=common_terms)
for term in pmi_cooc.highest(5):
print(term, pmi_cooc[term])
for term in pmi_cooc:
print(term, pmi_cooc[term])
compare genres
differences in subjectivity are not noticeable at small scale. Need a particular, larger-scale focus to bring them out. But they can drown again in very large sets
topics need large scale
named entities are manageable at small scale, but become harder to deal with at large scale: mostly long tail, unknown, lower accuracy
many aspects become harder to summarise and organise at large scale
import math
review_df.iloc[0:10,]
chunks = math.ceil(113000 / 10000)
for chunk in range(chunks):
print(chunk)