{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "We will start with a relatively simple question, but with a difficult journey to get any answers:\n", "\n", "- What are the differences in reception between two fiction genres in the context of Goodreads?\n", "\n", "We look at Goodreads reviews at different scales and with different selection criteria." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/marijnkoolen/Code/Huygens/scale\n" ] } ], "source": [ "# This reload library is just used for developing the REPUBLIC hOCR parser \n", "# and can be removed once this module is stable.\n", "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "# This is needed to add the repo dir to the path so jupyter\n", "# can load the modules in the scripts directory from the notebooks\n", "import os\n", "import sys\n", "repo_dir = os.path.split(os.getcwd())[0]\n", "print(repo_dir)\n", "if repo_dir not in sys.path:\n", " sys.path.append(repo_dir)\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import json\n", "import csv\n", "import os\n", "\n", "data_dir = '../data/GoodReads'\n", "\n", "books_10k_file = os.path.join(data_dir, 'goodreads_reviews-books_above_10k_lang_reviews.csv.gz')\n", "reviewers_5k_file = os.path.join(data_dir, 'goodreads_reviews-reviewers_above_5k_reviews.csv.gz')\n", "random_1M_file = os.path.join(data_dir, 'goodreads_reviews-random_sample_1M.csv.gz')\n", "author_file = os.path.join(data_dir, 'goodreads_book_authors.csv.gz') # author information\n", "book_file = os.path.join(data_dir, 'goodreads_books.csv.gz') # basic book metadata\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_idbook_idreview_idratingdate_addeddate_updatedread_atstarted_atn_votesn_commentsreview_lengthreview_text
08842281e1d1347389f2ab93d60773d4d16981a5d2c3628987712d0e05c4f90798eb673Mon Dec 05 10:46:44 -0800 2016Wed Mar 22 11:37:04 -0700 2017NaNNaN1093Recommended by Don Katz. Avail for free in Dec...
18842281e1d1347389f2ab93d60773d4d81910708fb75b37b3613a34e39169f139870f315Fri Nov 18 17:43:26 -0800 2016Thu Aug 03 22:54:31 -0700 2017Mon Jul 24 09:32:34 -0700 2017Mon May 08 07:52:12 -0700 20172502585Best book of the series, and best book about A...
28842281e1d1347389f2ab93d60773d4d40955299706d01666058b1fb2a96b29a1260b5Sun Nov 18 16:31:28 -0800 2012Wed Dec 21 10:43:14 -0800 2016Fri Apr 17 00:00:00 -0700 2015Mon Apr 06 00:00:00 -0700 2015514734A truly inspirational book by a truly inspirat...
38842281e1d1347389f2ab93d60773d4d4986701bb7de32f9fadc36627e61aaef7a931424Thu Aug 04 10:02:02 -0700 2011Thu Aug 04 10:02:02 -0700 2011NaNNaN6473Found the Goodreads down image in this, and ma...
48842281e1d1347389f2ab93d60773d4d77566cedb8b21ea6ad95b05fa3868e05488e65Wed Mar 12 16:37:16 -0700 2008Wed Mar 22 11:46:03 -0700 2017Fri Oct 19 00:00:00 -0700 2012Wed Sep 19 00:00:00 -0700 201242284Seven amazing stories. Each one you think can'...
.......................................
10080068ba77e3c745ebddccc6306fc3c6bb25e17419895c7b6304251a1d94d6d1af8313f8ae75Mon Jul 08 11:47:23 -0700 2013Sat Jul 22 15:50:08 -0700 2017NaNSat Jul 22 00:00:00 -0700 201700177No finer Maine writer ever lived than Sarah Or...
1008007e223be160b89f218dbee70b5fbdccf762289246947c31eb080291307e4c2c4e9642640035Sun Aug 10 20:41:33 -0700 2014Tue Aug 26 13:53:08 -0700 2014Mon Aug 11 21:36:42 -0700 2014Tue Mar 25 00:00:00 -0700 201400976Love Songs With Bright Blue Chippiness Happi...
1008008e223be160b89f218dbee70b5fbdccf76228911459514f23d8a4f835e3a417afd679548595Sun Aug 10 14:21:44 -0700 2014Tue Aug 26 13:41:10 -0700 2014Sun Aug 10 14:25:39 -0700 2014Sun Aug 03 00:00:00 -0700 201400249Great Times. My Brand New 2014 KIA OPTIMA EX...
1008009e223be160b89f218dbee70b5fbdccf76203693887c1395ba0a319423707d8ffff79aeafc5Fri Jun 06 22:32:23 -0700 2014Tue Aug 26 14:07:08 -0700 2014Sun Aug 10 16:25:37 -0700 2014Wed Dec 25 00:00:00 -0800 201300365Jail. = Boring. I Beat People Down. \"You Are...
1008010e223be160b89f218dbee70b5fbdccf7618518801d2ed77d013ca33fe0eaa9a4013b352c75Thu Sep 19 02:49:29 -0700 2013Tue Aug 26 15:00:58 -0700 2014Tue Aug 12 23:38:26 -0700 2014Sat Aug 10 00:00:00 -0700 201300708I Was Trying To Add This Book Here On Goodre...
\n", "

1008011 rows × 12 columns

\n", "
" ], "text/plain": [ " user_id book_id \\\n", "0 8842281e1d1347389f2ab93d60773d4d 16981 \n", "1 8842281e1d1347389f2ab93d60773d4d 8191070 \n", "2 8842281e1d1347389f2ab93d60773d4d 40955 \n", "3 8842281e1d1347389f2ab93d60773d4d 4986701 \n", "4 8842281e1d1347389f2ab93d60773d4d 77566 \n", "... ... ... \n", "1008006 8ba77e3c745ebddccc6306fc3c6bb25e 174198 \n", "1008007 e223be160b89f218dbee70b5fbdccf76 22892469 \n", "1008008 e223be160b89f218dbee70b5fbdccf76 22891145 \n", "1008009 e223be160b89f218dbee70b5fbdccf76 20369388 \n", "1008010 e223be160b89f218dbee70b5fbdccf76 18518801 \n", "\n", " review_id rating \\\n", "0 a5d2c3628987712d0e05c4f90798eb67 3 \n", "1 8fb75b37b3613a34e39169f139870f31 5 \n", "2 299706d01666058b1fb2a96b29a1260b 5 \n", "3 bb7de32f9fadc36627e61aaef7a93142 4 \n", "4 cedb8b21ea6ad95b05fa3868e05488e6 5 \n", "... ... ... \n", "1008006 95c7b6304251a1d94d6d1af8313f8ae7 5 \n", "1008007 47c31eb080291307e4c2c4e964264003 5 \n", "1008008 9514f23d8a4f835e3a417afd67954859 5 \n", "1008009 7c1395ba0a319423707d8ffff79aeafc 5 \n", "1008010 d2ed77d013ca33fe0eaa9a4013b352c7 5 \n", "\n", " date_added date_updated \\\n", "0 Mon Dec 05 10:46:44 -0800 2016 Wed Mar 22 11:37:04 -0700 2017 \n", "1 Fri Nov 18 17:43:26 -0800 2016 Thu Aug 03 22:54:31 -0700 2017 \n", "2 Sun Nov 18 16:31:28 -0800 2012 Wed Dec 21 10:43:14 -0800 2016 \n", "3 Thu Aug 04 10:02:02 -0700 2011 Thu Aug 04 10:02:02 -0700 2011 \n", "4 Wed Mar 12 16:37:16 -0700 2008 Wed Mar 22 11:46:03 -0700 2017 \n", "... ... ... \n", "1008006 Mon Jul 08 11:47:23 -0700 2013 Sat Jul 22 15:50:08 -0700 2017 \n", "1008007 Sun Aug 10 20:41:33 -0700 2014 Tue Aug 26 13:53:08 -0700 2014 \n", "1008008 Sun Aug 10 14:21:44 -0700 2014 Tue Aug 26 13:41:10 -0700 2014 \n", "1008009 Fri Jun 06 22:32:23 -0700 2014 Tue Aug 26 14:07:08 -0700 2014 \n", "1008010 Thu Sep 19 02:49:29 -0700 2013 Tue Aug 26 15:00:58 -0700 2014 \n", "\n", " read_at started_at \\\n", "0 NaN NaN \n", "1 Mon Jul 24 09:32:34 -0700 2017 Mon May 08 07:52:12 -0700 2017 \n", "2 Fri Apr 17 00:00:00 -0700 2015 Mon Apr 06 00:00:00 -0700 2015 \n", "3 NaN NaN \n", "4 Fri Oct 19 00:00:00 -0700 2012 Wed Sep 19 00:00:00 -0700 2012 \n", "... ... ... \n", "1008006 NaN Sat Jul 22 00:00:00 -0700 2017 \n", "1008007 Mon Aug 11 21:36:42 -0700 2014 Tue Mar 25 00:00:00 -0700 2014 \n", "1008008 Sun Aug 10 14:25:39 -0700 2014 Sun Aug 03 00:00:00 -0700 2014 \n", "1008009 Sun Aug 10 16:25:37 -0700 2014 Wed Dec 25 00:00:00 -0800 2013 \n", "1008010 Tue Aug 12 23:38:26 -0700 2014 Sat Aug 10 00:00:00 -0700 2013 \n", "\n", " n_votes n_comments review_length \\\n", "0 1 0 93 \n", "1 25 0 2585 \n", "2 5 1 4734 \n", "3 6 4 73 \n", "4 4 2 284 \n", "... ... ... ... \n", "1008006 0 0 177 \n", "1008007 0 0 976 \n", "1008008 0 0 249 \n", "1008009 0 0 365 \n", "1008010 0 0 708 \n", "\n", " review_text \n", "0 Recommended by Don Katz. Avail for free in Dec... \n", "1 Best book of the series, and best book about A... \n", "2 A truly inspirational book by a truly inspirat... \n", "3 Found the Goodreads down image in this, and ma... \n", "4 Seven amazing stories. Each one you think can'... \n", "... ... \n", "1008006 No finer Maine writer ever lived than Sarah Or... \n", "1008007 Love Songs With Bright Blue Chippiness Happi... \n", "1008008 Great Times. My Brand New 2014 KIA OPTIMA EX... \n", "1008009 Jail. = Boring. I Beat People Down. \"You Are... \n", "1008010 I Was Trying To Add This Book Here On Goodre... \n", "\n", "[1008011 rows x 12 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# the review dataframe\n", "review_df = pd.read_csv(random_1M_file, sep='\\t', compression='gzip')\n", "\n", "review_df" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from dateutil.parser import parse, tz\n", "\n", "def parse_date(date_str):\n", " try:\n", " return parse(date_str).astimezone(utc)\n", " except TypeError:\n", " return None\n", "\n", "utc = tz.gettz('UTC')\n", "\n", "review_df['date_added'] = review_df.date_added.apply(parse_date)\n", "review_df['date_updated'] = review_df.date_updated.apply(parse_date)\n", "review_df['read_at'] = review_df.read_at.apply(parse_date)\n", "review_df['started_at'] = review_df.started_at.apply(parse_date)\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of empty reviews: 411\n" ] } ], "source": [ "print('Number of empty reviews:', len(review_df[review_df.review_length == 0]))\n", "review_df = review_df[review_df.review_length > 0]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# get a list of book ids that are in the review dataset\n", "review_book_ids = set(review_df.book_id.unique())\n", "\n", "# load basic book metadata (only book and author id and book title)\n", "metadata_columns = ['book_id', 'work_id', 'author_id', 'title', 'num_pages', 'publication_year']\n", "bookmeta_df = pd.read_csv(book_file, sep='\\t', compression='gzip', usecols=metadata_columns)\n", "\n", "def get_first(row):\n", " try:\n", " return first[row['work_id']]\n", " except KeyError:\n", " return None\n", "\n", "first = bookmeta_df[bookmeta_df.publication_year.notna()].groupby(['work_id']).publication_year.min()\n", "bookmeta_df['first_publication_year'] = bookmeta_df.apply(get_first, axis=1)\n", "\n", "# filter the book metadata to only the book ids in the review dataset\n", "bookmeta_df = bookmeta_df[bookmeta_df.book_id.isin(review_book_ids)]\n", "\n", "# load the author metadata to get author names \n", "author_df = pd.read_csv(author_file, sep='\\t', compression='gzip', usecols=['author_id', 'name'])\n", "author_df = author_df.rename(columns={'name': 'author_name'})\n", "\n", "# merge the book and author metadata into a single dataframe, \n", "# keeping only author names for books in the review dataset\n", "metadata_df = pd.merge(bookmeta_df, author_df, how='left')\n", "\n", "# merge the review dataset with the book metadata\n", "review_df = pd.merge(review_df, metadata_df, on='book_id')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "genre_file = os.path.join(data_dir, 'goodreads_book_genres_initial.csv.gz') # book genre information\n", "\n", "genremeta_df = pd.read_csv(genre_file, sep='\\t', compression='gzip')\n", "genre_df = genremeta_df[genremeta_df.book_id.isin(review_book_ids)]\n", "\n", "groups = genre_df.groupby(['book_id', 'genres']).size()\n", "genre_df = groups.unstack('genres').fillna(0)\n", "genre_df = genre_df.reset_index()\n", "print('number of books with genre information:', len(genre_df))\n", "\n", "review_df = pd.merge(review_df, genre_df, on='book_id', how='left')\n", "\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# This step writes the current dataframe to file, \n", "# so all the merging steps can be skipped in reruns of the notebook\n", "merged_data_file = '../data/Goodreads/goodreads_reviews-random_sample_1M.genre_merged.csv.gzip'\n", "#review_df.to_csv(merged_data_file, sep='\\t', compression='gzip')\n", "\n", "review_df = pd.read_csv(merged_data_file, sep='\\t', compression='gzip')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "number of books with genre information: 394888\n", "['children', 'comics, graphic', 'fantasy, paranormal', 'fiction', 'history, historical fiction, biography', 'mystery, thriller, crime', 'non-fiction', 'poetry', 'romance', 'young-adult']\n", "Number of reviews per genre:\n", "children 127656\n", "comics, graphic 90404\n", "fantasy, paranormal 446156\n", "fiction 918241\n", "history, historical fiction, biography 354004\n", "mystery, thriller, crime 409302\n", "non-fiction 199731\n", "poetry 34223\n", "romance 573330\n", "young-adult 408106\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
genresbook_idchildrencomics, graphicfantasy, paranormalfictionhistory, historical fiction, biographymystery, thriller, crimenon-fictionpoetryromanceyoung-adult
011.00.01.01.00.01.00.00.01.01.0
121.00.01.01.00.01.00.00.01.01.0
231.00.01.01.00.01.00.00.00.01.0
341.00.01.01.00.01.00.00.00.01.0
451.00.01.01.00.01.00.00.00.01.0
....................................
394883364880990.00.00.00.00.00.00.00.01.00.0
394884364942991.01.01.01.00.01.00.00.01.01.0
394885364983280.00.01.01.00.00.00.00.01.00.0
394886365084860.01.00.01.00.00.00.00.01.00.0
394887365141960.00.00.00.00.00.00.00.01.00.0
\n", "

394888 rows × 11 columns

\n", "
" ], "text/plain": [ "genres book_id children comics, graphic fantasy, paranormal fiction \\\n", "0 1 1.0 0.0 1.0 1.0 \n", "1 2 1.0 0.0 1.0 1.0 \n", "2 3 1.0 0.0 1.0 1.0 \n", "3 4 1.0 0.0 1.0 1.0 \n", "4 5 1.0 0.0 1.0 1.0 \n", "... ... ... ... ... ... \n", "394883 36488099 0.0 0.0 0.0 0.0 \n", "394884 36494299 1.0 1.0 1.0 1.0 \n", "394885 36498328 0.0 0.0 1.0 1.0 \n", "394886 36508486 0.0 1.0 0.0 1.0 \n", "394887 36514196 0.0 0.0 0.0 0.0 \n", "\n", "genres history, historical fiction, biography mystery, thriller, crime \\\n", "0 0.0 1.0 \n", "1 0.0 1.0 \n", "2 0.0 1.0 \n", "3 0.0 1.0 \n", "4 0.0 1.0 \n", "... ... ... \n", "394883 0.0 0.0 \n", "394884 0.0 1.0 \n", "394885 0.0 0.0 \n", "394886 0.0 0.0 \n", "394887 0.0 0.0 \n", "\n", "genres non-fiction poetry romance young-adult \n", "0 0.0 0.0 1.0 1.0 \n", "1 0.0 0.0 1.0 1.0 \n", "2 0.0 0.0 0.0 1.0 \n", "3 0.0 0.0 0.0 1.0 \n", "4 0.0 0.0 0.0 1.0 \n", "... ... ... ... ... \n", "394883 0.0 0.0 1.0 0.0 \n", "394884 0.0 0.0 1.0 1.0 \n", "394885 0.0 0.0 1.0 0.0 \n", "394886 0.0 0.0 1.0 0.0 \n", "394887 0.0 0.0 1.0 0.0 \n", "\n", "[394888 rows x 11 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "genres = list(genre_df.columns)[1:]\n", "print(genres)\n", "\n", "print('Number of reviews per genre:')\n", "for genre in genres:\n", " print(f'{genre: <40}{len(review_df[review_df[genre] == 1]): >10}')\n", "\n", "genre_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The genres have been added as separate columns to the review dataframe, so that it's easy to select reviews for books in a specific genre. \n", "\n", "The *poetry* and *comics, graphics* genres are the smallest genres, with fewer than 100,000 reviews. The *fiction* genre is by far the largest, as it is operates as a aggregate genre with various fiction sub-genres. That is, most of the other genres overlap with the fiction genre. We can check the co-occurrence matrix to see the overlap in genres in more detail." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
genreschildrencomics, graphicfantasy, paranormalfictionhistory, historical fiction, biographymystery, thriller, crimenon-fictionpoetryromanceyoung-adult
genres
children1.0000000.1915900.1755010.1467000.1123360.0887490.1118770.2253520.0574950.306383
comics, graphic0.1269931.0000000.1102280.0877300.1066680.0661260.1392490.0833440.0384400.113899
fantasy, paranormal0.4385810.4155781.0000000.3942000.2440370.4360220.0751480.2156610.3743910.558983
fiction0.8224990.7420680.8844041.0000000.7358280.8890480.4004190.6821940.8071160.894502
history, historical fiction, biography0.2815700.4033590.2447660.3289551.0000000.2862030.6171350.3656160.2484290.255079
mystery, thriller, crime0.2167970.2437010.4262150.3873560.2789321.0000000.0972840.0755270.3370440.342905
non-fiction0.1920560.3606380.0516220.1226020.4226710.0683661.0000000.4116160.0349970.093471
poetry0.0674210.0376180.0258190.0364030.0436410.0092500.0717361.0000000.0147790.027005
romance0.1920560.1937190.5004370.4808680.3310790.4608840.0680990.1650081.0000000.555633
young-adult0.5780810.3242160.4220380.3010230.1920140.2648540.1027340.1703060.3138461.000000
\n", "
" ], "text/plain": [ "genres children comics, graphic \\\n", "genres \n", "children 1.000000 0.191590 \n", "comics, graphic 0.126993 1.000000 \n", "fantasy, paranormal 0.438581 0.415578 \n", "fiction 0.822499 0.742068 \n", "history, historical fiction, biography 0.281570 0.403359 \n", "mystery, thriller, crime 0.216797 0.243701 \n", "non-fiction 0.192056 0.360638 \n", "poetry 0.067421 0.037618 \n", "romance 0.192056 0.193719 \n", "young-adult 0.578081 0.324216 \n", "\n", "genres fantasy, paranormal fiction \\\n", "genres \n", "children 0.175501 0.146700 \n", "comics, graphic 0.110228 0.087730 \n", "fantasy, paranormal 1.000000 0.394200 \n", "fiction 0.884404 1.000000 \n", "history, historical fiction, biography 0.244766 0.328955 \n", "mystery, thriller, crime 0.426215 0.387356 \n", "non-fiction 0.051622 0.122602 \n", "poetry 0.025819 0.036403 \n", "romance 0.500437 0.480868 \n", "young-adult 0.422038 0.301023 \n", "\n", "genres history, historical fiction, biography \\\n", "genres \n", "children 0.112336 \n", "comics, graphic 0.106668 \n", "fantasy, paranormal 0.244037 \n", "fiction 0.735828 \n", "history, historical fiction, biography 1.000000 \n", "mystery, thriller, crime 0.278932 \n", "non-fiction 0.422671 \n", "poetry 0.043641 \n", "romance 0.331079 \n", "young-adult 0.192014 \n", "\n", "genres mystery, thriller, crime non-fiction \\\n", "genres \n", "children 0.088749 0.111877 \n", "comics, graphic 0.066126 0.139249 \n", "fantasy, paranormal 0.436022 0.075148 \n", "fiction 0.889048 0.400419 \n", "history, historical fiction, biography 0.286203 0.617135 \n", "mystery, thriller, crime 1.000000 0.097284 \n", "non-fiction 0.068366 1.000000 \n", "poetry 0.009250 0.071736 \n", "romance 0.460884 0.068099 \n", "young-adult 0.264854 0.102734 \n", "\n", "genres poetry romance young-adult \n", "genres \n", "children 0.225352 0.057495 0.306383 \n", "comics, graphic 0.083344 0.038440 0.113899 \n", "fantasy, paranormal 0.215661 0.374391 0.558983 \n", "fiction 0.682194 0.807116 0.894502 \n", "history, historical fiction, biography 0.365616 0.248429 0.255079 \n", "mystery, thriller, crime 0.075527 0.337044 0.342905 \n", "non-fiction 0.411616 0.034997 0.093471 \n", "poetry 1.000000 0.014779 0.027005 \n", "romance 0.165008 1.000000 0.555633 \n", "young-adult 0.170306 0.313846 1.000000 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "temp_df = genre_df[genres].fillna(0)\n", "cooc = temp_df.T.dot(temp_df)\n", "for genre in genres:\n", " cooc[genre] = cooc[genre] / cooc.loc[genre, genre]\n", "cooc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Above we see the co-occurrence of genres per book (a book can have multiple genre labels). The data in the co-occurrence matrix shows proportions. So the diagional cells from top left to bottom right are always 1.0 (a genre necessarily co-occurs with itself). \n", "\n", "The general *fiction* genre overlaps strongly with most other genres. That is, the other genres are sub-genres of fiction (we selected only books that had at least a *fiction* genre label). \n", "\n", "The *poetry* and *comics, graphic* genres are very distinct from the other subgenres. \n", "\n", "The genres *fantasy, paranormal* has a moderate overlap with *children*, *mystery, thriller, crime*, *romance*, *young-adult*.\n", "\n", "The *history, historical fiction, biography* genre has more overlap with *comics, graphic*, *poetry* and especially *non-fiction*. \n", "\n", "For genre comparison, we will choose three genres:\n", "\n", "- *history, historical fiction, biography*\n", "- *mystery, thriller, crime*\n", "- *romance*\n", "\n", "\n", "We'll start with a quantitative analysis of the metadata.\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
childrencomics, graphicfantasy, paranormalfictionhistory, historical fiction, biographymystery, thriller, crimenon-fictionpoetryromanceyoung-adult
children1.0000000.1987190.1608450.1243260.1075360.0928240.1034090.2306340.0523940.220813
comics, graphic0.1407301.0000000.0853600.0782870.1272020.0584190.2041050.1001670.0283070.085341
fantasy, paranormal0.5621510.4212651.0000000.4605810.3085360.5377400.0883490.2675100.4559030.614448
fiction0.8942860.7951640.9479291.0000000.8229230.9411170.5151780.7878040.8975020.952561
history, historical fiction, biography0.2982080.4980970.2448090.3172571.0000000.2748970.6707020.3958740.2402050.240575
mystery, thriller, crime0.2976200.2644910.4933210.4194990.3178381.0000000.1113400.1015110.3806360.394062
non-fiction0.1617940.4509310.0395510.1120590.3784140.0543321.0000000.4349710.0343820.073177
poetry0.0618300.0379190.0205200.0293620.0382710.0084880.0745301.0000000.0141560.022729
romance0.2353120.1795160.5858560.5603810.3890270.5331760.0986930.2371501.0000000.667596
young-adult0.7059210.3852480.5620460.4233590.2773410.3929100.1495210.2710460.4752061.000000
\n", "
" ], "text/plain": [ " children comics, graphic \\\n", "children 1.000000 0.198719 \n", "comics, graphic 0.140730 1.000000 \n", "fantasy, paranormal 0.562151 0.421265 \n", "fiction 0.894286 0.795164 \n", "history, historical fiction, biography 0.298208 0.498097 \n", "mystery, thriller, crime 0.297620 0.264491 \n", "non-fiction 0.161794 0.450931 \n", "poetry 0.061830 0.037919 \n", "romance 0.235312 0.179516 \n", "young-adult 0.705921 0.385248 \n", "\n", " fantasy, paranormal fiction \\\n", "children 0.160845 0.124326 \n", "comics, graphic 0.085360 0.078287 \n", "fantasy, paranormal 1.000000 0.460581 \n", "fiction 0.947929 1.000000 \n", "history, historical fiction, biography 0.244809 0.317257 \n", "mystery, thriller, crime 0.493321 0.419499 \n", "non-fiction 0.039551 0.112059 \n", "poetry 0.020520 0.029362 \n", "romance 0.585856 0.560381 \n", "young-adult 0.562046 0.423359 \n", "\n", " history, historical fiction, biography \\\n", "children 0.107536 \n", "comics, graphic 0.127202 \n", "fantasy, paranormal 0.308536 \n", "fiction 0.822923 \n", "history, historical fiction, biography 1.000000 \n", "mystery, thriller, crime 0.317838 \n", "non-fiction 0.378414 \n", "poetry 0.038271 \n", "romance 0.389027 \n", "young-adult 0.277341 \n", "\n", " mystery, thriller, crime non-fiction \\\n", "children 0.092824 0.103409 \n", "comics, graphic 0.058419 0.204105 \n", "fantasy, paranormal 0.537740 0.088349 \n", "fiction 0.941117 0.515178 \n", "history, historical fiction, biography 0.274897 0.670702 \n", "mystery, thriller, crime 1.000000 0.111340 \n", "non-fiction 0.054332 1.000000 \n", "poetry 0.008488 0.074530 \n", "romance 0.533176 0.098693 \n", "young-adult 0.392910 0.149521 \n", "\n", " poetry romance young-adult \n", "children 0.230634 0.052394 0.220813 \n", "comics, graphic 0.100167 0.028307 0.085341 \n", "fantasy, paranormal 0.267510 0.455903 0.614448 \n", "fiction 0.787804 0.897502 0.952561 \n", "history, historical fiction, biography 0.395874 0.240205 0.240575 \n", "mystery, thriller, crime 0.101511 0.380636 0.394062 \n", "non-fiction 0.434971 0.034382 0.073177 \n", "poetry 1.000000 0.014156 0.022729 \n", "romance 0.237150 1.000000 0.667596 \n", "young-adult 0.271046 0.475206 1.000000 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from itertools import combinations\n", "\n", "review_genre_df = review_df[genres].fillna(0)\n", "\n", "cooc = review_genre_df.T.dot(review_genre_df)\n", "\n", "for genre in genres:\n", " cooc[genre] = cooc[genre] / cooc.loc[genre, genre]\n", "\n", "cooc\n", "\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "compare_genres = ['history, historical fiction, biography', 'mystery, thriller, crime', 'romance']\n", "genre_df = {}\n", "for genre in compare_genres:\n", " genre_df[genre] = review_df[review_df[genre] == 1]\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We check the rating distribution to see if there are differences in rating behaviour between genres." ] }, { "cell_type": "code", "execution_count": 283, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 283, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "rating_series = []\n", "for genre in compare_genres:\n", " group = review_df.groupby(['rating', genre]).size()\n", " u = group.unstack(genre)\n", " u = u.rename(columns={1.0: genre})\n", " u[genre] = u[genre] / sum(u[genre])\n", " rating_series.append(u[genre])\n", "\n", "pd.concat(rating_series, axis=1).plot(kind='bar')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There are no big differences. The distributions look fairly similar in terms of rating behaviour. The *romance* genre has a slightly higher proportion of 5-star ratings and a lower proportion of 3- and 4-star ratings than the other two genres. \n", "\n", "### Number of reviews per reviewer, author and book\n", "\n", "Some books and authors are very popular and are reviewed by many different readers, which leads to these books and authors having more influence on the overall picture we get for a genre than books that are relatively obscure. So understanding differences between genres in terms of the *content* of reviews is aided by looking at differences in the actors that influence what content is generated and how it is generated. \n", "\n", "Another set of actor influencing this review generation process are the reviewers. Some reviewers write many reviews and have developed conventions for how to write them and what to included in them, others only write an occasional review and perhaps write whatever comes to mind. Some reviewers are very elaborate and discuss the story, writing style and reading experience of a book in detail, while others are succinct and focus the most salient aspect. Some focus more narrative, others on aesthetics and yet others on their own thoughts and feelings.\n", "\n", "If there differences in reviewers across genres, or in popularity of authors and books, these can help explain differences in content. We first look at total numbers of reviews, reviewers, authors and books across the three genres.\n", "\n" ] }, { "cell_type": "code", "execution_count": 284, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Genre\t\t\t\t\t\tReviews\t\tReviewers\tAuthors\t\tBooks\n", "history, historical fiction, biography 321138 114796 57498 88774\n", "mystery, thriller, crime 377140 122850 39462 82799\n", "romance 526310 136499 44263 122257\n" ] } ], "source": [ "print('Genre\\t\\t\\t\\t\\t\\tReviews\\t\\tReviewers\\tAuthors\\t\\tBooks')\n", "stats_columns = ['review_id', 'user_id', 'author_id', 'work_id']\n", "freq = {}\n", "for genre in compare_genres:\n", " stats_string = ''\n", " for column in stats_columns:\n", " freq[(genre, column)] = genre_df[genre][column].nunique()\n", " stats_string += f'{freq[(genre, column)]: >16}'\n", " print(f'{genre: <38}{stats_string}')\n", " " ] }, { "cell_type": "code", "execution_count": 285, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Genre Reviewers Authors Books \n", "Genre mean median mean median mean median \n", "history, historical fiction, biography 2.80 1 5.59 1 3.62 1\n", "mystery, thriller, crime 3.07 2 9.56 2 4.55 2\n", "romance 3.86 2 11.89 2 4.30 2\n" ] } ], "source": [ "print(f'{\"Genre\": <44}{\"Reviewers\": <16}{\"Authors\": <16}{\"Books\": <16}')\n", "print(f'{\"Genre\": <44}{\"mean median\": <16}{\"mean median\": <16}{\"mean median\": <16}')\n", "\n", "for genre in compare_genres:\n", " stats_string = ''\n", " for column in stats_columns[1:]:\n", " prop = freq[(genre, 'review_id')] / freq[(genre, column)]\n", " median = np.median(genre_df[genre][column].value_counts())\n", " stats_string += f'{prop: >10.2f}{median: >6.0f}'\n", " print(f'{genre: <38}{stats_string}')\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Above we show the average number of reviews per reviewer, author and book (both the mean and the median). There are some significant differences between the three genres. In the *romance* genre, individual reviewers write more reviews, and more there are more reviews per author, especially in comparison with *history, historical fiction, biography*. Reviewers tend to read (or at least review) more books, and also more books by the same author. **This may have consequences for the comparison.** If individual reviewers have personal characteristics that influence reviews (e.g. a tendency to write long or short reviews, to use certain vocabulary, to focus their reviews on certain book aspects), this has an influence in what are statistical commonalities and differences. \n", "\n", "With more reviews by the same reviewer, their idiosyncracies have relatively high frequency. Also, the reviews of books by the same author may all mention the same author name, or if the books belong to a series, characters and place may recur, which also results in higher relative frequencies.\n", "\n", "So, in the comparative analysis, one question is whether to compensate for these kinds of differences and if so, how. For instance, should we balance the selection to only include a single review per reviewer, book and author, or to leave the natural imbalance in tact. \n", "\n", "**If the goal of the comparison is to say something about reviews in a certain genre in general**, we should perhaps let the different frequencies come through as characteristics for indidivual genres. **If, on the other hand, the goal is to compare the reception of books in a genre**, these imbalances should probably be compensated for, or at least be taken into account in interpreting the observed similarities and differences. \n", "\n", "But there is a problem with the analysis above, which is signaled by the difference between the mean and median. We calculated the mean number of reviews per reviewer author and book, but statistics like mean are only meaningful if the data is roughly normally distributed (see the [Analyzing Distributions](./Analyzing-Distributions.ipynb) notebook for an elaboration on the problem). When the data distribution is very skewed, such statistics are deceptive. When most items in a set have a low frequency, and there is a small number of outliers with a very high frequency, the outliers drive up the mean, such that the majority of the items are below average.\n" ] }, { "cell_type": "code", "execution_count": 287, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "from collections import Counter\n", "from scripts.helper import ecdf\n", "\n", "for genre in compare_genres:\n", " dist = Counter([int(count) for count in genre_df[genre].user_id.value_counts()])\n", " x = dist.keys()\n", " y = list(dist.values())\n", " y_prob = [y_point / sum(y) for y_point in y]\n", " plt.scatter(x,y_prob, label=genre)\n", "plt.xscale('log')\n", "plt.yscale('log')\n", "plt.ylabel('Proportion of reviewers')\n", "plt.xlabel('Number of reviews per reviewer')\n", "plt.legend()\n", "plt.show()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Above we see the number of reviews per reviewer against the number of reviewers. At the top left we see that the vast majority of reviewers in all three genres write only one or a few reviews. The proportion is shown on a logarithmic scale, so the dots for a single review per reviewer are between 40% and 60% of all reviewers, while all dots for more than 10 reviews per reviewers quickly fall far below 1%. But the genres show a difference in their distributions at the higher end (i.e. above 10 reviews per reviewer). What this means is, **there is larger proportion of reviewers in the *romance* genre who write many reviews than in the other genres, but these represent only a few percent of all reviewers.**\n", "\n", "\n", "Next, we look at the number of reviews per author." ] }, { "cell_type": "code", "execution_count": 288, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "from collections import Counter\n", "from scripts.helper import ecdf\n", "\n", "for genre in compare_genres:\n", " dist = Counter([int(count) for count in genre_df[genre].author_id.value_counts()])\n", " x = dist.keys()\n", " y = list(dist.values())\n", " y_prob = [y_point / sum(y) for y_point in y]\n", " plt.scatter(x,y_prob, label=genre)\n", "plt.xscale('log')\n", "plt.yscale('log')\n", "plt.ylabel('Proportion of authors')\n", "plt.xlabel('Number of reviews per auhtor')\n", "plt.legend()\n", "plt.show()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Above we see the log-log distribution of the number reviews per author. The *history, historical fiction, biography* genre has relatively many authors with only a single review (the blue dot in the top left is higher than the top left green and orange dot), and therefore fewer authors with multiple reviews (more the right, the blue dots tend be below the green and orange dots). In other words, for most authors in this genre a comparison is difficult because there is only a single review for it. **But for all three genres, the majority of authors have only one or a few reviews, so the much higher average for *romance* authors than for *history, historical fiction, biography* authors is caused by a relatively small set of outliers with a very high number of reviews.**\n", "\n", "A possible explanation is that there is a subset of authors writings *romance* novels or *mystery, thriller, crime* novels who are more prolific than authors of *historical* novels and *biographies*." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# group by author and cont the number of works they wrote\n", "g = metadata_df.groupby(['author_id']).work_id.nunique()\n", "# turn series into a dataframe and rename the column to reflect the number of works per author\n", "u = g.reset_index()\n", "u = u.rename(columns={'work_id': 'author_works_num'})\n", "# merge the new dataframe with the metadata dataframe created at the top of this notebook\n", "meta_df = pd.merge(metadata_df, u, on='author_id')\n", "# create a dataframe for all books and their genre labels, with one column per genre\n", "groups = genremeta_df.groupby(['book_id', 'genres']).size()\n", "genremeta_df = groups.unstack('genres').fillna(0)\n", "genremeta_df = genremeta_df.reset_index()\n", "# merge the new genre metadata frame with the extended book metadata frame \n", "# so we can connect nmber of works per author with the genre labels\n", "meta_df = pd.merge(meta_df, genremeta_df, on='book_id')\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "history, historical fiction, biography \tauthors: 57498\tworks: 190384\tsingle work authors: 33085\n", "mystery, thriller, crime \tauthors: 39462\tworks: 186422\tsingle work authors: 15953\n", "romance \tauthors: 44263\tworks: 196750\tsingle work authors: 18319\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "fig = plt.figure(tight_layout=True, figsize=(12,4))\n", "plt.subplot(1,2,1)\n", "for genre in compare_genres:\n", " temp_df = meta_df[meta_df[genre] == 1]\n", " temp_df = temp_df[['author_id', 'author_works_num']].drop_duplicates()\n", " s = temp_df.author_works_num.value_counts().sort_index()\n", " u = s.reset_index()\n", " u = u.rename(columns={'index': 'author_works_num', 'author_works_num': 'num_authors'})\n", " plt.scatter(u.author_works_num, u.num_authors, label=genre)\n", "plt.xlabel('Number of works per author')\n", "plt.ylabel('Number of authors')\n", "plt.xscale('log')\n", "plt.yscale('log')\n", "plt.legend()\n", "\n", "plt.subplot(1,2,2)\n", "for genre in compare_genres:\n", " temp_df = meta_df[meta_df[genre] == 1]\n", " temp_df = temp_df[['author_id', 'author_works_num']].drop_duplicates()\n", " s = temp_df.author_works_num.value_counts().sort_index()\n", " u = s.reset_index()\n", " u = u.rename(columns={'index': 'author_works_num', 'author_works_num': 'num_authors'})\n", " plt.scatter(u.author_works_num, u.num_authors / sum(u.num_authors), label=genre)\n", " num_single_work_authors = int(u[u.author_works_num == 1].num_authors)\n", " print(f'{genre: <40}\\tauthors: {sum(u.num_authors): >6}\\tworks: {sum(u.author_works_num * u.num_authors): >8}\\tsingle work authors: {num_single_work_authors: >6}')\n", "plt.xlabel('Number of works per author')\n", "plt.ylabel('Proportion of authors')\n", "plt.xscale('log')\n", "plt.yscale('log')\n", "plt.legend()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In the entire dataset, there 186,360 different authors of *history, historical fiction, biography* books, which is the same as the number of different authors for *romance* and *mystery, thriller, crime* combined. **So one thing to take into account is that the *history* genre has many more distinct authors than the other two.** \n", "\n", "Above are two distribution plots. The first shows the number of works per author set off against the *number of authors* (how many authors have written X books). The second shows the number of works per author set off against the *proportion of authors* (what proportion authors have written X books).\n", "\n", "First, there are many more authors in the *history* genre who write only one book than in the other genres. In the plot on the left, it is clear that for all three genres, the majority of authors write only one or a few books. But the first blue dots are above the other colored dots, which means that there are many more *history* authors with few books. At the higher end, the long tail of highly prolific author, there are few differences as the colored dots overlap so much that only the green dots are visible. **In other words, it is not that there are fewer prolific authors in the *history* genre than in the other two genres, but that there are many more *history* authors, and therefore, *proportionally* many more *history* authors with few books.** This is what the plot on the right shows. From 4 works per author and more, the blue dots are well below the others, showing that the *proportion* of authors with 4 works of more is lower in the *history* genre than in the other genres. \n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "from collections import Counter\n", "from scripts.helper import ecdf\n", "\n", "for genre in compare_genres:\n", " dist = Counter([int(count) for count in genre_df[genre].work_id.value_counts()])\n", " x = dist.keys()\n", " y = list(dist.values())\n", " y_prob = [y_point / sum(y) for y_point in y]\n", " plt.scatter(x,y_prob, label=genre)\n", "plt.xscale('log')\n", "plt.yscale('log')\n", "plt.ylabel('Proportion of books')\n", "plt.xlabel('Number of reviews per book')\n", "plt.legend()\n", "plt.show()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Above we see number of reviews per book set of against the proportion of books. The three distributions show no big differences. In other words, book popularity behaves no differently between the three genres.\n", "\n", "From the analyses of these three aspects (number of reviews per reviewer, per author and per book), we have learned that in the *romance* genre, there is a longer tail of more prolific reviewers than for the other genres, while the *history* genre has a higher peak of authors with just a single work. **In other words, there is *less* overlap in authors in the *history* reviews, and the is *more* overlap of reviewers in the *romance* reviews.** The *romance* genre is therefore *less* heterogeneous than the *mystery, thriller, crime* genre and especially than the *history, historical fiction, biography* genre.\n" ] }, { "cell_type": "code", "execution_count": 295, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "book_id publication_year date_updated\n", "1 2006.0 2014 111\n", " 2015 24\n", " 2016 51\n", " 2017 43\n", "2 2004.0 2007 6\n", " ... \n", "36473367 2017.0 2017 1\n", "36477938 2017.0 2017 5\n", "36494299 2015.0 2017 1\n", "36498328 2003.0 2017 1\n", "36508486 2017.0 2017 1\n", "Length: 463959, dtype: int64" ] }, "execution_count": 295, "metadata": {}, "output_type": "execute_result" } ], "source": [ "review_df.groupby(['book_id', 'publication_year', review_df.date_updated.dt.year]).size()" ] }, { "cell_type": "code", "execution_count": 326, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "history, historical fiction, biography 1951.0 2020.0\n", "mystery, thriller, crime 1951.0 2020.0\n", "romance 1951.0 2020.0\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 326, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "for genre in compare_genres:\n", " temp_df = genre_df[genre][(genre_df[genre].publication_year > 1950) & (genre_df[genre].publication_year <= 2020)]\n", " year_min = temp_df.publication_year.min()\n", " year_max = temp_df.publication_year.max()\n", " print(genre, year_min, year_max)\n", " temp_df.publication_year.value_counts().sort_index().plot(kind='bar', label=genre)\n", "\n", "plt.legend()\n", "\n" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAEKCAYAAAAMzhLIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy86wFpkAAAACXBIWXMAAAsTAAALEwEAmpwYAABDyUlEQVR4nO2deXxU1dn4v88MAwlbwqasClZEISSEVUEFxYa2EdlEq1jFtfprLa++RaVWTK2ttNhaqG2ttq5tLYiA1dRiXShYbQEBQa3UF9zYNCIJAglkeX5/3JlhJpnlTjKTyfJ8P5/5zNyzPufcM/e595xzn0dUFcMwDMOIhifdAhiGYRhNG1MUhmEYRkxMURiGYRgxMUVhGIZhxMQUhWEYhhETUxSGYRhGTNqkW4BU0L17d+3fv3+6xTAMw2g2vPHGG5+pao9IcS1SUfTv358NGzakWwzDMIxmg4h8GC3Opp4MwzCMmJiiMAzDMGJiisIwDMOISYtcozBaFpWVlezcuZOKiop0i2IYzZ6MjAz69u2Lz+dznccUhZ+Vm3axcNU2dpeW0zs7k7mTBjE1v0+6xTKAnTt30qlTJ/r374+IpFscw2i2qCr79u1j586dDBgwwHU+m3rCURLzlm9lV2k5CuwqLWfe8q2s3LQr3aIZQEVFBd26dTMlYRgNRETo1q1bwk/npiiAhau2UV5ZHRZWXlnNwlXb0iSRURtTEoaRHOrzXzJFAewuLU8o3GhdfPDBB+Tk5ESMmz9/Pi+++GLUvCtXruSdd95JlWhBioqKuPfeeyPGjR07NmbeH//4x/Wq82tf+xqlpaUJ53v00Uf59re/XSf8yJEjnHfeeQwbNowlS5ZwzTXXxOy71atX89prrwWPH3jgAR5//PGE5YlFrHMfT75UEkuuVGBrFEDv7Ex2RVAKvbMz0yCN0Zy46667YsavXLmS888/n8GDB7sus6qqijZtkvfXDL2YRuLHP/4x3/ve91yXp6qoKn/9618bKloYmzZtAmDz5s0AXHzxxTHTr169mo4dOwYV4fXXX59UeeLxu9/9LinlJPt8pwJ7ogDmThpEps8bFpbp8zJ30qA0SWQ0hJWbdjFuwcsMuK2YcQteTspaU3V1Nddeey1DhgyhoKCA8nLnxmL27NksW7YMgNtuu43BgweTm5vLd7/7XV577TX+8pe/MHfuXIYNG8b27dvZvHkzp59+Orm5uUybNo39+/cDMGHCBP7nf/6HkSNH8qMf/YgBAwZQWVkJwIEDB8KOo/HOO+8wYcIETjrpJBYvXhwM79ixIwB79uzh7LPPZtiwYeTk5LB27Vpuu+02ysvLGTZsGLNmzQLg5z//OTk5OeTk5PCLX/wCcO5gBw0axOWXX05OTg4ff/wx/fv357PPPgPg8ccfJzc3l7y8PL7xjW8A8OyzzzJmzBjy8/M577zz+OSTT6LK/umnn3LZZZexfv36YF9NmDAhaGHhb3/7G8OHDycvL4+JEyfywQcf8MADD3DfffcxbNgw1q5dG/ZUFaufb731VkaPHs0pp5zC2rVr4577qqoqZs2axWmnncaFF17I4cOHg2UF5HvyyScZOnQoOTk53HrrrcG8v//97znllFMYPXo01157bfBJavbs2Vx//fWMGTOGW265hXXr1nHGGWeQn5/P2LFj2bbNmfZ+9NFHmTJlChMmTGDgwIH84Ac/CJYdaUxu376d4cOHB9O89957Ycf1JnB30JI+I0aM0ERZsXGnjr3nJe1/63M69p6XdMXGnQmXYaSGd955x3XaFRt36qnff15PvPW54OfU7z/foPP5/vvvq9fr1U2bNqmq6syZM/WJJ55QVdUrrrhCn3rqKf3ss8/0lFNO0ZqaGlVV3b9/f1h8gKFDh+rq1atVVfWOO+7QOXPmqKrq+PHj9YYbbgimmz17tq5YsUJVVX/729/qzTffHFPGO++8U8844wytqKjQkpIS7dq1qx49elRVVTt06KCqqvfee6/efffdqqpaVVWlBw4cCItXVd2wYYPm5OTowYMH9YsvvtDBgwfrxo0b9f3331cR0ddffz2Y9sQTT9SSkhJ96623dODAgVpSUqKqqvv27VNV1c8//zzYHw899FCwDY888oh+61vfqtOGV155RQsLC4PH48eP1/Xr1+unn36qffv21R07doSVf+edd+rChQvD+iBwHKufA3IUFxfrxIkTY/br+++/r4C++uqrqqp65ZVXBusIyLdr1y7t16+ffvrpp1pZWannnHOOrlixQnft2qUnnnii7tu3T48ePapnnnlmsN1XXHGFFhYWalVVlaqqlpWVaWVlpaqq/v3vf9fp06cH+6pnz5762Wef6eHDh3XIkCG6fv36mGNywoQJwfB58+bp4sWL67Qr0n8K2KBRrqn2ROFnan4f/nnbuby/oJB/3naubY1tpqRqY8KAAQMYNmwYACNGjOCDDz4Ii8/KyiIjI4Orr76a5cuX0759+zpllJWVUVpayvjx4wG44oorWLNmTTA+dKrlmmuu4ZFHHgHgkUce4corr4wrY2FhIe3ataN79+4cd9xxde7gR40axSOPPEJRURFbt26lU6dOdcp49dVXmTZtGh06dKBjx45Mnz49eNd94okncvrpp9fJ8/LLLzNz5ky6d+8OQNeuXQFnW/OkSZMYOnQoCxcu5O23347bhkj861//4uyzzw5u5wyUH414/Tx9+nQg8nmMRL9+/Rg3bhwAl112Ga+++mpY/Pr165kwYQI9evSgTZs2zJo1izVr1rBu3TrGjx9P165d8fl8zJw5MyzfzJkz8Xq9QZlnzpxJTk4ON910U1hfffnLX6Zbt25kZmYyffr0YP3RxmRg7FRXV7NkyRIuvfTSuG2MhykKo0WRqo0J7dq1C/72er1UVVWFxbdp04Z169Zx4YUX8txzz/GVr3wl4To6dOgQ/D1u3Dg++OADVq9eTXV1tauFy3gynn322axZs4Y+ffowe/bshBd+Q+Vzw4033si3v/1ttm7dym9/+9sm88JkoJ8i9VEkau8SStYOvND+vOOOOzjnnHN46623ePbZZ8P6Klr90c73jBkzeP7553nuuecYMWIE3bp1a7Cspij8FO8opmBZAbmP5VKwrIDiHcXpFsmoB9E2IKR6Y8LBgwcpKyvja1/7Gvfddx9vvvkmAJ06deKLL74AnKeOLl26BO/Qn3jiieBdbyQuv/xyLr300rCnifvvv5/777+/XjJ++OGHHH/88Vx77bVcc801bNy4EQCfzxdc/zjrrLNYuXIlhw8f5tChQ6xYsYKzzjorZrnnnnsuTz31FPv27QPg888/B5y75D59nCfzxx57rF4yA5x++umsWbOG999/P6z80L4NJdF+Bti1axcTJ06MGPfRRx/x+uuvA/CnP/2JM888Myx+9OjR/OMf/+Czzz6jurqaJ598kvHjxzNq1Cj+8Y9/sH//fqqqqnj66aej1h/aV48++mhY3N///nc+//xzysvLWblyZfDpJhoZGRlMmjSJG264wdWTqBtMUeAoiaJX72DPoT0oyp5Deyh69Q5TFs2QdG1M+OKLLzj//PPJzc3lzDPP5Oc//zkAX//611m4cCH5+fls376dxx57jLlz55Kbm8vmzZuZP39+1DJnzZrF/v37ueSSS4Jh7777br3vEFevXk1eXh75+fksWbKEOXPmAHDdddeRm5vLrFmzGD58OLNnz2b06NGMGTOGa665hvz8/JjlDhkyhNtvv53x48eTl5fHzTffDDhbdmfOnMmIESOC01L1oUePHjz44INMnz6dvLy84BTd5MmTWbFiRXAxO5RE+hmchf5oO48GDRrEr371K0477TT279/PDTfcEBbfq1cvFixYwDnnnENeXh4jRoxgypQp9OnTh+9973uMHj2acePG0b9/f7KysiLWccsttzBv3jzy8/PrPOWMHj2aGTNmkJuby4wZMxg5cmTMtoAzdjweDwUFBXHTukGcNYyWxciRIzURfxQFfzqTPZVldcJ7+bJ44dJXI+QwGpP//Oc/nHbaaa7TtxRzLMuWLeOZZ57hiSeeCIadf/75LF++nLZt26ZRspbH/fffzwknnMAFF1yQ1HIPHjxIx44dqaqqYtq0aVx11VVMmzbNdf5HH32UDRs2JPwUee+991JWVsYPf/jDiPGR/lMi8oaqRtRCTXvzbiOx92gpRJh33Hu0tNFlMRrO1Pw+zVIxhHLjjTfy/PPP13lX4bnnnkuTRC2bSC8AJoOioiJefPFFKioqKCgoYOrUqSmpJ5Rp06axfft2Xn755aSV2eSfKETkJOB2IEtVL3STJ+Enit+dxh5fXZ3Zq7KKF675j+tyjNSQ6BOFYRixSfSJIqVrFCLysIh8KiJv1Qr/iohsE5H/E5HbYpWhqjtU9epUyjnniJeMmpqwsIyaGuYc8UbJYRiG0XpI9WL2o0DYPkER8QK/Ar4KDAYuEZHBIjJURJ6r9TkuxfIBUHjWfIr2H6RXZRWiSq/KKor2H6TwrNgLYIZhGK2BlK5RqOoaEelfK3g08H+qugNARP4MTFHVe4DzUylPVHIvohAofOkuKNsJWX3hvIWQe1FaxDEMw2hKpGMxuw/wccjxTmBMtMQi0g34EZAvIvP8CiVSuuuA6wBOOOGExKXKvcgUg2EYRgSa/HsUqrpPVa9X1S9FUxL+dA+q6khVHdmjR4/GFNEwEmbz5s1Jt74aidLSUn79618Hj1evXs3557t7cA81oR5qAC/UGGBjUV+T5kZySIei2AX0Cznu6w8zjFZDfRSFG3MTtamtKNxSXV3NXXfdxXnnnZdw3khl1RdVpaamhr/+9a9kZ2c3WBajfqRDUawHBorIABFpC3wd+Esa5DBaKluWwn05UJTtfG9Z2qDiPvjgA0499VRmz57NKaecwqxZs3jxxRcZN24cAwcOZN26ddTU1DBw4EBKSkoAqKmp4eSTT6akpISnnnqKnJwc8vLyOPvsszl69Cjz589nyZIlQSc9hw4d4qqrrmL06NHk5+fzzDPPAM4LVxdccAHnnnsuEydO5PLLL2flypVB2WbNmhVMG4nbbruN7du3M2zYMObOnQs4L4FdeOGFnHrqqcyaNYvAFvn+/ftz6623Mnz4cJ566qkwE+rR+MMf/sDo0aMZNmwY3/zmN4NKoWPHjvzv//4veXl5QfMXkfjkk0+YNm0aeXl55OXl8dprr8U0ae7mXABR+9OoJ9HMyibjAzwJ7AEqcdYirvaHfw34L7AduD3Z9dbHzHiyMHPlyScRM+P65hLVu49XvbPzsc/dxzvh9SRg0nnLli1aXV2tw4cP1yuvvFJramp05cqVOmXKFFVVLSoq0vvuu09VVVetWhU0FZ2Tk6M7dzrjIGB+vLap7Xnz5gXNRO/fv18HDhyoBw8e1EceeUT79OkTNK29evXqYH2lpaXav3//oHnqaLIPGTIkePzKK69o586d9eOPP9bq6mo9/fTTde3atarqmA3/yU9+EkwbaiI9YFI7kK6kpETfeecdPf/884PmzG+44QZ97LHHVFUV0CVL4vf5RRddFOyzqqoqLS0tjWnS3O25iNafhkOTMjOuqpeoai9V9alqX1X9vT/8r6p6ijrrDj9KVn0iMllEHiwrq2uOozFYuWkX85ZvZVdpOQrsKi1n3vKtSXGcY7jkpbugspal2MpyJ7wBDBgwgKFDh+LxeBgyZAgTJ05ERBg6dGjQvPNVV10VtMj68MMPBw2yjRs3jtmzZ/PQQw9FnYZ54YUXWLBgAcOGDWPChAlUVFTw0UcfAY6Z6YBp7fHjx/Pee+9RUlLCk08+yYwZMxL2jjZ69Gj69u2Lx+Nh2LBhYaa243mVC+Wll17ijTfeYNSoUQwbNoyXXnqJHTt2AI410xkzZsQt4+WXXw7aTvJ6vUFbSNFMmoO7cxGrP43EaVEmPFT1WeDZkSNHXpuO+mP5QmjuJiWaDWU7Ewt3SahJZ4/HEzz2eDzBtYN+/fpx/PHH8/LLL7Nu3Tr++Mc/Ao4v53//+98UFxczYsQI3njjjTrlqypPP/00gwaFGy/897//Xce89+WXX84f/vAH/vznPwd9VtS3LbVNbSdiSlxVueKKK7jnnrp7TDIyMoK+FupDLDncnIto/WnUjya/66k5kSpfCEYCZPVNLDzJXHPNNVx22WVhTmm2b9/OmDFjuOuuu+jRowcff/xxHRPZkyZN4pe//GVwvSDgPzoSs2fPDrooDfjijmYmO5op7mQwceJEli1bxqeffgo45r8//PDDiGnnzZvHihUrIpbxm9/8BnAWvZM1G5BIfxrxMUWRRNLlC8EIYeJ88NXqb1+mE94IXHDBBRw8eDDMD8DcuXOD/pTHjh1LXl4e55xzDu+8805wMfuOO+6gsrKS3NxchgwZwh133BG1juOPP57TTjstrI5oZrK7devGuHHjyMnJCS5mJ4vBgwdz9913U1BQQG5uLl/+8pfZs2dPxLRbt26lZ8+edcIXLVrEK6+8wtChQxkxYgTvvPNOUmRLpD+N+DR5o4D1IVGjgMkisEYROv2U6fNyz/ShNvXUABI2CrhlqbMmEXjLfuL8RnuZcsOGDdx00011/CMkk8OHDzN06FA2btwYnNNPlZnsZDFp0iRWrVqVbjEMP2ZmPI1Mze/Dm/tf5un3H6LGux9PdRdmDLjWlERjk6a37BcsWMBvfvOb4NpEKnjxxRe5+uqruemmm8Kc4KTKTHayMCXRvGlRTxQiMhmYfPLJJ1/73nvvNXr9xTuKKXqtiIrqY/5uM7wZFI0tovCkwkaXp6VgZsYNI7k0KTPjjY2qPquq10VzN5hqFm1cFKYkACqqK1i0cVFa5DEMw0gGLUpRpJu9h/YmFG4YhtEcMEWRRHp2qLurI1a4YRhGc8AURRKZM3wOGd6MsLAMbwZzhs9Jk0SGYRgNx3Y9JZHAgvWijYvYe2gvPTv0ZM7wObaQbRhGs6ZFKYqQXU9pk6HwpEJTDC2coKE0jz2QG62DFjXS073rCUi6iWsjcYp3FFOwrIDcx3IpWFZA8Y7iBpdZ2/T11VdfTU5ODkOHDmXJkiWA4xRo/PjxTJkyhZNOOonbbruNP/7xj4wePZqhQ4eyfft2AJ599lnGjBlDfn4+5513Hp988gkARUVFXHXVVUyYMIGTTjqJxYsXB+t//PHHyc3NJS8vj2984xsAlJSUMGPGDEaNGsWoUaP45z//2eB2GkZEopmVbc6ftJkZT4GJayMxM+PPbX9ORz4xUnMezQl+Rj4xUp/b/lyDZAg1fb1s2TI977zztKqqSvfu3av9+vXT3bt36yuvvKJZWVm6e/duraio0N69e+v8+fNVVfUXv/iFzpkzR1VVP//8c62pqVFV1YceekhvvvlmVVW988479YwzztCKigotKSnRrl276tGjR/Wtt97SgQMHaklJiapq0OT4JZdcEjQR/uGHH+qpp57aoDYarYdEzYy3qKmntBPLxLX5424UYr3L0tApwYDp65tuuolLLrkEr9fL8ccfz/jx41m/fj2dO3dm1KhR9OrVC4AvfelLFBQUADB06FBeeeUVAHbu3MnFF1/Mnj17OHr0KAMGDAjWUVhYSLt27WjXrh3HHXccn3zyCS+//DIzZ86ke/fuAEGT4y+++GKYbaQDBw5w8OBBOnbs2KB2GkZtTFEkkxSZuDbck8p3WdyY4HZjAvvGG2/k5ptv5oILLmD16tUUFRVFzF/bBHhtampq+Ne//kVGRkbUNIaRDFrUGkXaSbOJa6Nx3mU566yzWLJkCdXV1ZSUlLBmzRpGjx7tOn9ZWRl9+jj2vx577LG46c8991yeeuop9u3bBzjmvAEKCgr45S9/GUy3efPmBFphGO4xRZFM0mzi2micd1mmTZsWXFg+99xz+elPfxrRhHY0ioqKmDlzJiNGjAhOJ8ViyJAh3H777YwfP568vDxuvvlmABYvXsyGDRvIzc1l8ODBPPDAA/Vuk2HEokUZBQyQLjPjQFpNXLdUEjUKWLyj2N5lMYwYtGoz403hPYp0mbg2jmHvshhGcmlRU0/aFN6jSAIrN+1i3IKXGXBbMeMWvMzKTbvSLZJhGK2YFvVE0RKo7SVvV2k585ZvBTAHSIZhpIUW9UTREli4aluYK1WA8spqFq7aliaJmgYtcS3NMNJBff5LpiiaGLtLyxMKbw1kZGSwb98+UxaG0UBUlX379iX87o1NPTUxemdnsiuCUuidnRkhdeugb9++7Ny5k5KSknSLYhjNnoyMDPr2TezdLlMUaWLlpl0sXLWN3aXl9M7OZO6kQUzN78PcSYPC1igAMn1e5k4alEZp04vP5wszc2EYRuNiiiINuFmwjqREDMMw0oEpijQQa8F6an6f4McwDKMp0KIWs0Vksog8WFZWlm5RYmIL1oZhNCfiKgoRmSMincXh9yKyUUQKGkO4RGkuL9xFW5huzQvWhmE0Xdw8UVylqgeAAqAL8A1gQUqlauHMnTSITJ83LKy1L1gbhtF0cbNGIf7vrwFPqOrbIiKxMhixsQVrwzCaE24UxRsi8gIwAJgnIp2AmtSK1fKxBWvDMJoLbhTF1cAwYIeqHhaRbsCVKZWqNWDmyA3DaCa4URSPAWuAw0Cpqu4D9qVUqpbOlqXw7HeO+dcu+9g5BlMWhmE0OdwsZj8M9AJ+KSI7RORpEUmeu7DWyEt3HVMSASrLnXDDMIwmRtwnClV9RUTWAKOAc4DrgSHAohTL1nIp25lYuGEYRhqJqyhE5CWgA/A6sBYYpaqfplqwFk1WX2e6KVK4YRhGE8PN1NMW4CiQA+QCOSJib4Y1hInzwVerC32ZTrhhGEYTI66iUNWbVPVsYDrOIvYjQGmK5aoXzcWEB7kXweTFkNUPEOd78mJbyDYMo0ki8ZzBiMi3gbOAEcAHONNPa1X15ZRLV09GjhypGzZsSLcYhmEYzQYReUNVR0aKc7M9NgP4OfCGqlYlVTLDMAyjyeNm6ulewIdj4wkR6SEi5kXGMAyjleDGeuydwK3APH+QD/hDKoUyDMMwmg5udj1NAy4ADgGo6m6gUyqFMgzDMJoObhTFUXVWvBVARDqkViTDMAyjKeFGUSwVkd8C2SJyLfAi8FBqxTIMwzCaCm5MeNwrIl8GDgCDgPmq+veUS9bCKd5RzKKNi9h7aC89O/RkzvA5FJ5UmG6xDMMw6uBmeyx+xWDKIUkU7yim6LUiKqorANhzaA9FrxUBmLIwDKPJEXXqSURe9X9/ISIHQj5fiMiBxhOx5bFo46KgkghQUV3Boo1mZ9EwjKZH1CcKVT3T/207nJLM3kN7Ewo3DMNIJ27eo1gsImc0hjCthZ4deiYUbhiGkU7c7Hp6A7hDRLaLyL0iEtEWiOGeOcPnkOHNCAvL8GYwZ7j5gzIMo+nhxoTHY6r6NRzHRduAn4jIeymXrB40F+uxhScVUjS2iF4deiEIvTr0omhskS1kG4bRJIlrPTaYUGQ0cDEwBfiPqk5OpWANwazHGoZhJEYs67Fu1ih+6n+CuAvYCoxsykrCMAzDSC5u3qPYDpyhqp+lWhjDMAyj6eFGUTwEXCoiJ6nqXSJyAtBTVdelWDajFis37WLhqm3sLi2nd3YmcycNYmp+n2MJtiyFl+6Csp2O/+2J881rnmEYDcbNrqdfAWcAl/iPv/CHGY3Iyk27mLd8K7tKy1FgV2k585ZvZeWmXU6CLUvh2e9A2ceAOt/PfscJNwzDaABuFMUYVf0WUAGgqvuBtimVyqjDwlXbKK+sDgsrr6xm4aptzsFLd0FleXimynIn3DAMowG4URSVIuLlmJnxHkBNSqUy6rC7tDx2eNnOyBmjhRuGYbjEjaJYDKwAjhORHwGvAj9OqVRGHXpnZ8YOz+obOWO0cMMwDJfEVBQi4gHeB24B7gH2AFNV9alGkM0IYe6kQWT6vGFhmT4vcycNcg4mzgdfLWXiy3TCDcMwGkDMXU+qWiMiv1LVfODdRpLJiEBgd1PUXU+B3U1Rdj2Z/wvDMOqLm+2xL4nIDGC5un2N20gJU/P71N0Oe18txXDTW7Xicyiu+pyiHl2pEAHM/4VhGInhZo3im8BTwBHzR9GEiLcdNiR+UZesoJIIYP4vDMNwixujgJ1U1aOqbVW1s/+4c2MIZ8Qg3nbYkPi9bbxEwvxfGIbhBjdPFEZTJMq215qynQy4rZiakPieVdUR05r/C8Mw3GCKoplyODPyRX53TTfU/x1gzv5SMmrCX30x/xeGYbglls/sAY0piJEYP628mMMa/oL88vadmXpCNzqeehtTT+jG8vbODGHhocMUffY5vaqqEYjr/6J4RzEFywrIfSyXgmUFFO8oTnVzDMNowsTa9bQMGCEiL6nqxMYSyHDHYwdH87nnKLe0WUpv2ccfOhzHz3q0p8ZTjgBHfOX8oEc3DpZkcvnhTyls043CUfGNBBbvKKbotSIqqisA2yFlGEZsReERke8Bp4jIzbUjVfXnqRPLiEfv7Ez+Unomfzl6JgAd+i7A4ykNS1Pjqebebj25/Jb/ui530cZFQSURILBDyhSFYbROYq1RfB2oxlEmnSJ8jFThf/+BomznO4IF2NpvaouvNGJR2iZyeDSi7YSyHVKG0XqJ+kShqgH/2FtU9flGlKneiMhkYPLJJ5+cblHqT+D9h8DW18D7ERA2bVT7TW1PdRe0zf46xfXy72yK5MsiNH8grGeHnuw5tKdOObZDyjBaL3F9ZotIFnAncLY/6B/AXapalmLZ6k2z9pl9X47/JbpaZPULf+u6FrXXFsDZ2VQ0tojKsmHMW741zEy5zyugUFlz7Pxn+rx8/ZwSntu9OGI5NvVkGC2XBvnMBh7GcVZ0kf9zAHgkeeIZYdTTXHjhSYUUjS2iV4deCBK2symSL4vKag1TEuD4t3hhXZ+o5RiG0TpxY+vpS6o6I+T4ByKyOUXyGFl9ozxRxDcXXnhSYcQLejRfFpHYXVoetRzDMFonbhRFuYicqaqvAojIOMD9lcdIjInzKX5xLos6t2dvGy89q6qZc+AwhQ0wF947O5NdfmXRpvMm2vVYhfhK0cpsjpRMoupAfljaSJj1WcNovbhRFNcDj/vXKgD2A1ekTqTWTXHHDhR170aFVgKwx9eGou7doGMH6ntZnjtpEPOWb6UycwMZvZYjHqdsaVtKRq/lVABVB/LD/VuEymTvVhhGq8aNUcA3VTUPyAVyVTVfVbekXrTWyaKNi4JKIkCFVjbI0uvU/D7cM30o7Y9/IagkAoinknY9VtEnO5N7pg8NN2MeKlOUdysMw2j5uHmiAEBVzbR4I5Cq9xim5vdh/pbSiHHetmX887ZzG10mwzCaB64VhdE4xHyPYcvSqB7sGlx2g2X6GMQLWu1s5U1QNsMwmi5mPbaJMWf4HDK8GWFhGd4M5nQfE9tRUUPKjmNF1p1MOEoC6iWbYRhNF1dPFCIyFugfml5VH0+RTK2awOJwnR1Gz9wa3VGRyzv3qGXHWZBOSKZ6ymYYRtPFzZvZTwBfAjbj2H4CUFX9TmpFqz/N+s3saBRlA5HOlUBRaePKEiCqTAHSKJthGAkR681sN08UI4HBGk+jGKmlAS/ipYxoMoXGG4bR7HGzRvEWYBbh0s3E+eALfxmuuHM2Bcdn19vBUIMdFEWQKYgv04k3DKPZ4+aJojvwjoisA44EAlX1gpRJZdQlMNfv3/VU3KMvRZ3aUlHp2GZM9CW4pLxEFyaT7XoyjJaKmzWK8ZHCVfUfKZEoCbTINYpaFCwriLhltVeHXrxw4Qspz28YRsuiQWsUqvoPETkeGOUPWqeqnyZTQCNxGvoSnL1EZxiGW+KuUYjIRcA6YCaOmfF/i8iFqRbMiE20l+TcOhhqaH7DMFoPbhazbwdGqeoVqno5MBq4I7ViGfGo78tzycpvGEbrwc1itqfWVNM+7I3utFPfl+fC8n/0LxbtWMFeD/SsgTknfiV6/i1L4flbofxz5zizK3z1J+4WrBtoeqRuObZwbhiNiZvF7IU4lmOf9AddDGxR1VtTLFu9aQ2L2Q2mtm9ucLa0Tl5c96K7ZSk88y2oPhoe7vHB1F/HvkgnUk+i8jakPMMwwmiQK1RVnQs8iN/MOPBgU1YShkteuiu6SZBIaWsrCYCaysjp61tPouU0pDzDMFzjytaTqj4NPJ1iWYzGJBHf3LH8dcfx5V1fH+BJq8cwjAYT9YlCRAKuT78QkQMhny9ExHxTNHeimdeIFB7LFEc8Mx2J1FOfcupbnmEYromqKFT1TP93J1XtHPLppKqdG09EI0CiJjdipg8xv1HcoT0FfXuT278fBcdn1y134nzwtg1P17c3xR07xTfTEcnMRxzzHhHlTsBcSINNkxiGEYab9yiecBNmpJaAyY09h/agaNDkRrSLYNz0uRfB5MUU9+hHUfeu7PG1QUXYU1lWt9zciyg+63qKunc7ls7XhqLjjqO4Y4fYgvvrIasfIM53jIXnqHJ37BBSDs6uJ6hTXqL9ZBhGfNzsetqoqsNDjtvg7HoanGrh6ktL3PWUqMkNt+mTna6hmGkSw0gP9dr1JCLzROQLIDd0fQL4BHgmRbIaUUjU5Ibb8GSnayhmmsQwmh6x1ijuAbKAx2utT3RT1XmNJ6IBiZvccBue7HQNxUyTGEbTI+YaharWcMwYoJFGEjW54TZ9stM1FDNNYhhNDzfvUWwUkVGquj7l0hhRSdRkh9v0yU7X2O1Mdn7DMOriZjH7XeBk4EPgECA4PrNzUy8eiMhUoBDoDPxeVeOuSLbExWzDMIxU0lCf2ZMaUPHDwPnAp6qaExL+FWAR4AV+p6oLopWhqiuBlSLSBbgXsK0rhmEYjYgbx0UfikgecJY/aK2qvumy/EeB+4HHAwEi4gV+BXwZ2AmsF5G/4CiNe2rlvyrEcu33/fmMhhDJkitEtso6sADee6FhVl9r15eKMgNlJMtKrWEYYbiZepoDXAss9wdNwzEM+EtXFYj0B54LPFGIyBlAkapO8h/Pg+Auq0j5BVgA/F1VX4xRz3XAdQAnnHDCiA8//NCNeK2LSBZYvW1B1THwF49ErbTGsviaYJkrN+1i4aptjDzwdxa0/T2Zx9y3O2XkXQpv/qnhVmoNo5XSIOuxwNXAGFWdr6rzgdNxFEd96QN8HHK80x8WjRuB84ALReT6aIlU9UFVHamqI3v06NEA8VowkSywVh91pyQgcSutsSy+JlDmyk27mLd8K7tKy5nbZmm4kgiU8cajybFSaxhGHdysUQhQHXJc7Q9rFFR1MbC4sepr0STDwmoiZbhNGyfdwlXbKK90hmBv+SxyIq2OHG5WZQ2jwbhRFI/g+MlegaMgpgC/b0Cdu4B+Icd9/WFGqsnq66xDNLSMZNcXp8zdpceeFHZrd/pGUhaBtZUEyzYMIz5uFrN/LiKrgTMBBa5U1U0NqHM9MFBEBuAoiK8DlzagPMMtE+fHXKMo7tCeRV2y2dvGS8+qaubsLwU4FlZdw5wvTcPtGwnF+dNYtP1p9no9wfIKDx0OT+S3/Fq8ozjs3Yez+57Nqg9WUXqklI6nQk11Jkc+uYCbZSz7e2zgkxAZC48qxYPOYVHZm+F1HVVHhmUF9k6FYTSAuIvZACIyHGfXUw3wT1Xd6KpwkSeBCUB3HBtRd6rq70Xka8AvcHY6PayqP6qX9HXrmwxMPvnkk6997733klFkyyPKrqfitXdR1F6p8BxbtvIhqNZQJcdmGjO8GRSNLYp7sQ1Yca2orjiWV5WijkMp/HhrWP3FHTvUSRsJrRFAEE9NWJlTuo/kmdK369QVMdyl/IbR2oi1mO1m19N8YCaOhzsBpgJPqerdSZYzadgLd4kTzepqJNxYYk3EimsidUfCIx5qtMZ1uFmSNYy6NPSFu1lAnqpW+AtbAGwGmqyiMBInEeuqbtImYsW1oZZdIymDWOFmSdYwEsPN9tjdQKiVtXbY4nOLIxHrqm7SJmLFtaGWXT0SeRhHCzdLsoaRGG4URRnwtog8KiKPAG8BpSKyWERs22oLIZLVVZ/HRxsJf+h0a4k1ESuukdJGwoMnojwzT5kZsa5o4WZJ1jASw83U0wr/J8Dq1IhipJNoVlcjhblZCE7EimuktKG7ngCy2mYxb8y8qGXmH5efULhhGO5xu+upLXCK/3Cbqrp8lbdxsV1PhmEY9aNBJjxEZALwHo5Bvl8D/xWRs5MpYLJQ1WdV9bqsrKx0i2IYhtFicDP19DOgQFW3AYjIKcCTwIhUCmYYhmE0DdwoCl9ASQCo6n9FxJdCmYwkEbC4uru0nN7ZmcydNAigTtjU/Og2GQNl7CotxyNQ45+pzM70UXTBkKh5Y9UdrawNH37Ok//+mGpVvCJcMqYfd08dmrwOcdFGrwjVqvRx0TeG0Vpw88LdIziGAP/gD5oFeFX1qhTLVm/shbtjFlcDxvQAfF4BhcqaY+c80+flnulDI14QI5URis8jLJyZVyev27pDERz7MLW57PQTUqosYrUxVt8YRkujoWbGrwfeAb7j/7wD3JA88YxUEGpxNUBltda5UJdXVrNw1TYiEamMsPJqNGJet3WHEi3myX830IhhHGK1MVbfGEZrIubUk98b3Zuqeirw88YRyUgGoRZX65vWTRmR0iRSdzyqXezKawjxZE1mWwyjuRLziUJVq4FtInJCI8nTIERksog8WFZWlm5R0k7v7MwGp3VTRqQ0idQdD6+k1vVJPFmT2RbDaK64WaNYA+QD64BDgXBVvSC1otUfW6OoO/d+gedVbvUtpRefsVu789Oqi/hLzZmR5+H9Fma1bCf7tSOqShc5GJYPYFqbf/KjTstpX743zB+2lu1kt3bjxephTPRsprd8xh66c2/1xayoGhcm5wWeV7mlzVJ6y2eU0pG2VNLB78GuBvAISDRfE5ldne/y/ZDZxf/783C/33H8Zkfqp4A8e+jO7hG3MOqCbybPH7f59TaaKA21Hjs+Uriq/iMJsqUEUxQOsfxMH9a2/NT3/xhWeF1dJRHDz/VhbcttldfQoW0bfuh9kDYxTIMr4a4Qq7wZ3C3X8+jB0XgEzpdXWeD7He3laANbGgMXfrMD/TTiwN/rypNMf9yR+tb8ehtNhHopChHJwFnIPhnYCvxeVatSJmUSMUVRi/tyInuay+oHN73lLm3tfFA/b3mhdbqpKxlEamckoskT1Xuey3LjlZ9oOYaRAuprZvwxoBJYC3wVGAyYNbXmSDS/0ZHC3fiYbogf6tC8jeXPuqG+u5PljzuR82AYTYhYi9mDVfUyVf0tcCGOhzujORLNb3SkcDc+prP61t8XdWi+xvJn7baeaOnE27By46U3v95GEyfWE0XQ8J+qVkmKd58YKSSSr2y/r2pXaUMJzffsdyhuK1H9bO9p48WDsyjdq6qaOQcOQ4gP66we7TmS3Zdy/9jK9E+Dltcaa4EysmtqUIUDfr/YZx8+zJr27YN1hx5nqnLYX45HPAxYMZX3v3g/zJlRrw69gtZki3cUs+j4bPZ2gazQeqprmJOVR+G2V+L2393/upun/vtU1DoSOg+G0YSItUZRzbFdTgJkAof9v1VVOzeKhAlg1mNjkMhum9C0wd1E++vkK159B0UfrKAi5MLuU0VVqfLUfVj1iRdFqErWUpcqhCqV2scuyPBmMOXkKTzzf89E9dmd4c2gqO9XKNy0Imr/3f2vu1mybUn0/AE/3bbryWiiNGjXU3PEFrMbh4b6um4qRPOtHUo8P9t5j+fFLMP8dBtNnYaa8DCMiLQU39PxlATEb2u8MlpKXxmtE1MURr1pKb6no/nWDiVeW+OV0VL6ymidmKIw6o1bP9tu4tJFNN/atdPE87M985SZDcpvGE2ZpvWvNZoV8fxs7zm0Jzj/H9j9E5o+q10WR6qOUF7t7ALK9Dp2lQLHAQJlZLfLRlU5cPRA0K/2mp1rwvxsB44z22RyuOpwMP+ATgNi7noK9a2d1S4rrB43fra/f/r3AWLvejKMZootZhuGYRi2mG0YhmHUH1MUhmEYRkxa1BpFyAt36RalWRPJ33WohdmVm3bxg2ffZv/hyrB8AXemof6mQ8vKbu/jSGU1hyudOfyAr2yo68cbCKsjULZbv92R/GDX9oddu47Qetz6zo7XV6kknXUbrQtbozDCiORDOtRnxcpNu5i77E0qq2OPm0yflxkj+vD0G7tiulP1iHNRDnWT6vMK1TVKDM+px9JG8Nsdz9d3onVE850dr69SSTrrNlomtkZhuCaSD+lQ39ELV22LqyQCeZ7898cxL9bgPB3U9qVdWe3uAg6R/XbH8/WdaB3RfGfH66tUks66jdaHKQojjHj+sxPxIZ1qf9cBasuUCj/XifgGbww/2+ms22h9mKIwwojnPzsRH9Kp9ncdoLZMqfBznYhv8Mbws53Ouo3WhykKI4y5kwaR6Qv3v5Dp8wYXf+dOGoTPG18BZPq8XDKmX52yauMRZ50hFJ9X8LjUMT6PBGULEKkNdfIlUEdo++PVEy1tskln3Ubro0XtejIaTmAhNNpumsC3211PI0/s2ui7nkLbkMpdT/H6KpWks26j9WG7ngzDMAzb9WQYhmHUH1MUhmEYRkxMURiGYRgxaVGKQkQmi8iDZWVl6RbFMAyjxdCiFIWqPquq12VlZaVbFMMwjBZDi1IUhmEYRvIxRWEYhmHExBSFYRiGERNTFIZhGEZMTFEYhmEYMTFFYRiGYcTEjAIaibNlKTx/K5R/7hxndoWv/gRyL3LiXroLyj4G8YA6BgDxdXC+Kw/5C/Gb34uZJpQI6TO7Qs+h8P4aJy6Ap62TvPpoZPlj1hMNgQFnw96tx9odidp9EdpPDSVQNkTv/+duhjceOdZHofKjIF7Q6mPfmV2h6sixvsjsCkOmwXsv+M9hIH2tfg/IETzXEcoFKN8PWX1hYIG/zJ2Q2SU8buL8kLKixIeNrZ11y4yVLhAOkeNC63aT3k1cNOLlqU+ZjYAZBTQSY8tSeOZbdS/CHh8Mvxze/BNUtnLnOYG+2PREdGVVX8RvWlxrefDz+ODEsfD+P5JbXyw5xAM1lfHTxsPbFlSjl+XLhLxL44+taOl8mTB5sfP72e+Ex0WqO1Z6N3HRLuxblsbOEy8+xcQyCmiKwkiM+3KcO8hIBO4mDeuLZOO2P6Oly+rnfEcbu4mkjxd301uRy4z23wnkiRefYmIpCpt6MhKjbGf0OLswHsP6Irm47c9o6WKN20TTJzsuEB4vPo3YYraRGFl9o8dJbK9yrQrri+Titj+jpcvqG3vsJpI+XlysMmOF16fMRsKeKIyoFO8oZtHGRew9tJeeHXoyZ/gcCifOp/jv/8uCLp0o9Ry7z8hUBU8bykPu6LKqa/jKoUOs6tghLC1Ar6pq5uwvBWBBty7B+Ez/VGh5Av62A17pmhQiztx3BDyq1IS0zwPMPPAF+UeOsqhLNnvbeOkZpX+yamo47Ugl6zLbUROSF/GwtHNHak8l96qq5uzDh1nTvj172njD6hxdXs677dpFPDeheTwQrKsGyK6pQYEDHg9ZNTWoQpnXE4wPzb+3jTeY5oDXQ89acT2rqplTdpDCQ+UUZ/rC2h+WzpvJnJJPoTpyHwXD/OkKD5QG21PcOZtFx2ezt7KMnp37MOfz/RQeOuxEettSnNmORdmdjpV54DCF/kXu4hfnsqhz+4hxPPsditvKsbqra5jzpWkU1jrfwf9RV6lbvy/z2IL6xPlhaxTFHdqzqGsX9rYRei4rcP5/J9UuvVYdof/VKGnrQ4taoxCRycDkk08++dr33nsv3eI0a4p3FFP0WhEV1RXBsAxvBlNOnsLy/y6j0vVUgDoXzQj4VKmGsItmq0UVUUVDLtqu+ydGH8eMj5UvXpnxSKDsDPExpWsuz3y2gYrQPLXS+cSL1lRRFRqmiqpSFdJvGeKj6IujFJbspLhHX4o6taVCjy1WZ6hSVPI5hW26Upw/jaKPisPjxUfRmT8EoOjVOyLGFZ5USPHqOyj6YEWYzBneDIrGFgUv0hH/RyH1R9v1VFz1OUU9usYsO0C0/2qktLGwxWwjYQqWFbDn0J464R7xUFNn66VhNIxkj6teHXrxwoUvRB3HbuKBeud94cIXgOj/o9A0kUgkX33rqI0tZhsJs/fQ3ojhpiSMVJDscRUYv9HGsdv4huatT/mJ5qtvHYlgi9lGRHp26Bkx3CM2ZIzkk+xxFRi/0caxm/iG5I30O1qaROIjhde3jkSwf70RkTnD55DhzQgLy/BmMPOUmfg8vqTU4fP48NgQDCKEz+m3lv4JjKva4602Po+PNtImbliGN4M5w+cA0cexm/iG5A3gJk0kEslX3zoSwaaejIgEFsEi7aTIPy6fBesWUHqkNJg+05sJQHn1sbdKs9pm8ZUBX2HVB6vC0oIzfxoYyKFlRSonHoKgTW/fU1Q8eKjh2FSLRzzMPGUm+cfl1+lvCO+frLZZnNb1NNZ9so4arQnmBVi6bWmdfujVoRdn9z2bNTvXhM1je8TD6ONH8+7+dyOem9A8gfWDwHd2u2xUlQNHD5DVLgtVpexoWTA+NP/eQ3uDaQ4cPUDPDj3D4mqPq9D2R0oHdcdkpLDA+I01jt3ENzSvmzSRSCRffetIBFvMNgzDMGIuZrf851rDMAyjQZiiMAzDMGJiaxQGKzftYuGqbewqLccrQrUqfbIzOefUHrzybgm7S8vpnZ3J3EmDmJrfJ6HyIpGd6eP8vF688m4Ju0rL8QjU+GdA2/s8VNUoR6vdT4n6PFDZRHbtBt4S79Lex4HyStw2Q4CxX+rK5o/LOHQ0/GXG0P5JlC7tfRyprOawv4Pa+5x7w8MROswj4JXIfRloV6gsgbDAmOnS3ue8pV1eSXaEetu28QbjVKG0vDLieIs1DrND6giMSYCFq7aFjdPaYZHGcqR8oeM7MI4TGf+heSLJ6ub/0xSxNYpWzspNu5i3fCvllfHftM70ebln+tCYgz2R8gyjofi8AgqVIZo0UpibfKHjO9I4jjf+4419N/+fdGJrFEZUFq7a5vqiXl5ZzcJV25JWnmE0lMpqraMQIoW5yRc6viON43jjP97Yd/P/aaqYomjl7I4yPVTf9ImWZxhNicD4jTaOY41vN2O/uf4/TFG0cnpnZyY1faLlGUZTIjB+o43jWOPbzdhvrv8PUxStnLmTBpHpc2frP9PnDS4CJqM8w2goPq/g80jcMDf5Qsd3pHEcb/zHG/tu/j9NFVMUrZyp+X24Z/pQ+vjvdLx+s8Z9sjO57PQT6JOdifiP3SzE1S4vEtmZvmDZ4OykCdDe56GtNzHz1r4mNIoDkndp7yORZggw7ktd6dC27oUmzjUvJl3a+4I7ncDp3/ZROswj0ftSQtLUDguMmS7tfWRn+pAo9YbGZWf6wvKGjrdo4VKrjj7ZmSy8MI+FM/PCxmmksNpjOVKa0PEdOo7djv/aeWrL2pQXsuNhu54MwzAM2/VkGIZh1B9TFIZhGEZMTFEYhmEYMWlRikJEJovIg2VlZekWxTAMo8XQohSFqj6rqtdlZWWlWxTDMIwWQ4vc9SQiJcCHQBYQ+ngRehztd3fgsySIUbvu+qSLFherXfGOA7+bUjujxbsJa6xz6rad8dLaObVz6uY4Hef0RFXtETFGVVvsB3gw2nGM3xtSUXd90kWLi9Uut+1uSu2MFu8mrLHOqdt22jm1c9qcz2m0T4uaeorAszGOo/1OVd31SRctLla74h0nu63JaGe0eDdhjXVOEynPzmn0eDun7o7T2dY6tMipp4YgIhs0yksnLYnW0k5oPW1tLe2E1tPWptLOlv5EUR8eTLcAjURraSe0nra2lnZC62lrk2inPVEYhmEYMbEnCsMwDCMmpigMwzCMmJiiMAzDMGJiiiIGItJBRB4TkYdEZFa65UklInKSiPxeRJalW5ZUIyJT/ed0iYgUpFueVCEip4nIAyKyTERuSLc8qcT/X90gIuenW5ZUIiITRGSt/7xOaKx6W52iEJGHReRTEXmrVvhXRGSbiPyfiNzmD54OLFPVa4ELGl3YBpJIW1V1h6penR5JG06CbV3pP6fXAxenQ976kmA7/6Oq1wMXAePSIW99SfB/CnArsLRxpUwOCbZVgYNABrCz0YRMxlt/zekDnA0MB94KCfMC24GTgLbAm8BgYB4wzJ/mT+mWPZVtDYlflm65G7GtPwOGp1v2VLYT5wbneeDSdMueqnYCXwa+DswGzk+37Cluq8cffzzwx8aSsdU9UajqGuDzWsGjgf9T5676KPBnYAqOxu7rT9Ps+irBtjZrEmmrOPwEeF5VNza2rA0h0XOqqn9R1a8CzWrqNMF2TgBOBy4FrhWRZvVfTaStqlrjj98PtGssGds0VkVNnD7AxyHHO4ExwGLgfhEpJDVmPtJBxLaKSDfgR0C+iMxT1XvSIl1yiXZebwTOA7JE5GRVfSAdwiWRaOd0As70aTvgr40vVtKJ2E5V/TaAiMwGPgu5mDZnop3T6cAkIBu4v7GEMUURA1U9BFyZbjkaA1XdhzNn3+JR1cU4NwEtGlVdDaxOsxiNhqo+mm4ZUo2qLgeWN3a9zeoRLYXsAvqFHPf1h7VErK0tD2tny6NJtdUUhcN6YKCIDBCRtjgLY39Js0ypwtra8rB2tjyaVFtbnaIQkSeB14FBIrJTRK5W1Srg28Aq4D/AUlV9O51yJgNra8trq7WzZbUTmkdbzSigYRiGEZNW90RhGIZhJIYpCsMwDCMmpigMwzCMmJiiMAzDMGJiisIwDMOIiSkKwzAMIyamKJo5fgN3r4rIV0PCZorI31Jc72oRGZlA+mEi8rVUyuRSjkdF5MIklDNbRJJua0dEPhCR7hHCi0Tku/7fd4nIefUou7+IXBpyPFJEmoQpk2jtTkK5ryW7zNaIKYpmjjovwlwP/FxEMkSkI/Bj4Fv1KU9EUmX/axiQdkXRElDV+ar6Yj2y9sexsBooZ4OqfidpgqUYEfEmmkdVx6ZCltaGKYoWgKq+hWPd9lZgPvAH4HYRWScim0RkCgTvKNeKyEb/Z6w/POA16y/AO+J4CysWkTdF5C0Riebc5xsistmfZrS/rA5+RyzBuv0mCO4CLvanv1hEtopItv+JaJ+IXO7P/7iIfFlEvCKyUETWi8gWEflmoFIRmRsS/oOQtv1HHM91b4vICyKSGUXu88TxhvZf8XtE8yvZR/xybRKRc2KFhyIihSLyeu07Yv9TwBP+uPdE5NqQ/n4uJN394lg+DXCLv751InJyhPqCT0UiMkpEXvOfq3Ui0inaeQYWAGf5z8FNoXKISFcRWenv03+JSG5IGx4W5wlyh4hEVCwi8ht/n74dOCf+8A9E5Ad+ObaKyKn+8G7+c/S2iPwOkCjlHhSRn4nIm8AZInKZv52bReS3/nFyvYgsDMkTfNoTkYMh4ZHGzdxAm0TkPhF52f/7XBH5YySZWiXpdtphn+R8gA7ANmArcA9wmT88G/ivP749kOEPHwhs8P+eABwCBviPZwAPhZSdFaG+1YE0OI5X3vL//nGUumcD94fkfwAoBHJw7NoEynrPn/464Pv+sHbABmAAUAA8iHNh8QDP+evvD1RxzNHU0oActeR+FPibP+9AHPPNGcD/Ag/705wKfBQnfDaOmedpwFqgS4S6inAczmQC3XHMRvf29/dzIenuB2b7f38A3O7/fXkgnb+s74a04UIchzY7gFH+8M44FqFjnefQeieElP9L4E7/73OBzSH1vuY/B92BfYAvQlu7+r+9OGMjN6Q9N/p//z/gd/7fi4H5/t+FOJ7bukcoV4GL/L9Pw7kh8vmPf+3vox44vhsCeZ4HzvT/Puj/jjZuTgee8qdZC6wDfMCdwDfT/b9uKh8zM95CUNVDIrIEx03iRcBk8c9p41zYTgB24/jXGAZUA6eEFLFOVd/3/94K/Ewc5z7PqeraKNU+6a97jYh0FpFsnD/kBRHqrs1anD/qh8BvgOtEpA+w39+WAiBXjq0nZOFc9Ar8n03+8I7+8I+A91V1sz/8DRzlEYml6vgseE9EduAogDNxLpao6rsi8qG/f6KFg3NBHQkUqOqBKHU9o6rlQLmIvILjkKY0StoAT4Z83xcj3SBgj6qu98t3AJynOqKf52iciXODgKq+7L/j7+yPK1bVI8AREfkUx7tabTecF4nIdTiKqheON7Yt/riAWew3cPxjgHPup/vrKxaR/VHkqgae9v+eCIwA1osIOAr4U1Ut8T/tnI5zo3Eq8M9a5UQbN48DI/xtPQJsxDmnZwHNZlou1ZiiaFnU+D8CzFDVbaGRIlIEfALk4dxVVYREHwr8UNX/ishwnDWFu0XkJVW9K0J9tQ2FaYy6x9RKuwZnHeUE4HacO/MLcRQI/nJuVNVVtcqZBNyjqr+tFd4f548eoBrnQhKJSHLXh4CrylNwnnjc1lVF+LRvRow89ZHtJqKf5/pQu1/DrhsiMgD4Ls6TzX4ReZTwNh2JltcFFapaHagKeExV50VI92ecG6R3gRXqf0QIFZMI48Yv//s4T4iv4Si3c4CTcYzxGdgaRUtlFXCj+G+7RCTfH56FcwdaA3wDZ5qgDiLSGzisqn8AFuL4843Exf70ZwJlqloWo+4vgE6BjKr6Mc5UxkBV3QG8inOxWRPShhtExOcv5xT/nfIq4CpxFu0RkT4icpzrnnGYKSIeEfkSzoV+G46CmhWoC0eBxQoH52loBvC4iAyJUtcUcdY5uuFM9az35xssIu38T2ETa+W5OOT79Rjt2Ab0EpFRfvk6ibMZIdp5DjsHtQht5wQcT3HRnpJq0xnnRqNMRI4HvhonPTjn+VJ/fV8FurjI8xJwYeB8+9dVTvTHrcBxi3oJjtKoTaxxs5ZjY28tzuaQTRGUTavFnihaJj8EfgFsEcd/8PvA+Thzuk+Ls3D8N0KeImoxFFgoIjVAJXBDlHQVIrIJZ073qjh1vwLcJiKbce7slgD/5thFbC3O2sqr/uPf4UwdbfQrnRJgqqq+ICKnAa/7ddFB4DKcu1W3fIQzF90ZuF5VK0Tk18BvRGQrzh3/bFU9EiMcCE5HzQKeEpHJqrq9Vl1b/G3vDvxQVXcDiMhS4C1//2yqlaeLiGzBuRO/JFojVPWoOBsNfinOwn05jovXaOd5C1DtXxh+tFa9RcDD/noPA1fE7sIwOd70j4N3cdZhak/7ROIHwJMi8jbOnfxHLup5R0S+D7zgH1uVOE+lH/qfZP4DDFbVdRHyRhs3n+KMvduB1/3TnhUce7I1MDPjhpEy/FN9B1X13nTLYhgNwaaeDMMwjJjYE4VhGIYRE3uiMAzDMGJiisIwDMOIiSkKwzAMIyamKAzDMIyYmKIwDMMwYmKKwjAMw4jJ/wc0DruUzfw1cwAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "for genre in compare_genres:\n", " counts = (genre_df[genre].date_updated.dt.year - genre_df[genre].first_publication_year).value_counts()\n", " dist = Counter([int(count) for count in counts])\n", " x = dist.keys()\n", " y = list(dist.values())\n", " y_prob = [y_point / sum(y) for y_point in y]\n", " plt.scatter(x,y_prob, label=genre)\n", "plt.xscale('log')\n", "plt.yscale('log')\n", "plt.ylabel('Proportion of reviews')\n", "plt.xlabel('Years between book publication and review')\n", "plt.legend()\n", "plt.show()\n", "\n" ] }, { "cell_type": "code", "execution_count": 324, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "917303\n", "2223\n" ] } ], "source": [ "temp_df = review_df[(review_df.publication_year > 1950) & (review_df.publication_year <= 2020)]\n", "#temp_df.publication_year.value_counts().sort_index().plot(kind='bar')\n", "print(len(temp_df))\n", "\n", "temp_df = review_df[(review_df.publication_year <= 1950) | (review_df.publication_year > 2020)]\n", "#temp_df.publication_year.value_counts().sort_index().plot(kind='bar')\n", "print(len(temp_df))\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "random_seed = 1205921\n", "\n", "sample_df = review_df.sample(100, random_state=random_seed)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from scripts.text_tail_analysis import write_docs_to_bin, read_docs_from_bin\n", "#docs = [nlp(text) for text in reviews_en]\n", "nlp_docs_file = f'../data/review_spacy_docs.random_1M.genre-{genre.replace(' ','_')}.sample-10000.seed-{random_seed}.docbin'\n", "\n", "#write_docs_to_bin(docs, nlp_docs_file)\n", "\n", "docs = read_docs_from_bin(nlp_docs_file, nlp)\n", "\n", "# iterate over the docs, then over the entities in each doc and count them\n", "tf = Counter([entity.text for doc in docs for entity in doc.ents])\n", "\n", "print('Total number of entities in the sample:', sum(tf.values()))\n", "tf.most_common(50)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.2" } }, "nbformat": 4, "nbformat_minor": 2 }