{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "We will start with a relatively simple question, but with a difficult journey to get any answers:\n", "\n", "- What are the differences in reception between two fiction genres in the context of Goodreads?\n", "\n", "We look at Goodreads reviews at different scales and with different selection criteria." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/marijnkoolen/Code/Huygens/scale\n" ] } ], "source": [ "# This reload library is just used for developing the REPUBLIC hOCR parser \n", "# and can be removed once this module is stable.\n", "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "# This is needed to add the repo dir to the path so jupyter\n", "# can load the modules in the scripts directory from the notebooks\n", "import os\n", "import sys\n", "repo_dir = os.path.split(os.getcwd())[0]\n", "print(repo_dir)\n", "if repo_dir not in sys.path:\n", " sys.path.append(repo_dir)\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import json\n", "import csv\n", "import os\n", "\n", "data_dir = '../data/GoodReads'\n", "\n", "books_10k_file = os.path.join(data_dir, 'goodreads_reviews-books_above_10k_lang_reviews.csv.gz')\n", "reviewers_5k_file = os.path.join(data_dir, 'goodreads_reviews-reviewers_above_5k_reviews.csv.gz')\n", "random_1M_file = os.path.join(data_dir, 'goodreads_reviews-random_sample_1M.csv.gz')\n", "author_file = os.path.join(data_dir, 'goodreads_book_authors.csv.gz') # author information\n", "book_file = os.path.join(data_dir, 'goodreads_books.csv.gz') # basic book metadata\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | user_id | \n", "book_id | \n", "review_id | \n", "rating | \n", "date_added | \n", "date_updated | \n", "read_at | \n", "started_at | \n", "n_votes | \n", "n_comments | \n", "review_length | \n", "review_text | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "8842281e1d1347389f2ab93d60773d4d | \n", "16981 | \n", "a5d2c3628987712d0e05c4f90798eb67 | \n", "3 | \n", "Mon Dec 05 10:46:44 -0800 2016 | \n", "Wed Mar 22 11:37:04 -0700 2017 | \n", "NaN | \n", "NaN | \n", "1 | \n", "0 | \n", "93 | \n", "Recommended by Don Katz. Avail for free in Dec... | \n", "
1 | \n", "8842281e1d1347389f2ab93d60773d4d | \n", "8191070 | \n", "8fb75b37b3613a34e39169f139870f31 | \n", "5 | \n", "Fri Nov 18 17:43:26 -0800 2016 | \n", "Thu Aug 03 22:54:31 -0700 2017 | \n", "Mon Jul 24 09:32:34 -0700 2017 | \n", "Mon May 08 07:52:12 -0700 2017 | \n", "25 | \n", "0 | \n", "2585 | \n", "Best book of the series, and best book about A... | \n", "
2 | \n", "8842281e1d1347389f2ab93d60773d4d | \n", "40955 | \n", "299706d01666058b1fb2a96b29a1260b | \n", "5 | \n", "Sun Nov 18 16:31:28 -0800 2012 | \n", "Wed Dec 21 10:43:14 -0800 2016 | \n", "Fri Apr 17 00:00:00 -0700 2015 | \n", "Mon Apr 06 00:00:00 -0700 2015 | \n", "5 | \n", "1 | \n", "4734 | \n", "A truly inspirational book by a truly inspirat... | \n", "
3 | \n", "8842281e1d1347389f2ab93d60773d4d | \n", "4986701 | \n", "bb7de32f9fadc36627e61aaef7a93142 | \n", "4 | \n", "Thu Aug 04 10:02:02 -0700 2011 | \n", "Thu Aug 04 10:02:02 -0700 2011 | \n", "NaN | \n", "NaN | \n", "6 | \n", "4 | \n", "73 | \n", "Found the Goodreads down image in this, and ma... | \n", "
4 | \n", "8842281e1d1347389f2ab93d60773d4d | \n", "77566 | \n", "cedb8b21ea6ad95b05fa3868e05488e6 | \n", "5 | \n", "Wed Mar 12 16:37:16 -0700 2008 | \n", "Wed Mar 22 11:46:03 -0700 2017 | \n", "Fri Oct 19 00:00:00 -0700 2012 | \n", "Wed Sep 19 00:00:00 -0700 2012 | \n", "4 | \n", "2 | \n", "284 | \n", "Seven amazing stories. Each one you think can'... | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
1008006 | \n", "8ba77e3c745ebddccc6306fc3c6bb25e | \n", "174198 | \n", "95c7b6304251a1d94d6d1af8313f8ae7 | \n", "5 | \n", "Mon Jul 08 11:47:23 -0700 2013 | \n", "Sat Jul 22 15:50:08 -0700 2017 | \n", "NaN | \n", "Sat Jul 22 00:00:00 -0700 2017 | \n", "0 | \n", "0 | \n", "177 | \n", "No finer Maine writer ever lived than Sarah Or... | \n", "
1008007 | \n", "e223be160b89f218dbee70b5fbdccf76 | \n", "22892469 | \n", "47c31eb080291307e4c2c4e964264003 | \n", "5 | \n", "Sun Aug 10 20:41:33 -0700 2014 | \n", "Tue Aug 26 13:53:08 -0700 2014 | \n", "Mon Aug 11 21:36:42 -0700 2014 | \n", "Tue Mar 25 00:00:00 -0700 2014 | \n", "0 | \n", "0 | \n", "976 | \n", "Love Songs With Bright Blue Chippiness Happi... | \n", "
1008008 | \n", "e223be160b89f218dbee70b5fbdccf76 | \n", "22891145 | \n", "9514f23d8a4f835e3a417afd67954859 | \n", "5 | \n", "Sun Aug 10 14:21:44 -0700 2014 | \n", "Tue Aug 26 13:41:10 -0700 2014 | \n", "Sun Aug 10 14:25:39 -0700 2014 | \n", "Sun Aug 03 00:00:00 -0700 2014 | \n", "0 | \n", "0 | \n", "249 | \n", "Great Times. My Brand New 2014 KIA OPTIMA EX... | \n", "
1008009 | \n", "e223be160b89f218dbee70b5fbdccf76 | \n", "20369388 | \n", "7c1395ba0a319423707d8ffff79aeafc | \n", "5 | \n", "Fri Jun 06 22:32:23 -0700 2014 | \n", "Tue Aug 26 14:07:08 -0700 2014 | \n", "Sun Aug 10 16:25:37 -0700 2014 | \n", "Wed Dec 25 00:00:00 -0800 2013 | \n", "0 | \n", "0 | \n", "365 | \n", "Jail. = Boring. I Beat People Down. \"You Are... | \n", "
1008010 | \n", "e223be160b89f218dbee70b5fbdccf76 | \n", "18518801 | \n", "d2ed77d013ca33fe0eaa9a4013b352c7 | \n", "5 | \n", "Thu Sep 19 02:49:29 -0700 2013 | \n", "Tue Aug 26 15:00:58 -0700 2014 | \n", "Tue Aug 12 23:38:26 -0700 2014 | \n", "Sat Aug 10 00:00:00 -0700 2013 | \n", "0 | \n", "0 | \n", "708 | \n", "I Was Trying To Add This Book Here On Goodre... | \n", "
1008011 rows × 12 columns
\n", "genres | \n", "book_id | \n", "children | \n", "comics, graphic | \n", "fantasy, paranormal | \n", "fiction | \n", "history, historical fiction, biography | \n", "mystery, thriller, crime | \n", "non-fiction | \n", "poetry | \n", "romance | \n", "young-adult | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "
1 | \n", "2 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "
2 | \n", "3 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "
3 | \n", "4 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "
4 | \n", "5 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
394883 | \n", "36488099 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "
394884 | \n", "36494299 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "
394885 | \n", "36498328 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "
394886 | \n", "36508486 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "
394887 | \n", "36514196 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "
394888 rows × 11 columns
\n", "genres | \n", "children | \n", "comics, graphic | \n", "fantasy, paranormal | \n", "fiction | \n", "history, historical fiction, biography | \n", "mystery, thriller, crime | \n", "non-fiction | \n", "poetry | \n", "romance | \n", "young-adult | \n", "
---|---|---|---|---|---|---|---|---|---|---|
genres | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
children | \n", "1.000000 | \n", "0.191590 | \n", "0.175501 | \n", "0.146700 | \n", "0.112336 | \n", "0.088749 | \n", "0.111877 | \n", "0.225352 | \n", "0.057495 | \n", "0.306383 | \n", "
comics, graphic | \n", "0.126993 | \n", "1.000000 | \n", "0.110228 | \n", "0.087730 | \n", "0.106668 | \n", "0.066126 | \n", "0.139249 | \n", "0.083344 | \n", "0.038440 | \n", "0.113899 | \n", "
fantasy, paranormal | \n", "0.438581 | \n", "0.415578 | \n", "1.000000 | \n", "0.394200 | \n", "0.244037 | \n", "0.436022 | \n", "0.075148 | \n", "0.215661 | \n", "0.374391 | \n", "0.558983 | \n", "
fiction | \n", "0.822499 | \n", "0.742068 | \n", "0.884404 | \n", "1.000000 | \n", "0.735828 | \n", "0.889048 | \n", "0.400419 | \n", "0.682194 | \n", "0.807116 | \n", "0.894502 | \n", "
history, historical fiction, biography | \n", "0.281570 | \n", "0.403359 | \n", "0.244766 | \n", "0.328955 | \n", "1.000000 | \n", "0.286203 | \n", "0.617135 | \n", "0.365616 | \n", "0.248429 | \n", "0.255079 | \n", "
mystery, thriller, crime | \n", "0.216797 | \n", "0.243701 | \n", "0.426215 | \n", "0.387356 | \n", "0.278932 | \n", "1.000000 | \n", "0.097284 | \n", "0.075527 | \n", "0.337044 | \n", "0.342905 | \n", "
non-fiction | \n", "0.192056 | \n", "0.360638 | \n", "0.051622 | \n", "0.122602 | \n", "0.422671 | \n", "0.068366 | \n", "1.000000 | \n", "0.411616 | \n", "0.034997 | \n", "0.093471 | \n", "
poetry | \n", "0.067421 | \n", "0.037618 | \n", "0.025819 | \n", "0.036403 | \n", "0.043641 | \n", "0.009250 | \n", "0.071736 | \n", "1.000000 | \n", "0.014779 | \n", "0.027005 | \n", "
romance | \n", "0.192056 | \n", "0.193719 | \n", "0.500437 | \n", "0.480868 | \n", "0.331079 | \n", "0.460884 | \n", "0.068099 | \n", "0.165008 | \n", "1.000000 | \n", "0.555633 | \n", "
young-adult | \n", "0.578081 | \n", "0.324216 | \n", "0.422038 | \n", "0.301023 | \n", "0.192014 | \n", "0.264854 | \n", "0.102734 | \n", "0.170306 | \n", "0.313846 | \n", "1.000000 | \n", "
\n", " | children | \n", "comics, graphic | \n", "fantasy, paranormal | \n", "fiction | \n", "history, historical fiction, biography | \n", "mystery, thriller, crime | \n", "non-fiction | \n", "poetry | \n", "romance | \n", "young-adult | \n", "
---|---|---|---|---|---|---|---|---|---|---|
children | \n", "1.000000 | \n", "0.198719 | \n", "0.160845 | \n", "0.124326 | \n", "0.107536 | \n", "0.092824 | \n", "0.103409 | \n", "0.230634 | \n", "0.052394 | \n", "0.220813 | \n", "
comics, graphic | \n", "0.140730 | \n", "1.000000 | \n", "0.085360 | \n", "0.078287 | \n", "0.127202 | \n", "0.058419 | \n", "0.204105 | \n", "0.100167 | \n", "0.028307 | \n", "0.085341 | \n", "
fantasy, paranormal | \n", "0.562151 | \n", "0.421265 | \n", "1.000000 | \n", "0.460581 | \n", "0.308536 | \n", "0.537740 | \n", "0.088349 | \n", "0.267510 | \n", "0.455903 | \n", "0.614448 | \n", "
fiction | \n", "0.894286 | \n", "0.795164 | \n", "0.947929 | \n", "1.000000 | \n", "0.822923 | \n", "0.941117 | \n", "0.515178 | \n", "0.787804 | \n", "0.897502 | \n", "0.952561 | \n", "
history, historical fiction, biography | \n", "0.298208 | \n", "0.498097 | \n", "0.244809 | \n", "0.317257 | \n", "1.000000 | \n", "0.274897 | \n", "0.670702 | \n", "0.395874 | \n", "0.240205 | \n", "0.240575 | \n", "
mystery, thriller, crime | \n", "0.297620 | \n", "0.264491 | \n", "0.493321 | \n", "0.419499 | \n", "0.317838 | \n", "1.000000 | \n", "0.111340 | \n", "0.101511 | \n", "0.380636 | \n", "0.394062 | \n", "
non-fiction | \n", "0.161794 | \n", "0.450931 | \n", "0.039551 | \n", "0.112059 | \n", "0.378414 | \n", "0.054332 | \n", "1.000000 | \n", "0.434971 | \n", "0.034382 | \n", "0.073177 | \n", "
poetry | \n", "0.061830 | \n", "0.037919 | \n", "0.020520 | \n", "0.029362 | \n", "0.038271 | \n", "0.008488 | \n", "0.074530 | \n", "1.000000 | \n", "0.014156 | \n", "0.022729 | \n", "
romance | \n", "0.235312 | \n", "0.179516 | \n", "0.585856 | \n", "0.560381 | \n", "0.389027 | \n", "0.533176 | \n", "0.098693 | \n", "0.237150 | \n", "1.000000 | \n", "0.667596 | \n", "
young-adult | \n", "0.705921 | \n", "0.385248 | \n", "0.562046 | \n", "0.423359 | \n", "0.277341 | \n", "0.392910 | \n", "0.149521 | \n", "0.271046 | \n", "0.475206 | \n", "1.000000 | \n", "