import numpy as np
import pandas as pd
import json
import csv
from collections import Counter
import gzip
import os
data_dir = '/Volumes/Samsung_T5/Data/Book-Reviews/GoodReads/'
author_file = os.path.join(data_dir, 'goodreads_book_authors.csv.gz') # author information
book_file = os.path.join(data_dir, 'goodreads_books.csv.gz') # basic book metadata
genre_file = os.path.join(data_dir, 'goodreads_book_genres_initial.csv.gz') # book genre information
review_file = os.path.join(data_dir, 'goodreads_reviews_dedup-no_text.csv.gz') # exclues text to save memory
review_text_file = os.path.join(data_dir, 'goodreads_reviews_dedup.csv.gz') # includes text
author_df = pd.read_csv(author_file, sep='\t', compression='gzip')
author_df
average_rating | author_id | text_reviews_count | name | ratings_count | |
---|---|---|---|---|---|
0 | 3.98 | 604031 | 7 | Ronald J. Fields | 49 |
1 | 4.08 | 626222 | 28716 | Anita Diamant | 546796 |
2 | 3.92 | 10333 | 5075 | Barbara Hambly | 122118 |
3 | 3.68 | 9212 | 36262 | Jennifer Weiner | 888522 |
4 | 3.82 | 149918 | 96 | Nigel Pennick | 1740 |
... | ... | ... | ... | ... | ... |
829524 | 4.36 | 197551 | 4 | Patty Furbush | 11 |
829525 | 4.33 | 3988103 | 3 | Jim Schlinkman | 6 |
829526 | 4.00 | 13464507 | 2 | Rich Jolly | 18 |
829527 | 3.31 | 7427847 | 1 | sr@ mwrGn | 13 |
829528 | 3.70 | 5401342 | 11 | Barry S. Brown | 43 |
829529 rows × 5 columns
author_df = author_df.rename(columns={'name': 'author_name'})
author_name_df = author_df[['author_id', 'author_name']]
author_name_df
author_id | author_name | |
---|---|---|
0 | 604031 | Ronald J. Fields |
1 | 626222 | Anita Diamant |
2 | 10333 | Barbara Hambly |
3 | 9212 | Jennifer Weiner |
4 | 149918 | Nigel Pennick |
... | ... | ... |
829524 | 197551 | Patty Furbush |
829525 | 3988103 | Jim Schlinkman |
829526 | 13464507 | Rich Jolly |
829527 | 7427847 | sr@ mwrGn |
829528 | 5401342 | Barry S. Brown |
829529 rows × 2 columns
book_df = pd.read_csv(book_file, sep='\t', compression='gzip')
book_df
isbn | text_reviews_count | country_code | language_code | asin | average_rating | author_id | publisher | num_pages | isbn13 | publication_year | book_id | ratings_count | work_id | title | title_without_series | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0312853122 | 1 | US | NaN | NaN | 4.00 | 604031 | St. Martin's Press | 256.0 | 9780312853129 | 1984.0 | 5333265 | 3 | 5400751 | W.C. Fields: A Life on Film | W.C. Fields: A Life on Film |
1 | 0743509986 | 6 | US | NaN | NaN | 3.23 | 626222 | Simon & Schuster Audio | NaN | 9780743509985 | 2001.0 | 1333909 | 10 | 1323437 | Good Harbor | Good Harbor |
2 | NaN | 7 | US | eng | B00071IKUY | 4.03 | 10333 | Nelson Doubleday, Inc. | 600.0 | NaN | 1987.0 | 7327624 | 140 | 8948723 | The Unschooled Wizard (Sun Wolf and Starhawk, ... | The Unschooled Wizard (Sun Wolf and Starhawk, ... |
3 | 0743294297 | 3282 | US | eng | NaN | 3.49 | 9212 | Atria Books | 368.0 | 9780743294294 | 2009.0 | 6066819 | 51184 | 6243154 | Best Friends Forever | Best Friends Forever |
4 | 0850308712 | 5 | US | NaN | NaN | 3.40 | 149918 | NaN | NaN | 9780850308716 | NaN | 287140 | 15 | 278577 | Runic Astrology: Starcraft and Timekeeping in ... | Runic Astrology: Starcraft and Timekeeping in ... |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2693892 | 0563553014 | 3 | US | eng | NaN | 4.05 | 14033 | BBC Audiobooks | 3.0 | 9780563553014 | 1999.0 | 3084038 | 12 | 3115103 | This Sceptred Isle, Vol. 10: The Age of Victor... | This Sceptred Isle, Vol. 10: The Age of Victor... |
2693893 | 178092870X | 2 | US | eng | NaN | 3.50 | 2448 | MX Publishing | 148.0 | 9781780928708 | 2015.0 | 26168430 | 6 | 46130263 | Sherlock Holmes and the July Crisis | Sherlock Holmes and the July Crisis |
2693894 | 178092870X | 2 | US | eng | NaN | 3.50 | 3460250 | MX Publishing | 148.0 | 9781780928708 | 2015.0 | 26168430 | 6 | 46130263 | Sherlock Holmes and the July Crisis | Sherlock Holmes and the July Crisis |
2693895 | 162378140X | 17 | US | eng | NaN | 4.37 | 7789809 | Guerrilla Wordfare | 306.0 | 9781623781408 | 2014.0 | 22017381 | 70 | 41332799 | 101 Nights: Volume One (101 Nights, #1-3) | 101 Nights: Volume One (101 Nights, #1-3) |
2693896 | NaN | 1 | US | NaN | B000W914MC | 3.52 | 621880 | NaN | NaN | NaN | NaN | 11419866 | 7 | 2206102 | The Spanish Duke's Virgin Bride (Innocent Mist... | The Spanish Duke's Virgin Bride (Innocent Mist... |
2693897 rows × 16 columns
book_author_df = pd.merge(book_df, author_name_df, on='author_id', how='left')
book_author_df.author_name.value_counts()
Agatha Christie 4544 Stephen King 4170 Anonymous 2956 William Shakespeare 2499 James Patterson 2486 ... Basuki Raharjo 1 David Silberman 1 Alex Simmons; Illustrator-Denise Shimabu 1 Anna Murdoch 1 Konstantin Leontjev 1 Name: author_name, Length: 671851, dtype: int64
genre_df = pd.read_csv(genre_file, sep='\t', compression='gzip')
genre_df
book_id | genres | |
---|---|---|
0 | 5333265 | history, historical fiction, biography |
1 | 1333909 | fiction |
2 | 1333909 | history, historical fiction, biography |
3 | 7327624 | fantasy, paranormal |
4 | 7327624 | fiction |
... | ... | ... |
5042537 | 2342551 | non-fiction |
5042538 | 22017381 | romance |
5042539 | 22017381 | mystery, thriller, crime |
5042540 | 11419866 | romance |
5042541 | 11419866 | fiction |
5042542 rows × 2 columns
book_author_genre_df = pd.merge(book_author_df, genre_df, on='book_id', how='left')
book_author_genre_df
isbn | text_reviews_count | country_code | language_code | asin | average_rating | author_id | publisher | num_pages | isbn13 | publication_year | book_id | ratings_count | work_id | title | title_without_series | author_name | genres | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0312853122 | 1 | US | NaN | NaN | 4.00 | 604031 | St. Martin's Press | 256.0 | 9780312853129 | 1984.0 | 5333265 | 3 | 5400751 | W.C. Fields: A Life on Film | W.C. Fields: A Life on Film | Ronald J. Fields | history, historical fiction, biography |
1 | 0743509986 | 6 | US | NaN | NaN | 3.23 | 626222 | Simon & Schuster Audio | NaN | 9780743509985 | 2001.0 | 1333909 | 10 | 1323437 | Good Harbor | Good Harbor | Anita Diamant | fiction |
2 | 0743509986 | 6 | US | NaN | NaN | 3.23 | 626222 | Simon & Schuster Audio | NaN | 9780743509985 | 2001.0 | 1333909 | 10 | 1323437 | Good Harbor | Good Harbor | Anita Diamant | history, historical fiction, biography |
3 | NaN | 7 | US | eng | B00071IKUY | 4.03 | 10333 | Nelson Doubleday, Inc. | 600.0 | NaN | 1987.0 | 7327624 | 140 | 8948723 | The Unschooled Wizard (Sun Wolf and Starhawk, ... | The Unschooled Wizard (Sun Wolf and Starhawk, ... | Barbara Hambly | fantasy, paranormal |
4 | NaN | 7 | US | eng | B00071IKUY | 4.03 | 10333 | Nelson Doubleday, Inc. | 600.0 | NaN | 1987.0 | 7327624 | 140 | 8948723 | The Unschooled Wizard (Sun Wolf and Starhawk, ... | The Unschooled Wizard (Sun Wolf and Starhawk, ... | Barbara Hambly | fiction |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
6177585 | 178092870X | 2 | US | eng | NaN | 3.50 | 3460250 | MX Publishing | 148.0 | 9781780928708 | 2015.0 | 26168430 | 6 | 46130263 | Sherlock Holmes and the July Crisis | Sherlock Holmes and the July Crisis | James Carlopio | fiction |
6177586 | 162378140X | 17 | US | eng | NaN | 4.37 | 7789809 | Guerrilla Wordfare | 306.0 | 9781623781408 | 2014.0 | 22017381 | 70 | 41332799 | 101 Nights: Volume One (101 Nights, #1-3) | 101 Nights: Volume One (101 Nights, #1-3) | S.E. Reign | romance |
6177587 | 162378140X | 17 | US | eng | NaN | 4.37 | 7789809 | Guerrilla Wordfare | 306.0 | 9781623781408 | 2014.0 | 22017381 | 70 | 41332799 | 101 Nights: Volume One (101 Nights, #1-3) | 101 Nights: Volume One (101 Nights, #1-3) | S.E. Reign | mystery, thriller, crime |
6177588 | NaN | 1 | US | NaN | B000W914MC | 3.52 | 621880 | NaN | NaN | NaN | NaN | 11419866 | 7 | 2206102 | The Spanish Duke's Virgin Bride (Innocent Mist... | The Spanish Duke's Virgin Bride (Innocent Mist... | Chantelle Shaw | romance |
6177589 | NaN | 1 | US | NaN | B000W914MC | 3.52 | 621880 | NaN | NaN | NaN | NaN | 11419866 | 7 | 2206102 | The Spanish Duke's Virgin Bride (Innocent Mist... | The Spanish Duke's Virgin Bride (Innocent Mist... | Chantelle Shaw | fiction |
6177590 rows × 18 columns
review_df = pd.read_csv(review_file, sep='\t', compression='gzip')
review_df
user_id | book_id | review_id | rating | date_added | date_updated | read_at | started_at | n_votes | n_comments | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 8842281e1d1347389f2ab93d60773d4d | 24375664 | 5cd416f3efc3f944fce4ce2db2290d5e | 5 | Fri Aug 25 13:55:02 -0700 2017 | Mon Oct 09 08:55:59 -0700 2017 | Sat Oct 07 00:00:00 -0700 2017 | Sat Aug 26 00:00:00 -0700 2017 | 16 | 0 |
1 | 8842281e1d1347389f2ab93d60773d4d | 18245960 | dfdbb7b0eb5a7e4c26d59a937e2e5feb | 5 | Sun Jul 30 07:44:10 -0700 2017 | Wed Aug 30 00:00:26 -0700 2017 | Sat Aug 26 12:05:52 -0700 2017 | Tue Aug 15 13:23:18 -0700 2017 | 28 | 1 |
2 | 8842281e1d1347389f2ab93d60773d4d | 6392944 | 5e212a62bced17b4dbe41150e5bb9037 | 3 | Mon Jul 24 02:48:17 -0700 2017 | Sun Jul 30 09:28:03 -0700 2017 | Tue Jul 25 00:00:00 -0700 2017 | Mon Jul 24 00:00:00 -0700 2017 | 6 | 0 |
3 | 8842281e1d1347389f2ab93d60773d4d | 22078596 | fdd13cad0695656be99828cd75d6eb73 | 4 | Mon Jul 24 02:33:09 -0700 2017 | Sun Jul 30 10:23:54 -0700 2017 | Sun Jul 30 15:42:05 -0700 2017 | Tue Jul 25 00:00:00 -0700 2017 | 22 | 4 |
4 | 8842281e1d1347389f2ab93d60773d4d | 6644782 | bd0df91c9d918c0e433b9ab3a9a5c451 | 4 | Mon Jul 24 02:28:14 -0700 2017 | Thu Aug 24 00:07:20 -0700 2017 | Sat Aug 05 00:00:00 -0700 2017 | Sun Jul 30 00:00:00 -0700 2017 | 8 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
15739962 | d0f6d1a4edcab80a6010cfcfeda4999f | 1656001 | b3d9a00405f7e96752d67b85deda4c7d | 4 | Mon Jun 04 18:08:44 -0700 2012 | Tue Jun 26 18:58:46 -0700 2012 | NaN | Sun Jun 10 00:00:00 -0700 2012 | 0 | 1 |
15739963 | 594c86711bd7acdaf655d102df52a9cb | 10024429 | 2bcba3579aa1d728e664de293e16aacf | 5 | Fri Aug 01 18:46:18 -0700 2014 | Fri Aug 01 18:47:07 -0700 2014 | NaN | NaN | 0 | 0 |
15739964 | 594c86711bd7acdaf655d102df52a9cb | 6721437 | 7c1a7fcc2614a1a2a29213c11c991083 | 3 | Tue Aug 27 12:49:25 -0700 2013 | Tue Aug 27 12:53:46 -0700 2013 | NaN | NaN | 0 | 0 |
15739965 | 594c86711bd7acdaf655d102df52a9cb | 15788197 | 74a9f9d1db09a90aae3a5acea68c6593 | 2 | Fri May 03 13:06:15 -0700 2013 | Fri May 03 15:35:39 -0700 2013 | Fri May 03 15:35:39 -0700 2013 | Fri May 03 00:00:00 -0700 2013 | 0 | 0 |
15739966 | 594c86711bd7acdaf655d102df52a9cb | 8239301 | f2af741fb7a99ff730cf29e004f127da | 4 | Sat Apr 20 15:18:15 -0700 2013 | Thu May 02 16:51:20 -0700 2013 | Thu May 02 16:51:20 -0700 2013 | Sat Apr 20 00:00:00 -0700 2013 | 0 | 0 |
15739967 rows × 10 columns
review_df = pd.read_csv(review_text_file, sep='\t', compression='gzip')
review_df