In [2]:
import numpy as np
import pandas as pd
import json
import csv
from collections import Counter
import gzip
import os

data_dir = '/Volumes/Samsung_T5/Data/Book-Reviews/GoodReads/'

author_file = os.path.join(data_dir, 'goodreads_book_authors.csv.gz') # author information
book_file = os.path.join(data_dir, 'goodreads_books.csv.gz') # basic book metadata
genre_file = os.path.join(data_dir, 'goodreads_book_genres_initial.csv.gz') # book genre information
review_file = os.path.join(data_dir, 'goodreads_reviews_dedup-no_text.csv.gz') # exclues text to save memory
review_text_file = os.path.join(data_dir, 'goodreads_reviews_dedup.csv.gz') # includes text
In [2]:
author_df = pd.read_csv(author_file, sep='\t', compression='gzip')

author_df
Out[2]:
average_rating author_id text_reviews_count name ratings_count
0 3.98 604031 7 Ronald J. Fields 49
1 4.08 626222 28716 Anita Diamant 546796
2 3.92 10333 5075 Barbara Hambly 122118
3 3.68 9212 36262 Jennifer Weiner 888522
4 3.82 149918 96 Nigel Pennick 1740
... ... ... ... ... ...
829524 4.36 197551 4 Patty Furbush 11
829525 4.33 3988103 3 Jim Schlinkman 6
829526 4.00 13464507 2 Rich Jolly 18
829527 3.31 7427847 1 sr@ mwrGn 13
829528 3.70 5401342 11 Barry S. Brown 43

829529 rows × 5 columns

In [15]:
author_df = author_df.rename(columns={'name': 'author_name'})

author_name_df = author_df[['author_id', 'author_name']]

author_name_df
Out[15]:
author_id author_name
0 604031 Ronald J. Fields
1 626222 Anita Diamant
2 10333 Barbara Hambly
3 9212 Jennifer Weiner
4 149918 Nigel Pennick
... ... ...
829524 197551 Patty Furbush
829525 3988103 Jim Schlinkman
829526 13464507 Rich Jolly
829527 7427847 sr@ mwrGn
829528 5401342 Barry S. Brown

829529 rows × 2 columns

In [3]:
book_df = pd.read_csv(book_file, sep='\t', compression='gzip')

book_df
Out[3]:
isbn text_reviews_count country_code language_code asin average_rating author_id publisher num_pages isbn13 publication_year book_id ratings_count work_id title title_without_series
0 0312853122 1 US NaN NaN 4.00 604031 St. Martin's Press 256.0 9780312853129 1984.0 5333265 3 5400751 W.C. Fields: A Life on Film W.C. Fields: A Life on Film
1 0743509986 6 US NaN NaN 3.23 626222 Simon & Schuster Audio NaN 9780743509985 2001.0 1333909 10 1323437 Good Harbor Good Harbor
2 NaN 7 US eng B00071IKUY 4.03 10333 Nelson Doubleday, Inc. 600.0 NaN 1987.0 7327624 140 8948723 The Unschooled Wizard (Sun Wolf and Starhawk, ... The Unschooled Wizard (Sun Wolf and Starhawk, ...
3 0743294297 3282 US eng NaN 3.49 9212 Atria Books 368.0 9780743294294 2009.0 6066819 51184 6243154 Best Friends Forever Best Friends Forever
4 0850308712 5 US NaN NaN 3.40 149918 NaN NaN 9780850308716 NaN 287140 15 278577 Runic Astrology: Starcraft and Timekeeping in ... Runic Astrology: Starcraft and Timekeeping in ...
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2693892 0563553014 3 US eng NaN 4.05 14033 BBC Audiobooks 3.0 9780563553014 1999.0 3084038 12 3115103 This Sceptred Isle, Vol. 10: The Age of Victor... This Sceptred Isle, Vol. 10: The Age of Victor...
2693893 178092870X 2 US eng NaN 3.50 2448 MX Publishing 148.0 9781780928708 2015.0 26168430 6 46130263 Sherlock Holmes and the July Crisis Sherlock Holmes and the July Crisis
2693894 178092870X 2 US eng NaN 3.50 3460250 MX Publishing 148.0 9781780928708 2015.0 26168430 6 46130263 Sherlock Holmes and the July Crisis Sherlock Holmes and the July Crisis
2693895 162378140X 17 US eng NaN 4.37 7789809 Guerrilla Wordfare 306.0 9781623781408 2014.0 22017381 70 41332799 101 Nights: Volume One (101 Nights, #1-3) 101 Nights: Volume One (101 Nights, #1-3)
2693896 NaN 1 US NaN B000W914MC 3.52 621880 NaN NaN NaN NaN 11419866 7 2206102 The Spanish Duke's Virgin Bride (Innocent Mist... The Spanish Duke's Virgin Bride (Innocent Mist...

2693897 rows × 16 columns

In [17]:
book_author_df = pd.merge(book_df, author_name_df, on='author_id', how='left')

book_author_df.author_name.value_counts()
Out[17]:
Agatha Christie                             4544
Stephen King                                4170
Anonymous                                   2956
William Shakespeare                         2499
James Patterson                             2486
                                            ... 
Basuki Raharjo                                 1
David Silberman                                1
Alex Simmons; Illustrator-Denise Shimabu       1
Anna Murdoch                                   1
Konstantin Leontjev                            1
Name: author_name, Length: 671851, dtype: int64
In [18]:
genre_df = pd.read_csv(genre_file, sep='\t', compression='gzip')

genre_df
Out[18]:
book_id genres
0 5333265 history, historical fiction, biography
1 1333909 fiction
2 1333909 history, historical fiction, biography
3 7327624 fantasy, paranormal
4 7327624 fiction
... ... ...
5042537 2342551 non-fiction
5042538 22017381 romance
5042539 22017381 mystery, thriller, crime
5042540 11419866 romance
5042541 11419866 fiction

5042542 rows × 2 columns

In [19]:
book_author_genre_df = pd.merge(book_author_df, genre_df, on='book_id', how='left')

book_author_genre_df
Out[19]:
isbn text_reviews_count country_code language_code asin average_rating author_id publisher num_pages isbn13 publication_year book_id ratings_count work_id title title_without_series author_name genres
0 0312853122 1 US NaN NaN 4.00 604031 St. Martin's Press 256.0 9780312853129 1984.0 5333265 3 5400751 W.C. Fields: A Life on Film W.C. Fields: A Life on Film Ronald J. Fields history, historical fiction, biography
1 0743509986 6 US NaN NaN 3.23 626222 Simon & Schuster Audio NaN 9780743509985 2001.0 1333909 10 1323437 Good Harbor Good Harbor Anita Diamant fiction
2 0743509986 6 US NaN NaN 3.23 626222 Simon & Schuster Audio NaN 9780743509985 2001.0 1333909 10 1323437 Good Harbor Good Harbor Anita Diamant history, historical fiction, biography
3 NaN 7 US eng B00071IKUY 4.03 10333 Nelson Doubleday, Inc. 600.0 NaN 1987.0 7327624 140 8948723 The Unschooled Wizard (Sun Wolf and Starhawk, ... The Unschooled Wizard (Sun Wolf and Starhawk, ... Barbara Hambly fantasy, paranormal
4 NaN 7 US eng B00071IKUY 4.03 10333 Nelson Doubleday, Inc. 600.0 NaN 1987.0 7327624 140 8948723 The Unschooled Wizard (Sun Wolf and Starhawk, ... The Unschooled Wizard (Sun Wolf and Starhawk, ... Barbara Hambly fiction
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
6177585 178092870X 2 US eng NaN 3.50 3460250 MX Publishing 148.0 9781780928708 2015.0 26168430 6 46130263 Sherlock Holmes and the July Crisis Sherlock Holmes and the July Crisis James Carlopio fiction
6177586 162378140X 17 US eng NaN 4.37 7789809 Guerrilla Wordfare 306.0 9781623781408 2014.0 22017381 70 41332799 101 Nights: Volume One (101 Nights, #1-3) 101 Nights: Volume One (101 Nights, #1-3) S.E. Reign romance
6177587 162378140X 17 US eng NaN 4.37 7789809 Guerrilla Wordfare 306.0 9781623781408 2014.0 22017381 70 41332799 101 Nights: Volume One (101 Nights, #1-3) 101 Nights: Volume One (101 Nights, #1-3) S.E. Reign mystery, thriller, crime
6177588 NaN 1 US NaN B000W914MC 3.52 621880 NaN NaN NaN NaN 11419866 7 2206102 The Spanish Duke's Virgin Bride (Innocent Mist... The Spanish Duke's Virgin Bride (Innocent Mist... Chantelle Shaw romance
6177589 NaN 1 US NaN B000W914MC 3.52 621880 NaN NaN NaN NaN 11419866 7 2206102 The Spanish Duke's Virgin Bride (Innocent Mist... The Spanish Duke's Virgin Bride (Innocent Mist... Chantelle Shaw fiction

6177590 rows × 18 columns

In [20]:
review_df = pd.read_csv(review_file, sep='\t', compression='gzip')

review_df
Out[20]:
user_id book_id review_id rating date_added date_updated read_at started_at n_votes n_comments
0 8842281e1d1347389f2ab93d60773d4d 24375664 5cd416f3efc3f944fce4ce2db2290d5e 5 Fri Aug 25 13:55:02 -0700 2017 Mon Oct 09 08:55:59 -0700 2017 Sat Oct 07 00:00:00 -0700 2017 Sat Aug 26 00:00:00 -0700 2017 16 0
1 8842281e1d1347389f2ab93d60773d4d 18245960 dfdbb7b0eb5a7e4c26d59a937e2e5feb 5 Sun Jul 30 07:44:10 -0700 2017 Wed Aug 30 00:00:26 -0700 2017 Sat Aug 26 12:05:52 -0700 2017 Tue Aug 15 13:23:18 -0700 2017 28 1
2 8842281e1d1347389f2ab93d60773d4d 6392944 5e212a62bced17b4dbe41150e5bb9037 3 Mon Jul 24 02:48:17 -0700 2017 Sun Jul 30 09:28:03 -0700 2017 Tue Jul 25 00:00:00 -0700 2017 Mon Jul 24 00:00:00 -0700 2017 6 0
3 8842281e1d1347389f2ab93d60773d4d 22078596 fdd13cad0695656be99828cd75d6eb73 4 Mon Jul 24 02:33:09 -0700 2017 Sun Jul 30 10:23:54 -0700 2017 Sun Jul 30 15:42:05 -0700 2017 Tue Jul 25 00:00:00 -0700 2017 22 4
4 8842281e1d1347389f2ab93d60773d4d 6644782 bd0df91c9d918c0e433b9ab3a9a5c451 4 Mon Jul 24 02:28:14 -0700 2017 Thu Aug 24 00:07:20 -0700 2017 Sat Aug 05 00:00:00 -0700 2017 Sun Jul 30 00:00:00 -0700 2017 8 0
... ... ... ... ... ... ... ... ... ... ...
15739962 d0f6d1a4edcab80a6010cfcfeda4999f 1656001 b3d9a00405f7e96752d67b85deda4c7d 4 Mon Jun 04 18:08:44 -0700 2012 Tue Jun 26 18:58:46 -0700 2012 NaN Sun Jun 10 00:00:00 -0700 2012 0 1
15739963 594c86711bd7acdaf655d102df52a9cb 10024429 2bcba3579aa1d728e664de293e16aacf 5 Fri Aug 01 18:46:18 -0700 2014 Fri Aug 01 18:47:07 -0700 2014 NaN NaN 0 0
15739964 594c86711bd7acdaf655d102df52a9cb 6721437 7c1a7fcc2614a1a2a29213c11c991083 3 Tue Aug 27 12:49:25 -0700 2013 Tue Aug 27 12:53:46 -0700 2013 NaN NaN 0 0
15739965 594c86711bd7acdaf655d102df52a9cb 15788197 74a9f9d1db09a90aae3a5acea68c6593 2 Fri May 03 13:06:15 -0700 2013 Fri May 03 15:35:39 -0700 2013 Fri May 03 15:35:39 -0700 2013 Fri May 03 00:00:00 -0700 2013 0 0
15739966 594c86711bd7acdaf655d102df52a9cb 8239301 f2af741fb7a99ff730cf29e004f127da 4 Sat Apr 20 15:18:15 -0700 2013 Thu May 02 16:51:20 -0700 2013 Thu May 02 16:51:20 -0700 2013 Sat Apr 20 00:00:00 -0700 2013 0 0

15739967 rows × 10 columns

In [ ]:
review_df = pd.read_csv(review_text_file, sep='\t', compression='gzip')

review_df
In [ ]: