{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import json\n", "import csv\n", "from collections import Counter\n", "import gzip\n", "import os\n", "\n", "data_dir = '/Volumes/Samsung_T5/Data/Book-Reviews/GoodReads/'\n", "\n", "author_file = os.path.join(data_dir, 'goodreads_book_authors.csv.gz') # author information\n", "book_file = os.path.join(data_dir, 'goodreads_books.csv.gz') # basic book metadata\n", "genre_file = os.path.join(data_dir, 'goodreads_book_genres_initial.csv.gz') # book genre information\n", "review_file = os.path.join(data_dir, 'goodreads_reviews_dedup-no_text.csv.gz') # exclues text to save memory\n", "review_text_file = os.path.join(data_dir, 'goodreads_reviews_dedup.csv.gz') # includes text\n", "\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
average_ratingauthor_idtext_reviews_countnameratings_count
03.986040317Ronald J. Fields49
14.0862622228716Anita Diamant546796
23.92103335075Barbara Hambly122118
33.68921236262Jennifer Weiner888522
43.8214991896Nigel Pennick1740
..................
8295244.361975514Patty Furbush11
8295254.3339881033Jim Schlinkman6
8295264.00134645072Rich Jolly18
8295273.3174278471sr@ mwrGn13
8295283.70540134211Barry S. Brown43
\n", "

829529 rows × 5 columns

\n", "
" ], "text/plain": [ " average_rating author_id text_reviews_count name \\\n", "0 3.98 604031 7 Ronald J. Fields \n", "1 4.08 626222 28716 Anita Diamant \n", "2 3.92 10333 5075 Barbara Hambly \n", "3 3.68 9212 36262 Jennifer Weiner \n", "4 3.82 149918 96 Nigel Pennick \n", "... ... ... ... ... \n", "829524 4.36 197551 4 Patty Furbush \n", "829525 4.33 3988103 3 Jim Schlinkman \n", "829526 4.00 13464507 2 Rich Jolly \n", "829527 3.31 7427847 1 sr@ mwrGn \n", "829528 3.70 5401342 11 Barry S. Brown \n", "\n", " ratings_count \n", "0 49 \n", "1 546796 \n", "2 122118 \n", "3 888522 \n", "4 1740 \n", "... ... \n", "829524 11 \n", "829525 6 \n", "829526 18 \n", "829527 13 \n", "829528 43 \n", "\n", "[829529 rows x 5 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "author_df = pd.read_csv(author_file, sep='\\t', compression='gzip')\n", "\n", "author_df\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
author_idauthor_name
0604031Ronald J. Fields
1626222Anita Diamant
210333Barbara Hambly
39212Jennifer Weiner
4149918Nigel Pennick
.........
829524197551Patty Furbush
8295253988103Jim Schlinkman
82952613464507Rich Jolly
8295277427847sr@ mwrGn
8295285401342Barry S. Brown
\n", "

829529 rows × 2 columns

\n", "
" ], "text/plain": [ " author_id author_name\n", "0 604031 Ronald J. Fields\n", "1 626222 Anita Diamant\n", "2 10333 Barbara Hambly\n", "3 9212 Jennifer Weiner\n", "4 149918 Nigel Pennick\n", "... ... ...\n", "829524 197551 Patty Furbush\n", "829525 3988103 Jim Schlinkman\n", "829526 13464507 Rich Jolly\n", "829527 7427847 sr@ mwrGn\n", "829528 5401342 Barry S. Brown\n", "\n", "[829529 rows x 2 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "author_df = author_df.rename(columns={'name': 'author_name'})\n", "\n", "author_name_df = author_df[['author_id', 'author_name']]\n", "\n", "author_name_df\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
isbntext_reviews_countcountry_codelanguage_codeasinaverage_ratingauthor_idpublishernum_pagesisbn13publication_yearbook_idratings_countwork_idtitletitle_without_series
003128531221USNaNNaN4.00604031St. Martin's Press256.097803128531291984.0533326535400751W.C. Fields: A Life on FilmW.C. Fields: A Life on Film
107435099866USNaNNaN3.23626222Simon & Schuster AudioNaN97807435099852001.01333909101323437Good HarborGood Harbor
2NaN7USengB00071IKUY4.0310333Nelson Doubleday, Inc.600.0NaN1987.073276241408948723The Unschooled Wizard (Sun Wolf and Starhawk, ...The Unschooled Wizard (Sun Wolf and Starhawk, ...
307432942973282USengNaN3.499212Atria Books368.097807432942942009.06066819511846243154Best Friends ForeverBest Friends Forever
408503087125USNaNNaN3.40149918NaNNaN9780850308716NaN28714015278577Runic Astrology: Starcraft and Timekeeping in ...Runic Astrology: Starcraft and Timekeeping in ...
...................................................
269389205635530143USengNaN4.0514033BBC Audiobooks3.097805635530141999.03084038123115103This Sceptred Isle, Vol. 10: The Age of Victor...This Sceptred Isle, Vol. 10: The Age of Victor...
2693893178092870X2USengNaN3.502448MX Publishing148.097817809287082015.026168430646130263Sherlock Holmes and the July CrisisSherlock Holmes and the July Crisis
2693894178092870X2USengNaN3.503460250MX Publishing148.097817809287082015.026168430646130263Sherlock Holmes and the July CrisisSherlock Holmes and the July Crisis
2693895162378140X17USengNaN4.377789809Guerrilla Wordfare306.097816237814082014.0220173817041332799101 Nights: Volume One (101 Nights, #1-3)101 Nights: Volume One (101 Nights, #1-3)
2693896NaN1USNaNB000W914MC3.52621880NaNNaNNaNNaN1141986672206102The Spanish Duke's Virgin Bride (Innocent Mist...The Spanish Duke's Virgin Bride (Innocent Mist...
\n", "

2693897 rows × 16 columns

\n", "
" ], "text/plain": [ " isbn text_reviews_count country_code language_code \\\n", "0 0312853122 1 US NaN \n", "1 0743509986 6 US NaN \n", "2 NaN 7 US eng \n", "3 0743294297 3282 US eng \n", "4 0850308712 5 US NaN \n", "... ... ... ... ... \n", "2693892 0563553014 3 US eng \n", "2693893 178092870X 2 US eng \n", "2693894 178092870X 2 US eng \n", "2693895 162378140X 17 US eng \n", "2693896 NaN 1 US NaN \n", "\n", " asin average_rating author_id publisher \\\n", "0 NaN 4.00 604031 St. Martin's Press \n", "1 NaN 3.23 626222 Simon & Schuster Audio \n", "2 B00071IKUY 4.03 10333 Nelson Doubleday, Inc. \n", "3 NaN 3.49 9212 Atria Books \n", "4 NaN 3.40 149918 NaN \n", "... ... ... ... ... \n", "2693892 NaN 4.05 14033 BBC Audiobooks \n", "2693893 NaN 3.50 2448 MX Publishing \n", "2693894 NaN 3.50 3460250 MX Publishing \n", "2693895 NaN 4.37 7789809 Guerrilla Wordfare \n", "2693896 B000W914MC 3.52 621880 NaN \n", "\n", " num_pages isbn13 publication_year book_id ratings_count \\\n", "0 256.0 9780312853129 1984.0 5333265 3 \n", "1 NaN 9780743509985 2001.0 1333909 10 \n", "2 600.0 NaN 1987.0 7327624 140 \n", "3 368.0 9780743294294 2009.0 6066819 51184 \n", "4 NaN 9780850308716 NaN 287140 15 \n", "... ... ... ... ... ... \n", "2693892 3.0 9780563553014 1999.0 3084038 12 \n", "2693893 148.0 9781780928708 2015.0 26168430 6 \n", "2693894 148.0 9781780928708 2015.0 26168430 6 \n", "2693895 306.0 9781623781408 2014.0 22017381 70 \n", "2693896 NaN NaN NaN 11419866 7 \n", "\n", " work_id title \\\n", "0 5400751 W.C. Fields: A Life on Film \n", "1 1323437 Good Harbor \n", "2 8948723 The Unschooled Wizard (Sun Wolf and Starhawk, ... \n", "3 6243154 Best Friends Forever \n", "4 278577 Runic Astrology: Starcraft and Timekeeping in ... \n", "... ... ... \n", "2693892 3115103 This Sceptred Isle, Vol. 10: The Age of Victor... \n", "2693893 46130263 Sherlock Holmes and the July Crisis \n", "2693894 46130263 Sherlock Holmes and the July Crisis \n", "2693895 41332799 101 Nights: Volume One (101 Nights, #1-3) \n", "2693896 2206102 The Spanish Duke's Virgin Bride (Innocent Mist... \n", "\n", " title_without_series \n", "0 W.C. Fields: A Life on Film \n", "1 Good Harbor \n", "2 The Unschooled Wizard (Sun Wolf and Starhawk, ... \n", "3 Best Friends Forever \n", "4 Runic Astrology: Starcraft and Timekeeping in ... \n", "... ... \n", "2693892 This Sceptred Isle, Vol. 10: The Age of Victor... \n", "2693893 Sherlock Holmes and the July Crisis \n", "2693894 Sherlock Holmes and the July Crisis \n", "2693895 101 Nights: Volume One (101 Nights, #1-3) \n", "2693896 The Spanish Duke's Virgin Bride (Innocent Mist... \n", "\n", "[2693897 rows x 16 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "book_df = pd.read_csv(book_file, sep='\\t', compression='gzip')\n", "\n", "book_df\n" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Agatha Christie 4544\n", "Stephen King 4170\n", "Anonymous 2956\n", "William Shakespeare 2499\n", "James Patterson 2486\n", " ... \n", "Basuki Raharjo 1\n", "David Silberman 1\n", "Alex Simmons; Illustrator-Denise Shimabu 1\n", "Anna Murdoch 1\n", "Konstantin Leontjev 1\n", "Name: author_name, Length: 671851, dtype: int64" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "book_author_df = pd.merge(book_df, author_name_df, on='author_id', how='left')\n", "\n", "book_author_df.author_name.value_counts()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
book_idgenres
05333265history, historical fiction, biography
11333909fiction
21333909history, historical fiction, biography
37327624fantasy, paranormal
47327624fiction
.........
50425372342551non-fiction
504253822017381romance
504253922017381mystery, thriller, crime
504254011419866romance
504254111419866fiction
\n", "

5042542 rows × 2 columns

\n", "
" ], "text/plain": [ " book_id genres\n", "0 5333265 history, historical fiction, biography\n", "1 1333909 fiction\n", "2 1333909 history, historical fiction, biography\n", "3 7327624 fantasy, paranormal\n", "4 7327624 fiction\n", "... ... ...\n", "5042537 2342551 non-fiction\n", "5042538 22017381 romance\n", "5042539 22017381 mystery, thriller, crime\n", "5042540 11419866 romance\n", "5042541 11419866 fiction\n", "\n", "[5042542 rows x 2 columns]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "genre_df = pd.read_csv(genre_file, sep='\\t', compression='gzip')\n", "\n", "genre_df\n" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
isbntext_reviews_countcountry_codelanguage_codeasinaverage_ratingauthor_idpublishernum_pagesisbn13publication_yearbook_idratings_countwork_idtitletitle_without_seriesauthor_namegenres
003128531221USNaNNaN4.00604031St. Martin's Press256.097803128531291984.0533326535400751W.C. Fields: A Life on FilmW.C. Fields: A Life on FilmRonald J. Fieldshistory, historical fiction, biography
107435099866USNaNNaN3.23626222Simon & Schuster AudioNaN97807435099852001.01333909101323437Good HarborGood HarborAnita Diamantfiction
207435099866USNaNNaN3.23626222Simon & Schuster AudioNaN97807435099852001.01333909101323437Good HarborGood HarborAnita Diamanthistory, historical fiction, biography
3NaN7USengB00071IKUY4.0310333Nelson Doubleday, Inc.600.0NaN1987.073276241408948723The Unschooled Wizard (Sun Wolf and Starhawk, ...The Unschooled Wizard (Sun Wolf and Starhawk, ...Barbara Hamblyfantasy, paranormal
4NaN7USengB00071IKUY4.0310333Nelson Doubleday, Inc.600.0NaN1987.073276241408948723The Unschooled Wizard (Sun Wolf and Starhawk, ...The Unschooled Wizard (Sun Wolf and Starhawk, ...Barbara Hamblyfiction
.........................................................
6177585178092870X2USengNaN3.503460250MX Publishing148.097817809287082015.026168430646130263Sherlock Holmes and the July CrisisSherlock Holmes and the July CrisisJames Carlopiofiction
6177586162378140X17USengNaN4.377789809Guerrilla Wordfare306.097816237814082014.0220173817041332799101 Nights: Volume One (101 Nights, #1-3)101 Nights: Volume One (101 Nights, #1-3)S.E. Reignromance
6177587162378140X17USengNaN4.377789809Guerrilla Wordfare306.097816237814082014.0220173817041332799101 Nights: Volume One (101 Nights, #1-3)101 Nights: Volume One (101 Nights, #1-3)S.E. Reignmystery, thriller, crime
6177588NaN1USNaNB000W914MC3.52621880NaNNaNNaNNaN1141986672206102The Spanish Duke's Virgin Bride (Innocent Mist...The Spanish Duke's Virgin Bride (Innocent Mist...Chantelle Shawromance
6177589NaN1USNaNB000W914MC3.52621880NaNNaNNaNNaN1141986672206102The Spanish Duke's Virgin Bride (Innocent Mist...The Spanish Duke's Virgin Bride (Innocent Mist...Chantelle Shawfiction
\n", "

6177590 rows × 18 columns

\n", "
" ], "text/plain": [ " isbn text_reviews_count country_code language_code \\\n", "0 0312853122 1 US NaN \n", "1 0743509986 6 US NaN \n", "2 0743509986 6 US NaN \n", "3 NaN 7 US eng \n", "4 NaN 7 US eng \n", "... ... ... ... ... \n", "6177585 178092870X 2 US eng \n", "6177586 162378140X 17 US eng \n", "6177587 162378140X 17 US eng \n", "6177588 NaN 1 US NaN \n", "6177589 NaN 1 US NaN \n", "\n", " asin average_rating author_id publisher \\\n", "0 NaN 4.00 604031 St. Martin's Press \n", "1 NaN 3.23 626222 Simon & Schuster Audio \n", "2 NaN 3.23 626222 Simon & Schuster Audio \n", "3 B00071IKUY 4.03 10333 Nelson Doubleday, Inc. \n", "4 B00071IKUY 4.03 10333 Nelson Doubleday, Inc. \n", "... ... ... ... ... \n", "6177585 NaN 3.50 3460250 MX Publishing \n", "6177586 NaN 4.37 7789809 Guerrilla Wordfare \n", "6177587 NaN 4.37 7789809 Guerrilla Wordfare \n", "6177588 B000W914MC 3.52 621880 NaN \n", "6177589 B000W914MC 3.52 621880 NaN \n", "\n", " num_pages isbn13 publication_year book_id ratings_count \\\n", "0 256.0 9780312853129 1984.0 5333265 3 \n", "1 NaN 9780743509985 2001.0 1333909 10 \n", "2 NaN 9780743509985 2001.0 1333909 10 \n", "3 600.0 NaN 1987.0 7327624 140 \n", "4 600.0 NaN 1987.0 7327624 140 \n", "... ... ... ... ... ... \n", "6177585 148.0 9781780928708 2015.0 26168430 6 \n", "6177586 306.0 9781623781408 2014.0 22017381 70 \n", "6177587 306.0 9781623781408 2014.0 22017381 70 \n", "6177588 NaN NaN NaN 11419866 7 \n", "6177589 NaN NaN NaN 11419866 7 \n", "\n", " work_id title \\\n", "0 5400751 W.C. Fields: A Life on Film \n", "1 1323437 Good Harbor \n", "2 1323437 Good Harbor \n", "3 8948723 The Unschooled Wizard (Sun Wolf and Starhawk, ... \n", "4 8948723 The Unschooled Wizard (Sun Wolf and Starhawk, ... \n", "... ... ... \n", "6177585 46130263 Sherlock Holmes and the July Crisis \n", "6177586 41332799 101 Nights: Volume One (101 Nights, #1-3) \n", "6177587 41332799 101 Nights: Volume One (101 Nights, #1-3) \n", "6177588 2206102 The Spanish Duke's Virgin Bride (Innocent Mist... \n", "6177589 2206102 The Spanish Duke's Virgin Bride (Innocent Mist... \n", "\n", " title_without_series author_name \\\n", "0 W.C. Fields: A Life on Film Ronald J. Fields \n", "1 Good Harbor Anita Diamant \n", "2 Good Harbor Anita Diamant \n", "3 The Unschooled Wizard (Sun Wolf and Starhawk, ... Barbara Hambly \n", "4 The Unschooled Wizard (Sun Wolf and Starhawk, ... Barbara Hambly \n", "... ... ... \n", "6177585 Sherlock Holmes and the July Crisis James Carlopio \n", "6177586 101 Nights: Volume One (101 Nights, #1-3) S.E. Reign \n", "6177587 101 Nights: Volume One (101 Nights, #1-3) S.E. Reign \n", "6177588 The Spanish Duke's Virgin Bride (Innocent Mist... Chantelle Shaw \n", "6177589 The Spanish Duke's Virgin Bride (Innocent Mist... Chantelle Shaw \n", "\n", " genres \n", "0 history, historical fiction, biography \n", "1 fiction \n", "2 history, historical fiction, biography \n", "3 fantasy, paranormal \n", "4 fiction \n", "... ... \n", "6177585 fiction \n", "6177586 romance \n", "6177587 mystery, thriller, crime \n", "6177588 romance \n", "6177589 fiction \n", "\n", "[6177590 rows x 18 columns]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "book_author_genre_df = pd.merge(book_author_df, genre_df, on='book_id', how='left')\n", "\n", "book_author_genre_df" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_idbook_idreview_idratingdate_addeddate_updatedread_atstarted_atn_votesn_comments
08842281e1d1347389f2ab93d60773d4d243756645cd416f3efc3f944fce4ce2db2290d5e5Fri Aug 25 13:55:02 -0700 2017Mon Oct 09 08:55:59 -0700 2017Sat Oct 07 00:00:00 -0700 2017Sat Aug 26 00:00:00 -0700 2017160
18842281e1d1347389f2ab93d60773d4d18245960dfdbb7b0eb5a7e4c26d59a937e2e5feb5Sun Jul 30 07:44:10 -0700 2017Wed Aug 30 00:00:26 -0700 2017Sat Aug 26 12:05:52 -0700 2017Tue Aug 15 13:23:18 -0700 2017281
28842281e1d1347389f2ab93d60773d4d63929445e212a62bced17b4dbe41150e5bb90373Mon Jul 24 02:48:17 -0700 2017Sun Jul 30 09:28:03 -0700 2017Tue Jul 25 00:00:00 -0700 2017Mon Jul 24 00:00:00 -0700 201760
38842281e1d1347389f2ab93d60773d4d22078596fdd13cad0695656be99828cd75d6eb734Mon Jul 24 02:33:09 -0700 2017Sun Jul 30 10:23:54 -0700 2017Sun Jul 30 15:42:05 -0700 2017Tue Jul 25 00:00:00 -0700 2017224
48842281e1d1347389f2ab93d60773d4d6644782bd0df91c9d918c0e433b9ab3a9a5c4514Mon Jul 24 02:28:14 -0700 2017Thu Aug 24 00:07:20 -0700 2017Sat Aug 05 00:00:00 -0700 2017Sun Jul 30 00:00:00 -0700 201780
.................................
15739962d0f6d1a4edcab80a6010cfcfeda4999f1656001b3d9a00405f7e96752d67b85deda4c7d4Mon Jun 04 18:08:44 -0700 2012Tue Jun 26 18:58:46 -0700 2012NaNSun Jun 10 00:00:00 -0700 201201
15739963594c86711bd7acdaf655d102df52a9cb100244292bcba3579aa1d728e664de293e16aacf5Fri Aug 01 18:46:18 -0700 2014Fri Aug 01 18:47:07 -0700 2014NaNNaN00
15739964594c86711bd7acdaf655d102df52a9cb67214377c1a7fcc2614a1a2a29213c11c9910833Tue Aug 27 12:49:25 -0700 2013Tue Aug 27 12:53:46 -0700 2013NaNNaN00
15739965594c86711bd7acdaf655d102df52a9cb1578819774a9f9d1db09a90aae3a5acea68c65932Fri May 03 13:06:15 -0700 2013Fri May 03 15:35:39 -0700 2013Fri May 03 15:35:39 -0700 2013Fri May 03 00:00:00 -0700 201300
15739966594c86711bd7acdaf655d102df52a9cb8239301f2af741fb7a99ff730cf29e004f127da4Sat Apr 20 15:18:15 -0700 2013Thu May 02 16:51:20 -0700 2013Thu May 02 16:51:20 -0700 2013Sat Apr 20 00:00:00 -0700 201300
\n", "

15739967 rows × 10 columns

\n", "
" ], "text/plain": [ " user_id book_id \\\n", "0 8842281e1d1347389f2ab93d60773d4d 24375664 \n", "1 8842281e1d1347389f2ab93d60773d4d 18245960 \n", "2 8842281e1d1347389f2ab93d60773d4d 6392944 \n", "3 8842281e1d1347389f2ab93d60773d4d 22078596 \n", "4 8842281e1d1347389f2ab93d60773d4d 6644782 \n", "... ... ... \n", "15739962 d0f6d1a4edcab80a6010cfcfeda4999f 1656001 \n", "15739963 594c86711bd7acdaf655d102df52a9cb 10024429 \n", "15739964 594c86711bd7acdaf655d102df52a9cb 6721437 \n", "15739965 594c86711bd7acdaf655d102df52a9cb 15788197 \n", "15739966 594c86711bd7acdaf655d102df52a9cb 8239301 \n", "\n", " review_id rating \\\n", "0 5cd416f3efc3f944fce4ce2db2290d5e 5 \n", "1 dfdbb7b0eb5a7e4c26d59a937e2e5feb 5 \n", "2 5e212a62bced17b4dbe41150e5bb9037 3 \n", "3 fdd13cad0695656be99828cd75d6eb73 4 \n", "4 bd0df91c9d918c0e433b9ab3a9a5c451 4 \n", "... ... ... \n", "15739962 b3d9a00405f7e96752d67b85deda4c7d 4 \n", "15739963 2bcba3579aa1d728e664de293e16aacf 5 \n", "15739964 7c1a7fcc2614a1a2a29213c11c991083 3 \n", "15739965 74a9f9d1db09a90aae3a5acea68c6593 2 \n", "15739966 f2af741fb7a99ff730cf29e004f127da 4 \n", "\n", " date_added date_updated \\\n", "0 Fri Aug 25 13:55:02 -0700 2017 Mon Oct 09 08:55:59 -0700 2017 \n", "1 Sun Jul 30 07:44:10 -0700 2017 Wed Aug 30 00:00:26 -0700 2017 \n", "2 Mon Jul 24 02:48:17 -0700 2017 Sun Jul 30 09:28:03 -0700 2017 \n", "3 Mon Jul 24 02:33:09 -0700 2017 Sun Jul 30 10:23:54 -0700 2017 \n", "4 Mon Jul 24 02:28:14 -0700 2017 Thu Aug 24 00:07:20 -0700 2017 \n", "... ... ... \n", "15739962 Mon Jun 04 18:08:44 -0700 2012 Tue Jun 26 18:58:46 -0700 2012 \n", "15739963 Fri Aug 01 18:46:18 -0700 2014 Fri Aug 01 18:47:07 -0700 2014 \n", "15739964 Tue Aug 27 12:49:25 -0700 2013 Tue Aug 27 12:53:46 -0700 2013 \n", "15739965 Fri May 03 13:06:15 -0700 2013 Fri May 03 15:35:39 -0700 2013 \n", "15739966 Sat Apr 20 15:18:15 -0700 2013 Thu May 02 16:51:20 -0700 2013 \n", "\n", " read_at started_at \\\n", "0 Sat Oct 07 00:00:00 -0700 2017 Sat Aug 26 00:00:00 -0700 2017 \n", "1 Sat Aug 26 12:05:52 -0700 2017 Tue Aug 15 13:23:18 -0700 2017 \n", "2 Tue Jul 25 00:00:00 -0700 2017 Mon Jul 24 00:00:00 -0700 2017 \n", "3 Sun Jul 30 15:42:05 -0700 2017 Tue Jul 25 00:00:00 -0700 2017 \n", "4 Sat Aug 05 00:00:00 -0700 2017 Sun Jul 30 00:00:00 -0700 2017 \n", "... ... ... \n", "15739962 NaN Sun Jun 10 00:00:00 -0700 2012 \n", "15739963 NaN NaN \n", "15739964 NaN NaN \n", "15739965 Fri May 03 15:35:39 -0700 2013 Fri May 03 00:00:00 -0700 2013 \n", "15739966 Thu May 02 16:51:20 -0700 2013 Sat Apr 20 00:00:00 -0700 2013 \n", "\n", " n_votes n_comments \n", "0 16 0 \n", "1 28 1 \n", "2 6 0 \n", "3 22 4 \n", "4 8 0 \n", "... ... ... \n", "15739962 0 1 \n", "15739963 0 0 \n", "15739964 0 0 \n", "15739965 0 0 \n", "15739966 0 0 \n", "\n", "[15739967 rows x 10 columns]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "review_df = pd.read_csv(review_file, sep='\\t', compression='gzip')\n", "\n", "review_df\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "review_df = pd.read_csv(review_text_file, sep='\\t', compression='gzip')\n", "\n", "review_df\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }