In [2]:
import numpy as np
import pandas as pd
import json
import csv
from collections import Counter
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
import gzip
import os

data_dir = '/Volumes/Samsung_T5/Data/Book-Reviews/GoodReads/'

files = [
    #'goodreads_book_authors.json.gz',
    'goodreads_reviews_dedup.json.gz',
]

def detect_lang(text):
    try:
        return detect(text)
    except (LangDetectException, TypeError):
        return 'unknown'
    
def parse_review(review):
    review['review_text'] = review['review_text'].replace('\r', ' ')
    review['review_text'] = review['review_text'].replace('\n', ' ')
    review['review_length'] = len(review['review_text'])
    review['review_lang'] = detect_lang(review['review_text'])
    return review

def read_json(data_file):
    with gzip.open(data_file, 'rt') as fh:
        for line in fh:
            yield json.loads(line.strip())
            
def json_to_csv(json_file, csv_file):
    reader = read_json(json_file)
    first = next(reader)
    headers = list(first.keys())
    if 'review_text' in headers:
        headers.remove('review_text')
        headers.append('review_length')
        headers.append('review_lang')
        headers.append('review_text')
        first = parse_review(first)
    with gzip.open(csv_file, 'wt') as fh:
        writer = csv.writer(fh, delimiter='\t')
        writer.writerow(headers)
        writer.writerow([first[header] if header in first else None for header in headers])
        for record in reader:
            if 'review_text' in record:
                record = parse_review(record)
            writer.writerow([record[header] if header in record else None for header in headers])


def inspect_fields(data_file):
    header_count = Counter()
    for record in read_json(data_file):
        header_count.update(record.keys())
    max_count = max(header_count.values())
    min_count = min(header_count.values())
    if max_count > min_count:
        print('non-equal field counts')
        print(header_count)
    else:
        print('equal field counts')
        print(header_count)
    return header_count.keys()


for filename in files:
    if '.json' in filename:
        print(f'transforming {filename} to CSV')
    else:
        print(f'skipping {filename}')
        continue
    json_file = os.path.join(data_dir, filename)
    csv_file = json_file.replace('json', 'csv')

    if csv_file != json_file:
        #inspect_fields(json_file)
        json_to_csv(json_file, csv_file)
transforming goodreads_reviews_dedup.json.gz to CSV
In [ ]:
 
In [25]:
json_file = os.path.join(data_dir, 'goodreads_book_genres_initial.json.gz')
csv_file = json_file.replace('json', 'csv')

headers = ['book_id', 'genres']
with gzip.open(csv_file, 'wt') as fh:
    writer = csv.writer(fh, delimiter='\t')
    writer.writerow(headers)
    for ri, record in enumerate(read_json(json_file)):
        for genre in record['genres']:
            doc = {'book_id': record['book_id'], 'genre': genre}
            row = [record['book_id'], genre]
            writer.writerow(row)
In [35]:
json_file = os.path.join(data_dir, 'goodreads_books.json.gz')
csv_file = json_file.replace('json', 'csv')

headers = [
    'isbn', 'text_reviews_count', 'country_code', 'language_code', 'asin', 'average_rating', 
    'author_id', 'publisher', 'num_pages', 
    'isbn13', 'publication_year', 'book_id', 'ratings_count', 'work_id', 'title', 'title_without_series'
]

doc_count = 0
with gzip.open(csv_file, 'wt') as fh:
    writer = csv.writer(fh, delimiter='\t')
    writer.writerow(headers)
    for ri, record in enumerate(read_json(json_file)):
        for author in record['authors']:
            if author['role'].lower() in ['', 'author', 'creator']:
                doc = {header: record[header] for header in headers if header in record}
                doc['author_id'] = author['author_id']
                row = [doc[header] for header in headers]
                doc_count += 1
                writer.writerow(row)
        if (ri+1) % 100000 == 0:
            print(ri+1, 'records parsed', doc_count, 'rows printed')
100000 records parsed 113676 rows printed
200000 records parsed 227587 rows printed
300000 records parsed 342076 rows printed
400000 records parsed 456535 rows printed
500000 records parsed 570463 rows printed
600000 records parsed 684301 rows printed
700000 records parsed 798336 rows printed
800000 records parsed 912376 rows printed
900000 records parsed 1027553 rows printed
1000000 records parsed 1141468 rows printed
1100000 records parsed 1255976 rows printed
1200000 records parsed 1370538 rows printed
1300000 records parsed 1484713 rows printed
1400000 records parsed 1599219 rows printed
1500000 records parsed 1713095 rows printed
1600000 records parsed 1827324 rows printed
1700000 records parsed 1941476 rows printed
1800000 records parsed 2055309 rows printed
1900000 records parsed 2169593 rows printed
2000000 records parsed 2283752 rows printed
2100000 records parsed 2397019 rows printed
2200000 records parsed 2510567 rows printed
2300000 records parsed 2624920 rows printed
In [ ]: