import numpy as np
import pandas as pd
import json
import csv
from collections import Counter
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
import gzip
import os
data_dir = '/Volumes/Samsung_T5/Data/Book-Reviews/GoodReads/'
files = [
#'goodreads_book_authors.json.gz',
'goodreads_reviews_dedup.json.gz',
]
def detect_lang(text):
try:
return detect(text)
except (LangDetectException, TypeError):
return 'unknown'
def parse_review(review):
review['review_text'] = review['review_text'].replace('\r', ' ')
review['review_text'] = review['review_text'].replace('\n', ' ')
review['review_length'] = len(review['review_text'])
review['review_lang'] = detect_lang(review['review_text'])
return review
def read_json(data_file):
with gzip.open(data_file, 'rt') as fh:
for line in fh:
yield json.loads(line.strip())
def json_to_csv(json_file, csv_file):
reader = read_json(json_file)
first = next(reader)
headers = list(first.keys())
if 'review_text' in headers:
headers.remove('review_text')
headers.append('review_length')
headers.append('review_lang')
headers.append('review_text')
first = parse_review(first)
with gzip.open(csv_file, 'wt') as fh:
writer = csv.writer(fh, delimiter='\t')
writer.writerow(headers)
writer.writerow([first[header] if header in first else None for header in headers])
for record in reader:
if 'review_text' in record:
record = parse_review(record)
writer.writerow([record[header] if header in record else None for header in headers])
def inspect_fields(data_file):
header_count = Counter()
for record in read_json(data_file):
header_count.update(record.keys())
max_count = max(header_count.values())
min_count = min(header_count.values())
if max_count > min_count:
print('non-equal field counts')
print(header_count)
else:
print('equal field counts')
print(header_count)
return header_count.keys()
for filename in files:
if '.json' in filename:
print(f'transforming {filename} to CSV')
else:
print(f'skipping {filename}')
continue
json_file = os.path.join(data_dir, filename)
csv_file = json_file.replace('json', 'csv')
if csv_file != json_file:
#inspect_fields(json_file)
json_to_csv(json_file, csv_file)
json_file = os.path.join(data_dir, 'goodreads_book_genres_initial.json.gz')
csv_file = json_file.replace('json', 'csv')
headers = ['book_id', 'genres']
with gzip.open(csv_file, 'wt') as fh:
writer = csv.writer(fh, delimiter='\t')
writer.writerow(headers)
for ri, record in enumerate(read_json(json_file)):
for genre in record['genres']:
doc = {'book_id': record['book_id'], 'genre': genre}
row = [record['book_id'], genre]
writer.writerow(row)
json_file = os.path.join(data_dir, 'goodreads_books.json.gz')
csv_file = json_file.replace('json', 'csv')
headers = [
'isbn', 'text_reviews_count', 'country_code', 'language_code', 'asin', 'average_rating',
'author_id', 'publisher', 'num_pages',
'isbn13', 'publication_year', 'book_id', 'ratings_count', 'work_id', 'title', 'title_without_series'
]
doc_count = 0
with gzip.open(csv_file, 'wt') as fh:
writer = csv.writer(fh, delimiter='\t')
writer.writerow(headers)
for ri, record in enumerate(read_json(json_file)):
for author in record['authors']:
if author['role'].lower() in ['', 'author', 'creator']:
doc = {header: record[header] for header in headers if header in record}
doc['author_id'] = author['author_id']
row = [doc[header] for header in headers]
doc_count += 1
writer.writerow(row)
if (ri+1) % 100000 == 0:
print(ri+1, 'records parsed', doc_count, 'rows printed')