In [8]:
import requests
import xml.etree.ElementTree as ET
from pymarc import marcxml

class OCLC_API(object):

    def __init__(self, api_key, request_wait=0.5):
        self.xisbn_url = "http://xisbn.worldcat.org/webservices/xid/isbn/"
        self.xisbn_query = "?method=getEditions&format=json&fl=*"
        self.search_url = "http://www.worldcat.org/webservices/catalog/search/worldcat/sru"
        self.api_key = api_key
        self.request_wait = request_wait

    def make_author_title_query(self, author, title, record_schema="marcxml"):
        query = 'srw.au+all+"%s"+and+srw.ti+all+"%s"' % (author, title)
        schema = "info%%3Asrw%%2Fschema%%2F1%%2F%s" % (record_schema)
        return "%s?query=%s&wskey=%s&recordSchema=%s" % (self.search_url, query, self.api_key, schema)

    def get_editions(self, isbn):
        isbn_url = "%s%s%s" % (self.xisbn_url, isbn, self.xisbn_query)
        response = requests.get(isbn_url)
        if 'list' in response.json():
            return response.json()['list']
        return []

    def map_isbns(self, isbns):
        mapped = {}
        for index, isbn in enumerate(isbns):
            mapped[isbn] = self.get_editions(isbn)
            print(index+1, " mappings retrieved")
            time.sleep(self.request_wait)
        return mapped

    def find_work_records(self, author, title, record_schema="marcxml"):
        url = self.make_author_title_query(author, title)
        response = requests.get(url)
        root = ET.fromstring(response.text)
        for child in root:
            if child.tag == "{http://www.loc.gov/zing/srw/}records":
                return child
        return None

    def get_isbns(self, records):
        if isinstance(records, list):
            return [self.get_isbn(record) for record in records]
        else:
            return []

    def get_isbn(self, record):
        temp_file = "tempfile.xml"
        with open(temp_file, 'wb') as fh:
            fh.write(ET.tostring(record))
        reader = marcxml.parse_xml_to_array(temp_file)
        for record in reader:
            if record:
                metadata = {
                    "author": record.author(),
                    "title": record.title(),
                    "isbn": [field['a'] for field in record.get_fields('020')],
                    "notes": [note['a'] for note in record.notes()],
                    "publisher": record.publisher(),
                    "pubyear": record.pubyear()
                }
                return metadata
                return record.isbn()
        return None

    def find_work_metadata(self, author, title, record_schema="marcxml"):
        records = self.find_work_records(author, title)
        return self.get_isbns(records)

    
def test_oclc_api():
    oclc_api_key = "5GymVhuhaTbrsUg0NSgTGEdH8OevLipG6LIGaZAsLqT5ib1MaaGMQTDKkJlr9Ywjh0ooUFcdcnL4RQX3"
    oclc = OCLC_API(oclc_api_key)
    author = "zadie smith"
    title = "witte tanden"
    isbns = oclc.find_work_metadata(author, title)
    print(isbns)


test_oclc_api()

[]


In [50]:
import re
import json
import copy
from bs4 import BeautifulSoup

class LeestafelParser(object):
    
    def __init__(self, oclc_api_key):
        self.oclc = OCLC_API(oclc_api_key)
    
    def is_review(self, article):
        if self.has_title_header(article):
            return True
        return False

    def has_title_header(self, para):
        titles_soup = para.find_all(class_="article_heading1")
        if titles_soup:
            return True
        return False

    def get_title_info(self, para, review):
        review["title"] = self.get_title(para)
        for author in self.get_author(para):
            # add any authors beyond the main author that the page is about
            if author not in review["author"]:
                review["author"].append(author)

    def get_title(self, para):
        titles_soup = para.find_all(class_="article_heading1")
        for title_soup in titles_soup:
            if title_soup.text == "":
                continue
            return title_soup.text
            break
        return None

    def get_author(self, content_soup):
        authors = []
        try:
            author_soup = content_soup.find_all(class_="componentheading")[0]
            return [author_soup.text] # return as list, may have co-authors on certain reviewed books
        except AttributeError:
            return []
        except IndexError:
            pass
        authors_soup = content_soup.find_all(class_="article_heading2")
        if not authors_soup:
            return []
        for author_soup in authors_soup:
            if author_soup.text != "" and author_soup.text != " ":
                authors += [author_string for author_string in author_soup.stripped_strings]
        return authors

    def has_messageboard_link(self, para):
        links = para.find_all('a')
        if not links:
            return False
        for link in links:
            if "http://leestafel.messageboard.nl/forum/viewtopic.php?" in link['href']:
                return True
        return False

    def normalize_isbn(self, text):
        text = re.sub(r"ISBN(10|13)?:?", r"ISBN", text)
        if "ISBN 978" in text:
            while re.match(r"ISBN [0-9]{,8} [0-9]", text):
                text = re.sub(r"ISBN ([0-9]{,11}) +([0-9])", r"ISBN \1\2", text)
        else:
            while re.match(r"ISBN [0-9]{,8} [0-9]", text):
                text = re.sub(r"ISBN ([0-9]{,8}) +([0-9])", r"ISBN \1\2", text)
        return text

    def get_isbn(self, text, review):
        text = self.normalize_isbn(text)
        m = re.search("ISBN ([0-9Xx]{10,13})", text)
        review["isbn"] = [m.group(1)]

    def has_isbn(self, text):
        if "ISBN" in text:
            text = self.normalize_isbn(text)
        m = re.search("ISBN(10|13)?:? ([0-9Xx]{10,13})", text)
        return True if m else False

    def has_reviewer_metadata(self, text):
        if r"©" in text:
            return True
        return False
    
    def extract_review_date(self, copyright, review):
        months = ""
        m = re.search(r"((\d{1,2} +)?(januari|februari|maart|april|mei|juni|juli|augustus|september|oktober|november|december) +(\d{4}))", copyright)
        if m:
            review["review_date"] = m.group(0)
            copyright = re.sub(m.group(0), "", copyright)
            copyright = re.sub("[,\. ]+$", "", copyright)
        return copyright

    def get_reviewer_metadata(self, text, review):
        # Assumption: reviewer signs off with copyright symbol, followed by name, followed by date
        # Example: © Alice, 23-05-2011
        copyright = re.sub(r".*©", "©", text).replace("©,","©").replace("\n","")
        #copyright = copyright
        copyright = self.extract_review_date(copyright, review)
        review["reviewer"] = re.sub("© +","", copyright).strip()

    def get_articles(self, content_soup):
        return content_soup.find_all(class_="article_column")

    def get_review(self, author, article_soup):
        review = { "author": copy.copy(author), "review_text": "", "unparsed_metadata": [] }
        if self.has_title_header(article_soup):
            self.get_title_info(article_soup, review)
        self.status = "text"
        for p in article_soup.find_all('p'):
            self.parse_paragraph(p, review)
        if "title" not in review:
            return None
        if "isbn" not in review.keys():
            review["isbn"] = self.lookup_isbn(review["author"][0], review["title"])
        return review

    def lookup_isbn(self, author, title):
        editions = self.oclc.find_work_metadata(author, title)
        return [edition["isbn"] for edition in editions if "isbn" in edition]

    def parse_paragraph(self, p, review):
        paragraph = p.get_text().replace("\u00a0", " ") # remobe non-breaking spaces
        if self.has_title_header(p):
            return True
        if self.has_messageboard_link(p):
            return True
        if self.has_reviewer_metadata(paragraph):
            self.get_reviewer_metadata(paragraph, review)
        if self.has_isbn(paragraph):
            self.status = "metadata" # from here all paragraphs are metadata
            self.get_isbn(paragraph, review)
        if self.status == "metadata":
            review["unparsed_metadata"].append(paragraph)
        else:
            review["review_text"] += " " + paragraph
    
    def get_content(self, fname):
        with open(fname, 'rt') as fh:
            soup = BeautifulSoup(fh, "lxml")
            return soup.find(id='main')

    def parse_review_page(self, fname):
        content_soup = self.get_content(fname)
        author = self.get_author(content_soup)
        articles_soup = self.get_articles(content_soup)
        if not articles_soup: # no review articles found
            return []
        reviews = [self.get_review(author, article_soup) for article_soup in articles_soup]
        return [review for review in reviews if review] # skip null or None

site = "leestafel.info"
files = !ls {site}/a*
#print(files)

oclc_api_key_file = "oclc-api-key"
with open(oclc_api_key_file, 'rt') as fh:
    oclc_api_key = fh.read().strip()

leestafel_parser = LeestafelParser(oclc_api_key)

output_file = "leestafel.a.json"

with open(output_file, 'wt') as fh:
    for fname in files:
        print(fname)
        try:
            reviews = leestafel_parser.parse_review_page(fname)
            fh.write(json.dumps(reviews) + "\n")
        except AttributeError as err: # catch all for Beautiful soup errors, need to improve
            print(err)
            print("Skipping", fname)
            pass


leestafel.info/a-beck
leestafel.info/a-berebrouckx
leestafel.info/a-bohlmeijer
leestafel.info/a-bon
leestafel.info/a-chambers
leestafel.info/a-coudenys-a-a-van-lierde
leestafel.info/a-de-saint-exupery
leestafel.info/a-desmond-a-j-moore
leestafel.info/a-el-baraka
leestafel.info/a-el-baraka?format=feed&type=atom
'NoneType' object has no attribute 'find_all'
Skipping leestafel.info/a-el-baraka?format=feed&type=atom
leestafel.info/a-el-baraka?format=feed&type=rss
'NoneType' object has no attribute 'find_all'
Skipping leestafel.info/a-el-baraka?format=feed&type=rss
leestafel.info/a-f-th-van-der-heijden
leestafel.info/a-havukainen-a-s-toivonen
leestafel.info/a-kranendonk
leestafel.info/a-kruijssen
leestafel.info/a-lindgren
leestafel.info/a-lootens
leestafel.info/a-machowiak-a-d-mizieliski
leestafel.info/a-mg-schmidt
leestafel.info/a-moravia-a-a-elkann
leestafel.info/a-munoz-molina
leestafel.info/a-sax
leestafel.info/a-smids
leestafel.info/a-steinhoefel
leestafel.info/a-v-praag
leestafel.info

leestafel.info/anita-shreve
leestafel.info/anita-van-den-bogaart
leestafel.info/anja-van-biene
leestafel.info/anja-vereijken
leestafel.info/anke-de-jong-koele
leestafel.info/anke-dorpmanns-a-maikel-verkoelen
leestafel.info/anke-kranendonk
leestafel.info/anke-kranendonk-romans
leestafel.info/anke-scheeren
leestafel.info/anke-werker-a-danielle-schothorst
leestafel.info/ann-brashares
leestafel.info/ann-de-bode
leestafel.info/ann-downer
leestafel.info/ann-lootens
leestafel.info/anna-bikont
leestafel.info/anna-brouwer
leestafel.info/anna-chojnacka
leestafel.info/anna-coudenys
leestafel.info/anna-currey
leestafel.info/anna-dale
leestafel.info/anna-drijver
leestafel.info/anna-enquist
leestafel.info/anna-fredriksson
leestafel.info/anna-kemp-a-sara-ogilvie
leestafel.info/anna-lawrence-pietroni
leestafel.info/anna-llenas
leestafel.info/anna-mcpartlin
leestafel.info/anna-nilsen
leestafel.info/anna-pasternak
leestafel.info/anna-van-praag
leestafel.info/anna-woltz
leestafel.info/annabel-pitcher
lee

leestafel.info/astrid-sy
leestafel.info/astrid-witte
leestafel.info/ate-hoekstra
leestafel.info/atticus-lish
leestafel.info/audrey-niffenegger
leestafel.info/august-willemsen
leestafel.info/augusta-verburg
leestafel.info/augusta-verburg-4-6-jr
leestafel.info/augusto-cruz
leestafel.info/auke-van-stralen
leestafel.info/aurora-marsotto
leestafel.info/ayana-mathis
leestafel.info/ayano-imai
leestafel.info/ayu-utami
