Created
February 12, 2019 02:04
-
-
Save paultopia/3e38653f1a061d7d6ab5c985b35b324c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pickle | |
import os.path | |
from googleapiclient.discovery import build | |
from google_auth_oauthlib.flow import InstalledAppFlow | |
from google.auth.transport.requests import Request | |
import json | |
# If modifying these scopes, delete the file token.pickle. | |
SCOPES = ['https://www.googleapis.com/auth/documents.readonly'] | |
# this is all based on this quickstart: https://developers.google.com/docs/api/quickstart/js | |
# to run this, you need to follow the activation and credential download steps on that page first. | |
DOCUMENT_ID = '1w8kimbrzfIwvbfaUFEVzL5x1XrUc_Fy8YxhR6mRn10A' | |
# matches document: https://docs.google.com/document/d/1w8kimbrzfIwvbfaUFEVzL5x1XrUc_Fy8YxhR6mRn10A/edit?usp=sharing | |
def download_test_document(): | |
"""Shows basic usage of the Docs API. | |
Prints the title of a sample document. | |
""" | |
creds = None | |
# The file token.pickle stores the user's access and refresh tokens, and is | |
# created automatically when the authorization flow completes for the first | |
# time. | |
if os.path.exists('token.pickle'): | |
with open('token.pickle', 'rb') as token: | |
creds = pickle.load(token) | |
# If there are no (valid) credentials available, let the user log in. | |
if not creds or not creds.valid: | |
if creds and creds.expired and creds.refresh_token: | |
creds.refresh(Request()) | |
else: | |
flow = InstalledAppFlow.from_client_secrets_file( | |
'credentials.json', SCOPES) | |
creds = flow.run_local_server() | |
# Save the credentials for the next run | |
with open('token.pickle', 'wb') as token: | |
pickle.dump(creds, token) | |
service = build('docs', 'v1', credentials=creds) | |
# Retrieve the documents contents from the Docs service. | |
document = service.documents().get(documentId=DOCUMENT_ID).execute() | |
print('The title of the document is: {}'.format(document.get('title'))) | |
with open("testdoc.json", 'w') as tj: | |
json.dump(document, tj, indent=4, sort_keys=True) | |
# that function is step 1, saving a local copy of the document in json format to parse through. | |
# this is step 2. | |
# helper function for making sense of the heavily nested json | |
def pretty_print(data): | |
print(json.dumps(data, indent=2, sort_keys=True)) | |
def check_italics(textrun): | |
if textrun["textStyle"].get("italic"): | |
return textrun["content"] | |
return False | |
def check_smallcaps(textrun): | |
if textrun["textStyle"].get("smallCaps"): | |
return textrun["content"] | |
return False | |
def print_article_title_journal(footnote): | |
for content in footnote["content"]: | |
elements = content["paragraph"]["elements"] | |
textruns = [element["textRun"] for element in elements] | |
for idx, item in enumerate(textruns): | |
if idx >= 3: # to avoid searching below zero | |
if check_smallcaps(item): | |
candidate_title = textruns[idx - 2] | |
if check_italics(candidate_title): # name | |
print("{}, {}, in note {}".format(candidate_title["content"], item["content"], footnote["footnoteId"][-1])) | |
def parse_test_document(): | |
with open("testdoc.json") as tj: | |
document = json.load(tj) | |
# just for convenience sake I'd like to have footnotes as a list of dicts rather than one big dict. | |
footnotes = [item for key, item in document["footnotes"].items()] | |
# and now we can loop over the footnotes and print article | |
# and title only for those footnotes with a journal article rather than a book | |
pretty_print(footnotes) | |
for footnote in footnotes: | |
try: | |
print_article_title_journal(footnote) | |
except: | |
pass # just to get rid of idiosyncratic bits of json in the footnotes block w/o content | |
if __name__ == '__main__': | |
download_test_document() | |
parse_test_document() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment