Created
March 20, 2015 10:13
-
-
Save librarian/1018ea0a34e581e32784 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
import requests | |
from requests.exceptions import RequestException, BaseHTTPError | |
import lxml.html | |
def get_page(url, utf8=False): | |
try: | |
r = requests.get(url) | |
except (RequestException, BaseHTTPError) as e: | |
print('Exception occured at http request performing: ' + url) | |
exit(1) | |
if r.status_code != requests.codes.ok: | |
print('HTTP status code: ' + str(r.status_code) + '; url: ' + url) | |
exit(2) | |
if utf8: | |
r.encoding = 'utf-8' | |
return r.text | |
def get_title(url): | |
html = get_page(url) | |
doc = lxml.html.document_fromstring(html) | |
ts = doc.xpath('//title') | |
return ts[0].text.strip() if len(ts) >= 1 else 'TODO' | |
def full_url(url, context_url): | |
# assume 'context_url' are full url | |
proto, tail = context_url.split(':', 1) | |
context_base = proto + '://' + tail.lstrip('/').split('/', 1)[0] | |
if url.startswith('#'): | |
context_page = context_url.split('#', 1)[0] | |
return context_page + url | |
elif url.startswith('//'): | |
return proto + ':' + url | |
elif url.startswith('/'): | |
return context_base.rstrip('/') + '/' + url.lstrip('/') | |
elif url.startswith(('http://', 'https://', 'ftp://')): | |
return url | |
else: | |
# Need we support relational link like | |
# 'smth.html', './smth.html', or '../smth.html'? | |
raise NameError('bad url in \'full_url\':\nurl: ' + url + '\n') | |
def innerHTML(node, strip=True): | |
# http://stackoverflow.com/a/24151860/1598057 | |
res = node.text or '' | |
for child in node: | |
res += lxml.html.tostring(child, encoding='unicode') | |
if strip: | |
res = res.strip() | |
return res | |
def process_footnotes(s): | |
for fn in s['footnotes']: | |
s['res'] += '[^%s]: %s' % (fn['num'], fn['body']) + s['par_sep'] | |
s['footnotes'] = [] | |
def process_toplevel_a(a, s): | |
s['res'] += a.xpath('./h1')[0].text.strip() + s['line_break'] | |
s['res'] += full_url(a.get('href'), context_url=s['base_url']) | |
s['res'] += s['par_sep'] | |
def process_a(a, s): | |
s['res'] += '[%s][%d]' % (innerHTML(a), s['ref_counter']) | |
ref = { | |
'num': s['ref_counter'], | |
'url': full_url(a.get('href'), context_url=s['base_url']), | |
} | |
s['references'].append(ref) | |
s['ref_counter'] += 1 | |
def ret_span(span, orig_s): | |
# TODO: make it in some other way | |
s = { | |
'base_url': orig_s['base_url'], | |
'ref_counter': 1, | |
'fn_counter': 1, | |
'par_sep': '\n\n', | |
'line_break': '\n', | |
'res': '', | |
'footnotes': [], | |
'references': [], | |
} | |
process_toplevel_p(span, s) | |
return s['res'].strip() | |
def process_span(span, s): | |
# TODO: extract number, don't reenum via 'fn_counver' | |
if span.get('class') != 'ref': | |
s['res'] += lxml.html.tostring(span, encoding='unicode') | |
return | |
s['res'] += '[^%s]' % s['fn_counter'] | |
footnote = { | |
'num': s['fn_counter'], | |
'body': ret_span(span.xpath('./span[@class="refbody"]')[0], s), | |
} | |
s['footnotes'].append(footnote) | |
s['fn_counter'] += 1 | |
def process_strong(strong, deep_level, s): | |
s['res'] += '**' if (deep_level % 2 == 0) else '__' | |
s['res'] += strong.text or '' | |
for child in strong: | |
tail_added = False | |
if child.tag == 'em': | |
process_em(child, deep_level + 1, s) | |
elif child.tag == 'strong': | |
process_strong(child, deep_level + 1, s) | |
elif child.tag == 'a': | |
process_a(child, s) | |
elif child.tag == 'span': | |
process_span(child, s) | |
else: | |
s['res'] += lxml.html.tostring(child, encoding='unicode') | |
tail_added = True | |
if not tail_added: | |
s['res'] += child.tail or '' | |
s['res'] += '**' if (deep_level % 2 == 0) else '__' | |
def process_em(em, deep_level, s): | |
# TODO: deduplicate code | |
s['res'] += '*' if (deep_level % 2 == 0) else '_' | |
s['res'] += em.text or '' | |
for child in em: | |
tail_added = False | |
if child.tag == 'em': | |
process_em(child, deep_level + 1, s) | |
elif child.tag == 'strong': | |
process_strong(child, deep_level + 1, s) | |
elif child.tag == 'a': | |
process_a(child, s) | |
elif child.tag == 'span': | |
process_span(child, s) | |
else: | |
s['res'] += lxml.html.tostring(child, encoding='unicode') | |
tail_added = True | |
if not tail_added: | |
s['res'] += child.tail or '' | |
s['res'] += '*' if (deep_level % 2 == 0) else '_' | |
def maybe_formula(par): | |
if par.lstrip().startswith(r'\[') and par.rstrip().endswith(r'\]'): | |
return '$$ ' + par.strip()[2:-2].strip() + ' $$' | |
else: | |
return par | |
def process_toplevel_p(p, s): | |
is_question = 'id' in p.attrib and p.attrib['id'] == 'question' | |
is_attribute = 'id' in p.attrib and p.attrib['id'] == 'attribute' | |
if is_question or is_attribute: | |
s['res'] += '> ' | |
# TODO: check for formula only for entire fragment | |
s['res'] += maybe_formula(p.text or '') | |
for child in p: | |
tail_added = False | |
if child.tag == 'em': | |
process_em(child, 0, s) | |
elif child.tag == 'strong': | |
process_strong(child, 0, s) | |
elif child.tag == 'a': | |
process_a(child, s) | |
elif child.tag == 'span': | |
process_span(child, s) | |
elif child.tag == 'sup': | |
s['res'] += '<sup>' + ret_span(child, s) + '</sup>' | |
else: | |
s['res'] += lxml.html.tostring(child, encoding='unicode') | |
tail_added = True | |
if not tail_added: | |
s['res'] += child.tail or '' | |
if is_question: | |
s['res'] += s['line_break'] + '>' + s['line_break'] | |
else: | |
s['res'] += s['par_sep'] | |
process_footnotes(s) | |
def process_toplevel_img(img, s): | |
url = full_url(img.get('src'), context_url=s['base_url']), | |
s['res'] += '' % img.get('title') + s['line_break'] | |
s['res'] += '[labels]' + s['line_break'] | |
s['res'] += 'TODO' + s['line_break'] | |
s['res'] += '[/labels]' + s['line_break'] | |
s['res'] += 'render: ' % url + s['par_sep'] | |
def postprocess_references(s): | |
for r in s['references']: | |
# TODO: check for spaces and so on | |
t = get_title(r['url']) | |
s['res'] += '[%s]: %s "%s"' % (r['num'], r['url'], t) + s['par_sep'] | |
url = 'http://what-if.xkcd.com' | |
html = get_page(url, utf8=True) | |
doc = lxml.html.document_fromstring(html) | |
article = doc.xpath('//body//article')[0] | |
with open('a.html', 'w', encoding='utf-8') as f: | |
print(innerHTML(article), file=f) | |
parser_state = { | |
'base_url': url, | |
'ref_counter': 1, | |
'fn_counter': 1, | |
'par_sep': '\n\n', | |
'line_break': '\n', | |
'res': '', | |
'footnotes': [], | |
'references': [], | |
} | |
for child in article: | |
if child.tag == 'a': | |
process_toplevel_a(child, parser_state) | |
elif child.tag == 'p': | |
process_toplevel_p(child, parser_state) | |
elif child.tag == 'img': | |
process_toplevel_img(child, parser_state) | |
postprocess_references(parser_state) | |
with open('a.md', 'w', encoding='utf-8') as f: | |
print(parser_state['res'].rstrip(), file=f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment