Skip to content

Instantly share code, notes, and snippets.

@ids1024
Created March 18, 2018 01:04
Show Gist options
  • Save ids1024/1530bca58dace62971dbbf8270f36c6d to your computer and use it in GitHub Desktop.
Save ids1024/1530bca58dace62971dbbf8270f36c6d to your computer and use it in GitHub Desktop.
import os
from urllib.request import urlretrieve
import tarfile
import re
import bs4
TAR = "susv4tc2.tar.bz2"
DLURL = "http://pubs.opengroup.org/onlinepubs/9699919799/download/" + TAR
if not os.path.exists(TAR):
urlretrieve(DLURL, TAR)
os.makedirs("output", exist_ok=True)
with tarfile.open(TAR) as tar:
for i in tar.getmembers():
m = re.match("susv4tc2/basedefs/(.*\\.h)\\.html", i.name)
if not m:
continue
name = m.group(1)
html = bs4.BeautifulSoup(tar.extractfile(i), 'lxml')
src = '\n'.join(j
for i in html.findAll('pre')
for j in i.text.splitlines()
if not j.startswith('['))
with open("output/" + name, 'w') as hfile:
hfile.write(src)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment