Forked from simonw/index_django_docs_to_elasticsearch.py
Created
August 21, 2016 17:34
-
-
Save willingc/d9051e55e773c46083129570f4bc5975 to your computer and use it in GitHub Desktop.
Example code for converting Django docs into a format suitable for import into Elasticsearch - for a talk I'm giving.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import re | |
import os | |
import urlparse | |
import hashlib | |
import json | |
import sys | |
title_re = re.compile(r'([=*]{3,})\n([^\n]+)\n\1\n') | |
def find_title(rst): | |
matches = title_re.findall(rst) | |
# eg: [('===========================', 'FAQ: Installation')] | |
if not matches: | |
return '' | |
else: | |
return matches[0][1] | |
def walk_documentation(path='.', base_url=''): | |
path = os.path.realpath(path) | |
for dirpath, dirnames, filenames in os.walk(path): | |
for filename in filenames: | |
if filename.endswith('.txt'): | |
filepath = os.path.join(dirpath, filename) | |
content = open(filepath).read() | |
title = find_title(content) | |
# Figure out path relative to original | |
relpath = os.path.relpath(filepath, path) | |
url = urlparse.urljoin(base_url, relpath[:-4] + '/') | |
yield { | |
'title': title, | |
'path': relpath, | |
'top_folder': relpath.split('/')[0], | |
'url': url, | |
'content': content, | |
'id': hashlib.sha1(url).hexdigest(), | |
} | |
def usage(extra_error=''): | |
print "Outputs sphinx docs in a format that can be bulk loaded into elasticsearch" | |
print "Usage: %s <path-to-folder> <base-url-of-hosted-docs>" % sys.argv[0] | |
print "e.g." | |
print " %s ~/django/docs/ https://docs.djangoproject.com/en/1.10/ | curl -s XPOST localhost:9200/docsearch/doc/_bulk --data-binary @-" % sys.argv[0] | |
if extra_error: | |
print extra_error | |
sys.exit(1) | |
if __name__ == '__main__': | |
args = sys.argv[1:] | |
if len(args) != 2: | |
usage() | |
path, base_url = args | |
if not (base_url.startswith('http://') or base_url.startswith('https://')): | |
usage('Docs URL must start with http or https') | |
for document in walk_documentation(path, base_url): | |
print json.dumps({'index': {'_id': document['id']}}) | |
print json.dumps(document) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment