Created
July 2, 2014 17:25
-
-
Save brandonmwest/9a4a7f3eb3cd474f666f to your computer and use it in GitHub Desktop.
convert jekyll html with custom liquid tags and codeblocks to well-structured markdown
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'json' | |
require 'nokogiri' | |
require 'nokogiri-pretty' | |
require 'iconv' | |
require 'pandoc-ruby' | |
require 'fileutils' | |
#convert HTML to markdown | |
html_files = File.join("/Users/brandonwest/SendGrid/docs/source", "**", "*.html") | |
#need to pull all codeblocks out, stash them, and put them back in to preserve formatting | |
Dir.glob html_files do |html_file| | |
next if html_file == '.' or html_file == '..' or html_file.match('_layouts') or html_file.match('_includes') or html_file.match('_assets') or html_file.include?('index.html') or html_file.match('search.html') or html_file.match('code_workshop.html') or html_file.match('api_workshop.html') or html_file.match('error.html') | |
puts "Converting #{html_file}" | |
file = File.open(html_file, "r:UTF-8") | |
html = file.read | |
file.close | |
#replace the front-matter from the HTML file | |
front_matter = html.match(/(\s?---\s?)(.*?)(\s?---\s?)/m); | |
#Convert to markdown! | |
contents = PandocRuby.html(html).convert({:f => :html, :to => "markdown_mmd-pipe_tables" }, 'no-wrap', 'parse-raw', 'atx-headers') | |
contents.sub!(/(\s?---\s?)(.*?)(\s?---\s?)/m,"---\n" + front_matter[2] + "\n---\n") | |
##replace the codeblocks with the original ones | |
contents.gsub!(/{%\s?codeblock .*?\s+%}.*?{\%\s?endcodeblock\s?%}/m) do |match| | |
#replace the match with the first matched codeblock from the html | |
replace = "\n\n" + html.match(/{%\s?codeblock .*?\s+%}.*?{\%\s?endcodeblock\s?%}/m)[0].to_s + "\n\n" | |
replace.gsub!(/&(?!amp)/,"&") | |
#delete the matched codeblock from the HTML so we keep the indexes sync'd | |
html.sub!(/{%\s?codeblock .*?\s+%}.*?{\%\s?endcodeblock\s?%}/m,"") | |
replace | |
end | |
#put linebreaks before and after anchors and info blocks | |
contents.gsub!(/{%\s?anchor\s?(.*?)\s?%}\s?/) { |match| "\n{% anchor #{$1} %}\n" } | |
contents.gsub!(/{%\s?endanchor\s?%}\s?/,"\n{% endanchor %}\n") | |
contents.gsub!(/{%\s?info\s?%}\s?/,"\n{% info %}\n") | |
contents.gsub!(/{%\s?endinfo\s?%}\s?/,"\n{% endinfo %}\n") | |
contents.gsub!(/{%\s?warning\s?%}\s?/,"\n{% warning %}\n") | |
contents.gsub!(/{%\s?endwarning\s?%}\s?/,"\n{% endwarning %}\n") | |
contents.gsub!(/{%\s?apiexample\s?(.*?)\s?%}\s?/) { |match| "\n{% apiexample #{$1} %}" } | |
contents.gsub!(/{%\s?endapiexample\s?%}\s?/,"{% endapiexample %}\n") | |
contents.gsub!(/{%\s?requestblock\s?%}\s?/,"\n{% requestblock %}\n") | |
contents.gsub!(/{%\s?endrequestblock\s?%}\s?/,"\n{% endrequestblock %}\n") | |
contents.gsub!(/{%\s?parameter\s(.*?)\s?%}\s?/) { |match| "\n {% parameter #{$1} %}" } | |
contents.gsub!(/{%\s?endparameter\s?%}\s?/,"\n {% endparameter %}\n") | |
contents.gsub!(/{%\s?requesturl\s?(.*?)\s?%}\s?/) { |match| "\n {% requesturl #{$1} %}" } | |
contents.gsub!(/{%\s?endrequesturl\s?%}\s?/,"\n {% endrequesturl %}\n") | |
contents.gsub!(/{%\s?requestdata\s?(.*?)\s?%}\s?/) { |match| "\n {% requestdata #{$1} %}" } | |
contents.gsub!(/{%\s?endrequestdata\s?%}\s?/,"\n {% endrequestdata %}\n") | |
contents.gsub!(/{%\s?parameter\s(.*?)\s?%}\s?/) { |match| "\n {% parameter #{$1} %}" } | |
contents.gsub!(/{%\s?endparameter\s?%}\s?/,"\n {% endparameter %}\n") | |
#Pretty print the JSON | |
contents.gsub!(/({%\s?codeblock lang:json\s?%})(.*?)({\%\s?endcodeblock\s?%})/m) do |match| | |
begin | |
json = JSON.parse($2) | |
valid = true | |
rescue | |
puts "\ninvalid JSON or non-JSON javascript block in #{html_file}: #{$2}\n)" | |
valid = false | |
end | |
if valid | |
"\n{% codeblock lang:json %}" + "\n" + JSON.pretty_generate(json) + "\n" + "{% endcodeblock %}\n" | |
else | |
puts $2 | |
"\n{% codeblock lang:json %}" + "\n" + $2 + "\n" + "{% endcodeblock %}\n" | |
end | |
end | |
contents.gsub!(/(\s?{%\s?response json\s?%}\s?)(.*?)({\%\s?endresponse\s?%}\s?)/m) do |match| | |
begin | |
json = JSON.parse($2) | |
valid = true | |
rescue | |
puts "\ninvalid JSON or non-JSON javascript block in #{html_file}: #{$2}\n)" | |
valid = false | |
end | |
if valid | |
"\n {% response json %}" + "\n" + JSON.pretty_generate(json) + "\n" + "{% endresponse %}\n" | |
else | |
puts $2 | |
"\n {% response json %}" + "\n" + $2 + "\n" + " {% endresponse %}\n" | |
end | |
end | |
#Pretty print the XML | |
contents.gsub!(/(\s?{%\s?codeblock lang:xml\s?%})(.*?)({\%\s?endcodeblock\s?%})/m) do |match| | |
begin | |
xml = Nokogiri.XML($2, nil, "UTF-8") | |
rescue | |
puts "\ninvalid XML block in #{html_file}: #{$2}\n)" | |
next | |
end | |
"\n{% codeblock lang:xml %}" + "\n" + xml.human + "\n" + "{% endcodeblock %}\n" | |
end | |
contents.gsub!(/(\s?{%\s?response xml\s?%}\s?)(.*?)({\%\s?endresponse\s?%}\s?)/m) do |match| | |
begin | |
xml = Nokogiri.XML($2, nil, "UTF-8") | |
rescue | |
puts "\ninvalid XML block in #{html_file}: #{$2}\n)" | |
next | |
end | |
"\n {% response xml %}" + "\n" + xml.human + "\n" + " {% endresponse %}\n" | |
end | |
#Pretty print all the tables that pandoc mangled | |
contents.gsub!(/(<table.*?>)(.*?)(<\/table>)/m) do | |
begin | |
html = '<table class="table table-bordered table-striped">' + $2.to_s.gsub!("\n","") + '</table>' | |
xml = Nokogiri.XML(html, nil, "UTF-8") | |
rescue | |
puts "\ninvalid HTML block in #{html_file}:\n)" | |
next | |
end | |
pretty_html = Iconv.conv 'UTF-8', 'iso8859-1', xml.human | |
pretty_html.gsub!(/<\?xml version="1\.0" encoding="ISO-8859-1"\?>\s*/,"") | |
pretty_html.gsub!(/\s*class="odd"\s*/,"") | |
pretty_html.gsub!(/\s*class="even"\s*/,"") | |
pretty_html.gsub!(/\s*class="header"\s*/,"") | |
pretty_html.gsub!(/\s*align="left"\s*/,"") | |
pretty_html.gsub!(/\s*markdown="1"\s*/,"") | |
pretty_html | |
end | |
contents.gsub!(/\s*markdown="1"\s*/,"") | |
contents.gsub!(/\\_/,"_") | |
output_path = html_file.sub('.html','.md')#.sub('source_html','source') | |
dirname = File.dirname(output_path) | |
unless File.directory?(dirname) | |
FileUtils.mkdir_p(dirname) | |
end | |
FileUtils.rm_f(output_path) | |
file = File.new(output_path,"w:UTF-8") | |
file.write(contents) | |
file.close | |
##CAREFUL! | |
File.delete(html_file) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment