brandonmwest · July 2, 2014 17:25
diff --git a/html2markdown.rb b/html2markdown.rb
 require 'rubygems'
 require 'json'
 require 'nokogiri'
 require 'nokogiri-pretty'
 require 'iconv'
 require 'pandoc-ruby'
 require 'fileutils'

 #convert HTML to markdown
 html_files = File.join("/Users/brandonwest/SendGrid/docs/source", "**", "*.html")

 #need to pull all codeblocks out, stash them, and put them back in to preserve formatting

 Dir.glob html_files do |html_file|  
  next if html_file == '.' or html_file == '..' or html_file.match('_layouts') or html_file.match('_includes') or html_file.match('_assets') or html_file.include?('index.html') or html_file.match('search.html') or html_file.match('code_workshop.html')  or html_file.match('api_workshop.html') or html_file.match('error.html')

  puts "Converting #{html_file}"

  file = File.open(html_file, "r:UTF-8")
  html = file.read
  file.close
  
  #replace the front-matter from the HTML file
  front_matter = html.match(/(\s?---\s?)(.*?)(\s?---\s?)/m);

  #Convert to markdown!
  contents = PandocRuby.html(html).convert({:f => :html, :to => "markdown_mmd-pipe_tables" }, 'no-wrap', 'parse-raw', 'atx-headers')
  contents.sub!(/(\s?---\s?)(.*?)(\s?---\s?)/m,"---\n" + front_matter[2] + "\n---\n")
 
  ##replace the codeblocks with the original ones
  contents.gsub!(/{%\s?codeblock .*?\s+%}.*?{\%\s?endcodeblock\s?%}/m) do |match|
    #replace the match with the first matched codeblock from the html
    replace = "\n\n" + html.match(/{%\s?codeblock .*?\s+%}.*?{\%\s?endcodeblock\s?%}/m)[0].to_s + "\n\n"
    replace.gsub!(/&(?!amp)/,"&amp;")

    #delete the matched codeblock from the HTML so we keep the indexes sync'd
    html.sub!(/{%\s?codeblock .*?\s+%}.*?{\%\s?endcodeblock\s?%}/m,"")
    replace
  end
  
  #put linebreaks before and after anchors and info blocks
  contents.gsub!(/{%\s?anchor\s?(.*?)\s?%}\s?/) { |match| "\n{% anchor #{$1} %}\n" }
  contents.gsub!(/{%\s?endanchor\s?%}\s?/,"\n{% endanchor %}\n")

  contents.gsub!(/{%\s?info\s?%}\s?/,"\n{% info %}\n")
  contents.gsub!(/{%\s?endinfo\s?%}\s?/,"\n{% endinfo %}\n")

  contents.gsub!(/{%\s?warning\s?%}\s?/,"\n{% warning %}\n")
  contents.gsub!(/{%\s?endwarning\s?%}\s?/,"\n{% endwarning %}\n")

  contents.gsub!(/{%\s?apiexample\s?(.*?)\s?%}\s?/) { |match| "\n{% apiexample #{$1} %}" }
  contents.gsub!(/{%\s?endapiexample\s?%}\s?/,"{% endapiexample %}\n")

  contents.gsub!(/{%\s?requestblock\s?%}\s?/,"\n{% requestblock %}\n")
  contents.gsub!(/{%\s?endrequestblock\s?%}\s?/,"\n{% endrequestblock %}\n")

  contents.gsub!(/{%\s?parameter\s(.*?)\s?%}\s?/) { |match| "\n  {% parameter #{$1} %}" }
  contents.gsub!(/{%\s?endparameter\s?%}\s?/,"\n  {% endparameter %}\n")

  contents.gsub!(/{%\s?requesturl\s?(.*?)\s?%}\s?/) { |match| "\n  {% requesturl #{$1} %}" }
  contents.gsub!(/{%\s?endrequesturl\s?%}\s?/,"\n  {% endrequesturl %}\n")

  contents.gsub!(/{%\s?requestdata\s?(.*?)\s?%}\s?/) { |match| "\n  {% requestdata #{$1} %}" }
  contents.gsub!(/{%\s?endrequestdata\s?%}\s?/,"\n  {% endrequestdata %}\n")

  contents.gsub!(/{%\s?parameter\s(.*?)\s?%}\s?/) { |match| "\n  {% parameter #{$1} %}" }
  contents.gsub!(/{%\s?endparameter\s?%}\s?/,"\n  {% endparameter %}\n")

  #Pretty print the JSON
  contents.gsub!(/({%\s?codeblock lang:json\s?%})(.*?)({\%\s?endcodeblock\s?%})/m) do |match|
    begin
      json = JSON.parse($2)
      valid = true
    rescue
      puts "\ninvalid JSON or non-JSON javascript block in #{html_file}: #{$2}\n)"
      valid = false
    end
    if valid
      "\n{% codeblock lang:json %}" + "\n" + JSON.pretty_generate(json) + "\n" + "{% endcodeblock %}\n"
    else
      puts $2
      "\n{% codeblock lang:json %}" + "\n" + $2 + "\n" + "{% endcodeblock %}\n"
    end
  end

 contents.gsub!(/(\s?{%\s?response json\s?%}\s?)(.*?)({\%\s?endresponse\s?%}\s?)/m) do |match|
    begin
      json = JSON.parse($2)
      valid = true
    rescue
      puts "\ninvalid JSON or non-JSON javascript block in #{html_file}: #{$2}\n)"
      valid = false
    end
    if valid
      "\n  {% response json %}" + "\n" + JSON.pretty_generate(json) + "\n" + "{% endresponse %}\n"
    else
      puts $2
      "\n  {% response json %}" + "\n" + $2 + "\n" + "  {% endresponse %}\n"
    end
  end


  #Pretty print the XML
  contents.gsub!(/(\s?{%\s?codeblock lang:xml\s?%})(.*?)({\%\s?endcodeblock\s?%})/m) do |match|
    begin
      xml = Nokogiri.XML($2, nil, "UTF-8")
    rescue
      puts "\ninvalid XML block in #{html_file}: #{$2}\n)"
      next
    end
    "\n{% codeblock lang:xml %}" + "\n" + xml.human + "\n" + "{% endcodeblock %}\n"
  end

  contents.gsub!(/(\s?{%\s?response xml\s?%}\s?)(.*?)({\%\s?endresponse\s?%}\s?)/m) do |match|
    begin
      xml = Nokogiri.XML($2, nil, "UTF-8")
    rescue
      puts "\ninvalid XML block in #{html_file}: #{$2}\n)"
      next
    end
    "\n  {% response xml %}" + "\n" + xml.human + "\n" + "  {% endresponse %}\n"
  end


  #Pretty print all the tables that pandoc mangled
  contents.gsub!(/(<table.*?>)(.*?)(<\/table>)/m) do
    begin
      html = '<table class="table table-bordered table-striped">' + $2.to_s.gsub!("\n","") + '</table>'
      xml = Nokogiri.XML(html, nil, "UTF-8")
    rescue
      puts "\ninvalid HTML block in #{html_file}:\n)"
      next
    end

    pretty_html = Iconv.conv 'UTF-8', 'iso8859-1', xml.human
    pretty_html.gsub!(/<\?xml version="1\.0" encoding="ISO-8859-1"\?>\s*/,"")
    pretty_html.gsub!(/\s*class="odd"\s*/,"")
    pretty_html.gsub!(/\s*class="even"\s*/,"")
    pretty_html.gsub!(/\s*class="header"\s*/,"")
    pretty_html.gsub!(/\s*align="left"\s*/,"")
    pretty_html.gsub!(/\s*markdown="1"\s*/,"")
    pretty_html
  end

  contents.gsub!(/\s*markdown="1"\s*/,"")
  contents.gsub!(/\\_/,"_")

  output_path = html_file.sub('.html','.md')#.sub('source_html','source')

  dirname = File.dirname(output_path)
  unless File.directory?(dirname)
    FileUtils.mkdir_p(dirname)
  end
  
  FileUtils.rm_f(output_path)

  file = File.new(output_path,"w:UTF-8")
  file.write(contents)
  file.close
  ##CAREFUL!
  File.delete(html_file)
 end
	require 'rubygems'
	require 'json'
	require 'nokogiri'
	require 'nokogiri-pretty'
	require 'iconv'
	require 'pandoc-ruby'
	require 'fileutils'

	#convert HTML to markdown
	html_files = File.join("/Users/brandonwest/SendGrid/docs/source", "*", ".html")

	#need to pull all codeblocks out, stash them, and put them back in to preserve formatting

	Dir.glob html_files do \|html_file\|
	next if html_file == '.' or html_file == '..' or html_file.match('_layouts') or html_file.match('_includes') or html_file.match('_assets') or html_file.include?('index.html') or html_file.match('search.html') or html_file.match('code_workshop.html') or html_file.match('api_workshop.html') or html_file.match('error.html')

	puts "Converting #{html_file}"

	file = File.open(html_file, "r:UTF-8")
	html = file.read
	file.close

	#replace the front-matter from the HTML file
	front_matter = html.match(/(\s?---\s?)(.*?)(\s?---\s?)/m);

	#Convert to markdown!
	contents = PandocRuby.html(html).convert({:f => :html, :to => "markdown_mmd-pipe_tables" }, 'no-wrap', 'parse-raw', 'atx-headers')
	contents.sub!(/(\s?---\s?)(.*?)(\s?---\s?)/m,"---\n" + front_matter[2] + "\n---\n")

	##replace the codeblocks with the original ones
	contents.gsub!(/{%\s?codeblock .?\s+%}.?{\%\s?endcodeblock\s?%}/m) do \|match\|
	#replace the match with the first matched codeblock from the html
	replace = "\n\n" + html.match(/{%\s?codeblock .?\s+%}.?{\%\s?endcodeblock\s?%}/m)[0].to_s + "\n\n"
	replace.gsub!(/&(?!amp)/,"&")

	#delete the matched codeblock from the HTML so we keep the indexes sync'd
	html.sub!(/{%\s?codeblock .?\s+%}.?{\%\s?endcodeblock\s?%}/m,"")
	replace
	end

	#put linebreaks before and after anchors and info blocks
	contents.gsub!(/{%\s?anchor\s?(.*?)\s?%}\s?/) { \|match\| "\n{% anchor #{$1} %}\n" }
	contents.gsub!(/{%\s?endanchor\s?%}\s?/,"\n{% endanchor %}\n")

	contents.gsub!(/{%\s?info\s?%}\s?/,"\n{% info %}\n")
	contents.gsub!(/{%\s?endinfo\s?%}\s?/,"\n{% endinfo %}\n")

	contents.gsub!(/{%\s?warning\s?%}\s?/,"\n{% warning %}\n")
	contents.gsub!(/{%\s?endwarning\s?%}\s?/,"\n{% endwarning %}\n")

	contents.gsub!(/{%\s?apiexample\s?(.*?)\s?%}\s?/) { \|match\| "\n{% apiexample #{$1} %}" }
	contents.gsub!(/{%\s?endapiexample\s?%}\s?/,"{% endapiexample %}\n")

	contents.gsub!(/{%\s?requestblock\s?%}\s?/,"\n{% requestblock %}\n")
	contents.gsub!(/{%\s?endrequestblock\s?%}\s?/,"\n{% endrequestblock %}\n")

	contents.gsub!(/{%\s?parameter\s(.*?)\s?%}\s?/) { \|match\| "\n {% parameter #{$1} %}" }
	contents.gsub!(/{%\s?endparameter\s?%}\s?/,"\n {% endparameter %}\n")

	contents.gsub!(/{%\s?requesturl\s?(.*?)\s?%}\s?/) { \|match\| "\n {% requesturl #{$1} %}" }
	contents.gsub!(/{%\s?endrequesturl\s?%}\s?/,"\n {% endrequesturl %}\n")

	contents.gsub!(/{%\s?requestdata\s?(.*?)\s?%}\s?/) { \|match\| "\n {% requestdata #{$1} %}" }
	contents.gsub!(/{%\s?endrequestdata\s?%}\s?/,"\n {% endrequestdata %}\n")

	contents.gsub!(/{%\s?parameter\s(.*?)\s?%}\s?/) { \|match\| "\n {% parameter #{$1} %}" }
	contents.gsub!(/{%\s?endparameter\s?%}\s?/,"\n {% endparameter %}\n")

	#Pretty print the JSON
	contents.gsub!(/({%\s?codeblock lang:json\s?%})(.*?)({\%\s?endcodeblock\s?%})/m) do \|match\|
	begin
	json = JSON.parse($2)
	valid = true
	rescue
	puts "\ninvalid JSON or non-JSON javascript block in #{html_file}: #{$2}\n)"
	valid = false
	end
	if valid
	"\n{% codeblock lang:json %}" + "\n" + JSON.pretty_generate(json) + "\n" + "{% endcodeblock %}\n"
	else
	puts $2
	"\n{% codeblock lang:json %}" + "\n" + $2 + "\n" + "{% endcodeblock %}\n"
	end
	end

	contents.gsub!(/(\s?{%\s?response json\s?%}\s?)(.*?)({\%\s?endresponse\s?%}\s?)/m) do \|match\|
	begin
	json = JSON.parse($2)
	valid = true
	rescue
	puts "\ninvalid JSON or non-JSON javascript block in #{html_file}: #{$2}\n)"
	valid = false
	end
	if valid
	"\n {% response json %}" + "\n" + JSON.pretty_generate(json) + "\n" + "{% endresponse %}\n"
	else
	puts $2
	"\n {% response json %}" + "\n" + $2 + "\n" + " {% endresponse %}\n"
	end
	end


	#Pretty print the XML
	contents.gsub!(/(\s?{%\s?codeblock lang:xml\s?%})(.*?)({\%\s?endcodeblock\s?%})/m) do \|match\|
	begin
	xml = Nokogiri.XML($2, nil, "UTF-8")
	rescue
	puts "\ninvalid XML block in #{html_file}: #{$2}\n)"
	next
	end
	"\n{% codeblock lang:xml %}" + "\n" + xml.human + "\n" + "{% endcodeblock %}\n"
	end

	contents.gsub!(/(\s?{%\s?response xml\s?%}\s?)(.*?)({\%\s?endresponse\s?%}\s?)/m) do \|match\|
	begin
	xml = Nokogiri.XML($2, nil, "UTF-8")
	rescue
	puts "\ninvalid XML block in #{html_file}: #{$2}\n)"
	next
	end
	"\n {% response xml %}" + "\n" + xml.human + "\n" + " {% endresponse %}\n"
	end


	#Pretty print all the tables that pandoc mangled
	contents.gsub!(/(<table.?>)(.?)(<\/table>)/m) do
	begin
	html = '<table class="table table-bordered table-striped">' + $2.to_s.gsub!("\n","") + '</table>'
	xml = Nokogiri.XML(html, nil, "UTF-8")
	rescue
	puts "\ninvalid HTML block in #{html_file}:\n)"
	next
	end

	pretty_html = Iconv.conv 'UTF-8', 'iso8859-1', xml.human
	pretty_html.gsub!(/<\?xml version="1\.0" encoding="ISO-8859-1"\?>\s*/,"")
	pretty_html.gsub!(/\sclass="odd"\s/,"")
	pretty_html.gsub!(/\sclass="even"\s/,"")
	pretty_html.gsub!(/\sclass="header"\s/,"")
	pretty_html.gsub!(/\salign="left"\s/,"")
	pretty_html.gsub!(/\smarkdown="1"\s/,"")
	pretty_html
	end

	contents.gsub!(/\smarkdown="1"\s/,"")
	contents.gsub!(/\\_/,"_")

	output_path = html_file.sub('.html','.md')#.sub('source_html','source')

	dirname = File.dirname(output_path)
	unless File.directory?(dirname)
	FileUtils.mkdir_p(dirname)
	end

	FileUtils.rm_f(output_path)

	file = File.new(output_path,"w:UTF-8")
	file.write(contents)
	file.close
	##CAREFUL!
	File.delete(html_file)
	end