hothero · December 16, 2015 02:19
diff --git a/woconverter.rb b/woconverter.rb
 require 'fileutils'
 require 'date'
 require 'yaml'
 require 'rexml/document'
 require 'ya2yaml'
 require 'uri'

 include REXML

 doc = Document.new(File.new(ARGV[0]))

 FileUtils.rmdir "_posts"
 FileUtils.mkdir_p "_posts"

 site_link = XPath.first(doc, 'rss/channel/link').text

 # all articles and pages
 doc.elements.each("rss/channel/item[wp:status = 'publish' and (wp:post_type = 'post' or wp:post_type = 'page')]") do |e|
  p e.elements['wp:post_name'].text
  post = e.elements
  wordpress_id = post['wp:post_id'].text
  #slug = post['wp:post_name'].text
  slug = wordpress_id
  date = DateTime.parse(post['wp:post_date'].text)
  name = "%02d-%02d-%02d-%s.textile" % [date.year, date.month, date.day, slug]
  date_string = "#{date.year}-#{date.month}-#{date.day}"
  title_string = post['title'].text.encode("UTF-8")

  # gathering tags and categories for category and keyword
  categories = []
  tags = []
  #<category domain="post_tag" nicename="warning"><![CDATA[warning]]></category>
  #<category domain="category" nicename="os"><![CDATA[作業系統(Operating System)]]></category>
  post.each('category') do |cat|
    tmp =  cat.attribute('domain')
    if (tmp.to_s.casecmp("post_tag") == 0)
      tags << cat.text
    elsif (tmp.to_s.casecmp("category") == 0)
      categories << cat.text
    end
  end

  content = post['content:encoded'].text.encode("UTF-8")

  # convert code tag to code block from wordpress plugin: syntaxHighlighter envolved
  content = content.gsub(/\[csharp\]/, '{% codeblock lang:csharp %}')
  content = content.gsub(/\[\/csharp\]/, '{% endcodeblock %}')
  content = content.gsub(/\[shell\]/, '{% codeblock %}')
  content = content.gsub(/\[\/shell\]/, '{% endcodeblock %}')
  content = content.gsub(/\[ruby\]/, '{% codeblock lang:ruby %}')
  content = content.gsub(/\[\/ruby\]/, '{% endcodeblock %}')
  content = content.gsub(/\[html\]/, '{% codeblock lang:html %}')
  content = content.gsub(/\[\/html\]/, '{% endcodeblock %}')
  content = content.gsub(/\[python\]/, '{% codeblock lang:python %}')
  content = content.gsub(/\[\/python\]/, '{% endcodeblock %}')
  content = content.gsub(/\[php\]/, '{% codeblock lang:php %}')
  content = content.gsub(/\[\/php\]/, '{% endcodeblock %}')
  content = content.gsub(/\[javascript\]/, '{% codeblock lang:javascript %}')
  content = content.gsub(/\[\/javascript\]/, '{% endcodeblock %}')

  # fixed breaking edition
  content = content.gsub(/<div.*>/, '')
  content = content.gsub('</div>', '')

  # change all absolute file links to relative 
  content = content.gsub("#{site_link}/wp-content/uploads", "/wp-content/uploads")

  # change all category links to relative
  content = content.gsub("#{site_link}/category", "/blog/categories")

  # discard empty line, but making breaking edition
  # content = content.gsub(/\n$/, '') 

  # URI.unescape: solved chinese encoding problem
  article_link = URI.unescape(post['link'].text.gsub(site_link, ''))

  # convert <pre></pre> blocks to {% codeblock %}{% encodebloc %}
  #content = content.gsub(/<pre lang="([^"]*)">(.*?)<\/pre>/m, '`\1`')
  #content = content.gsub(/<pre>/, '{% codeblock %}')
  #content = content.gsub(/<pre lang="([^"]*)">/, '{% codeblock %}')
  #content = content.gsub(/<\/pre>/m, '{% endcodeblock %}')

  # convert headers
  (1..3).each do |i|
    content = content.gsub(/<h#{i}>([^<]*)<\/h#{i}>/, ('#'*i) + ' \1')
  end

  puts "Converting: #{name}"

 =begin
  data = {
    'layout' => 'post',
    'title' => post['title'].text,
    'date' => date_string,
    'comments' => true,
    'categories' => categories,
  }.delete_if { |k,v| v.nil? || v == ''}.to_yaml
 =end

  if (e.elements['wp:post_type'].text.casecmp("post") == 0)
    filename = "_posts/#{name}"
  elsif (e.elements['wp:post_type'].text.casecmp("page") == 0)
    next
    FileUtils.rmdir article_link.split("/")[-1]
    FileUtils.mkdir_p article_link.split("/")[-1]
    filename = "#{article_link.split("/")[-1]}/index.textile"
  end

  File.open(filename, "w") do |f|
    f.puts "---"
    f.puts "layout: #{e.elements['wp:post_type'].text}"
    f.puts "title: \"#{title_string.gsub('"', '&quot;')}\""
    f.puts "date: #{date_string}"
    f.puts "wordpress_id: #{wordpress_id}"
    # f.puts "permalink: /#{wordpress_id}/#{title_string.gsub('[', '').gsub(']', '').gsub('.', '').gsub('"', '').gsub('#', '').gsub(' ', '-')}"
    f.puts "permalink: #{article_link}" if e.elements['wp:post_type'].text.casecmp("post") == 0
    f.puts "comments: true"
    f.puts "categories: [#{categories.join(', ')}]"
    f.puts "tags: [#{tags.join(', ')}]"

    # for SEO
    f.puts "keywords: #{tags.join(', ')}"

    #f.puts data
    f.puts "---"
    f.puts content
  end
 end
	require 'fileutils'
	require 'date'
	require 'yaml'
	require 'rexml/document'
	require 'ya2yaml'
	require 'uri'

	include REXML

	doc = Document.new(File.new(ARGV[0]))

	FileUtils.rmdir "_posts"
	FileUtils.mkdir_p "_posts"

	site_link = XPath.first(doc, 'rss/channel/link').text

	# all articles and pages
	doc.elements.each("rss/channel/item[wp:status = 'publish' and (wp:post_type = 'post' or wp:post_type = 'page')]") do \|e\|
	p e.elements['wp:post_name'].text
	post = e.elements
	wordpress_id = post['wp:post_id'].text
	#slug = post['wp:post_name'].text
	slug = wordpress_id
	date = DateTime.parse(post['wp:post_date'].text)
	name = "%02d-%02d-%02d-%s.textile" % [date.year, date.month, date.day, slug]
	date_string = "#{date.year}-#{date.month}-#{date.day}"
	title_string = post['title'].text.encode("UTF-8")

	# gathering tags and categories for category and keyword
	categories = []
	tags = []
	#<category domain="post_tag" nicename="warning"><![CDATA[warning]]></category>
	#<category domain="category" nicename="os"><![CDATA[作業系統(Operating System)]]></category>
	post.each('category') do \|cat\|
	tmp = cat.attribute('domain')
	if (tmp.to_s.casecmp("post_tag") == 0)
	tags << cat.text
	elsif (tmp.to_s.casecmp("category") == 0)
	categories << cat.text
	end
	end

	content = post['content:encoded'].text.encode("UTF-8")

	# convert code tag to code block from wordpress plugin: syntaxHighlighter envolved
	content = content.gsub(/\[csharp\]/, '{% codeblock lang:csharp %}')
	content = content.gsub(/\[\/csharp\]/, '{% endcodeblock %}')
	content = content.gsub(/\[shell\]/, '{% codeblock %}')
	content = content.gsub(/\[\/shell\]/, '{% endcodeblock %}')
	content = content.gsub(/\[ruby\]/, '{% codeblock lang:ruby %}')
	content = content.gsub(/\[\/ruby\]/, '{% endcodeblock %}')
	content = content.gsub(/\[html\]/, '{% codeblock lang:html %}')
	content = content.gsub(/\[\/html\]/, '{% endcodeblock %}')
	content = content.gsub(/\[python\]/, '{% codeblock lang:python %}')
	content = content.gsub(/\[\/python\]/, '{% endcodeblock %}')
	content = content.gsub(/\[php\]/, '{% codeblock lang:php %}')
	content = content.gsub(/\[\/php\]/, '{% endcodeblock %}')
	content = content.gsub(/\[javascript\]/, '{% codeblock lang:javascript %}')
	content = content.gsub(/\[\/javascript\]/, '{% endcodeblock %}')

	# fixed breaking edition
	content = content.gsub(/<div.*>/, '')
	content = content.gsub('</div>', '')

	# change all absolute file links to relative
	content = content.gsub("#{site_link}/wp-content/uploads", "/wp-content/uploads")

	# change all category links to relative
	content = content.gsub("#{site_link}/category", "/blog/categories")

	# discard empty line, but making breaking edition
	# content = content.gsub(/\n$/, '')

	# URI.unescape: solved chinese encoding problem
	article_link = URI.unescape(post['link'].text.gsub(site_link, ''))

	# convert <pre></pre> blocks to {% codeblock %}{% encodebloc %}
	#content = content.gsub(/<pre lang="([^"])">(.?)<\/pre>/m, '`\1`')
	#content = content.gsub(/<pre>/, '{% codeblock %}')
	#content = content.gsub(/<pre lang="([^"]*)">/, '{% codeblock %}')
	#content = content.gsub(/<\/pre>/m, '{% endcodeblock %}')

	# convert headers
	(1..3).each do \|i\|
	content = content.gsub(/<h#{i}>([^<])<\/h#{i}>/, ('#'i) + ' \1')
	end

	puts "Converting: #{name}"

	=begin
	data = {
	'layout' => 'post',
	'title' => post['title'].text,
	'date' => date_string,
	'comments' => true,
	'categories' => categories,
	}.delete_if { \|k,v\| v.nil? \|\| v == ''}.to_yaml
	=end

	if (e.elements['wp:post_type'].text.casecmp("post") == 0)
	filename = "_posts/#{name}"
	elsif (e.elements['wp:post_type'].text.casecmp("page") == 0)
	next
	FileUtils.rmdir article_link.split("/")[-1]
	FileUtils.mkdir_p article_link.split("/")[-1]
	filename = "#{article_link.split("/")[-1]}/index.textile"
	end

	File.open(filename, "w") do \|f\|
	f.puts "---"
	f.puts "layout: #{e.elements['wp:post_type'].text}"
	f.puts "title: \"#{title_string.gsub('"', '"')}\""
	f.puts "date: #{date_string}"
	f.puts "wordpress_id: #{wordpress_id}"
	# f.puts "permalink: /#{wordpress_id}/#{title_string.gsub('[', '').gsub(']', '').gsub('.', '').gsub('"', '').gsub('#', '').gsub(' ', '-')}"
	f.puts "permalink: #{article_link}" if e.elements['wp:post_type'].text.casecmp("post") == 0
	f.puts "comments: true"
	f.puts "categories: [#{categories.join(', ')}]"
	f.puts "tags: [#{tags.join(', ')}]"

	# for SEO
	f.puts "keywords: #{tags.join(', ')}"

	#f.puts data
	f.puts "---"
	f.puts content
	end
	end