Created
January 26, 2022 11:36
-
-
Save xbelanch/e44185648944e576d05a95f9cbe0107f to your computer and use it in GitHub Desktop.
Clean all the shitty stuff from Moodle book to Markdown
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
## Clean HTML Moodle Book and export to markdown | |
require 'yaml' | |
require 'nokogiri' | |
require 'paru/pandoc' | |
require 'paru/filter' | |
require 'date' | |
warn <<-'EOF' | |
__ ___ __ __ ____ __ | |
/ |/ /____ ____ ____/ // /___ / __ ) ____ / /_ | |
/ /|_/ // __ \ / __ \ / __ // // _ \ / __ |/ __ \ / __/ | |
/ / / // /_/ // /_/ // /_/ // // __// /_/ // /_/ // /_ | |
/_/ /_/ \____/ \____/ \__,_//_/ \___//_____/ \____/ \__/ | |
EOF | |
# Measure and Benchmark Time for Ruby Methods | |
# https://stackoverflow.com/questions/11406410/measure-and-benchmark-time-for-ruby-methods | |
start = Time.now | |
# the global variable to output the result | |
output = $stdout | |
# pretty nice output | |
# https://gist.github.com/mislav/398334 | |
tidy = Nokogiri::XSLT File.open('tidy.xsl') | |
BASE_DIR= File.absolute_path(".") | |
CONFIG_DIR = BASE_DIR + '/config' | |
yaml_default ||= File.join(CONFIG_DIR, 'default.yaml') | |
# open the Moodle Book html exported | |
moodleBook = File.open(ARGV[0]) { |file| Nokogiri::HTML(file, &:noblanks)} | |
=begin | |
__ __ __ __ _ | |
\ \ / //\ | \/ | | | |
\ \_/ // \ | \ / | | | |
\ // /\ \ | |\/| | | | |
| |/ ____ \| | | | |____ | |
|_/_/ \_\_| |_|______| | |
=end | |
# Load default yaml data | |
metadata = YAML.load_file(yaml_default) | |
# Maybe we need to extract some metada from the document | |
base_conf_from_moodlebok = { | |
'title' => moodleBook.xpath('//div[@class="book_info"]/table/tbody/tr[3]/td[2]').text, | |
'author' => 'Diferents autors', | |
'course' => moodleBook.xpath('//div[@class="book_info"]/table/tbody/tr[2]/td[2]').text, | |
'printed' => moodleBook.xpath('//div[@class="book_info"]/table/tbody/tr[3]/td[2]').text, | |
'edition' => moodleBook.xpath('//div[@class="book_info"]/table/tbody/tr[5]/td[2]').text, | |
'date' => Date.today.to_s | |
} | |
# check if data is empty? | |
base_conf_from_moodlebok.each do |key, value| | |
if value.empty? | |
base_conf_from_moodlebok[key] = "#{key.capitalize} is not defined" | |
end | |
end | |
metadata.merge!(base_conf_from_moodlebok) | |
=begin | |
metadata.merge! | |
=end | |
=begin | |
_____ _ _ _____ _______ ______ _____ _____ | |
/ ____| | | | /\ | __ \__ __| ____| __ \ / ____| | |
| | | |__| | / \ | |__) | | | | |__ | |__) | (___ | |
| | | __ | / /\ \ | ___/ | | | __| | _ / \___ \ | |
| |____| | | |/ ____ \| | | | | |____| | \ \ ____) | | |
\_____|_| |_/_/ \_\_| |_| |______|_| \_\_____/ | |
=end | |
warn "Add chapters and subchapters as a H2 and H3" | |
chapters = [] | |
moodleBook.xpath('//div[@class="book_toc_numbered"]/ul/li').each do |chapter| | |
subchapters = [] | |
if chapter.at('./ul') != nil | |
chapter.at('./ul').children.each do |subchapter| | |
subchapters << { | |
:title => subchapter.at('./a')['title'].sub(/^\d\.\d\.\s/,''), # remove numbered chapter | |
:href => subchapter.at('./a')['href'].sub(/^.*#/,''), # remove http... | |
} | |
end | |
end | |
chapters << { | |
:title => chapter.at('./a')['title'].sub(/^\d\.\s/,''), # remove numbered chapter | |
:href => chapter.at('./a')['href'].sub(/^.*#/,''), # remove http... | |
:subchapters => subchapters | |
} | |
end | |
# replace div.book_chapter for h2 chapters | |
moodleBook.xpath('//div[@class="book_chapter"]/a').each do |book_chapter| | |
# replace chapters | |
chapters.any? {|chapter| | |
if chapter[:href] == book_chapter['name'] | |
new_node = moodleBook.create_element "h2" | |
new_node.inner_html = chapter[:title] | |
book_chapter.replace new_node | |
end | |
} | |
# replace subchapters | |
chapters.each do |chapter| | |
unless chapter[:subchapters].empty? | |
chapter[:subchapters].any? { |subchapter| | |
if subchapter[:href] == book_chapter['name'] | |
new_node = moodleBook.create_element "h3" | |
new_node.inner_html = subchapter[:title] | |
book_chapter.replace new_node | |
end | |
} | |
end | |
end | |
end | |
=begin | |
_____ ______ __ __ ______ ________ | |
| __ \| ____| \/ |/ __ \ \ / / ____| | |
| |__) | |__ | \ / | | | \ \ / /| |__ | |
| _ /| __| | |\/| | | | |\ \/ / | __| | |
| | \ \| |____| | | | |__| | \ / | |____ | |
|_| \_\______|_| |_|\____/ \/ |______| | |
=end | |
# Remove namespaces | |
warn "Removing namespaces" | |
moodleBook.remove_namespaces! | |
# Empty tags | |
# https://stackoverflow.com/questions/20123176/cleaning-xml-document-recursively-from-empty-tags-with-nokogiri | |
warn "Removing empty tags" | |
moodleBook.search(':empty').remove | |
# Scripts | |
warn "Removing scripts" | |
# https://stackoverflow.com/questions/1980845/removing-the-script-elements-of-an-html | |
moodleBook.xpath("//script").remove | |
# Book info | |
warn "Removing book info" | |
moodleBook.xpath("//*[contains(@class,'book_info')]").remove | |
# Onclicks | |
warn "Removing onclicks" | |
moodleBook.xpath("//*[@onclick]").remove | |
# Skiplinks | |
warn "Removing skiplinks" | |
moodleBook.xpath("//*[contains(@class,'skiplinks')]").remove | |
# TOC Numbered | |
warn "Removing table of contents" | |
moodleBook.xpath("//*[contains(@class,'book_toc_numbered')]").remove | |
# Shit YUI | |
# https://stackoverflow.com/questions/1556028/how-do-i-do-a-regex-search-in-nokogiri-for-text-that-matches-a-certain-beginning | |
warn "Removing every shitty YUI reminders" | |
moodleBook.xpath("//*[starts-with(@id, 'yui')]").each do | tag | | |
tag.attributes["id"].remove | |
end | |
# Clearfix remove | |
warn "Removing clearfixes class" | |
moodleBook.xpath("//*[contains(@class,'clearfix')]").each do | tag | | |
tag.attributes["class"].remove | |
end | |
# Unnecessary shitdivs | |
warn "Removing divs generated automatically by Moodle Book Module" | |
val = %w( page page-content main ) | |
moodleBook.xpath("//div").each { | div | | |
# remove divs with shitty ids like page... | |
val.each { |id| | |
div.replace(div.children) if div.attr("id") || div.attr("role") == id | |
} | |
# remove div with empty attributes | |
div.replace(div.children) if div.element? | |
} | |
# Empty paragraphs | |
warn "Removing blank or empty paragraphs" | |
moodleBook.xpath("//p").each { | p | | |
p.remove if p.text.empty? | |
} | |
# Span as spams | |
warn "Removing shitty spams... well, I said spans" | |
moodleBook.xpath("//span").each { | span | | |
# replace children if empty attributes | |
span.replace(span.children) if span.element? | |
} | |
# Strange li attributes | |
warn "Removing strange li attributes" | |
moodleBook.xpath("//li[starts-with(@id, 'module-')]").each do | tag | | |
tag.attributes["id"].remove | |
tag.attributes["data-draggroups"].remove | |
end | |
# Calendar ids duplicate | |
warn "Removing duplicated ids" | |
moodleBook.xpath("//h2[starts-with(@id, 'calendari-')]").each do | tag | | |
tag.attributes["id"].remove | |
end | |
# Strange p attributes | |
warn "Removing paragraphs attributes" | |
moodleBook.xpath("//p[starts-with(@id, 'module-')]").each do | tag | | |
tag.attributes["id"].remove | |
tag.attributes["data-draggroups"].remove | |
end | |
# strong inline tags inside headers | |
warn "Removing inline tags like strong inside text header" | |
=begin | |
moodleBook.xpath('//*[name()="h3"]').css('strong').each { |strong| | |
strong.replace(" " + strong.text() + " ") | |
} | |
=end | |
moodleBook.xpath('//*[name()="h3"]').each do |node| | |
node.replace '<h3>' + node.text() + '</h3>' # remove space trailing spaces | |
end | |
# Styles | |
warn "Removing styles" | |
moodleBook.xpath("//@style").remove | |
# Strip gaps between HTML and fucking carriage returns | |
# https://stackoverflow.com/questions/8965897/how-do-you-strip-gaps-between-html-tags-with-nokogiri?rq=1 | |
moodleBook.xpath('//text()[1]').each{ |t| t.content = t.content.lstrip } | |
moodleBook.xpath('//text()[last()]').each{ |t| t.content = t.content.rstrip } | |
moodleBook.xpath('//text()').each{ |t| t.content = t.content.sub("\n", "").strip } | |
# Replace return footnote char https://www.fileformat.info/info/unicode/char/21a9/index.htm | |
moodleBook.xpath('//@*', '//text()').each do |node| | |
node.content = node.content.gsub('↩', 'Torna') | |
end | |
=begin | |
_ _ _____ _____ ______ _ _ _______ __ __ _ | |
| \ | |_ _/ ____| ____| | | | |__ __| \/ | | | |
| \| | | || | | |__ | |__| | | | | \ / | | | |
| . ` | | || | | __| | __ | | | | |\/| | | | |
| |\ |_| || |____| |____ | | | | | | | | | | |____ | |
|_| \_|_____\_____|______| |_| |_| |_| |_| |_|______| | |
=end | |
# https://gist.github.com/mislav/398334 | |
niceBook = tidy.transform(moodleBook).to_html | |
# Uncomment this line to output a pretty HTML version | |
# output.puts niceBook | |
# exit | |
=begin | |
__ __ _____ _ _______ ______ ___ _ | |
| \/ | /\ | __ \| |/ / __ \ / __ \ \ / / \ | | | |
| \ / | / \ | |__) | ' /| | | | | | \ \ /\ / /| \| | | |
| |\/| | / /\ \ | _ /| < | | | | | | |\ \/ \/ / | . ` | | |
| | | |/ ____ \| | \ \| . \| |__| | |__| | \ /\ / | |\ | | |
|_| |_/_/ \_\_| \_\_|\_\_____/ \____/ \/ \/ |_| \_| | |
=end | |
HTML2MARKDOWN = Paru::Pandoc.new do | |
from "html+raw_html" | |
to "markdown_mmd" | |
wrap "none" | |
end.convert niceBook | |
=begin | |
____ _ _ _______ _____ _ _ _______ | |
/ __ \| | | |__ __| __ \| | | |__ __| | |
| | | | | | | | | | |__) | | | | | | | |
| | | | | | | | | | ___/| | | | | | | |
| |__| | |__| | | | | | | |__| | | | | |
\____/ \____/ |_| |_| \____/ |_| | |
=end | |
file = metadata.to_yaml << "...\n\n" << HTML2MARKDOWN | |
# We warn the time taken by the script | |
finish = Time.now | |
diff = finish - start | |
warn "Time taken: #{diff} seconds" | |
output.puts file | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment