Created
September 22, 2022 13:53
-
-
Save cwe1ss/9427a8c2d0a298b3ef72623ffde69987 to your computer and use it in GitHub Desktop.
Convert Confluence to Markdown (Azure DevOps)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require('fs/promises') | |
const path = require('path') | |
//const util = require('util') | |
const sanitize = require("sanitize-filename") | |
const HTMLParser = require('node-html-parser') | |
const TurndownService = require('turndown') | |
const TurndownPluginGfmService = require('@guyplusplus/turndown-plugin-gfm') | |
const TurndownPluginConfluenceToGfmService = require('turndown-plugin-confluence-to-gfm') | |
const turndownService = new TurndownService() | |
TurndownPluginGfmService.gfm(turndownService) | |
TurndownPluginConfluenceToGfmService.confluenceGfm(turndownService) | |
/////////////////////////////////////// | |
// Source and destination folders | |
// A directory which has been exported from Confluence using the HTML export. | |
const htmlDirectory = 'C:/temp/Confluence-export/ABC' | |
// The target directory | |
const markdownDirectory = 'c:/temp/AzDevOps/docs/ABC' | |
; (async () => { | |
const newAttachmentsDirectoryName = '.attachments' | |
/////////////////////////////////////// | |
// Read folder structure, page titles, etc from index.html | |
var addChildren = function(ul, result, parentFolders) { | |
if (!ul) { | |
return | |
} | |
for (const listItemNode of ul.childNodes) { | |
if (listItemNode.tagName != 'LI') continue; | |
var obj = { | |
htmlFileName: '', | |
markdownFileName: '', | |
folderName: null, | |
parents: parentFolders | |
} | |
result.push(obj) | |
var addAsParent = true | |
for (const listItemChildNode of listItemNode.childNodes) { | |
if (listItemChildNode.tagName == 'A') { | |
obj.htmlFileName = listItemChildNode.attributes['href'] | |
var pageTitle = listItemChildNode.innerText | |
pageTitle = pageTitle | |
.replaceAll('&', '&') | |
.replaceAll(''', '\'') | |
.replaceAll('"', '"') | |
.replaceAll('/', '|') | |
// https://learn.microsoft.com/en-us/azure/devops/project/wiki/wiki-file-structure?view=azure-devops#special-characters-in-wiki-page-titles | |
.replaceAll(':', '%3A') | |
.replaceAll('<', '%3C') | |
.replaceAll('>', '%3E') | |
.replaceAll('*', '%2A') | |
.replaceAll('?', '%3F') | |
.replaceAll('|', '%7C') | |
.replaceAll('-', '%2D') | |
.replaceAll('"', '%22') | |
var sanitizedPageTitle = sanitize(pageTitle) | |
// Azure DevOps Wiki doesn't allow spaces | |
sanitizedPageTitle = sanitizedPageTitle.replaceAll(' ', '-') | |
obj.markdownFileName = sanitizedPageTitle + '.md' | |
obj.folderName = sanitizedPageTitle | |
} | |
if (listItemChildNode.tagName == 'IMG') { | |
// The sole root item is the home page (with an img tag to recognize it) and we don't need a separate folder for it. | |
addAsParent = false | |
} | |
if (listItemChildNode.tagName == 'UL') { | |
const newParents = [...parentFolders] | |
if (addAsParent) { | |
newParents.push(obj.folderName) | |
} | |
addChildren(listItemChildNode, result, newParents) | |
} | |
} | |
} | |
} | |
var getFiles = async function() { | |
const indexFilePath = path.join(htmlDirectory, 'index.html') | |
const htmlContent = await fs.readFile(indexFilePath, { encoding: 'utf-8'} ) | |
const html = HTMLParser.parse(htmlContent) | |
const pageSection = html.querySelector('div.pageSection ul') | |
var files = [] | |
addChildren(pageSection, files, []) | |
files.unshift({ | |
htmlFileName: 'index.html', | |
markdownFileName: 'index.md', | |
pageTitle: html.querySelector('head title').innerText, | |
folderName: null, | |
parents: [] | |
}) | |
return files | |
} | |
var files = await getFiles() | |
// console.log(util.inspect(files, { depth: null, colors: true })) | |
// return | |
/////////////////////////////////////// | |
// Copy "attachments"-directory to ".attachments" (Azure DevOps requires this name) | |
const existingAttachmentsDirectory = path.join(htmlDirectory, 'attachments') | |
const newAttachmentsDirectory = path.join(markdownDirectory, newAttachmentsDirectoryName) | |
var copyRecursiveSync = async function(src, dest) { | |
// https://stackoverflow.com/a/22185855 | |
var stats = await fs.stat(src) | |
var isDirectory = stats.isDirectory() | |
if (isDirectory) { | |
await fs.mkdir(dest, { recursive: true }) | |
const files = await fs.readdir(src) | |
for (const childItemName of files) { | |
await copyRecursiveSync(path.join(src, childItemName), path.join(dest, childItemName)) | |
} | |
} else { | |
await fs.copyFile(src, dest) | |
} | |
}; | |
await copyRecursiveSync(existingAttachmentsDirectory, newAttachmentsDirectory) | |
/////////////////////////////////////// | |
// Convert pages | |
for (const file of files) { | |
const htmlFileFullPath = path.join(htmlDirectory, file.htmlFileName) | |
const markdownFileDirectory = path.join(markdownDirectory, ...file.parents) | |
const markdownFileFullPath = path.join(markdownFileDirectory, file.markdownFileName) | |
console.log(htmlFileFullPath + " --> " + markdownFileFullPath) | |
/////////////////////////////////////// | |
// Load file content | |
var htmlContent = await fs.readFile(htmlFileFullPath, { encoding: 'utf-8'} ) | |
/////////////////////////////////////// | |
// Replace links | |
for (const linkFile of files) { | |
var target = '../'.repeat(file.parents.length) | |
target += linkFile.parents.join('/') + (linkFile.parents.length > 0 ? '/' : '') | |
target += linkFile.markdownFileName | |
//console.log(' - ' + linkFile.htmlFileName + ': ' + target) | |
htmlContent = htmlContent.replaceAll(linkFile.htmlFileName, target) | |
} | |
/////////////////////////////////////// | |
// Replace attachment links (since we place files in subfolders) | |
const attachmentsUrl = '../'.repeat(file.parents.length) + newAttachmentsDirectoryName + '/' | |
htmlContent = htmlContent.replaceAll('"attachments/', '"' + attachmentsUrl) | |
var html = HTMLParser.parse(htmlContent) | |
/////////////////////////////////////// | |
// Remove breadcrumbs section (as we have the navigation bar anyway) | |
var htmlBreadcrumbs = html.querySelector('#breadcrumbs') | |
if (htmlBreadcrumbs) { | |
htmlBreadcrumbs.set_content('') | |
} | |
/////////////////////////////////////// | |
// Remove footer (as that only contains info about when the HTML document was generated and a link to Atlassian) | |
var htmlFooter = html.querySelector('#footer') | |
if (htmlFooter) { | |
htmlFooter.set_content('') | |
} | |
/////////////////////////////////////// | |
// Remove page title (as that would output the page title twice) | |
var htmlPageTitle = html.querySelector('title') | |
if (htmlPageTitle) { | |
htmlPageTitle.set_content('') | |
} | |
/////////////////////////////////////// | |
// Remove header from content (as Azure DevOps displays a title based on the file name) | |
var htmlMainHeader = html.querySelector('#main-header') | |
htmlMainHeader.set_content('') | |
/////////////////////////////////////// | |
// Remove query strings from attachment includes (as this doesn't display in Azure DevOps) | |
var htmlImages = html.querySelectorAll('img') | |
for (const htmlImage of htmlImages) { | |
const src = htmlImage.attributes['src'] | |
if (src.indexOf(newAttachmentsDirectoryName) >= 0) { | |
var indexOfQuery = src.indexOf('?') | |
if (indexOfQuery >= 0) { | |
const newSrc = src.substring(0, indexOfQuery) | |
htmlImage.setAttribute('src', newSrc) | |
} | |
} | |
} | |
/////////////////////////////////////// | |
// Remove Page metadata (author, creation time) (since we don't need them in DevOps) | |
var htmlPageMetadata = html.querySelector('#content div.page-metadata') | |
htmlPageMetadata?.set_content('') | |
/////////////////////////////////////// | |
// Show "information macro"-widgets as blockquotes | |
const macroBodies = html.querySelectorAll('div.confluence-information-macro > p.title, div.confluence-information-macro-body, div.confluence-information-macro-body p, div.confluence-information-macro-body div, div.confluence-information-macro-body li, div.confluence-information-macro-body pre') | |
for (const macroBody of macroBodies) { | |
if (macroBody.innerText?.trim() != '') | |
macroBody.insertAdjacentHTML('afterbegin', '{{tmp:blockquote}}') | |
} | |
/////////////////////////////////////// | |
// Replace known custom scripts | |
const attachments = html.querySelector('#attachments')?.parentNode?.nextElementSibling?.querySelectorAll('a') | |
const customScripts = html.querySelectorAll('script.ap-iframe-body-script') | |
for (const customScript of customScripts) { | |
const scriptContent = customScript.innerHTML | |
if (scriptContent.indexOf('com.gliffy.integration.confluence') >= 0) { | |
/////////////////////////////////////// | |
// Gliffy Diagrams | |
const gliffyExtractPageIdRegex = /(?:.*container=|^)(\d+)/ | |
const gliffyExtractAttachmentIdRegex = /(?:.*imageAttachmentId=att|^)(\d+)/ | |
const gliffyExtractNameRegex = /(?:.*[\|"]name=)([^|\\]*)/ | |
const pageIdMatch = scriptContent.match(gliffyExtractPageIdRegex) | |
if (!pageIdMatch || pageIdMatch.length < 2) continue; | |
// The attachment id is not easy to resolve. If we can't get it directly, we need to find it by name. | |
var attachmentId = '' | |
const attachmentIdMatch = scriptContent.match(gliffyExtractAttachmentIdRegex) | |
if (attachmentIdMatch && attachmentIdMatch.length > 1) { | |
attachmentId = attachmentIdMatch[1] | |
} else { | |
// Find the image by attachment name and use the most recent one | |
const nameMatch = scriptContent.match(gliffyExtractNameRegex) | |
if (!nameMatch || nameMatch.length < 2) continue; | |
const imageAttachments = attachments.filter(x => x.innerText == nameMatch[1] + '.png') | |
if (imageAttachments.length > 0) { | |
const latestAttachment = imageAttachments[imageAttachments.length - 1] | |
const href = latestAttachment.attributes['href'] | |
attachmentId = href.substring(href.lastIndexOf('/') + 1, href.lastIndexOf('.')) | |
} | |
} | |
//console.log('PageId: ' + pageId[1] + '; AttachmentId: ' + attachmentId[1]) | |
const attachmentUrl = attachmentsUrl + pageIdMatch[1] + '/' + attachmentId + '.png' | |
customScript.set_content('') | |
customScript.insertAdjacentHTML('beforebegin', '<img src="' + attachmentUrl + '" />') | |
} else if (scriptContent.indexOf('com.balsamiq.mockups.confluence') >= 0) { | |
/////////////////////////////////////// | |
// Balsamiq Mockups | |
const balsamiqExtractPageIdRegex = /(?:.*pageid=|^)(\d+)/ | |
const balsamiqExtractDownloadLinkRegex = /(?:.*[\|"]downloadLink=)([^|\\]*)/ | |
const balsamiqExtractNameRegex = /(?:.*&name=)([^&\"]*)/ | |
const pageIdMatch = scriptContent.match(balsamiqExtractPageIdRegex) | |
if (!pageIdMatch || pageIdMatch.length < 2) continue; | |
// Find the image by attachment name and use the most recent one | |
var attachmentUrl = null | |
const downloadLinkMatch = scriptContent.match(balsamiqExtractDownloadLinkRegex) | |
if (downloadLinkMatch && downloadLinkMatch.length > 1) { | |
const downloadLink = downloadLinkMatch[1] | |
const lastIndexOfSlash = downloadLink.lastIndexOf('/') | |
const fileName = downloadLink.substring(lastIndexOfSlash + 1) | |
const imageAttachments = attachments.filter(x => x.innerText == fileName) | |
if (imageAttachments.length > 0) { | |
const latestAttachment = imageAttachments[imageAttachments.length - 1] | |
const href = latestAttachment.attributes['href'] | |
const attachmentId = href.substring(href.lastIndexOf('/') + 1, href.lastIndexOf('.')) | |
//console.log('PageId: ' + pageId[1] + '; AttachmentId: ' + attachmentId[1]) | |
attachmentUrl = attachmentsUrl + pageIdMatch[1] + '/' + attachmentId + '.png' | |
} | |
} else { | |
const nameMatch = scriptContent.match(balsamiqExtractNameRegex) | |
if (nameMatch && nameMatch.length > 1) { | |
const name = decodeURIComponent(nameMatch[1].replaceAll('+', ' ')) | |
console.log('- ' + name) | |
const imageAttachments = attachments.filter(x => x.innerText == 'mockup_' + name + '.png') | |
if (imageAttachments.length > 0) { | |
const latestAttachment = imageAttachments[imageAttachments.length - 1] | |
const href = latestAttachment.attributes['href'] | |
const attachmentId = href.substring(href.lastIndexOf('/') + 1, href.lastIndexOf('.')) | |
attachmentUrl = attachmentsUrl + pageIdMatch[1] + '/' + attachmentId + '.png' | |
} | |
} | |
} | |
if (attachmentUrl) { | |
customScript.set_content('') | |
customScript.insertAdjacentHTML('beforebegin', '<img src="' + attachmentUrl + '" />') | |
} | |
} | |
} | |
/////////////////////////////////////// | |
// Fix double strong text (<strong>Some <strong>text</strong> example</strong>) | |
const doubleStrong = html.querySelectorAll('strong strong') | |
for (const strong of doubleStrong) { | |
strong.insertAdjacentHTML('beforebegin', strong.innerHTML) | |
strong.set_content('') | |
} | |
/////////////////////////////////////// | |
// Convert Attachments to list | |
const attachmentsContainer = html.querySelector('#attachments')?.parentNode?.nextElementSibling | |
if (attachmentsContainer) { | |
attachmentsContainer.set_content( | |
'<ul>' + | |
attachmentsContainer.innerHTML.replaceAll('<img', '<li><img').replaceAll('<br>', '</li>') + | |
'</ul>') | |
} | |
/////////////////////////////////////// | |
// Convert HTML to Markdown | |
var markdown = turndownService.turndown(html.toString()) | |
/////////////////////////////////////// | |
// Trim lines at the end (we can't trim at the start since Markdown relies on indentation at the beginning) | |
// Fix Quotations | |
var lines = markdown.split('\n') | |
for (let i = 0; i < lines.length; i++) { | |
var line = lines[i] | |
if (line.indexOf('{{tmp:blockquote}}') >= 0) { | |
line = '> ' + line.replaceAll('{{tmp:blockquote}}', '') | |
if (line.trim() == '>') { | |
line = '' | |
} | |
} | |
lines[i] = line.trimEnd() | |
} | |
markdown = lines.join('\n') | |
/////////////////////////////////////// | |
// Remove excess empty lines (more than one empty line between text) | |
var oldMarkdown = '' | |
do { | |
oldMarkdown = markdown | |
markdown = markdown.replaceAll('\n\n\n', '\n\n') | |
} while (oldMarkdown != markdown) | |
/////////////////////////////////////// | |
// Remove empty lines between quotes (as that shows up as separate blockquote-items) | |
lines = markdown.split('\n') | |
for (let i = 0; i < lines.length; i++) { | |
const previousLine = i > 0 ? lines[i - 1] : null | |
const currentLine = lines[i] | |
const nextLine = i < lines.length - 1 ? lines[i + 1] : null | |
if (previousLine && previousLine.startsWith('> ') && nextLine && nextLine.startsWith('> ') && currentLine.trim() == '') { | |
lines[i] = '>' | |
} | |
} | |
markdown = lines.join('\n').trim(); | |
/////////////////////////////////////// | |
// Write markdown to disk | |
await fs.mkdir(markdownFileDirectory, { recursive: true }) | |
await fs.writeFile(markdownFileFullPath, markdown) | |
} | |
/////////////////////////////////////// | |
// Update .order file (we currently only support the root level) | |
const orderFilePath = path.join(markdownDirectory, '.order') | |
var orderFileContent = '' | |
for (const file of files) { | |
if (file.parents.length == 0) { | |
orderFileContent += file.markdownFileName.replace('.md', '') + '\n' | |
} | |
} | |
await fs.writeFile(orderFilePath, orderFileContent) | |
console.log('finished') | |
})() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "confluence-export", | |
"version": "1.0.0", | |
"description": "", | |
"main": "index.js", | |
"scripts": { | |
"test": "echo \"Error: no test specified\" && exit 1", | |
"start": "node index.js" | |
}, | |
"author": "", | |
"license": "ISC", | |
"dependencies": { | |
"node-html-parser": "^6.1.0", | |
"sanitize-filename": "^1.6.3", | |
"turndown-plugin-confluence-to-gfm": "^0.5.0" | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment