Created
March 26, 2025 13:23
-
-
Save armornick/5e5a052e05424ffeb0f49581a6c4ffb4 to your computer and use it in GitHub Desktop.
Markdown to Docx converter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { readFileSync, writeFileSync } from "node:fs"; | |
import { marked } from "marked"; | |
import { JSDOM } from "jsdom"; | |
import { | |
AlignmentType, Document, ExternalHyperlink, HeadingLevel, LevelFormat, | |
Packer, Paragraph, ShadingType, Table, TableCell, TableRow, TextRun, | |
WidthType | |
} from "docx"; | |
const INPUT_FILE = 'sample.md'; | |
const OUTPUT_FILE = 'sample.docx'; | |
const ELEMENT_NODE = 1; | |
const TEXT_NODE = 3; | |
const SPACING = { | |
before: 200, | |
} | |
const SHADING = { | |
type: ShadingType.SOLID, | |
color: "dee2e6", | |
// fill: "212529", | |
} | |
// ----------------------------------------------------------------------- | |
const convertChildNodes = (/** @type {Element} */element) => { | |
const children = []; | |
const $nodes = element.childNodes; | |
for (const $node of $nodes) { | |
if ($node.nodeType === TEXT_NODE) { | |
children.push(new TextRun($node.textContent)); | |
} | |
else if ($node.nodeType === ELEMENT_NODE) { | |
const tag = $node.tagName; | |
if (tag === 'A') { | |
children.push(new ExternalHyperlink({ | |
children: [ | |
new TextRun({ | |
text: $node.textContent, | |
style: 'Hyperlink', | |
}) | |
], | |
link: $node.href, | |
})); | |
} | |
else if (tag === 'KBD' || tag === 'CODE') { | |
children.push(new TextRun({ | |
text: $node.textContent, | |
shading: SHADING, | |
})) | |
} | |
else if (tag === 'IMG') { | |
console.log('images not currently supported'); | |
} | |
else { | |
const isBold = ($node.tagName === 'B' || $node.tagName === 'STRONG' || $node.querySelector('strong') != null); | |
const isItalic = ($node.tagName === 'EM' || $node.querySelector('em') != null); | |
const isStrike = ($node.tagName === 'S' || $node.tagName === 'DEL' || $node.querySelector('del') != null); | |
children.push(new TextRun({ | |
text: $node.textContent, | |
bold: isBold, | |
italics: isItalic, | |
strike: isStrike, | |
})); | |
} | |
} | |
} | |
return children; | |
} | |
/** | |
* @param {any[]} children | |
* @param {HTMLUListElement} $list | |
*/ | |
const convertUnorderedList = (children, $list, level = 0) => { | |
const $items = $list.querySelectorAll('li'); | |
for (const $item of $items) { | |
const sublist = $item.querySelector('ul'); | |
if (sublist) { | |
convertUnorderedList(children, sublist, level + 1); | |
} | |
else { | |
children.push(new Paragraph({ | |
children: convertChildNodes($item), | |
bullet: { level }, | |
})) | |
} | |
} | |
} | |
/** | |
* | |
* @param {any[]} children | |
* @param {HTMLOListElement} $list | |
*/ | |
const convertOrderedList = (children, $list, level = 0) => { | |
const $items = $list.querySelectorAll('li'); | |
for (const $item of $items) { | |
const sublist = $item.querySelector('ol'); | |
if (sublist) { | |
convertOrderedList(children, sublist, level + 1); | |
} | |
else { | |
children.push(new Paragraph({ | |
children: convertChildNodes($item), | |
numbering: { reference: 'numbered-list', level }, | |
})) | |
} | |
} | |
} | |
/** | |
* @param {HTMLTableRowElement} $row | |
*/ | |
const convertTableRow = ($row) => { | |
const cells = []; | |
const $cells = $row.cells; | |
for (const $cell of $cells) { | |
let alignment = undefined; | |
const $align = $cell.getAttribute('align'); | |
if ($align === 'left') { | |
alignment === AlignmentType.LEFT; | |
} | |
else if ($align === 'center') { | |
alignment === AlignmentType.CENTER; | |
} | |
else if ($align === 'right') { | |
alignment === AlignmentType.RIGHT; | |
} | |
cells.push(new TableCell({ | |
children: [new Paragraph({ | |
text: $cell.textContent, | |
alignment, | |
})], | |
})); | |
} | |
return cells; | |
} | |
/** | |
* @param {any[]} children | |
* @param {HTMLTableElement} $table | |
*/ | |
const convertTable = (children, $table) => { | |
const rows = []; | |
if ($table.tHead) { | |
const $rows = $table.tHead.rows; | |
if ($rows.length === 1) { | |
rows.push(new TableRow({ | |
children: convertTableRow($rows.item(0)), | |
tableHeader: true, | |
})); | |
} else { | |
for (const $row of $rows) { | |
rows.push(new TableRow({ | |
children: convertTableRow($row), | |
})); | |
} | |
} | |
} | |
for (const $tbody of $table.tBodies) { | |
const $rows = $tbody.rows; | |
for (const $row of $rows) { | |
rows.push(new TableRow({ | |
children: convertTableRow($row), | |
})); | |
} | |
} | |
children.push(new Table({ | |
rows, | |
width: { | |
size: 100, | |
type: WidthType.PERCENTAGE, | |
} | |
})); | |
// add an empty paragraph to make sure consecutive tables aren't merged | |
children.push(new Paragraph('')); | |
} | |
/** | |
* | |
* @param {any[]} children | |
* @param {HTMLPreElement} element | |
*/ | |
const convertPreElement = (children, element) => { | |
const content = element.textContent; | |
const lines = content?.split('\n'); | |
for (const line of lines) { | |
children.push(new Paragraph({ | |
text: line, | |
shading: SHADING, | |
})); | |
} | |
// remove the final line (this is an empty line that has the pre tag itelf) | |
children.pop(); | |
// add an empty paragraph to stop it from merging consecutive lines | |
children.push(new Paragraph('')); | |
} | |
/** | |
* | |
* @param {any[]} children | |
* @param {Element} element | |
*/ | |
const convertElement = (children, element) => { | |
// skip whitespace | |
if (element.nodeType === TEXT_NODE && | |
element.textContent.trim() === '' | |
) { | |
return; | |
} | |
const tag = element.tagName; | |
if (tag === 'H1') { | |
children.push(new Paragraph({ | |
text: element.textContent, | |
heading: HeadingLevel.HEADING_1, | |
spacing: SPACING, | |
})); | |
} | |
else if (tag === 'H2') { | |
children.push(new Paragraph({ | |
text: element.textContent, | |
heading: HeadingLevel.HEADING_2, | |
spacing: SPACING, | |
})); | |
} | |
else if (tag === 'H3') { | |
children.push(new Paragraph({ | |
text: element.textContent, | |
heading: HeadingLevel.HEADING_3, | |
spacing: SPACING, | |
})); | |
} | |
else if (tag === 'H4') { | |
children.push(new Paragraph({ | |
text: element.textContent, | |
heading: HeadingLevel.HEADING_4, | |
spacing: SPACING, | |
})); | |
} | |
else if (tag === 'H5') { | |
children.push(new Paragraph({ | |
text: element.textContent, | |
heading: HeadingLevel.HEADING_5, | |
spacing: SPACING, | |
})); | |
} | |
else if (tag === 'H6') { | |
children.push(new Paragraph({ | |
text: element.textContent, | |
heading: HeadingLevel.HEADING_6, | |
spacing: SPACING, | |
})); | |
} | |
else if (tag === 'P') { | |
children.push(new Paragraph({ | |
children: convertChildNodes(element), | |
spacing: SPACING, | |
})); | |
} | |
else if (tag === 'UL') { | |
convertUnorderedList(children, element); | |
} | |
else if (tag === 'OL') { | |
convertOrderedList(children, element); | |
} | |
else if (tag === 'TABLE') { | |
convertTable(children, element); | |
} | |
else if (tag === 'PRE') { | |
convertPreElement(children, element); | |
} | |
else if (tag === 'HR') { | |
children.push(new Paragraph({ | |
text: '', | |
border: { | |
bottom: { | |
color: 'auto', | |
style: 'single', | |
} | |
} | |
})); | |
// add an empty paragraph to stop it from merging consecutive lines | |
children.push(new Paragraph('')); | |
} | |
else if (tag === 'DIV' || tag === 'BLOCKQUOTE') { | |
const $children = element.childNodes; | |
for (const $child of $children) { | |
convertElement(children, $child); | |
} | |
} | |
else { | |
console.log(`unrecognized tag: ${tag} (${element})`); | |
} | |
} | |
const convertFragToDoc = (/** @type {DocumentFragment} */element) => { | |
const children = []; | |
for (const $el of element.children) { | |
convertElement(children, $el); | |
} | |
return new Document({ | |
numbering: { | |
config: [ | |
{ | |
reference: 'numbered-list', | |
levels: [ | |
{ | |
level: 0, | |
format: LevelFormat.DECIMAL_ENCLOSED_FULLSTOP, | |
text: "%1", | |
alignment: AlignmentType.START, | |
style: { | |
paragraph: { | |
indent: { left: 300 } | |
} | |
} | |
}, | |
{ | |
level: 1, | |
format: LevelFormat.DECIMAL_ENCLOSED_FULLSTOP, | |
text: "%1", | |
alignment: AlignmentType.START, | |
style: { | |
paragraph: { | |
indent: { left: 600 } | |
} | |
} | |
}, | |
{ | |
level: 2, | |
format: LevelFormat.DECIMAL_ENCLOSED_FULLSTOP, | |
text: "%1", | |
alignment: AlignmentType.START, | |
style: { | |
paragraph: { | |
indent: { left: 900 } | |
} | |
} | |
}, | |
{ | |
level: 3, | |
format: LevelFormat.DECIMAL_ENCLOSED_FULLSTOP, | |
text: "%1", | |
alignment: AlignmentType.START, | |
style: { | |
paragraph: { | |
indent: { left: 1200 } | |
} | |
} | |
} | |
] | |
} | |
] | |
}, | |
sections: [ | |
{ | |
properties: {}, | |
children, | |
} | |
] | |
}); | |
}; | |
// ----------------------------------------------------------------------- | |
const main = async () => { | |
console.log(`reading ${INPUT_FILE}`); | |
const src = readFileSync(INPUT_FILE, 'utf-8'); | |
// console.log(`processing input`); | |
const html = marked(src); | |
writeFileSync('sample.html', html, 'utf-8'); | |
const fragment = JSDOM.fragment(html); | |
const document = convertFragToDoc(fragment); | |
console.log(`writing ${OUTPUT_FILE}`); | |
const buff = await Packer.toBuffer(document) | |
writeFileSync(OUTPUT_FILE, buff); | |
} | |
main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment