Last active
May 19, 2022 23:48
-
-
Save ateucher/a60e539f70bdaff2e13362fda4ec4deb to your computer and use it in GitHub Desktop.
Extract notes from a pptx slide and export to markdown or docx
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
extract_pptx_notes <- function(pp_file, format = c("md", "docx")) { | |
if (!requireNamespace("officer", quietly = TRUE)) | |
stop("pacakge 'officer' required.") | |
if (!requireNamespace("xml2", quietly = TRUE)) | |
stop("pacakge 'xml2' required.") | |
if (!requireNamespace("rmarkdown", quietly = TRUE)) | |
stop("pacakge 'rmarkdown' required.") | |
if (!requireNamespace("glue", quietly = TRUE)) | |
stop("pacakge 'glue' required.") | |
format <- match.arg(format) | |
pp <- read_pptx(pp_file) | |
file_sans_ext <- tools::file_path_sans_ext(pp_file) | |
slides_with_notes_meta <- pp$notesSlide$get_metadata() | |
slides_with_notes_meta$notes <- as.numeric(gsub("notesSlide(\\d{1,3}).*", "\\1", rownames(slides_with_notes_meta))) | |
slides_with_notes_meta <- slides_with_notes_meta[!grepl("notesMaster", slides_with_notes_meta$target), ] | |
slides_with_notes_meta$slide <- as.numeric(gsub(".+slide(\\d{1,3})\\.xml", "\\1", slides_with_notes_meta$target)) | |
if (!nrow(slides_with_notes_meta)) stop("No notes in this presentation") | |
slide_nums <- seq(1, max(slides_with_notes_meta$slide)) | |
notes <- lapply(slide_nums, \(x) { | |
notes_slide <- slides_with_notes_meta[slides_with_notes_meta$slide == x, "notes"] | |
if (!length(notes_slide)) return(character(0)) | |
slide <- pp$notesSlide$get_slide(notes_slide) | |
xml <- slide$get() | |
# xpath search from here: https://robaboukhalil.medium.com/your-slide-deck-is-a-zip-file-in-disguise-36bb14f11c0b | |
xpath <- "//*[local-name()='txBody']/*[local-name()='p']/*[local-name()='r']/*[local-name()='t']/text()" | |
node <- xml_find_all(xml, xpath) | |
as.character(node) | |
}) | |
names(notes) <- paste("Slide", slide_nums) | |
out <- paste0(file_sans_ext, "_notes.md") | |
if (file.exists(out)) { | |
overwrite <- askYesNo(glue("File {out} already exists. Overwrite?")) | |
if (!overwrite) stop("Quitting", call. = FALSE) | |
file.remove(out) | |
} | |
cat(glue('--- | |
title: "{basename(file_sans_ext)}" | |
output: word_document | |
date: "{Sys.Date()}" | |
---\n\n\n'), file = out) | |
for (n in names(notes)) { | |
cat(paste0("## ", n, ":\n\n"), file = out, append = TRUE) | |
if (length(notes[[n]])) { | |
cat(notes[[n]], file = out, sep = "\n\n", append = TRUE) | |
cat("\n", file = out, append = TRUE) | |
} | |
} | |
if (format == "docx") { | |
out_docx <- paste0(file_sans_ext, "_notes.docx") | |
return(rmarkdown::render(out, output_file = out_docx)) | |
} | |
out | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment