Created
January 1, 2016 03:43
-
-
Save beancurd1/f850890fe7e7e5069f06 to your computer and use it in GitHub Desktop.
A Powershell script which uses itextsharp.dll library to extract date from PDF files located on a network share and check whether they are valid. Email the result to people
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# PDF Files Check Script (created by beancurd1, please distribute the code with this session, thanks) | |
# It uses itextsharp.dll (downloaded from SourceForge) to parse PDF files, extract the first date it found | |
# compare it against a predefined Date. Email the PDF file names to people if they doesn't match the predefined date | |
Add-Type -Path .\itextsharp.dll | |
$validDate = "11 Dec 2015" | |
$day = ([datetime]$validDate).ToString('dd') ; $day = $day -replace "^0", "" | |
#Define valid Month+Year format here, this will combine with $day in the search | |
$validMYArray = @(([datetime]$validDate).ToString(' MMM yyyy')) | |
$validMYArray += ([datetime]$validDate).ToString(' MMMM yyyy') | |
$validMYArray += ([datetime]$validDate).ToString('/MM/yyyy') | |
$validMYArray += ([datetime]$validDate).ToString('-MMM-yy') | |
# PDF Counters | |
$countTotal = $countGood = $countBad = 0 | |
$badDate = $foundDate = "" | |
$PDFPath="\\server\share" | |
$badPDF="Bad PDF (e.g. incorrect date, empty date):`n`n" | |
Write-Host "Mapping a Drive..." | |
New-PSDrive -Name NetworkDrive -PSProvider FileSystem -Root $PDFPath | |
Write-Host "Parsing PDF Files..." | |
################################################################# | |
### Search PDF Files from UNC folder, parse each PDF ### | |
### output PDFs with incorrect date ### | |
################################################################# | |
Get-ChildItem -Path NetworkDrive:\ -Filter *.pdf -Recurse | | |
Foreach-Object{ | |
$countTotal++ | |
$reader = New-Object iTextSharp.text.pdf.pdfreader -ArgumentList $_.FullName | |
$pageText = [iTextSharp.text.pdf.parser.PdfTextExtractor]::GetTextFromPage($reader, 1) -join "" -split "`n" | |
# search each line, look for a date which match the format defined in above | |
:loop ForEach ($line in $pageText) { | |
ForEach ($validMY in $validMYArray) { | |
if ($line -cmatch "0?$day$validMY") { | |
$countGood++ | |
$foundDate="Yes" | |
break loop | |
} elseif ($line -cmatch "[0-3]?\d$validMY" -and $badDate -eq "") { | |
# extract incorrect date and append it to the PDFs | |
$badDate = [regex]::Matches($line, "([0-3]?\d$validMY)")[0].Groups[1].Value | |
break loop | |
} | |
} | |
} | |
if ($foundDate -ne "Yes") { | |
$countBad++ | |
$badPDF += $_.FullName + " ($badDate)`t`n" #<-Insert a Tab character before `n to avoid Outlook Extra Line Break issue | |
$badDate = "" | |
} | |
$foundDate = "" | |
} | |
$reader.Dispose() #<-Destroy/free the Object, it locks the PDF files otherwise | |
# Unmap the drive | |
Remove-PSDrive -Name NetworkDrive | |
$stopWatch.Stop() | |
# Remove "\\server\share\" from file path | |
$badPDF = $badPDF -replace "\\\\.*\\", "" | |
Write-Host "$badPDF`n`n$countTotal PDFs, Good=$countGood, Bad=$countBad $($stopWatch.Elapsed.TotalSeconds) sec | |
$PDFPath$validDateNum" | |
Write-Host "Email Result..." | |
$messageParameters = @{ | |
Subject = "PDF Checked has finish" | |
Body = "Say something here" | |
From = "[email protected]" | |
To = "[email protected]" | |
SmtpServer = "mailserver" | |
} | |
Send-MailMessage @messageParameters |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment