Created
October 5, 2016 18:20
-
-
Save Tucker-Eric/30736bbdf8c8496fb43a746c4adbb8ad to your computer and use it in GitHub Desktop.
HTML Image Scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'***************************************************************** | |
Const adSaveCreateOverWrite = 2 | |
Const adTypeBinary = 1 | |
Dim htmlSrc, dstDir | |
htmlSrc = InputBox("Enter Html File Name") & ".html" | |
dstDir = InputBox("Enter Folder Name To Place Images") | |
'***************************************************************** | |
'** Download the image | |
' strResult = GetImage(strSource, strDest) | |
getImgTagUrl htmlSrc, dstDir | |
' If strResult = "OK" Then | |
' wscript.quit(0) | |
' Else | |
' wscript.quit(1) | |
' End If | |
Function GetImage(strPath, dstDir) | |
Dim objXMLHTTP, nF, arr, objFSO, objFile, re | |
Dim objRec, objStream | |
Dim strDest | |
Set re = new RegExp | |
re.pattern = "^.*\/([\w-]+\.\w+)$" | |
strDest = dstDir & "\" & re.Replace(strPath, "$1") | |
'create XMLHTTP component | |
Set objXMLHTTP = CreateObject("Microsoft.XMLHTTP") | |
'get the image specified by strPath | |
objXMLHTTP.Open "GET", strPath, False | |
objXMLHTTP.Send | |
'check if retrieval was successful | |
If objXMLHTTP.statusText = "OK" Then | |
'create binary stream to write image output | |
Set objStream = CreateObject("ADODB.Stream") | |
objStream.Type = adTypeBinary | |
objStream.Open | |
objStream.Write objXMLHTTP.ResponseBody | |
objStream.SavetoFile strDest, adSaveCreateOverwrite | |
objStream.Close | |
GetImage = "OK" | |
Else | |
GetImage = objXMLHTTP.statusText | |
End If | |
End Function | |
Function getImgTagURL(htmlFile, dstDir) | |
Dim HTMLstring | |
Set objFS = CreateObject("Scripting.FileSystemObject") | |
Set file = objFS.OpenTextFile(htmlFile, 1) | |
If NOT objFS.FolderExists(dstDir) Then | |
objFS.CreateFolder(dstDir) | |
End If | |
HTMLstring = file.ReadAll | |
Set RegEx = New RegExp | |
With RegEx | |
.Pattern = "src=[\""\']([^\""\']+)" | |
.IgnoreCase = True | |
.Global = True | |
End With | |
Set Matches = RegEx.Execute(HTMLstring) | |
Set isJs = New RegExp | |
isJs.Pattern = "\.js$" | |
Set isHttp = New RegExp | |
isHttp.Pattern = "^src=[\""\'](https?\:)?\/\/" | |
'Iterate through the Matches collection. | |
URL = "" | |
For Each Match in Matches | |
'We only want the first match. | |
URL = Match.Value | |
If (isJs.Test(URL) = False) AND (isHttp.Test(URL) = True) Then | |
GetImage Replace(URL, "src=""", ""), dstDir | |
End If | |
' Exit For | |
Next | |
'Clean up | |
Set Match = Nothing | |
Set RegEx = Nothing | |
Set isJs = Nothing | |
Set isHttp = Nothing | |
' src=" is hanging on the front, so we will replace it with nothing | |
End Function |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment