Last active
March 2, 2021 13:05
-
-
Save xymox12/5422b02a965ce45c36a71144fb0db09d to your computer and use it in GitHub Desktop.
Convert a csv of URLs and Page Titles created using WGET to Freemind XML and UL list
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# Crawls a domain | |
# Retreives all visible URLs and their page titles | |
# Saves to CSV | |
# $1 = URL (no http(s)) | |
# $2 = csv title | |
# MODIFY - wget include directories, domain, and --reject-regex TODO - make a variable | |
# Text color variables | |
txtund=$(tput sgr 0 1) | |
# Underline | |
txtbld=$(tput bold) | |
# Bold | |
bldred=${txtbld}$(tput setaf 1) # red | |
bldblu=${txtbld}$(tput setaf 4) # blue | |
bldgreen=${txtbld}$(tput setaf 2) # green | |
bldwht=${txtbld}$(tput setaf 7) # white | |
txtrst=$(tput sgr0) # Reset | |
info=${bldwht}*${txtrst} # Feedback | |
pass=${bldblu}*${txtrst} | |
warn=${bldred}*${txtrst} | |
ques=${bldblu}?${txtrst} | |
printf "%s=== Crawling $1 === %s" "$bldgreen" "$txtrst" | |
# wget in Spider mode, outputs to wglog file | |
# -R switch to ignore specific file types (images, javascript etc.) | |
wget --spider --recursive --level=1 --include-directories=/site --domains=www.aa.bb.cc --no-parent --no-host-directories --no-directories --restrict-file-names=nocontrol --execute="robots=off" --no-check-certificate --force-html --no-clobber --reject=bmp,css,gif,ico,jpg,jpeg,js,mp3,mp4,pdf,png,swf,svg,txt,xml,xls --user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0" $1 2>&1 | tee wglog | |
printf " %s========================================== \n" "$bldgreen" | |
printf "%s=== Crawl Finished... ===%s \n" "$bldgreen" "$txtrst" | |
printf "%s=== Begin retreiving page titles... ===%s \n" "$bldgreen" "$txtrst" | |
printf "%s========================================== \n" "$dgreen" | |
printf "%s** Run tail -f $1.csv for progress%s \n" "$bldred" "$txtrst" | |
# from wglog, grab URLs | |
# curl each URL and grep title | |
cat wglog | grep '^--' | awk '{print $3}' | sort | uniq | while read url; do { | |
printf "%s* Retreiving title for: %s$url%s \n" "$bldgreen" "$txtrst$txtbld" "$txtrst" | |
printf "\"${url}\",\"`curl -s ${url} | sed -n 's/.*<title>\(.*\)<\/title>.*/\1/ip;T;q'`\"\n" >> $2.csv | |
}; done | |
# clean up log file | |
rm wglog | |
exit |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* @file | |
* Create XML from sitemap. | |
* | |
* php -S localhost:8000 | |
* http://localhost:8000/site_crawler_2_xml.php | |
* OR from cmd line | |
* php site_crawler_2_xml.php | |
*/ | |
$url_arrays = read_log_file_to_array(); | |
// Use array merge | |
$tree = build_tree($url_arrays); | |
$list = build_list($tree); | |
echo $list; | |
// Use references | |
$tree= build_tree2($url_arrays); | |
$list = build_list($tree); | |
echo $list; | |
// Create a Freemind Sitemap | |
$xml_sitemap = new SimpleXMLElement("<map version=\"1.0.1\"></map>"); | |
array_to_xml($tree, $xml_sitemap); | |
$xml_file = $xml_user_info->asXML('sitemap.mm'); | |
print_r($xml_file); | |
/** | |
* Read a URL list into an array. | |
*/ | |
function read_log_file_to_array() { | |
$file = fopen("wlog.csv", "r"); | |
while (!feof($file)) { | |
$meta = []; | |
$line = fgetcsv($file); | |
$path = str_replace('http://', '', $line[0]); | |
$path = str_replace(array("\r\n", "\n"), "", $path); | |
$path = rtrim($path, '/'); | |
$dir_names = explode('/', $path); | |
$meta['__path'] = $line[0]; | |
$meta['__title'] = $line[1]; | |
$data[] = array($dir_names, $meta); | |
} | |
fclose($file); | |
return $data; | |
} | |
/** | |
* Description. | |
*/ | |
function build_tree($paths) { | |
$array = []; | |
foreach ($paths as $path) { | |
$reverse_dir_order = array_reverse($path[0]); | |
$first = TRUE; | |
foreach ($reverse_dir_order as $dir) { | |
$temp = []; | |
if ($first) { | |
$temp[$dir]['__title'] = $path[1]['__title']; | |
$temp[$dir]['__path'] = $path[1]['__path']; | |
$first = FALSE; | |
} else { | |
$temp[$dir] = $prev; | |
} | |
$prev = $temp; | |
} | |
$array = array_merge_recursive($array, $temp); | |
} | |
return $array; | |
} | |
/** | |
* Description. | |
*/ | |
function build_tree2($path_list) { | |
$path_tree = array(); | |
foreach ($path_list as $path_data) { | |
$last_dir =& $path_tree; | |
foreach ($path_data[0] as $dir) { | |
if (!isset($last_dir[$dir])) { | |
$last_dir[$dir] = NULL; | |
} | |
$last_dir =& $last_dir[$dir]; | |
} | |
$last_dir['__title'] = $path_data[1]['__title']; | |
$last_dir['__path'] = $path_data[1]['__path']; | |
} | |
return $path_tree; | |
} | |
/** | |
* Description. | |
*/ | |
function build_list($tree, $prefix = '') { | |
$ul = ''; | |
foreach ($tree as $key => $value) { | |
$li = ''; | |
if (is_array($value)) { | |
if (array_key_exists('__title', $value)) { | |
$li .= "$prefix$key/ <a href=\"http://$prefix$key/\">${value['__title']}</a>"; | |
} | |
else { | |
$li .= "$prefix$key/"; | |
} | |
$li .= build_list($value, "$prefix$key/"); | |
$ul .= strlen($li) ? "<li>$li</li>" : ''; | |
} | |
} | |
return strlen($ul) ? "<ul>$ul</ul>" : ''; | |
} | |
/** | |
* Description. | |
*/ | |
function array_to_xml($array, &$xml_user_info) { | |
foreach ($array as $key => $value) { | |
if (is_array($value)) { | |
if ($key !== '__title' || $key !== '__path') { | |
$subnode = $xml_user_info->addChild("node"); | |
$subnode->addAttribute('LINK', $value['__path']); | |
$subnode->addAttribute('TEXT', $value['__title']); | |
array_to_xml($value, $subnode); | |
} | |
else { | |
$subnode = $xml_user_info->addChild("item$key"); | |
array_to_xml($value, $subnode); | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment