-
-
Save egobude/861fb9879dd47a3db8082c4c46774fd9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace Acme\Bundle\Command; | |
use Symfony\Bundle\FrameworkBundle\Command\ContainerAwareCommand; | |
use Symfony\Component\Console\Input\InputArgument; | |
use Symfony\Component\Console\Input\InputOption; | |
use Symfony\Component\Console\Input\InputInterface; | |
use Symfony\Component\Console\Output\OutputInterface; | |
use Symfony\Component\HttpFoundation\RedirectResponse; | |
use Symfony\Component\DomCrawler\Crawler; | |
use Symfony\Component\HttpKernel\Client; | |
use Symfony\Component\BrowserKit\Cookie; | |
use Symfony\Component\Security\Core\Authentication\Token\UsernamePasswordToken; | |
/** | |
* This class crawls the Acme site | |
* | |
* @author Joe Sexton <[email protected] | |
*/ | |
class SiteCrawlerCommand extends ContainerAwareCommand | |
{ | |
/** | |
* @var OutputInterface | |
*/ | |
protected $output; | |
/** | |
* @var Router | |
*/ | |
protected $router; | |
/** | |
* @var EntityManager | |
*/ | |
protected $entityManager; | |
/** | |
* @var string | |
*/ | |
protected $domain = null; | |
/** | |
* @var string | |
*/ | |
protected $username = null; | |
/** | |
* @var string | |
*/ | |
protected $securityFirewall = null; | |
/** | |
* @var integer | |
*/ | |
protected $searchLimit; | |
/** | |
* index routes containing these keywords only once | |
* @var array | |
*/ | |
protected $ignoredRouteKeywords; | |
/** | |
* @var array | |
*/ | |
protected $domainLinks = null; | |
/** | |
* @var array | |
*/ | |
protected $linksToProcess = null; | |
/** | |
* Configure | |
* | |
* @author Joe Sexton <[email protected] | |
*/ | |
protected function configure() | |
{ | |
$this | |
->setName( 'crawler:crawl' ) | |
->setDescription( 'Crawls the Acme website.' ) | |
->setDefinition(array( | |
new InputArgument( 'startingLink', InputArgument::REQUIRED, 'Link to start crawling' ), | |
new InputArgument( 'username', InputArgument::REQUIRED, 'Username' ), | |
new InputOption( 'limit', null, InputOption::VALUE_REQUIRED, 'Limit the number of links to process, prevents infinite crawling', 20 ), | |
new InputOption( 'security-firewall', null, InputOption::VALUE_REQUIRED, 'Firewall name', 'default_firewall' ), | |
new InputOption( 'ignore-duplicate-keyword', null, InputOption::VALUE_IS_ARRAY|InputOption::VALUE_REQUIRED, 'Index routes containing this keyword only one time (prevents infinite crawling of routes containng query parameters)', array() ), | |
)) | |
->setHelp(<<<EOT | |
The <info>crawler:crawl</info> command crawls the Acme website: | |
<info>php app/console crawler:crawl <startingLink> <username></info> | |
EOT | |
); | |
} | |
/** | |
* Execute | |
* | |
* @author Joe Sexton <[email protected] | |
* @param InputInterface $input | |
* @param OutputInterface $output | |
* @todo use product sitemap to crawl product pages | |
*/ | |
protected function execute( InputInterface $input, OutputInterface $output ) | |
{ | |
// user input | |
$startingLink = $input->getArgument( 'startingLink' ); | |
$this->domain = parse_url( $startingLink, PHP_URL_HOST ); | |
$this->username = $input->getArgument( 'username' ); | |
$this->searchLimit = $input->getOption( 'limit' ); | |
$this->securityFirewall = $input->getOption( 'security-firewall' ); | |
$this->ignoredRouteKeywords = $input->getOption( 'ignore-duplicate-keyword' ); | |
$this->output = $output; | |
$this->router = $this->getContainer()->get( 'router' ); | |
$this->entityManager = $this->getContainer()->get( 'doctrine.orm.entity_manager' ); | |
// start | |
$output->writeln(' | |
<info>A super-duper web crawler written by: | |
___ _____ _ | |
|_ | / ___| | | | |
| | ___ ___ \ `--. _____ _| |_ ___ _ __ | |
| |/ _ \ / _ \ `--. \/ _ \ \/ / __/ _ \| |_ \ | |
/\__/ / (_) | __/ /\__/ / __/> <| || (_) | | | | | |
\____/ \___/ \___| \____/ \___/_/\_\\__\___/|_| |_| | |
</info>'); | |
// config | |
$kernel = $this->_createKernel(); | |
$client = $kernel->getContainer()->get( 'test.client' ); | |
$this->_authenticate( $kernel, $client ); | |
// start crawling | |
$output->writeln( sprintf( 'Dominating <comment>%s</comment>, starting at <comment>%s</comment>. At most, <comment>%s</comment> pages will be crawled.', $this->domain, $startingLink, $this->searchLimit ) ); | |
// crawl starting link | |
$crawler = $client->request( 'GET', $startingLink ); | |
// redirect if necessary | |
while ( $client->getResponse() instanceof RedirectResponse ) { | |
$crawler = $client->followRedirect(); | |
} | |
$this->_processLinksOnPage( $crawler, $startingLink ); | |
$index = 1; | |
// crawl links found | |
while ( ! empty( $this->linksToProcess ) && ++$index < $this->searchLimit ) { | |
$client->getHistory()->clear(); // prevent out of memory errors... | |
$url = array_pop( $this->linksToProcess ); | |
$output->writeln( 'Processing: '.$url ); | |
$crawler = $client->request( 'GET', $url ); | |
// redirect if necessary | |
while ( $client->getResponse() instanceof RedirectResponse ) { | |
$crawler = $client->followRedirect(); | |
} | |
$this->_processLinksOnPage( $crawler, $url ); | |
} | |
// boom, done | |
$output->writeln( 'All Links Found:' ); | |
foreach ( $this->domainLinks as $link => $linkDetails ) { | |
$output->writeln( ' '.$link.' : '.$linkDetails['route'] ); | |
} | |
$output->writeln( $index.' links dominated, too easy...' ); | |
} | |
/** | |
* Interact | |
* | |
* @author Joe Sexton <[email protected] | |
* @param InputInterface $input | |
* @param OutputInterface $output | |
*/ | |
protected function interact( InputInterface $input, OutputInterface $output ) | |
{ | |
if ( ! $input->getArgument( 'startingLink' ) ) { | |
$startingLink = $this->getHelper( 'dialog' )->askAndValidate( | |
$output, | |
'Please enter the link to start at(including the locale):', | |
function( $startingLink ) { | |
if ( empty( $startingLink ) ) { | |
throw new \Exception('starting link can not be empty'); | |
} | |
return $startingLink; | |
} | |
); | |
$input->setArgument( 'startingLink', $startingLink ); | |
} | |
if ( ! $input->getArgument( 'username' ) ) { | |
$username = $this->getHelper( 'dialog' )->askAndValidate( | |
$output, | |
'Please choose a username:', | |
function( $username ) { | |
if ( empty( $username ) ) { | |
throw new \Exception( 'Username can not be empty' ); | |
} | |
return $username; | |
} | |
); | |
$input->setArgument( 'username', $username ); | |
} | |
} | |
/** | |
* createKernel | |
* | |
* @author Joe Sexton <[email protected] | |
* @return \AppKernel | |
*/ | |
protected function _createKernel() { | |
$rootDir = $this->getContainer()->get( 'kernel' )->getRootDir(); | |
require_once( $rootDir . '/AppKernel.php' ); | |
$kernel = new \AppKernel( 'test', true ); | |
$kernel->boot(); | |
return $kernel; | |
} | |
/** | |
* authenticate with a user account to access secured urls | |
* | |
* @author Joe Sexton <[email protected] | |
* @param AppKernel $kernel | |
* @param Client $client | |
*/ | |
protected function _authenticate( $kernel, $client ) { | |
// however you retrieve a user in your application | |
$user = $this->entityManager->getRepository( 'Entity:User' )->findOneByUsername( $this->username ); | |
$token = new UsernamePasswordToken( $user, null, $this->securityFirewall, $user->getRoles() ); | |
// set session | |
$session = $client->getContainer()->get('session'); | |
$session->set('_security_'.$this->securityFirewall, serialize($token)); | |
$session->save(); | |
// set cookie | |
$cookie = new Cookie($session->getName(), $session->getId()); | |
$client->getCookieJar()->set($cookie); | |
} | |
/** | |
* get all links on the page as an array of urls | |
* | |
* @author Joe Sexton <[email protected] | |
* @param Crawler $crawler | |
* @return array | |
*/ | |
protected function _getLinksOnCurrentPage( Crawler $crawler ) { | |
$links = $crawler->filter( 'a' )->each( function ( Crawler $node, $i ) { | |
return $node->link()->getUri(); | |
}); | |
// remove outboundlinks | |
foreach ( $links as $key => $link ) { | |
$this->output->writeln( 'Link: '.$link ); | |
$linkParts = parse_url( $link ); | |
if ( empty( $linkParts['host'] ) || $linkParts['host'] !== $this->domain || $linkParts['scheme'] !== 'http' ) { | |
unset( $links[$key] ); | |
} | |
} | |
return array_values( $links ); | |
} | |
/** | |
* process all links on a page | |
* | |
* @author Joe Sexton <[email protected] | |
* @param Crawler $crawler | |
* @param string $currentUrl | |
*/ | |
protected function _processLinksOnPage( Crawler $crawler, $currentUrl ) { | |
$links = $this->_getLinksOnCurrentPage( $crawler ); | |
// process each link | |
foreach ( $links as $key => $link ) { | |
$this->_processSingleLink( $link, $currentUrl ); | |
} | |
} | |
/** | |
* process a single link | |
* | |
* @author Joe Sexton <[email protected] | |
* @param string $link | |
* @param string $currentUrl | |
*/ | |
protected function _processSingleLink( $link, $currentUrl ) { | |
if ( empty( $this->domainLinks[$link] ) ) { | |
// check for routes that should only be indexed once | |
// do this before we add the link to the domainLinks array since we check that array for duplicates... | |
if ( ! $this->_isDuplicateIgnoredRoute( $link ) ) { | |
$this->linksToProcess[] = $link; | |
} | |
// add details to the domainLinks array | |
$route = $this->router->match( parse_url( $link, PHP_URL_PATH ) ); | |
$this->domainLinks[$link] = array( | |
'route' => ( ! empty( $route['_route'] ) ) ? $route['_route'] : '', | |
// any other details about a link you would like to know; ie. number of occurances, pages found on, etc... | |
); | |
} | |
} | |
/** | |
* routeIsInQueue | |
* | |
* @author Joe Sexton <[email protected] | |
* @param string $routeName | |
* @return boolean | |
*/ | |
protected function _routeIsInQueue( $routeName ) { | |
// check each existing link for a similar match | |
$allLinks = $this->domainLinks; | |
foreach ( $allLinks as $existingLink ) { | |
// does the url contain app name? | |
if ( $existingLink['route'] === $routeName ) { | |
return true; | |
} | |
} | |
return false; | |
} | |
/** | |
* isDuplicateIgnoredRoute | |
* | |
* @author Joe Sexton <[email protected] | |
* @param string $newLink | |
* @return boolean | |
*/ | |
protected function _isDuplicateIgnoredRoute( $newLink ) { | |
// $linkParts = parse_url( $newLink, PHP_URL_PATH ); | |
$route = $this->router->match( parse_url( $newLink, PHP_URL_PATH ) ); | |
$routeName = ( ! empty( $route['_route'] ) ) ? $route['_route'] : ''; | |
// if the route name contains an ignored route keyword, check if it's in the queue of links to process | |
foreach ( $this->ignoredRouteKeywords as $keyword ) { | |
$keyword = '/'.$keyword.'/'; // add delimiters | |
if ( preg_match( $keyword, $routeName ) === 1 ) { | |
return $this->_routeIsInQueue( $routeName ); | |
} | |
} | |
return false; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment