Last active
August 29, 2015 14:03
-
-
Save vgarvardt/84c20bb51f7d9ad0f8d2 to your computer and use it in GitHub Desktop.
Simple crawler in Go
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// originally found @ http://venkat.io/posts/concurrent-crawling/ | |
package main | |
//The builtins are limited. Making a lot of imports necessary | |
import ("sync" | |
"net/http" | |
"regexp" | |
"io/ioutil" | |
"os" | |
"bytes" | |
"fmt" | |
"strconv" | |
"runtime" | |
"crypto/md5" | |
"io" | |
) | |
var source = os.Args[1] //source link | |
var num_worker_threads, _ = strconv.Atoi(os.Args[2]) //specifying how many workers | |
var num_to_crawl, _ = strconv.Atoi(os.Args[3]) //maximum no. of pages to fetch | |
var crawled = make(chan int, num_to_crawl) //buffered channel to count page fetches | |
var links = make(chan string, num_to_crawl) //buffered channel as a queue of links | |
func do_work(link string, crawler_id int) { | |
//fmt.Println("crawling", crawler_id, link) | |
re := regexp.MustCompile(`<a href="(http.*?)"`) | |
resp, err := http.Get(link) | |
if err != nil { | |
return | |
} | |
defer resp.Body.Close() | |
content, _ := ioutil.ReadAll(resp.Body) | |
contentString := bytes.NewBuffer(content).String() | |
h := md5.New() | |
io.WriteString(h, contentString) | |
var _ = h.Sum(nil) | |
//Try to add a link to the queue of links. If it is full, the default case | |
//returns as there is no point in adding more links to the queue as our | |
//maximum page fetches is limited anyways. | |
for _, match := range re.FindAllStringSubmatch(contentString, -1) { | |
select { | |
case links <- match[1]: | |
default: | |
return | |
} | |
} | |
} | |
func worker(crawler_id int) { | |
//If the crawled channel's buffer is full, no more pages to fetch | |
//so no more work to do. | |
for { | |
select { | |
case crawled <- 1: | |
do_work(<-links, crawler_id) | |
default: | |
return | |
} | |
} | |
} | |
func main() { | |
var _ = fmt.Println | |
//Try to make the workers use all the logical CPUs in the machine. | |
runtime.GOMAXPROCS(runtime.NumCPU()) | |
var wg sync.WaitGroup | |
links <- source | |
for i:=0; i < num_worker_threads; i++ { | |
// Increment the WaitGroup counter. | |
wg.Add(1) | |
// Launch a goroutine worker. | |
go func(crawler_id int) { | |
// Decrement the counter when the goroutine completes. | |
defer wg.Done() | |
worker(crawler_id) | |
}(i) | |
} | |
// Wait for all the workers to finish. | |
wg.Wait() | |
close(crawled) | |
close(links) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment