Skip to content

Instantly share code, notes, and snippets.

@bhardwajRahul
Created May 20, 2025 07:59
Show Gist options
  • Save bhardwajRahul/adf30bbab41240afdbedc69b95e64bd9 to your computer and use it in GitHub Desktop.
Save bhardwajRahul/adf30bbab41240afdbedc69b95e64bd9 to your computer and use it in GitHub Desktop.
Go concurrency web crawler
package main
import (
"fmt"
"sync"
)
// ---------------- INTERFACES AND STRUCTS ----------------
// Fetcher defines an interface for fetching URLs
type Fetcher interface {
Fetch(url string) (body string, urls []string, err error)
}
// Visited is a thread-safe map to track visited URLs
type Visited struct {
mu sync.Mutex // Mutex to guard access to the map
visited map[string]bool // Map to track if a URL has already been visited
}
// isVisited checks if a URL has been visited. If not, it marks it as visited.
// This function is thread-safe using the mutex.
func (v *Visited) isVisited(url string) bool {
v.mu.Lock()
defer v.mu.Unlock()
if _, ok := v.visited[url]; ok {
return true // Already visited
}
v.visited[url] = true // Mark as visited
return false
}
// ---------------- CRAWLER FUNCTION ----------------
// crawl fetches a URL and recursively spawns goroutines for its child URLs
func crawl(url string, fetcher Fetcher, visited *Visited, wg *sync.WaitGroup, ch chan struct{}) {
defer wg.Done() // Decrement WaitGroup counter when this function completes
// Check if this URL has already been visited
if visited.isVisited(url) {
return // Skip if already visited
}
// Acquire a concurrency token (this limits how many goroutines run at once)
<-ch
defer func() { ch <- struct{}{} }() // Return the token when done, using defer for safety
// Fetch the URL's body and list of URLs it links to
body, urls, err := fetcher.Fetch(url)
if err != nil {
// Print error if fetch failed, then return
fmt.Printf("error fetching %s: %v\n", url, err)
return
}
// Print the result of the successful fetch
fmt.Printf("found: %s %q\n", url, body)
// Launch new goroutines for each discovered URL
for _, u := range urls {
wg.Add(1) // Increment WaitGroup counter for the new goroutine
go crawl(u, fetcher, visited, wg, ch) // Start crawling the linked URL
}
}
// ---------------- MAIN FUNCTION ----------------
func main() {
// Create a Visited tracker to avoid revisiting the same URL
visited := Visited{visited: make(map[string]bool)}
// Create a WaitGroup to wait for all crawling goroutines to finish
var wg sync.WaitGroup
// Create a buffered channel to act as a semaphore limiting concurrency to 20
concurrencyLimit := 20
ch := make(chan struct{}, concurrencyLimit)
// Fill the channel with 20 empty tokens (struct{} is used because it uses 0 bytes)
for i := 0; i < concurrencyLimit; i++ {
ch <- struct{}{}
}
// Start crawling from the initial URL
wg.Add(1)
go crawl("https://golang.org/", fetcher, &visited, &wg, ch)
// Wait until all crawling is complete
wg.Wait()
}
// ---------------- FAKE FETCHER IMPLEMENTATION ----------------
// fakeFetcher implements the Fetcher interface using hardcoded data
type fakeFetcher map[string]*fakeResult
// fakeResult contains the body and the child URLs of a fake page
type fakeResult struct {
body string
urls []string
}
// Fetch returns the body and child URLs for a given URL
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a mock implementation of the Fetcher interface
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
body: "The Go Programming Language",
urls: []string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
body: "Packages",
urls: []string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
body: "Package fmt",
urls: []string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
body: "Package os",
urls: []string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment