Created
May 20, 2025 07:59
-
-
Save bhardwajRahul/adf30bbab41240afdbedc69b95e64bd9 to your computer and use it in GitHub Desktop.
Go concurrency web crawler
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"sync" | |
) | |
// ---------------- INTERFACES AND STRUCTS ---------------- | |
// Fetcher defines an interface for fetching URLs | |
type Fetcher interface { | |
Fetch(url string) (body string, urls []string, err error) | |
} | |
// Visited is a thread-safe map to track visited URLs | |
type Visited struct { | |
mu sync.Mutex // Mutex to guard access to the map | |
visited map[string]bool // Map to track if a URL has already been visited | |
} | |
// isVisited checks if a URL has been visited. If not, it marks it as visited. | |
// This function is thread-safe using the mutex. | |
func (v *Visited) isVisited(url string) bool { | |
v.mu.Lock() | |
defer v.mu.Unlock() | |
if _, ok := v.visited[url]; ok { | |
return true // Already visited | |
} | |
v.visited[url] = true // Mark as visited | |
return false | |
} | |
// ---------------- CRAWLER FUNCTION ---------------- | |
// crawl fetches a URL and recursively spawns goroutines for its child URLs | |
func crawl(url string, fetcher Fetcher, visited *Visited, wg *sync.WaitGroup, ch chan struct{}) { | |
defer wg.Done() // Decrement WaitGroup counter when this function completes | |
// Check if this URL has already been visited | |
if visited.isVisited(url) { | |
return // Skip if already visited | |
} | |
// Acquire a concurrency token (this limits how many goroutines run at once) | |
<-ch | |
defer func() { ch <- struct{}{} }() // Return the token when done, using defer for safety | |
// Fetch the URL's body and list of URLs it links to | |
body, urls, err := fetcher.Fetch(url) | |
if err != nil { | |
// Print error if fetch failed, then return | |
fmt.Printf("error fetching %s: %v\n", url, err) | |
return | |
} | |
// Print the result of the successful fetch | |
fmt.Printf("found: %s %q\n", url, body) | |
// Launch new goroutines for each discovered URL | |
for _, u := range urls { | |
wg.Add(1) // Increment WaitGroup counter for the new goroutine | |
go crawl(u, fetcher, visited, wg, ch) // Start crawling the linked URL | |
} | |
} | |
// ---------------- MAIN FUNCTION ---------------- | |
func main() { | |
// Create a Visited tracker to avoid revisiting the same URL | |
visited := Visited{visited: make(map[string]bool)} | |
// Create a WaitGroup to wait for all crawling goroutines to finish | |
var wg sync.WaitGroup | |
// Create a buffered channel to act as a semaphore limiting concurrency to 20 | |
concurrencyLimit := 20 | |
ch := make(chan struct{}, concurrencyLimit) | |
// Fill the channel with 20 empty tokens (struct{} is used because it uses 0 bytes) | |
for i := 0; i < concurrencyLimit; i++ { | |
ch <- struct{}{} | |
} | |
// Start crawling from the initial URL | |
wg.Add(1) | |
go crawl("https://golang.org/", fetcher, &visited, &wg, ch) | |
// Wait until all crawling is complete | |
wg.Wait() | |
} | |
// ---------------- FAKE FETCHER IMPLEMENTATION ---------------- | |
// fakeFetcher implements the Fetcher interface using hardcoded data | |
type fakeFetcher map[string]*fakeResult | |
// fakeResult contains the body and the child URLs of a fake page | |
type fakeResult struct { | |
body string | |
urls []string | |
} | |
// Fetch returns the body and child URLs for a given URL | |
func (f fakeFetcher) Fetch(url string) (string, []string, error) { | |
if res, ok := f[url]; ok { | |
return res.body, res.urls, nil | |
} | |
return "", nil, fmt.Errorf("not found: %s", url) | |
} | |
// fetcher is a mock implementation of the Fetcher interface | |
var fetcher = fakeFetcher{ | |
"https://golang.org/": &fakeResult{ | |
body: "The Go Programming Language", | |
urls: []string{ | |
"https://golang.org/pkg/", | |
"https://golang.org/cmd/", | |
}, | |
}, | |
"https://golang.org/pkg/": &fakeResult{ | |
body: "Packages", | |
urls: []string{ | |
"https://golang.org/", | |
"https://golang.org/cmd/", | |
"https://golang.org/pkg/fmt/", | |
"https://golang.org/pkg/os/", | |
}, | |
}, | |
"https://golang.org/pkg/fmt/": &fakeResult{ | |
body: "Package fmt", | |
urls: []string{ | |
"https://golang.org/", | |
"https://golang.org/pkg/", | |
}, | |
}, | |
"https://golang.org/pkg/os/": &fakeResult{ | |
body: "Package os", | |
urls: []string{ | |
"https://golang.org/", | |
"https://golang.org/pkg/", | |
}, | |
}, | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment