-
-
Save harryhare/6a4979aa7f8b90db6cbc74400d0beb49 to your computer and use it in GitHub Desktop.
package main | |
import ( | |
"fmt" | |
"time" | |
"sync" | |
) | |
type Fetcher interface { | |
// Fetch returns the body of URL and | |
// a slice of URLs found on that page. | |
Fetch(url string) (body string, urls []string, err error) | |
} | |
type SafeCounter struct { | |
v map[string]bool | |
mux sync.Mutex | |
} | |
var c SafeCounter = SafeCounter{v: make(map[string]bool)} | |
func (s SafeCounter)checkvisited(url string)bool{ | |
s.mux.Lock() | |
defer s.mux.Unlock() | |
_,ok:=s.v[url] | |
if ok==false { | |
s.v[url]=true | |
return false | |
} | |
return true | |
} | |
// Crawl uses fetcher to recursively crawl | |
// pages starting with url, to a maximum of depth. | |
func Crawl(url string, depth int, fetcher Fetcher) { | |
// TODO: Fetch URLs in parallel. | |
// TODO: Don't fetch the same URL twice. | |
// This implementation doesn't do either: | |
if depth <= 0 { | |
return | |
} | |
if c.checkvisited(url) { | |
return; | |
} | |
body, urls, err := fetcher.Fetch(url) | |
if err != nil { | |
fmt.Println(err) | |
return | |
} | |
fmt.Printf("found: %s %q\n", url, body) | |
for _, u := range urls { | |
go Crawl(u, depth-1, fetcher) | |
} | |
return | |
} | |
func main() { | |
Crawl("http://golang.org/", 4, fetcher) | |
time.Sleep(5*time.Second) | |
} | |
// fakeFetcher is Fetcher that returns canned results. | |
type fakeFetcher map[string]*fakeResult | |
type fakeResult struct { | |
body string | |
urls []string | |
} | |
func (f fakeFetcher) Fetch(url string) (string, []string, error) { | |
if res, ok := f[url]; ok { | |
return res.body, res.urls, nil | |
} | |
return "", nil, fmt.Errorf("not found: %s", url) | |
} | |
// fetcher is a populated fakeFetcher. | |
var fetcher = fakeFetcher{ | |
"http://golang.org/": &fakeResult{ | |
"The Go Programming Language", | |
[]string{ | |
"http://golang.org/pkg/", | |
"http://golang.org/cmd/", | |
}, | |
}, | |
"http://golang.org/pkg/": &fakeResult{ | |
"Packages", | |
[]string{ | |
"http://golang.org/", | |
"http://golang.org/cmd/", | |
"http://golang.org/pkg/fmt/", | |
"http://golang.org/pkg/os/", | |
}, | |
}, | |
"http://golang.org/pkg/fmt/": &fakeResult{ | |
"Package fmt", | |
[]string{ | |
"http://golang.org/", | |
"http://golang.org/pkg/", | |
}, | |
}, | |
"http://golang.org/pkg/os/": &fakeResult{ | |
"Package os", | |
[]string{ | |
"http://golang.org/", | |
"http://golang.org/pkg/", | |
}, | |
}, | |
} |
It's been one week since I started learning Go and 4 days ago I took completing this exercise without waitGroup as a challenge when I saw this gist and 4 days ago somebody already refactored the code, great implementation @jldiaz 🥇
`
uMap := make(map[string]bool)
var mut sync.Mutex
c := make(chan struct{}) // Create channel
q := make(chan struct{}) // Quit channel
go Crawl("https://golang.org/", 4, fetcher, &uMap, c, q, &mut)
for i := 0; ; {
select {
case <-c:
i = i + 1
case <-q:
i = i - 1
}
if i == 0 {
break
}
}
`
@jldiaz I found this "official"? resolution at
https://cs.opensource.google/go/x/website/+/master:tour/solutions/webcrawler.go
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
import (
"errors"
"fmt"
"sync"
)
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
// fetched tracks URLs that have been (or are being) fetched.
// The lock must be held while reading from or writing to the map.
// See https://golang.org/ref/spec#Struct_types section on embedded types.
var fetched = struct {
m map[string]error
sync.Mutex
}{m: make(map[string]error)}
var loading = errors.New("url load in progress") // sentinel value
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
if depth <= 0 {
fmt.Printf("<- Done with %v, depth 0.\n", url)
return
}
fetched.Lock()
if _, ok := fetched.m[url]; ok {
fetched.Unlock()
fmt.Printf("<- Done with %v, already fetched.\n", url)
return
}
// We mark the url to be loading to avoid others reloading it at the same time.
fetched.m[url] = loading
fetched.Unlock()
// We load it concurrently.
body, urls, err := fetcher.Fetch(url)
// And update the status in a synced zone.
fetched.Lock()
fetched.m[url] = err
fetched.Unlock()
if err != nil {
fmt.Printf("<- Error on %v: %v\n", url, err)
return
}
fmt.Printf("Found: %s %q\n", url, body)
done := make(chan bool)
for i, u := range urls {
fmt.Printf("-> Crawling child %v/%v of %v : %v.\n", i, len(urls), url, u)
go func(url string) {
Crawl(url, depth-1, fetcher)
done <- true
}(u)
}
for i, u := range urls {
fmt.Printf("<- [%v] %v/%v Waiting for child %v.\n", url, i, len(urls), u)
<-done
}
fmt.Printf("<- Done with %v\n", url)
}
func main() {
Crawl("https://golang.org/", 4, fetcher)
fmt.Println("Fetching stats\n--------------")
for url, err := range fetched.m {
if err != nil {
fmt.Printf("%v failed: %v\n", url, err)
} else {
fmt.Printf("%v was fetched\n", url)
}
}
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f *fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := (*f)[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = &fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
👋 Learning Go here, not much know how on this language. Here I'm getting the same results but with a simpler approach.
Maybe using a struct to hold the fetched URLs would let this more organized.
Not sure if channels would be needed in this exercise.
If you have any thoughts, comments or see something wrong please let me know.
package main
import (
"fmt"
"sync"
"time"
"github.com/juniormayhe/currentFilename"
)
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
func setVisited(url string, visited map[string]bool, lock sync.Mutex) {
lock.Lock()
visited[url] = true
lock.Unlock()
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, fetched map[string]bool) {
var lock sync.Mutex
_, exist := fetched[url]
if depth <= 0 || exist {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
setVisited(url, fetched, lock)
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
go Crawl(u, depth-1, fetcher, fetched)
}
setVisited(url, fetched, lock)
time.Sleep(time.Second)
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
func main() {
fmt.Println(currentFilename.GetCurrentFileName(), "\n")
fetchedUrls := make(map[string]bool)
Crawl("https://golang.org/", 4, fetcher, fetchedUrls)
fmt.Println(fetchedUrls)
}
Solution with waitGroup looks to be out of "A tour of Go" scope, really. And solution with 'time.Sleep' looks disguisting too much ))
The Tutorial explains mutex, channels, maps, so the solution shall contain only these primitives. None of the solutions sticks to this limitation.
Here is my solution using chained channels - it's not ideal cause I learn Go about two weeks, but it works and covers (i hope:) these conditions:
package main
import (
"fmt"
"sync"
// "time"
)
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
type UrlCache struct {
mu sync.Mutex
cache map[string]int
}
func (u *UrlCache) IsFetched(url string) bool {
u.mu.Lock()
defer u.mu.Unlock()
_, fetched := u.cache[url];
if !fetched {
u.cache[url]++
}
return fetched
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, c UrlCache, out chan string) {
defer close(out)
if depth <= 0 {
return
}
if !c.IsFetched(url) {
body, urls, err := fetcher.Fetch(url)
if err != nil {
// out <- err.Error()
out <- fmt.Sprintf("%v", err)
return
}
out <- fmt.Sprintf("found: %s %q", url, body)
outs := make([]chan string, len(urls))
for i, u := range urls {
outs[i] = make(chan string)
go Crawl(u, depth-1, fetcher, c, outs[i])
}
for _, ch := range outs {
for msg := range ch {
out <- msg
}
}
}
}
func main() {
cache := UrlCache{cache: make(map[string]int)}
out := make(chan string)
go Crawl("https://golang.org/", 4, fetcher, cache, out)
// time.Sleep(time.Second*1)
for msg := range out {
fmt.Println(msg)
}
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
Why you guys didn't use sync.Map
for the safe cache and instead implemented it all manually?
package main
import (
"fmt"
"sync"
)
type Cache struct {
mu sync.Mutex
urls map[string]bool
}
func (c *Cache) InsertUrls(urls []string) {
c.mu.Lock()
defer c.mu.Unlock()
for _, url := range urls {
c.urls[url] = false
}
}
func (c *Cache) UpdateFetchedUrl(url string) {
c.mu.Lock()
defer c.mu.Unlock()
c.urls[url] = true
}
func (c *Cache) IsUrlFetched(url string) bool {
c.mu.Lock()
defer c.mu.Unlock()
return c.urls[url]
}
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, cache *Cache, wg *sync.WaitGroup) {
defer wg.Done()
if depth <= 0 {
return
}
body, urls, err := fetcher.Fetch(url)
// insert new urls in cache
// and update url fetched
cache.InsertUrls(urls)
cache.UpdateFetchedUrl(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
// check cache if url is already fetched
if !cache.IsUrlFetched(u) {
wg.Add(1)
go Crawl(u, depth-1, fetcher, cache, wg)
}
}
return
}
// synced crawl using sync.Metux and sync.WaitGroup
func SyncCrawl(url string, depth int, fetcher Fetcher) {
var wg sync.WaitGroup
cache := Cache{urls: make(map[string]bool)}
wg.Add(1)
go Crawl(url, depth, fetcher, &cache, &wg)
wg.Wait()
}
func main() {
SyncCrawl("https://golang.org/", 4, fetcher)
fmt.Println("done")
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
package main
import (
"fmt"
"sync"
"time"
)
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
type safemap struct {
m sync.Mutex
mm map[string]bool
}
var safe safemap = safemap{mm:make(map[string]bool)}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
// TODO: Fetch URLs in parallel.
// TODO: Don't fetch the same URL twice.
safe.m.Lock()
safe.mm[url] = true
safe.m.Unlock()
// This implementation doesn't do either:
if depth <= 0 {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
if _, visited := safe.mm[u]; !visited {
go Crawl(u, depth-1, fetcher)
}
}
return
}
func main() {
Crawl("https://golang.org/", 4, fetcher)
time.Sleep(time.Second)
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
could you guys give me some feedback on this code?
here is my answer, using channels.
package main
import (
"fmt"
"sync"
)
type Fetcher interface {
Fetch(url string) (body string, urls []string, err error)
}
type SafeCounter struct {
mu sync.Mutex
v map[string]bool
}
func (c *SafeCounter) Inc(key string) {
c.mu.Lock()
defer c.mu.Unlock()
c.v[key] = true
}
func (c *SafeCounter) Visited(key string) bool {
c.mu.Lock()
defer c.mu.Unlock()
return c.v[key]
}
func Crawl(url string, depth int, fetcher Fetcher, ch chan string, wg *sync.WaitGroup, c *SafeCounter) error {
defer func() {
if r := recover(); r != nil {
fmt.Println("Recovered in Crawl:", r)
}
}()
if depth <= 0 || c.Visited(url) {
return nil
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
return err
}
//body, urls, err := fetcher.Fetch(url)
//fmt.Printf("found: %s %q\n", url, body)
wg.Add(len(urls)) // Add the number of URLs to crawl before starting to crawl them
ch <- url + " " + "'" + body + "'"
c.Inc(url)
for _, u := range urls {
go func(u string) {
defer wg.Done()
Crawl(u, depth-1, fetcher, ch, wg, c)
}(u)
}
return nil
}
func main() {
ch := make(chan string)
c := SafeCounter{v: make(map[string]bool)}
var wg sync.WaitGroup
// Increment the WaitGroup before starting the crawl
wg.Add(1)
go func() {
defer wg.Done()
if err := Crawl("https://golang.org/", 4, fetcher, ch, &wg, &c); err != nil {
fmt.Println(err)
}
}()
go func() {
wg.Wait()
close(ch)
}()
for result := range ch {
fmt.Printf("found: %s \n", result)
}
wg.Wait() // Wait for all goroutines to finish before exiting
//fmt.Println(c.v)
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
Here's my solution. Pretty neat imo!
package main
import (
"fmt"
"sync"
)
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
type SafeMap struct {
mu sync.Mutex
m map[string]string
}
func (sm *SafeMap) addCache(url string) {
sm.mu.Lock()
defer sm.mu.Unlock()
sm.m[url] = url
}
func (sm *SafeMap) checkCache(url string) bool {
sm.mu.Lock()
defer sm.mu.Unlock()
_, ok := sm.m[url]
return ok
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, sm *SafeMap) {
defer wg.Done()
if depth <= 0 {
return
}
if sm.checkCache(url) {
return
}
sm.addCache(url)
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
wg.Add(1)
go Crawl(u, depth-1, fetcher, sm)
}
}
var wg sync.WaitGroup
func main() {
sm := SafeMap{m: make(map[string]string)}
wg.Add(1)
Crawl("https://golang.org/", 4, fetcher, &sm)
wg.Wait()
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
@zurcacielos @dilyanpalauzov I've found an implementation which uses only channel, maps and mutexes. I ended up using these primitives to build my own
WaitGroup
implementation.