doc, err := html.Parse(resp.Body) resp.Body.Close() if err != nil { returnnil, fmt.Errorf("parsing %s as HTML: %v", url, err) }
var links []string visitNode := func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "a" { for _, a := range n.Attr { if a.Key != "href" { continue } link, err := resp.Request.URL.Parse(a.Val) if err != nil { continue// ignore bad URLs } links = append(links, link.String()) } } } forEachNode(doc, visitNode, nil) return links, nil }
//!-Extract
// Copied from gopl.io/ch5/outline2. funcforEachNode(n *html.Node, pre, post func(n *html.Node)) { if pre != nil { pre(n) } for c := n.FirstChild; c != nil; c = c.NextSibling { forEachNode(c, pre, post) } if post != nil { post(n) } }
// Start with the command-line arguments. gofunc() { worklist <- os.Args[1:] }()
// Crawl the web concurrently. seen := make(map[string]bool) for list := range worklist { for _, link := range list { if !seen[link] { seen[link] = true gofunc(link string) { worklist <- crawl(link) }(link) } } } }
//!-main
/* //!+output $ go build gopl.io/ch8/crawl1 $ ./crawl1 http://gopl.io/ http://gopl.io/ https://golang.org/help/ https://golang.org/doc/ https://golang.org/blog/ ... 2015/07/15 18:22:12 Get ...: dial tcp: lookup blog.golang.org: no such host 2015/07/15 18:22:12 Get ...: dial tcp 23.21.222.120:443: socket: too many open files ... //!-output */
//!+ funcmain() { worklist := make(chan []string) var n int// number of pending sends to worklist
// Start with the command-line arguments. n++ gofunc() { worklist <- os.Args[1:] }()
// Crawl the web concurrently. seen := make(map[string]bool) for ; n > 0; n-- { list := <-worklist for _, link := range list { if !seen[link] { seen[link] = true n++ gofunc(link string) { worklist <- crawl(link) }(link) } } } }
// Create 20 crawler goroutines to fetch each unseen link. for i := 0; i < 20; i++ { gofunc() { for link := range unseenLinks { foundLinks := crawl(link) gofunc() { worklist <- foundLinks }() } }() }
// The main goroutine de-duplicates worklist items // and sends the unseen ones to the crawlers. seen := make(map[string]bool) for list := range worklist { for _, link := range list { if !seen[link] { seen[link] = true unseenLinks <- link } } } }