两种:
crawler.go
代码语言:javascript复制package main
import (
"fmt"
"links"
//"log"
"os"
)
func main() {
worklist := make(chan []string)
// Start with the command-line arguments.
go func() { worklist <- os.Args[1:] }()
// Crawl the web concurrently.
seen := make(map[string]bool)
for list := range worklist {
for _, link := range list {
if !seen[link] {
seen[link] = true
go func(link string) {
worklist <- crawl(link)
}(link)
}
}
}
}
var tokens = make(chan struct{}, 20)
//从一个url页面中提取出所有的url
func crawl(url string) []string {
fmt.Println(url)
tokens <- struct{}{}
list, err := links.Extract(url)
<-tokens
if err != nil {
//log.Print(err)
}
return list
}
crawler2.go
代码语言:javascript复制package main
import (
"fmt"
"links"
//"log"
"os"
"strings"
)
func main() {
worklist := make(chan []string)
unseenLinks := make(chan string)
// Start with the command-line arguments.
go func() { worklist <- os.Args[1:] }()
// Create 20 crawler goroutines to fetch each unseen link.
for i := 0; i < 20; i {
go func() {
for link := range unseenLinks {
//if strings.HasPrefix(link, "http://www.lypeng.com") {
foundLinks := crawl(link)
go func() { worklist <- foundLinks }()
//}
}
}()
}
// The main goroutine de-duplicates worklist items
// and sends the unseen ones to the crawlers.
seen := make(map[string]bool)
for list := range worklist {
for _, link := range list {
if !seen[link] {
seen[link] = true
unseenLinks <- link
}
}
}
}
//从一个url页面中提取出所有的url
func crawl(url string) []string {
fmt.Println(url)
list, err := links.Extract(url)
if err != nil {
//log.Print(err)
}
return list
}