go语言渐入佳境[51]-爬虫-正则获取网址

## 添加正则表达式

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
package main

import (
"net/http"
"io/ioutil"
"fmt"
"io"
"golang.org/x/net/html/charset"
"golang.org/x/text/encoding"
"bufio"
"golang.org/x/text/transform"
"regexp"
)

func main(){
resp,err:= http.Get("http://www.zhenai.com/zhenghun")
if err!=nil{
panic(err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK{
fmt.Println("Error: status code",resp.StatusCode)
}
e:= determineEncoding(resp.Body)
utf8reader:= transform.NewReader(resp.Body,e.NewDecoder())

all,err:= ioutil.ReadAll(utf8reader)
if err!=nil{
panic(err)
}
//fmt.Printf("%s\n",all)
printCityList(all)
}

func determineEncoding(r io.Reader) encoding.Encoding{
bytes,err := bufio.NewReader(r).Peek(1024)
if err !=nil{
panic(err)
}
e,_,_:= charset.DetermineEncoding(bytes,"")
return e
}

func printCityList(contents []byte){

//正则匹配网址http://www.zhenai.com/zhenghun/xiamen" data-v-4e064b2c>厦门</a>
//[^>]代表以>结尾,*>代表到达之前>之前的东西,可能有换行符。
re:=regexp.MustCompile(`http://www.zhenai.com/zhenghun/[0-9a-z]+"[^>]*>[^<]+</a>`)

matches:= re.FindAll(contents,-1)

for _,m :=range matches{
fmt.Printf("%s\n",m)
}

fmt.Printf("Matches found: %d\n",len(matches))
}