go语言渐入佳境[51]-爬虫-正则表达式进行分组

##为正则表达式进行分组

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
package main

import (
"net/http"
"io/ioutil"
"fmt"
"io"
"golang.org/x/net/html/charset"
"golang.org/x/text/encoding"
"bufio"
"golang.org/x/text/transform"
"regexp"
)

func main(){
resp,err:= http.Get("http://www.zhenai.com/zhenghun")
if err!=nil{
panic(err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK{
fmt.Println("Error: status code",resp.StatusCode)
}
e:= determineEncoding(resp.Body)
utf8reader:= transform.NewReader(resp.Body,e.NewDecoder())

all,err:= ioutil.ReadAll(utf8reader)
if err!=nil{
panic(err)
}
//fmt.Printf("%s\n",all)
printCityList(all)
}

func determineEncoding(r io.Reader) encoding.Encoding{
bytes,err := bufio.NewReader(r).Peek(1024)
if err !=nil{
panic(err)
}
e,_,_:= charset.DetermineEncoding(bytes,"")
return e
}

func printCityList(contents []byte){
re:=regexp.MustCompile(`(http://www.zhenai.com/zhenghun/[0-9a-z]+)"[^>]*>([^<]+)</a>`)

matches:= re.FindAllSubmatch(contents,-1)

for _,m :=range matches{
fmt.Printf("City: %s,URL:%s\n ",m[2],m[1])

}
fmt.Printf("Matches found: %d\n",len(matches))
}