用了goquery 和regexp两个包
用法如query:
dom,err:=goquery.NewDocumentFromReader(strings.NewReader(result))
if err!=nil{
fmt.Println("HttpGet err :",err)
}
dom.Find(".Programlist .Cont ul p").Each(func(i int, selection *goquery.Selection) {
// if selection != nil {
title += selection.Text() + "\r"
titleS = append(titleS,selection.Text())
// }
})
regexp用法:
// fmt.Println(video_cont)
rel2 := regexp.MustCompile(`"title":"流畅","url":"(.*?)"`)
if rel2 == nil {
fmt.Println("准备好了12")
}
arr2 := rel2.FindAllStringSubmatch(video_cont,-1) //获取的数据是二维的切片
package main
import (
"fmt"
"strconv"
"net/http"
"os"
"regexp"
"strings"
"github.com/PuerkitoBio/goquery"
"database/sql"
_ "github.com/go-sql-driver/mysql"
)
// type collectionmwd struct {
// ID int64 `db:"id"`
// title string `db:title`
// cover string `db:"cover"` //由于在mysql的users表中name没有设置为NOT NULL,所以name可能为null,在查询过程中会返回nil,如果是string类型则无法接收nil,但sql.NullString则可以接收nil值
// videoUrl string `db:"videourl"`
// }
const (
USERNAME = "root"
PASSWORD = "root"
NETWORK = "tcp"
SERVER = "localhost"
PORT = 3306
DATABASE = "guanfu_school"
)
func main () {
var start ,end int
fmt.Printf("请输入起始页:(2013开始,2019结束)")
fmt.Scan(&start)
fmt.Printf("请输入结束页:(2013开始,2019结束)")
fmt.Scan(&end)
// image := []string{}
// fmt.Println(len(image))
Dowork(start,end)
// title := []string{"12321321","dwefdsfsd","萨芬就开始放假都是放到数据库"}
// path := []string{"12321321","dwefdsfsd","萨芬就开始放假都是放到数据库"}
// fmt.Println(image[2])
// fmt.Println(title[2])
// fmt.Println(path[2])
// insertData(image,title,path)
}
func Dowork (start , end int) {
// return
fmt.Println("正在爬取数据")
// var title string
//开始循环每个年份
//然后每个年份再循环找每一页的数据
for i:=start;i<=end;i++ {
for j:=1;j<=3;j++ {
//写入文件
var img string
var title string
var path string
//插入数据库数据
var imgS = make([]string,0)
var titleS = make([]string,0)
var pathS = make([]string,0)
var url = fmt.Sprintf("http://vod.gxtv.cn/program/28/%s/%s.html",strconv.Itoa(i),strconv.Itoa(j))
//获取一年中每页的内容
result,err := HttpGet(url)
if err != nil {
fmt.Println("HttpGet err :",err)
break
}
// 过滤标题
dom,err:=goquery.NewDocumentFromReader(strings.NewReader(result))
if err!=nil{
fmt.Println("HttpGet err :",err)
}
dom.Find(".Programlist .Cont ul p").Each(func(i int, selection *goquery.Selection) {
// if selection != nil {
title += selection.Text() + "\r"
titleS = append(titleS,selection.Text())
// }
})
fmt.Println("title:",title)
fmt.Println("titleS",titleS)
// 过滤封面url
imgReg := regexp.MustCompile(`
1 {
img += data[1] + "\r"
imgS = append(imgS,data[1])
}
}
fmt.Println("img :",img)
fmt.Println("imgS :",imgS)
//找到详情页路径 再爬取代码
rel := regexp.MustCompile(`