半小时实现一个 go 爬虫

1、Quick Start

只需三步,你就可以部署一个爬取 gocn 网站的所有新闻的爬虫

第一步,你需要去 github 上生成一个自己的 token Settings ——> Developer settings ——> Personal access tokens ——> Generate new token

然后,配置自己的环境变量 export GITHUB_TOKEN=(第一步生成的 token),或者将代码中全局 Token 修改为自己 token

var Token = GetValueFromEnv(“GITHUB_TOKEN”)

第二步,需要在本地安装 redis,并且启动程序之前需要先启动本地 redis,端口使用默认端口 6379,因为程序默认使用 redis 进行去重。redis 的安装 可以参考 redis安装

第三步,git clone 代码仓库,并且在后台进程中运行爬虫,每 6 个小时爬取当天新闻并进行 github 推送。

git clone https://github.com/lubanproj/crawl.git
cd crawl
go build -v 
./crawl &

2、特性

  • 支持每天定时爬取
  • 支持分页爬取
  • 支持数据去重
  • 支持 github 推送

3、展示效果

半小时实现一个 go 爬虫_第1张图片
详情效果可见:go_read

4、源码分析

(1)爬取网站

// Crawl all gocn topics
func Crawl(url string) {

	pattern := `/topics/\d+`

	collector := colly.NewCollector()
	collector.OnHTML("a[title]", func(e *colly.HTMLElement) {
		// regex match topic
		path := e.Attr("href")
		topic, ok := regexMatch(path, pattern)
		if ok {
			e.Request.Visit(fmt.Sprintf("https://gocn.vip%s",topic))
		}
	})

	redisAddr := ":6379"
	conn, err := redis.Dial("tcp",redisAddr)
	if err != nil {
		log.Fatalf("get redis conn error : %v", err)
	}
	defer conn.Close()

	collector.OnRequest(func(r *colly.Request) {
		topic, ok := regexMatch(r.URL.Path, pattern)
		if ok {
			r.Visit(fmt.Sprintf("https://gocn.vip%s",topic))
			// fmt.Println("content",r.URL)
		}

	})

	collector.OnResponse(func(r *colly.Response) {

		topic := strings.Replace(r.Request.URL.Path,"/topics/","", -1)
		isExist, err := existTopic(conn, topic)

		// the topic has had crawled
		if isExist == 1 || err != nil {
			return
		}

		title, content, ok := parseContent(string(r.Body))
		titleAndContent := fmt.Sprintf("

%s

%s
"
, title, content) fmt.Println("titleAndContent : ", titleAndContent) date := getDate(title) if curDay := time.Now().Format("2006-01-02"); curDay != date { // just climb today's data return } if ok && content != "" && title != "" { pushToGithub(titleAndContent, Token) } saveDB(conn, topic, date) }) collector.Visit(url) }

(2)正则表达式解析内容

func parseContent(body string) (string, string, bool) {

	pattern := `<p>GoCN(.|\n|\t)*每日新闻(.*?)</p>`
	title, _ := regexMatch(body, pattern)
	if title == "" {
		pattern = `<h[0-9]>GoCN(.|\n|\t)*每日新闻(.|\n|\t)*</h[0-9]>?`
		title, _ = regexMatch(body, pattern)

		if title == "" {
			return "", "", false
		}
		pattern = `>(.|\n|\t)*每日新闻(.|\n|\t)*<`
		title, _ = regexMatch(title, pattern)
		title = strings.Replace(title, "<", "", 1)
		title = strings.Replace(title, ">", "", 1)
	}

	pattern = `<ol>(.|\n|\t)*</ol>`
	content, _ := regexMatch(body, pattern)

	return title, content, true
}

(3) 推送 github

func pushToGithub(data, token string) error {
	if data == "" {
		return errors.New("params error")
	}

	ctx := context.Background()
	ts := oauth2.StaticTokenSource(
		&oauth2.Token{AccessToken: token},
	)

	tc := oauth2.NewClient(ctx, ts)
	client := github.NewClient(tc)
	c := "feat: add gocn news, date : " + time.Now().Format("2006-01-02")
	sha := ""
	content := &github.RepositoryContentFileOptions{
		Message: &c,
		SHA:     &sha,
		Committer: &github.CommitAuthor{
			Name:  github.String("lubanproj"),
			Email: github.String("[email protected]"),
			Login: github.String("lubanproj"),
		},
		Author: &github.CommitAuthor{
			Name:  github.String("lubanproj"),
			Email: github.String("[email protected]"),
			Login: github.String("lubanproj"),
		},
		Branch: github.String("master"),
	}
	op := &github.RepositoryContentGetOptions{}

	repo, _, _, er := client.Repositories.GetContents(ctx, "lubanproj", "go_read", "README.md", op)
	if er != nil || repo == nil {
		fmt.Println("get github repositories error, date: ", time.Now())
		return er
	}

	content.SHA = repo.SHA
	decodeBytes, err := base64.StdEncoding.DecodeString(*repo.Content)
	if err != nil {
		fmt.Println("decode repo error, ",err)
		return err
	}


	oldContentList := strings.Split(string(decodeBytes), "
") if len(oldContentList) != 2 { fmt.Println("README.md format error") } content.Content = []byte(oldContentList[0] + "
" + data + oldContentList[1]) _, _, err = client.Repositories.UpdateFile(ctx, "lubanproj", "go_read", "README.md", content) if err != nil { println(err) return err } return nil }

具体代码可参考:crawl

你可能感兴趣的:(半小时实现一个 go 爬虫)