支持链接提取

This commit is contained in:
2021-12-20 18:48:35 +08:00
parent dade855613
commit 336658850a
6 changed files with 125 additions and 7 deletions

25
crawler/collector.go Normal file
View File

@ -0,0 +1,25 @@
// Package crawler ...
//
// Description : crawler ...
//
// Author : go_developer@163.com<白茶清欢>
//
// Date : 2021-12-20 4:46 PM
package crawler
import "github.com/gocolly/colly"
// StartHTMLCollector 获取页面爬虫实例
//
// Author : go_developer@163.com<白茶清欢>
//
// Date : 4:47 PM 2021/12/20
func StartHTMLCollector(domainList []string, visitURL string, requestHandler IRequestHandler) error {
c := colly.NewCollector()
// 设置域名白名单, 不设置, 默认所有均可访问
c.AllowedDomains = domainList
c.OnRequest(requestHandler.OnRequest())
// html处理
c.OnHTML(requestHandler.OnHTML())
return c.Visit(visitURL)
}

58
crawler/collector_test.go Normal file
View File

@ -0,0 +1,58 @@
// Package crawler ...
//
// Description : crawler ...
//
// Author : go_developer@163.com<白茶清欢>
//
// Date : 2021-12-20 5:58 PM
package crawler
import (
"fmt"
"testing"
"github.com/gocolly/colly"
)
// TestStartCollector ...
//
// Author : go_developer@163.com<白茶清欢>
//
// Date : 5:59 PM 2021/12/20
func TestStartCollector(t *testing.T) {
if err := StartHTMLCollector([]string{}, "http://www.baidu.com", &testHandler{}); nil != err {
panic("出现异常 : " + err.Error())
}
}
type testHandler struct {
}
func (t *testHandler) OnRequest() colly.RequestCallback {
return func(r *colly.Request) {
fmt.Println("开始请求 : ", r.URL)
}
}
func (t *testHandler) OnError() {
fmt.Println("请求异常 : ")
}
func (t *testHandler) OnResponse() {
}
func (t *testHandler) OnHTML() (string, colly.HTMLCallback) {
return "a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
// Print link
fmt.Printf("Link found: %q -> %s\n", e.Text, link)
}
}
func (t *testHandler) OnScraped() {
}

8
crawler/define.go Normal file
View File

@ -0,0 +1,8 @@
// Package crawler ...
//
// Description : crawler ...
//
// Author : go_developer@163.com<白茶清欢>
//
// Date : 2021-12-20 4:41 PM
package crawler

30
crawler/i_handler.go Normal file
View File

@ -0,0 +1,30 @@
// Package crawler ...
//
// Description : crawler ...
//
// Author : go_developer@163.com<白茶清欢>
//
// Date : 2021-12-20 4:50 PM
package crawler
import (
"github.com/gocolly/colly"
)
// IRequestHandler 请求结果的处理
//
// Author : go_developer@163.com<白茶清欢>
//
// Date : 4:50 PM 2021/12/20
type IRequestHandler interface {
// OnRequest 在发起请求前被调用
OnRequest() colly.RequestCallback
// OnError 请求过程中如果发生错误被调用
OnError()
// OnResponse 收到回复后被调用
OnResponse()
// OnHTML 在OnResponse之后被调用如果收到的内容是HTML
OnHTML() (string, colly.HTMLCallback)
// OnScraped 在OnHTML之后被调用
OnScraped()
}