支持链接提取
This commit is contained in:
25
crawler/collector.go
Normal file
25
crawler/collector.go
Normal file
@ -0,0 +1,25 @@
|
||||
// Package crawler ...
|
||||
//
|
||||
// Description : crawler ...
|
||||
//
|
||||
// Author : go_developer@163.com<白茶清欢>
|
||||
//
|
||||
// Date : 2021-12-20 4:46 PM
|
||||
package crawler
|
||||
|
||||
import "github.com/gocolly/colly"
|
||||
|
||||
// StartHTMLCollector 获取页面爬虫实例
|
||||
//
|
||||
// Author : go_developer@163.com<白茶清欢>
|
||||
//
|
||||
// Date : 4:47 PM 2021/12/20
|
||||
func StartHTMLCollector(domainList []string, visitURL string, requestHandler IRequestHandler) error {
|
||||
c := colly.NewCollector()
|
||||
// 设置域名白名单, 不设置, 默认所有均可访问
|
||||
c.AllowedDomains = domainList
|
||||
c.OnRequest(requestHandler.OnRequest())
|
||||
// html处理
|
||||
c.OnHTML(requestHandler.OnHTML())
|
||||
return c.Visit(visitURL)
|
||||
}
|
58
crawler/collector_test.go
Normal file
58
crawler/collector_test.go
Normal file
@ -0,0 +1,58 @@
|
||||
// Package crawler ...
|
||||
//
|
||||
// Description : crawler ...
|
||||
//
|
||||
// Author : go_developer@163.com<白茶清欢>
|
||||
//
|
||||
// Date : 2021-12-20 5:58 PM
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
"github.com/gocolly/colly"
|
||||
)
|
||||
|
||||
// TestStartCollector ...
|
||||
//
|
||||
// Author : go_developer@163.com<白茶清欢>
|
||||
//
|
||||
// Date : 5:59 PM 2021/12/20
|
||||
func TestStartCollector(t *testing.T) {
|
||||
if err := StartHTMLCollector([]string{}, "http://www.baidu.com", &testHandler{}); nil != err {
|
||||
panic("出现异常 : " + err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
type testHandler struct {
|
||||
}
|
||||
|
||||
func (t *testHandler) OnRequest() colly.RequestCallback {
|
||||
return func(r *colly.Request) {
|
||||
fmt.Println("开始请求 : ", r.URL)
|
||||
}
|
||||
}
|
||||
|
||||
func (t *testHandler) OnError() {
|
||||
fmt.Println("请求异常 : ")
|
||||
}
|
||||
|
||||
func (t *testHandler) OnResponse() {
|
||||
|
||||
}
|
||||
|
||||
func (t *testHandler) OnHTML() (string, colly.HTMLCallback) {
|
||||
return "a[href]", func(e *colly.HTMLElement) {
|
||||
link := e.Attr("href")
|
||||
|
||||
// Print link
|
||||
|
||||
fmt.Printf("Link found: %q -> %s\n", e.Text, link)
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
func (t *testHandler) OnScraped() {
|
||||
|
||||
}
|
8
crawler/define.go
Normal file
8
crawler/define.go
Normal file
@ -0,0 +1,8 @@
|
||||
// Package crawler ...
|
||||
//
|
||||
// Description : crawler ...
|
||||
//
|
||||
// Author : go_developer@163.com<白茶清欢>
|
||||
//
|
||||
// Date : 2021-12-20 4:41 PM
|
||||
package crawler
|
30
crawler/i_handler.go
Normal file
30
crawler/i_handler.go
Normal file
@ -0,0 +1,30 @@
|
||||
// Package crawler ...
|
||||
//
|
||||
// Description : crawler ...
|
||||
//
|
||||
// Author : go_developer@163.com<白茶清欢>
|
||||
//
|
||||
// Date : 2021-12-20 4:50 PM
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"github.com/gocolly/colly"
|
||||
)
|
||||
|
||||
// IRequestHandler 请求结果的处理
|
||||
//
|
||||
// Author : go_developer@163.com<白茶清欢>
|
||||
//
|
||||
// Date : 4:50 PM 2021/12/20
|
||||
type IRequestHandler interface {
|
||||
// OnRequest 在发起请求前被调用
|
||||
OnRequest() colly.RequestCallback
|
||||
// OnError 请求过程中如果发生错误被调用
|
||||
OnError()
|
||||
// OnResponse 收到回复后被调用
|
||||
OnResponse()
|
||||
// OnHTML 在OnResponse之后被调用,如果收到的内容是HTML
|
||||
OnHTML() (string, colly.HTMLCallback)
|
||||
// OnScraped 在OnHTML之后被调用
|
||||
OnScraped()
|
||||
}
|
Reference in New Issue
Block a user