支持链接提取
This commit is contained in:
parent
dade855613
commit
336658850a
25
crawler/collector.go
Normal file
25
crawler/collector.go
Normal file
@ -0,0 +1,25 @@
|
||||
// Package crawler ...
|
||||
//
|
||||
// Description : crawler ...
|
||||
//
|
||||
// Author : go_developer@163.com<白茶清欢>
|
||||
//
|
||||
// Date : 2021-12-20 4:46 PM
|
||||
package crawler
|
||||
|
||||
import "github.com/gocolly/colly"
|
||||
|
||||
// StartHTMLCollector 获取页面爬虫实例
|
||||
//
|
||||
// Author : go_developer@163.com<白茶清欢>
|
||||
//
|
||||
// Date : 4:47 PM 2021/12/20
|
||||
func StartHTMLCollector(domainList []string, visitURL string, requestHandler IRequestHandler) error {
|
||||
c := colly.NewCollector()
|
||||
// 设置域名白名单, 不设置, 默认所有均可访问
|
||||
c.AllowedDomains = domainList
|
||||
c.OnRequest(requestHandler.OnRequest())
|
||||
// html处理
|
||||
c.OnHTML(requestHandler.OnHTML())
|
||||
return c.Visit(visitURL)
|
||||
}
|
58
crawler/collector_test.go
Normal file
58
crawler/collector_test.go
Normal file
@ -0,0 +1,58 @@
|
||||
// Package crawler ...
|
||||
//
|
||||
// Description : crawler ...
|
||||
//
|
||||
// Author : go_developer@163.com<白茶清欢>
|
||||
//
|
||||
// Date : 2021-12-20 5:58 PM
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
"github.com/gocolly/colly"
|
||||
)
|
||||
|
||||
// TestStartCollector ...
|
||||
//
|
||||
// Author : go_developer@163.com<白茶清欢>
|
||||
//
|
||||
// Date : 5:59 PM 2021/12/20
|
||||
func TestStartCollector(t *testing.T) {
|
||||
if err := StartHTMLCollector([]string{}, "http://www.baidu.com", &testHandler{}); nil != err {
|
||||
panic("出现异常 : " + err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
type testHandler struct {
|
||||
}
|
||||
|
||||
func (t *testHandler) OnRequest() colly.RequestCallback {
|
||||
return func(r *colly.Request) {
|
||||
fmt.Println("开始请求 : ", r.URL)
|
||||
}
|
||||
}
|
||||
|
||||
func (t *testHandler) OnError() {
|
||||
fmt.Println("请求异常 : ")
|
||||
}
|
||||
|
||||
func (t *testHandler) OnResponse() {
|
||||
|
||||
}
|
||||
|
||||
func (t *testHandler) OnHTML() (string, colly.HTMLCallback) {
|
||||
return "a[href]", func(e *colly.HTMLElement) {
|
||||
link := e.Attr("href")
|
||||
|
||||
// Print link
|
||||
|
||||
fmt.Printf("Link found: %q -> %s\n", e.Text, link)
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
func (t *testHandler) OnScraped() {
|
||||
|
||||
}
|
8
crawler/define.go
Normal file
8
crawler/define.go
Normal file
@ -0,0 +1,8 @@
|
||||
// Package crawler ...
|
||||
//
|
||||
// Description : crawler ...
|
||||
//
|
||||
// Author : go_developer@163.com<白茶清欢>
|
||||
//
|
||||
// Date : 2021-12-20 4:41 PM
|
||||
package crawler
|
30
crawler/i_handler.go
Normal file
30
crawler/i_handler.go
Normal file
@ -0,0 +1,30 @@
|
||||
// Package crawler ...
|
||||
//
|
||||
// Description : crawler ...
|
||||
//
|
||||
// Author : go_developer@163.com<白茶清欢>
|
||||
//
|
||||
// Date : 2021-12-20 4:50 PM
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"github.com/gocolly/colly"
|
||||
)
|
||||
|
||||
// IRequestHandler 请求结果的处理
|
||||
//
|
||||
// Author : go_developer@163.com<白茶清欢>
|
||||
//
|
||||
// Date : 4:50 PM 2021/12/20
|
||||
type IRequestHandler interface {
|
||||
// OnRequest 在发起请求前被调用
|
||||
OnRequest() colly.RequestCallback
|
||||
// OnError 请求过程中如果发生错误被调用
|
||||
OnError()
|
||||
// OnResponse 收到回复后被调用
|
||||
OnResponse()
|
||||
// OnHTML 在OnResponse之后被调用,如果收到的内容是HTML
|
||||
OnHTML() (string, colly.HTMLCallback)
|
||||
// OnScraped 在OnHTML之后被调用
|
||||
OnScraped()
|
||||
}
|
5
go.mod
5
go.mod
@ -14,6 +14,7 @@ require (
|
||||
github.com/gin-gonic/gin v1.7.6
|
||||
github.com/go-redis/redis/v8 v8.11.4
|
||||
github.com/go-redis/redis_rate/v9 v9.1.2
|
||||
github.com/gocolly/colly v1.2.0
|
||||
github.com/lestrrat-go/file-rotatelogs v2.4.0+incompatible
|
||||
github.com/pkg/errors v0.9.1
|
||||
github.com/shirou/gopsutil v3.21.10+incompatible
|
||||
@ -57,7 +58,6 @@ require (
|
||||
github.com/go-playground/validator/v10 v10.4.1 // indirect
|
||||
github.com/go-sql-driver/mysql v1.6.0 // indirect
|
||||
github.com/gobwas/glob v0.2.3 // indirect
|
||||
github.com/gocolly/colly v1.2.0 // indirect
|
||||
github.com/gogo/protobuf v1.3.2 // indirect
|
||||
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
|
||||
github.com/golang/protobuf v1.5.2 // indirect
|
||||
@ -69,7 +69,6 @@ require (
|
||||
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 // indirect
|
||||
github.com/hashicorp/go-uuid v1.0.2 // indirect
|
||||
github.com/hashicorp/hcl v1.0.0 // indirect
|
||||
github.com/jawher/mow.cli v1.2.0 // indirect
|
||||
github.com/jcmturner/aescts/v2 v2.0.0 // indirect
|
||||
github.com/jcmturner/dnsutils/v2 v2.0.0 // indirect
|
||||
github.com/jcmturner/gofork v1.0.0 // indirect
|
||||
@ -116,7 +115,7 @@ require (
|
||||
go.uber.org/atomic v1.7.0 // indirect
|
||||
go.uber.org/multierr v1.6.0 // indirect
|
||||
golang.org/x/crypto v0.0.0-20210920023735-84f357641f63 // indirect
|
||||
golang.org/x/net v0.0.0-20210917221730-978cfadd31cf // indirect
|
||||
golang.org/x/net v0.0.0-20211216030914-fe4d6282115f // indirect
|
||||
golang.org/x/sys v0.0.0-20211123173158-ef496fb156ab // indirect
|
||||
golang.org/x/text v0.3.7 // indirect
|
||||
golang.org/x/time v0.0.0-20211116232009-f0f3c7e86c11 // indirect
|
||||
|
6
go.sum
6
go.sum
@ -292,8 +292,6 @@ github.com/hashicorp/serf v0.9.5/go.mod h1:UWDWwZeL5cuWDJdl0C6wrvrUwEqtQ4ZKBKKEN
|
||||
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
|
||||
github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
|
||||
github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
|
||||
github.com/jawher/mow.cli v1.2.0 h1:e6ViPPy+82A/NFF/cfbq3Lr6q4JHKT9tyHwTCcUQgQw=
|
||||
github.com/jawher/mow.cli v1.2.0/go.mod h1:y+pcA3jBAdo/GIZx/0rFjw/K2bVEODP9rfZOfaiq8Ko=
|
||||
github.com/jcmturner/aescts/v2 v2.0.0 h1:9YKLH6ey7H4eDBXW8khjYslgyqG2xZikXP0EQFKrle8=
|
||||
github.com/jcmturner/aescts/v2 v2.0.0/go.mod h1:AiaICIRyfYg35RUkr8yESTqvSy7csK90qZ5xfvvsoNs=
|
||||
github.com/jcmturner/dnsutils/v2 v2.0.0 h1:lltnkeZGL0wILNvrNiVCR6Ro5PGU/SeBvVO/8c/iPbo=
|
||||
@ -474,7 +472,6 @@ github.com/spf13/viper v1.9.0 h1:yR6EXjTp0y0cLN8OZg1CRZmOBdI88UcGkhgyJhu6nZk=
|
||||
github.com/spf13/viper v1.9.0/go.mod h1:+i6ajR7OX2XaiBkrcZJFK21htRk7eDeLg7+O6bhUPP4=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE=
|
||||
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
|
||||
@ -644,8 +641,9 @@ golang.org/x/net v0.0.0-20210428140749-89ef3d95e781/go.mod h1:OJAsFXCWl8Ukc7SiCT
|
||||
golang.org/x/net v0.0.0-20210503060351-7fd8e65b6420/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
|
||||
golang.org/x/net v0.0.0-20210726213435-c6fcb2dbf985/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
|
||||
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
|
||||
golang.org/x/net v0.0.0-20210917221730-978cfadd31cf h1:R150MpwJIv1MpS0N/pc+NhTM8ajzvlmxlY5OYsrevXQ=
|
||||
golang.org/x/net v0.0.0-20210917221730-978cfadd31cf/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
|
||||
golang.org/x/net v0.0.0-20211216030914-fe4d6282115f h1:hEYJvxw1lSnWIl8X9ofsYMklzaDs90JI2az5YMd4fPM=
|
||||
golang.org/x/net v0.0.0-20211216030914-fe4d6282115f/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
|
||||
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
|
||||
golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
|
||||
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
|
||||
|
Loading…
Reference in New Issue
Block a user