支持链接提取

This commit is contained in:
白茶清欢 2021-12-20 18:48:35 +08:00
parent dade855613
commit 336658850a
6 changed files with 125 additions and 7 deletions

25
crawler/collector.go Normal file
View File

@ -0,0 +1,25 @@
// Package crawler ...
//
// Description : crawler ...
//
// Author : go_developer@163.com<白茶清欢>
//
// Date : 2021-12-20 4:46 PM
package crawler
import "github.com/gocolly/colly"
// StartHTMLCollector 获取页面爬虫实例
//
// Author : go_developer@163.com<白茶清欢>
//
// Date : 4:47 PM 2021/12/20
func StartHTMLCollector(domainList []string, visitURL string, requestHandler IRequestHandler) error {
c := colly.NewCollector()
// 设置域名白名单, 不设置, 默认所有均可访问
c.AllowedDomains = domainList
c.OnRequest(requestHandler.OnRequest())
// html处理
c.OnHTML(requestHandler.OnHTML())
return c.Visit(visitURL)
}

58
crawler/collector_test.go Normal file
View File

@ -0,0 +1,58 @@
// Package crawler ...
//
// Description : crawler ...
//
// Author : go_developer@163.com<白茶清欢>
//
// Date : 2021-12-20 5:58 PM
package crawler
import (
"fmt"
"testing"
"github.com/gocolly/colly"
)
// TestStartCollector ...
//
// Author : go_developer@163.com<白茶清欢>
//
// Date : 5:59 PM 2021/12/20
func TestStartCollector(t *testing.T) {
if err := StartHTMLCollector([]string{}, "http://www.baidu.com", &testHandler{}); nil != err {
panic("出现异常 : " + err.Error())
}
}
type testHandler struct {
}
func (t *testHandler) OnRequest() colly.RequestCallback {
return func(r *colly.Request) {
fmt.Println("开始请求 : ", r.URL)
}
}
func (t *testHandler) OnError() {
fmt.Println("请求异常 : ")
}
func (t *testHandler) OnResponse() {
}
func (t *testHandler) OnHTML() (string, colly.HTMLCallback) {
return "a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
// Print link
fmt.Printf("Link found: %q -> %s\n", e.Text, link)
}
}
func (t *testHandler) OnScraped() {
}

8
crawler/define.go Normal file
View File

@ -0,0 +1,8 @@
// Package crawler ...
//
// Description : crawler ...
//
// Author : go_developer@163.com<白茶清欢>
//
// Date : 2021-12-20 4:41 PM
package crawler

30
crawler/i_handler.go Normal file
View File

@ -0,0 +1,30 @@
// Package crawler ...
//
// Description : crawler ...
//
// Author : go_developer@163.com<白茶清欢>
//
// Date : 2021-12-20 4:50 PM
package crawler
import (
"github.com/gocolly/colly"
)
// IRequestHandler 请求结果的处理
//
// Author : go_developer@163.com<白茶清欢>
//
// Date : 4:50 PM 2021/12/20
type IRequestHandler interface {
// OnRequest 在发起请求前被调用
OnRequest() colly.RequestCallback
// OnError 请求过程中如果发生错误被调用
OnError()
// OnResponse 收到回复后被调用
OnResponse()
// OnHTML 在OnResponse之后被调用如果收到的内容是HTML
OnHTML() (string, colly.HTMLCallback)
// OnScraped 在OnHTML之后被调用
OnScraped()
}

5
go.mod
View File

@ -14,6 +14,7 @@ require (
github.com/gin-gonic/gin v1.7.6
github.com/go-redis/redis/v8 v8.11.4
github.com/go-redis/redis_rate/v9 v9.1.2
github.com/gocolly/colly v1.2.0
github.com/lestrrat-go/file-rotatelogs v2.4.0+incompatible
github.com/pkg/errors v0.9.1
github.com/shirou/gopsutil v3.21.10+incompatible
@ -57,7 +58,6 @@ require (
github.com/go-playground/validator/v10 v10.4.1 // indirect
github.com/go-sql-driver/mysql v1.6.0 // indirect
github.com/gobwas/glob v0.2.3 // indirect
github.com/gocolly/colly v1.2.0 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.2 // indirect
@ -69,7 +69,6 @@ require (
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 // indirect
github.com/hashicorp/go-uuid v1.0.2 // indirect
github.com/hashicorp/hcl v1.0.0 // indirect
github.com/jawher/mow.cli v1.2.0 // indirect
github.com/jcmturner/aescts/v2 v2.0.0 // indirect
github.com/jcmturner/dnsutils/v2 v2.0.0 // indirect
github.com/jcmturner/gofork v1.0.0 // indirect
@ -116,7 +115,7 @@ require (
go.uber.org/atomic v1.7.0 // indirect
go.uber.org/multierr v1.6.0 // indirect
golang.org/x/crypto v0.0.0-20210920023735-84f357641f63 // indirect
golang.org/x/net v0.0.0-20210917221730-978cfadd31cf // indirect
golang.org/x/net v0.0.0-20211216030914-fe4d6282115f // indirect
golang.org/x/sys v0.0.0-20211123173158-ef496fb156ab // indirect
golang.org/x/text v0.3.7 // indirect
golang.org/x/time v0.0.0-20211116232009-f0f3c7e86c11 // indirect

6
go.sum
View File

@ -292,8 +292,6 @@ github.com/hashicorp/serf v0.9.5/go.mod h1:UWDWwZeL5cuWDJdl0C6wrvrUwEqtQ4ZKBKKEN
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
github.com/jawher/mow.cli v1.2.0 h1:e6ViPPy+82A/NFF/cfbq3Lr6q4JHKT9tyHwTCcUQgQw=
github.com/jawher/mow.cli v1.2.0/go.mod h1:y+pcA3jBAdo/GIZx/0rFjw/K2bVEODP9rfZOfaiq8Ko=
github.com/jcmturner/aescts/v2 v2.0.0 h1:9YKLH6ey7H4eDBXW8khjYslgyqG2xZikXP0EQFKrle8=
github.com/jcmturner/aescts/v2 v2.0.0/go.mod h1:AiaICIRyfYg35RUkr8yESTqvSy7csK90qZ5xfvvsoNs=
github.com/jcmturner/dnsutils/v2 v2.0.0 h1:lltnkeZGL0wILNvrNiVCR6Ro5PGU/SeBvVO/8c/iPbo=
@ -474,7 +472,6 @@ github.com/spf13/viper v1.9.0 h1:yR6EXjTp0y0cLN8OZg1CRZmOBdI88UcGkhgyJhu6nZk=
github.com/spf13/viper v1.9.0/go.mod h1:+i6ajR7OX2XaiBkrcZJFK21htRk7eDeLg7+O6bhUPP4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
@ -644,8 +641,9 @@ golang.org/x/net v0.0.0-20210428140749-89ef3d95e781/go.mod h1:OJAsFXCWl8Ukc7SiCT
golang.org/x/net v0.0.0-20210503060351-7fd8e65b6420/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20210726213435-c6fcb2dbf985/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20210917221730-978cfadd31cf h1:R150MpwJIv1MpS0N/pc+NhTM8ajzvlmxlY5OYsrevXQ=
golang.org/x/net v0.0.0-20210917221730-978cfadd31cf/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20211216030914-fe4d6282115f h1:hEYJvxw1lSnWIl8X9ofsYMklzaDs90JI2az5YMd4fPM=
golang.org/x/net v0.0.0-20211216030914-fe4d6282115f/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=