diff --git a/crawler/collector.go b/crawler/collector.go new file mode 100644 index 0000000..2b15206 --- /dev/null +++ b/crawler/collector.go @@ -0,0 +1,25 @@ +// Package crawler ... +// +// Description : crawler ... +// +// Author : go_developer@163.com<白茶清欢> +// +// Date : 2021-12-20 4:46 PM +package crawler + +import "github.com/gocolly/colly" + +// StartHTMLCollector 获取页面爬虫实例 +// +// Author : go_developer@163.com<白茶清欢> +// +// Date : 4:47 PM 2021/12/20 +func StartHTMLCollector(domainList []string, visitURL string, requestHandler IRequestHandler) error { + c := colly.NewCollector() + // 设置域名白名单, 不设置, 默认所有均可访问 + c.AllowedDomains = domainList + c.OnRequest(requestHandler.OnRequest()) + // html处理 + c.OnHTML(requestHandler.OnHTML()) + return c.Visit(visitURL) +} diff --git a/crawler/collector_test.go b/crawler/collector_test.go new file mode 100644 index 0000000..a8ec958 --- /dev/null +++ b/crawler/collector_test.go @@ -0,0 +1,58 @@ +// Package crawler ... +// +// Description : crawler ... +// +// Author : go_developer@163.com<白茶清欢> +// +// Date : 2021-12-20 5:58 PM +package crawler + +import ( + "fmt" + "testing" + + "github.com/gocolly/colly" +) + +// TestStartCollector ... +// +// Author : go_developer@163.com<白茶清欢> +// +// Date : 5:59 PM 2021/12/20 +func TestStartCollector(t *testing.T) { + if err := StartHTMLCollector([]string{}, "http://www.baidu.com", &testHandler{}); nil != err { + panic("出现异常 : " + err.Error()) + } +} + +type testHandler struct { +} + +func (t *testHandler) OnRequest() colly.RequestCallback { + return func(r *colly.Request) { + fmt.Println("开始请求 : ", r.URL) + } +} + +func (t *testHandler) OnError() { + fmt.Println("请求异常 : ") +} + +func (t *testHandler) OnResponse() { + +} + +func (t *testHandler) OnHTML() (string, colly.HTMLCallback) { + return "a[href]", func(e *colly.HTMLElement) { + link := e.Attr("href") + + // Print link + + fmt.Printf("Link found: %q -> %s\n", e.Text, link) + + } +} + +func (t *testHandler) OnScraped() { + +} diff --git a/crawler/define.go b/crawler/define.go new file mode 100644 index 0000000..d756c01 --- /dev/null +++ b/crawler/define.go @@ -0,0 +1,8 @@ +// Package crawler ... +// +// Description : crawler ... +// +// Author : go_developer@163.com<白茶清欢> +// +// Date : 2021-12-20 4:41 PM +package crawler diff --git a/crawler/i_handler.go b/crawler/i_handler.go new file mode 100644 index 0000000..e75d771 --- /dev/null +++ b/crawler/i_handler.go @@ -0,0 +1,30 @@ +// Package crawler ... +// +// Description : crawler ... +// +// Author : go_developer@163.com<白茶清欢> +// +// Date : 2021-12-20 4:50 PM +package crawler + +import ( + "github.com/gocolly/colly" +) + +// IRequestHandler 请求结果的处理 +// +// Author : go_developer@163.com<白茶清欢> +// +// Date : 4:50 PM 2021/12/20 +type IRequestHandler interface { + // OnRequest 在发起请求前被调用 + OnRequest() colly.RequestCallback + // OnError 请求过程中如果发生错误被调用 + OnError() + // OnResponse 收到回复后被调用 + OnResponse() + // OnHTML 在OnResponse之后被调用,如果收到的内容是HTML + OnHTML() (string, colly.HTMLCallback) + // OnScraped 在OnHTML之后被调用 + OnScraped() +} diff --git a/go.mod b/go.mod index 2eea023..bae3d41 100644 --- a/go.mod +++ b/go.mod @@ -14,6 +14,7 @@ require ( github.com/gin-gonic/gin v1.7.6 github.com/go-redis/redis/v8 v8.11.4 github.com/go-redis/redis_rate/v9 v9.1.2 + github.com/gocolly/colly v1.2.0 github.com/lestrrat-go/file-rotatelogs v2.4.0+incompatible github.com/pkg/errors v0.9.1 github.com/shirou/gopsutil v3.21.10+incompatible @@ -57,7 +58,6 @@ require ( github.com/go-playground/validator/v10 v10.4.1 // indirect github.com/go-sql-driver/mysql v1.6.0 // indirect github.com/gobwas/glob v0.2.3 // indirect - github.com/gocolly/colly v1.2.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/protobuf v1.5.2 // indirect @@ -69,7 +69,6 @@ require ( github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 // indirect github.com/hashicorp/go-uuid v1.0.2 // indirect github.com/hashicorp/hcl v1.0.0 // indirect - github.com/jawher/mow.cli v1.2.0 // indirect github.com/jcmturner/aescts/v2 v2.0.0 // indirect github.com/jcmturner/dnsutils/v2 v2.0.0 // indirect github.com/jcmturner/gofork v1.0.0 // indirect @@ -116,7 +115,7 @@ require ( go.uber.org/atomic v1.7.0 // indirect go.uber.org/multierr v1.6.0 // indirect golang.org/x/crypto v0.0.0-20210920023735-84f357641f63 // indirect - golang.org/x/net v0.0.0-20210917221730-978cfadd31cf // indirect + golang.org/x/net v0.0.0-20211216030914-fe4d6282115f // indirect golang.org/x/sys v0.0.0-20211123173158-ef496fb156ab // indirect golang.org/x/text v0.3.7 // indirect golang.org/x/time v0.0.0-20211116232009-f0f3c7e86c11 // indirect diff --git a/go.sum b/go.sum index 1e5ad25..6d8da63 100644 --- a/go.sum +++ b/go.sum @@ -292,8 +292,6 @@ github.com/hashicorp/serf v0.9.5/go.mod h1:UWDWwZeL5cuWDJdl0C6wrvrUwEqtQ4ZKBKKEN github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= -github.com/jawher/mow.cli v1.2.0 h1:e6ViPPy+82A/NFF/cfbq3Lr6q4JHKT9tyHwTCcUQgQw= -github.com/jawher/mow.cli v1.2.0/go.mod h1:y+pcA3jBAdo/GIZx/0rFjw/K2bVEODP9rfZOfaiq8Ko= github.com/jcmturner/aescts/v2 v2.0.0 h1:9YKLH6ey7H4eDBXW8khjYslgyqG2xZikXP0EQFKrle8= github.com/jcmturner/aescts/v2 v2.0.0/go.mod h1:AiaICIRyfYg35RUkr8yESTqvSy7csK90qZ5xfvvsoNs= github.com/jcmturner/dnsutils/v2 v2.0.0 h1:lltnkeZGL0wILNvrNiVCR6Ro5PGU/SeBvVO/8c/iPbo= @@ -474,7 +472,6 @@ github.com/spf13/viper v1.9.0 h1:yR6EXjTp0y0cLN8OZg1CRZmOBdI88UcGkhgyJhu6nZk= github.com/spf13/viper v1.9.0/go.mod h1:+i6ajR7OX2XaiBkrcZJFK21htRk7eDeLg7+O6bhUPP4= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= @@ -644,8 +641,9 @@ golang.org/x/net v0.0.0-20210428140749-89ef3d95e781/go.mod h1:OJAsFXCWl8Ukc7SiCT golang.org/x/net v0.0.0-20210503060351-7fd8e65b6420/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20210726213435-c6fcb2dbf985/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= -golang.org/x/net v0.0.0-20210917221730-978cfadd31cf h1:R150MpwJIv1MpS0N/pc+NhTM8ajzvlmxlY5OYsrevXQ= golang.org/x/net v0.0.0-20210917221730-978cfadd31cf/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.0.0-20211216030914-fe4d6282115f h1:hEYJvxw1lSnWIl8X9ofsYMklzaDs90JI2az5YMd4fPM= +golang.org/x/net v0.0.0-20211216030914-fe4d6282115f/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=