diff --git a/crawler/collector.go b/crawler/collector.go index 2b15206..7667e1b 100644 --- a/crawler/collector.go +++ b/crawler/collector.go @@ -19,7 +19,10 @@ func StartHTMLCollector(domainList []string, visitURL string, requestHandler IRe // 设置域名白名单, 不设置, 默认所有均可访问 c.AllowedDomains = domainList c.OnRequest(requestHandler.OnRequest()) + c.OnError(requestHandler.OnError()) // html处理 c.OnHTML(requestHandler.OnHTML()) + c.OnResponse(requestHandler.OnResponse()) + c.OnScraped(requestHandler.OnScraped()) return c.Visit(visitURL) } diff --git a/crawler/collector_test.go b/crawler/collector_test.go index a8ec958..bfebb43 100644 --- a/crawler/collector_test.go +++ b/crawler/collector_test.go @@ -20,7 +20,7 @@ import ( // // Date : 5:59 PM 2021/12/20 func TestStartCollector(t *testing.T) { - if err := StartHTMLCollector([]string{}, "http://www.baidu.com", &testHandler{}); nil != err { + if err := StartHTMLCollector([]string{}, "https://go.zhangdeman.cn", &testHandler{}); nil != err { panic("出现异常 : " + err.Error()) } } @@ -34,12 +34,16 @@ func (t *testHandler) OnRequest() colly.RequestCallback { } } -func (t *testHandler) OnError() { - fmt.Println("请求异常 : ") +func (t *testHandler) OnError() colly.ErrorCallback { + return func(response *colly.Response, err error) { + fmt.Println("请求异常 : " + err.Error()) + } } -func (t *testHandler) OnResponse() { - +func (t *testHandler) OnResponse() colly.ResponseCallback { + return func(response *colly.Response) { + fmt.Println("响应数据 : ", response.StatusCode) + } } func (t *testHandler) OnHTML() (string, colly.HTMLCallback) { @@ -53,6 +57,8 @@ func (t *testHandler) OnHTML() (string, colly.HTMLCallback) { } } -func (t *testHandler) OnScraped() { +func (t *testHandler) OnScraped() colly.ScrapedCallback { + return func(response *colly.Response) { + } } diff --git a/crawler/i_handler.go b/crawler/i_handler.go index e75d771..862a13a 100644 --- a/crawler/i_handler.go +++ b/crawler/i_handler.go @@ -20,11 +20,11 @@ type IRequestHandler interface { // OnRequest 在发起请求前被调用 OnRequest() colly.RequestCallback // OnError 请求过程中如果发生错误被调用 - OnError() + OnError() colly.ErrorCallback // OnResponse 收到回复后被调用 - OnResponse() + OnResponse() colly.ResponseCallback // OnHTML 在OnResponse之后被调用,如果收到的内容是HTML OnHTML() (string, colly.HTMLCallback) // OnScraped 在OnHTML之后被调用 - OnScraped() + OnScraped() colly.ScrapedCallback }