@@ -2,37 +2,9 @@ package octopus
22
33import (
44 "fmt"
5- "log"
65 "time"
76)
87
9- func (o * octopus ) setupOctopus () {
10- o .setupValidProtocolMap ()
11- o .setupTimeToQuit ()
12- o .setupMaxLinksCrawled ()
13- }
14-
15- func (o * octopus ) setupValidProtocolMap () {
16- o .isValidProtocol = make (map [string ]bool )
17- for _ , protocol := range o .ValidProtocols {
18- o .isValidProtocol [protocol ] = true
19- }
20- }
21-
22- func (o * octopus ) setupTimeToQuit () {
23- if o .TimeToQuit > 0 {
24- o .timeToQuit = time .Duration (o .TimeToQuit ) * time .Second
25- } else {
26- log .Fatalln ("TimeToQuit is not greater than 0" )
27- }
28- }
29-
30- func (o * octopus ) setupMaxLinksCrawled () {
31- if o .MaxCrawledUrls == 0 {
32- panic ("MaxCrawledUrls should either be negative or greater than 0." )
33- }
34- }
35-
368func (o * octopus ) SetupSystem () {
379 o .isReady = false
3810 o .setupOctopus ()
@@ -57,16 +29,12 @@ func (o *octopus) SetupSystem() {
5729 depthLimitChSet := o .makeCrawlDepthFilterPipe (pageParseChSet )
5830 maxDelayChSet := o .makeMaxDelayPipe (depthLimitChSet )
5931
60- var distributorChSet * NodeChSet
61- if o .MaxCrawledUrls < 0 {
62- distributorChSet = o .makeDistributorPipe (maxDelayChSet , outAdapterChSet )
63- } else {
64- maxLinksCrawledChSet := o .makeLimitCrawlPipe (outAdapterChSet )
65- distributorChSet = o .makeDistributorPipe (maxDelayChSet , maxLinksCrawledChSet )
66- }
32+ distributorChSet := o .handleDistributorPipeline (maxDelayChSet , outAdapterChSet )
6733
6834 pageReqChSet := o .makePageRequisitionPipe (distributorChSet )
69- invUrlFilterChSet := o .makeInvalidUrlFilterPipe (pageReqChSet )
35+
36+ invUrlFilterChSet := o .handleRateLimitingPipeline (pageReqChSet )
37+
7038 dupFilterChSet := o .makeDuplicateUrlFilterPipe (invUrlFilterChSet )
7139 protoFilterChSet := o .makeUrlProtocolFilterPipe (dupFilterChSet )
7240 linkAbsChSet := o .makeLinkAbsolutionPipe (protoFilterChSet )
@@ -77,6 +45,28 @@ func (o *octopus) SetupSystem() {
7745 o .isReady = true
7846}
7947
48+ func (o * octopus ) handleDistributorPipeline (maxDelayChSet , outAdapterChSet * NodeChSet ) * NodeChSet {
49+ var distributorChSet * NodeChSet
50+ if o .MaxCrawledUrls < 0 {
51+ distributorChSet = o .makeDistributorPipe (maxDelayChSet , outAdapterChSet )
52+ } else {
53+ maxLinksCrawledChSet := o .makeCrawlLinkCountLimitPipe (outAdapterChSet )
54+ distributorChSet = o .makeDistributorPipe (maxDelayChSet , maxLinksCrawledChSet )
55+ }
56+ return distributorChSet
57+ }
58+
59+ func (o * octopus ) handleRateLimitingPipeline (pageReqChSet * NodeChSet ) * NodeChSet {
60+ var invUrlFilterChSet * NodeChSet
61+ if o .rateLimiter != nil {
62+ rateLimitingChSet := o .makeRateLimitingPipe (pageReqChSet )
63+ invUrlFilterChSet = o .makeInvalidUrlFilterPipe (rateLimitingChSet )
64+ } else {
65+ invUrlFilterChSet = o .makeInvalidUrlFilterPipe (pageReqChSet )
66+ }
67+ return invUrlFilterChSet
68+ }
69+
8070func (o * octopus ) BeginCrawling (baseUrlStr string ) {
8171 if ! o .isReady {
8272 panic ("Call BuildSystem first to setup Octopus" )
0 commit comments