Skip to content

Commit c75e915

Browse files
committed
#a98h3 - working max Links limit
1 parent 0962c36 commit c75e915

File tree

4 files changed

+16
-7
lines changed

4 files changed

+16
-7
lines changed

adapter/basicadapters.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,12 @@ func (s *StdOpAdapter) Consume() *oct.NodeChSet {
2222
},
2323
}
2424
go func() {
25+
i := 1
2526
for {
2627
select {
2728
case output := <-listenCh:
28-
fmt.Printf("%d - %s\n", output.Depth, output.UrlString)
29+
fmt.Printf("%d - %d - %s\n", i, output.Depth, output.UrlString)
30+
i++
2931
case <-quitCh:
3032
return
3133
}

main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ func main() {
99
opAdapter := &adapter.StdOpAdapter{}
1010
options := octopus.GetDefaultCrawlOptions()
1111
options.OpAdapter = opAdapter
12+
options.MaxCrawledUrls = 150
1213
crawler := octopus.New(options)
1314
crawler.SetupSystem()
1415
crawler.BeginCrawling("https://www.macrumors.com")

octopus/core.go

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,8 @@ func (o *octopus) setupTimeToQuit() {
2828
}
2929

3030
func (o *octopus) setupMaxLinksCrawled() {
31-
switch {
32-
case o.MaxCrawledUrls == 0:
31+
if o.MaxCrawledUrls == 0 {
3332
panic("MaxCrawledUrls should either be negative or greater than 0.")
34-
case o.MaxCrawledUrls > 0:
35-
o.MaxCrawledUrls++ // done for convenience.
3633
}
3734
}
3835

@@ -59,7 +56,15 @@ func (o *octopus) SetupSystem() {
5956
pageParseChSet := o.makeParseNodeFromHtmlPipe(ingestChSet)
6057
depthLimitChSet := o.makeCrawlDepthFilterPipe(pageParseChSet)
6158
maxDelayChSet := o.makeMaxDelayPipe(depthLimitChSet)
62-
distributorChSet := o.makeDistributorPipe(maxDelayChSet, outAdapterChSet)
59+
60+
var distributorChSet *NodeChSet
61+
if o.MaxCrawledUrls < 0 {
62+
distributorChSet = o.makeDistributorPipe(maxDelayChSet, outAdapterChSet)
63+
} else {
64+
maxLinksCrawledChSet := o.makeLimitCrawlPipe(outAdapterChSet)
65+
distributorChSet = o.makeDistributorPipe(maxDelayChSet, maxLinksCrawledChSet)
66+
}
67+
6368
pageReqChSet := o.makePageRequisitionPipe(distributorChSet)
6469
invUrlFilterChSet := o.makeInvalidUrlFilterPipe(pageReqChSet)
6570
dupFilterChSet := o.makeDuplicateUrlFilterPipe(invUrlFilterChSet)

octopus/pipe_ctrl_limitcrawl.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,10 @@ func (o *octopus) makeLimitCrawlPipe(inChSet *NodeChSet) *NodeChSet {
1010

1111
func (o *octopus) checkWithinLimit(node *Node, outChSet *NodeChSet) {
1212
if v := atomic.AddInt64(&o.crawledUrlCounter,
13-
1); v < o.MaxCrawledUrls {
13+
1); v <= o.MaxCrawledUrls {
1414
outChSet.NodeCh <- node
1515
} else {
1616
outChSet.QuitCh <- 1
17+
o.masterQuitCh <- 1
1718
}
1819
}

0 commit comments

Comments
 (0)