Skip to content

Commit d175631

Browse files
committed
#a98h3 - base functions for limit
1 parent a232410 commit d175631

File tree

4 files changed

+41
-12
lines changed

4 files changed

+41
-12
lines changed

octopus/core.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
func (o *octopus) setupOctopus() {
1010
o.setupValidProtocolMap()
1111
o.setupTimeToQuit()
12+
o.setupMaxLinksCrawled()
1213
}
1314

1415
func (o *octopus) setupValidProtocolMap() {
@@ -26,6 +27,15 @@ func (o *octopus) setupTimeToQuit() {
2627
}
2728
}
2829

30+
func (o *octopus) setupMaxLinksCrawled() {
31+
switch {
32+
case o.MaxCrawledUrls == 0:
33+
panic("MaxCrawledUrls should either be negative or greater than 0.")
34+
case o.MaxCrawledUrls > 0:
35+
o.MaxCrawledUrls++ // done for convenience.
36+
}
37+
}
38+
2939
func (o *octopus) SetupSystem() {
3040
o.isReady = false
3141
o.setupOctopus()
@@ -64,7 +74,7 @@ func (o *octopus) SetupSystem() {
6474

6575
func (o *octopus) BeginCrawling(baseUrlStr string) {
6676
if !o.isReady {
67-
log.Fatal("Call BuildSystem first to setup Octopus")
77+
panic("Call BuildSystem first to setup Octopus")
6878
}
6979
go func() {
7080
o.inputUrlStrChan <- baseUrlStr

octopus/modelfactory.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,9 @@ func createNode(parentUrlStr, urlStr string, depth int64) *Node {
4545
func GetDefaultCrawlOptions() *CrawlOptions {
4646
return &CrawlOptions{
4747
MaxCrawlDepth: defaultMaxDepth,
48-
MaxCrawlLinks: defaultCrawlLimit,
48+
MaxCrawledUrls: defaultCrawlLimit,
4949
StayWithinBaseHost: false,
50-
CrawlRate: -1,
50+
CrawlRate: -1,
5151
RespectRobots: false,
5252
IncludeBody: true,
5353
OpAdapter: nil,

octopus/models.go

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,14 @@ import (
1212
// to an instance of the crawler.
1313
type octopus struct {
1414
*CrawlOptions
15-
visited *sync.Map
16-
isReady bool
17-
adapterChSet *NodeChSet
18-
isValidProtocol map[string]bool
19-
timeToQuit time.Duration
20-
inputUrlStrChan chan string
21-
masterQuitCh chan int
15+
visited *sync.Map
16+
isReady bool
17+
adapterChSet *NodeChSet
18+
isValidProtocol map[string]bool
19+
timeToQuit time.Duration
20+
inputUrlStrChan chan string
21+
masterQuitCh chan int
22+
crawledUrlCounter int64
2223
}
2324

2425
// CrawlOptions is used to house options for crawling.
@@ -29,7 +30,7 @@ type octopus struct {
2930
// MaxCrawlDepth - Indicates the maximum depth that will be crawled,
3031
// for each new link.
3132
//
32-
// MaxCrawlLinks - Specifies the Maximum Number of Unique Links that will be crawled.
33+
// MaxCrawledUrls - Specifies the Maximum Number of Unique Links that will be crawled.
3334
// Note : When combined with DepthPerLink, it will combine both.
3435
// Use -1 to indicate infinite links to be crawled (only bounded by depth of traversal).
3536
//
@@ -54,7 +55,7 @@ type octopus struct {
5455
// generated before the crawler quits. This is in seconds.
5556
type CrawlOptions struct {
5657
MaxCrawlDepth int64
57-
MaxCrawlLinks int64
58+
MaxCrawledUrls int64
5859
StayWithinBaseHost bool
5960
CrawlRate int64
6061
RespectRobots bool

octopus/pipe_ctrl_limitcrawl.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
package octopus
2+
3+
import (
4+
"sync/atomic"
5+
)
6+
7+
func (o *octopus) makeLimitCrawlPipe(inChSet *NodeChSet) *NodeChSet {
8+
return stdLinearNodeFunc(o.checkWithinLimit, inChSet)
9+
}
10+
11+
func (o *octopus) checkWithinLimit(node *Node, outChSet *NodeChSet) {
12+
if v := atomic.AddInt64(&o.crawledUrlCounter,
13+
1); v < o.MaxCrawledUrls {
14+
outChSet.NodeCh <- node
15+
} else {
16+
outChSet.QuitCh <- 1
17+
}
18+
}

0 commit comments

Comments
 (0)