Skip to content

Commit fe83c08

Browse files
committed
design octopus v1.2
1 parent 442e91d commit fe83c08

File tree

5 files changed

+62
-16
lines changed

5 files changed

+62
-16
lines changed

octopus/core.go

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,33 @@
11
package octopus
22

33
import (
4+
"fmt"
45
"log"
56
"time"
67
)
78

9+
func (o *octopus) setupOctopus() {
10+
o.setupValidProtocolMap()
11+
o.setupTimeToQuit()
12+
}
13+
814
func (o *octopus) setupValidProtocolMap() {
915
o.isValidProtocol = make(map[string]bool)
1016
for _, protocol := range o.ValidProtocols {
1117
o.isValidProtocol[protocol] = true
1218
}
1319
}
1420

21+
func (o *octopus) setupTimeToQuit() {
22+
if o.TimeToQuit > 0 {
23+
o.timeToQuit = time.Duration(o.TimeToQuit)
24+
} else {
25+
log.Fatalln("TimeToQuit is not greater than 0")
26+
}
27+
}
28+
1529
func (o *octopus) SetupSystem() {
16-
o.setupValidProtocolMap()
30+
o.setupOctopus()
1731

1832
ingestCh := make(chan *Node)
1933
ingestQuitCh := make(chan int, 1)
@@ -39,7 +53,7 @@ func (o *octopus) SetupSystem() {
3953
o.makeIngestPipe(inPipeChSet, linkAbsChSet)
4054

4155
o.inpUrlStrChan = ingestStrCh
42-
o.masterQuitCh = ingestQuitCh
56+
o.masterQuitCh = make(chan int, 1)
4357
o.isReady = true
4458
}
4559

@@ -56,9 +70,9 @@ func (o *octopus) BeginCrawling(baseUrlStr string) {
5670
{
5771
o.inpUrlStrChan <- urlStr
5872
}
59-
case <-time.After(10 * time.Second):
73+
case <-o.masterQuitCh:
6074
{
61-
o.masterQuitCh <- 1
75+
fmt.Println("Master Kill Switch Activated")
6276
return
6377
}
6478
}

octopus/modelfactory.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@ package octopus
33
import "sync"
44

55
const (
6-
defaultMaxDepth int64 = 2
7-
anchorTag = "a"
8-
anchorAttrb = "href"
6+
defaultMaxDepth int64 = 2
7+
anchorTag = "a"
8+
anchorAttrb = "href"
9+
defaultTimeToQuit = 5
910
)
1011

1112
// NewWithDefaultOptions - Create an Instance of the Octopus with the default CrawlOptions.
@@ -49,6 +50,7 @@ func GetDefaultCrawlOptions() *CrawlOptions {
4950
IncludeBody: true,
5051
OpAdapter: nil,
5152
ValidProtocols: []string{"http", "https"},
53+
TimeToQuit: defaultTimeToQuit,
5254
}
5355
}
5456

octopus/models.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package octopus
33
import (
44
"io"
55
"sync"
6+
"time"
67
)
78

89
// octopus is a concurrent web crawler.
@@ -15,6 +16,7 @@ type octopus struct {
1516
isReady bool
1617
adapterChSet *NodeChSet
1718
isValidProtocol map[string]bool
19+
timeToQuit time.Duration
1820
inpUrlStrChan chan string
1921
masterQuitCh chan int
2022
}
@@ -32,6 +34,10 @@ type octopus struct {
3234
// will pump output onto the implementation's channel returned by its Consume method.
3335
// CrawlRate is the rate at which requests will be made.
3436
// RespectRobots (unimplemented) choose whether to respect robots.txt or not.
37+
// ValidProtocols - This is an array containing the list of url protocols that
38+
// should be crawled.
39+
// TimeToQuit - represents the total time to wait between two new nodes to be
40+
// generated before the crawler quits. This is in seconds.
3541
type CrawlOptions struct {
3642
MaxCrawlDepth int64
3743
MaxCrawlLinks int64
@@ -41,6 +47,7 @@ type CrawlOptions struct {
4147
IncludeBody bool
4248
OpAdapter OutputAdapter
4349
ValidProtocols []string
50+
TimeToQuit int64
4451
}
4552

4653
// NodeInfo is used to represent each crawled link and its associated crawl depth.

octopus/pipe_spl_ingest.go

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,50 @@
11
package octopus
22

3+
import (
4+
"fmt"
5+
"time"
6+
)
7+
38
func (o *octopus) makeIngestPipe(inChSet *ingestPipeChSet, opChSet *NodeChSet) {
4-
go channelConnector(inChSet, opChSet)
5-
go setupStringIngestPipe(inChSet, opChSet)
9+
go channelConnector(inChSet, opChSet, o.timeToQuit, o.masterQuitCh)
10+
go setupStringIngestPipe(inChSet, opChSet, o.masterQuitCh)
611
}
712

8-
func setupStringIngestPipe(inChSet *ingestPipeChSet, nodeOpChSet *NodeChSet) {
13+
func setupStringIngestPipe(inChSet *ingestPipeChSet, nodeOpChSet *NodeChSet,
14+
masterQuitCh chan int) {
915
for {
1016
select {
1117
case str := <-inChSet.StrCh:
1218
{
1319
nodeOpChSet.NodeCh <- createNode("", str, 1)
1420
}
15-
case i := <-inChSet.QuitCh:
16-
{
17-
nodeOpChSet.QuitCh <- i
18-
}
21+
// case i := <-inChSet.QuitCh:
22+
// {
23+
// nodeOpChSet.QuitCh <- i
24+
// masterQuitCh <- i
25+
// }
1926
}
2027
}
2128
}
2229

23-
func channelConnector(inChSet *ingestPipeChSet, opChSet *NodeChSet) {
30+
func channelConnector(inChSet *ingestPipeChSet, opChSet *NodeChSet,
31+
timeOut time.Duration, masterQuitCh chan int) {
2432
for {
2533
select {
2634
case node := <-inChSet.NodeCh:
2735
opChSet.NodeCh <- node
2836
case i := <-inChSet.QuitCh:
29-
opChSet.QuitCh <- i
37+
{
38+
fmt.Println("Quit Received on Ingest Channel")
39+
opChSet.QuitCh <- i
40+
masterQuitCh <- i
41+
}
42+
case <-time.After(timeOut * time.Second):
43+
{
44+
fmt.Println("Timeout Triggered in Ingest Channel")
45+
opChSet.QuitCh <- 1
46+
return
47+
}
3048
}
3149
}
3250
}

octopus/stdpipefunc.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
package octopus
22

3+
import (
4+
"fmt"
5+
)
6+
37
type stdFunc func(*Node, *NodeChSet)
48

59
func stdLinearNodeFunc(stdFn stdFunc, outChSet *NodeChSet) *NodeChSet {
@@ -22,6 +26,7 @@ func stdLinearNodeFunc(stdFn stdFunc, outChSet *NodeChSet) *NodeChSet {
2226
}
2327
case <-listenQuitCh:
2428
{
29+
fmt.Println("Quit Received on Internal Channel")
2530
outChSet.QuitCh <- 1
2631
return
2732
}

0 commit comments

Comments
 (0)