Skip to content

Commit 82a2c21

Browse files
committed
m2o - refactoring methods
1 parent 12582e3 commit 82a2c21

11 files changed

+66
-15
lines changed

main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,5 +72,5 @@ func runPipelineWithOptions() {
7272
crawler := exp.NewMonsterWithOptions(opt)
7373
opAdapterPipe := exp.GetOutputAdapterPipe()
7474
crawler.BuildSystem(opAdapterPipe)
75-
crawler.StartCrawling(LessLinkUrl)
75+
crawler.StartCrawling(HomeUrl)
7676
}

octopus/core.go

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,22 @@
11
package octopus
22

3-
func (o *octopus) setup() {}
3+
func (o *octopus) BuildSystem(opAdapter *OutputAdapter) {
4+
//parsePipe, compPipeChan := m.MakeParsingPipe()
5+
//var reqPipe chan<- *Node
6+
//if opAdapterPipe == nil {
7+
// reqPipe = m.MakeRequisitionPipe(parsePipe, nil)
8+
//} else {
9+
// reqPipe = m.MakeRequisitionPipe(parsePipe, opAdapterPipe)
10+
//}
11+
//validationPipe := m.MakeUrlValidationPipe(reqPipe)
12+
//unduplPipe := m.MakeUnduplicationPipe(validationPipe)
13+
//cleanPipe := m.MakeLinkCleaningPipe(unduplPipe)
14+
//compPipe := m.MakeCompositionPipe(cleanPipe)
15+
//compPipeChan <- compPipe
16+
//m.compPipe = compPipe
17+
18+
}
19+
20+
func (o *octopus) BeginCrawling() {
21+
22+
}

octopus/modelfactory.go

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,42 @@
11
package octopus
22

3+
import "sync"
4+
35
const (
46
defaultMaxDepth int16 = 2
57
anchorTag = "a"
68
anchorAttrb = "href"
79
)
810

9-
// MakeNew - Creates an Instance of the Octopus Crawler with the given options.
10-
func MakeNew(opt CrawlOptions) *octopus {
11+
// NewWithDefaultOptions - Create an Instance of the Octopus with the default CrawlOptions.
12+
func NewWithDefaultOptions() *octopus {
1113
oct := &octopus{
12-
CrawlOptions: opt,
13-
visited: make(map[Node]bool),
14+
CrawlOptions: getDefaultCrawlOptions(),
15+
visited: new(sync.Map),
16+
isBuilt: false,
1417
}
1518
oct.setup()
1619
return oct
1720
}
21+
22+
// New - Create an Instance of the Octopus with the given CrawlOptions.
23+
func New(opt *CrawlOptions) *octopus {
24+
oct := &octopus{
25+
CrawlOptions: opt,
26+
visited: new(sync.Map),
27+
isBuilt: false,
28+
}
29+
return oct
30+
}
31+
32+
func getDefaultCrawlOptions() *CrawlOptions {
33+
return &CrawlOptions{
34+
MaxDepthCrawled: -1,
35+
MaxLinksCrawled: -1,
36+
StayWithinBaseHost: false,
37+
CrawlRatePerSec: -1,
38+
RespectRobots: false,
39+
IncludeBody: true,
40+
OpAdapter: nil,
41+
}
42+
}

octopus/models.go

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ package octopus
22

33
import (
44
"io"
5-
"time"
5+
"sync"
66
)
77

88
// Node is used to represent each crawled link and its associated depth of crawl.
@@ -17,8 +17,9 @@ type Node struct {
1717
// It also has a CrawlOptions structure to initialize setting specific
1818
// to an instance of the crawler.
1919
type octopus struct {
20-
CrawlOptions
21-
visited map[Node]bool
20+
*CrawlOptions
21+
visited *sync.Map
22+
isBuilt bool
2223
}
2324

2425
// CrawlOptions is used to house options for crawling.
@@ -33,18 +34,17 @@ type octopus struct {
3334
// CrawlRate is the rate at which requests will be made.
3435
// RespectRobots (unimplemented) choose whether to respect robots.txt or not.
3536
type CrawlOptions struct {
36-
DepthPerLink int16
37+
MaxDepthCrawled int64
3738
MaxLinksCrawled int64
3839
StayWithinBaseHost bool
39-
BaseURLString string
40-
CrawlRate time.Duration
40+
CrawlRatePerSec int64
4141
RespectRobots bool
4242
IncludeBody bool
43-
OpAdapter OutputAdapter
43+
OpAdapter *OutputAdapter
4444
}
4545

4646
type CrawlOutput struct {
47-
Node
47+
*Node
4848
Body io.ReadCloser
4949
}
5050

@@ -56,5 +56,5 @@ type CrawlOutput struct {
5656
// Implementers of the interface should listen on this channel for output from
5757
// the crawler.
5858
type OutputAdapter interface {
59-
Consume(quitCh <-chan bool) chan<- CrawlOutput
59+
Consume(quitCh <-chan bool) chan<- *CrawlOutput
6060
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
package octopus

octopus/pipe_composition.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
package octopus

octopus/pipe_filter_duplication.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
package octopus
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
package octopus

octopus/pipe_htmlparsing.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
package octopus

octopus/pipe_pagerequisition.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
package octopus

0 commit comments

Comments
 (0)