File tree Expand file tree Collapse file tree 11 files changed +66
-15
lines changed
Expand file tree Collapse file tree 11 files changed +66
-15
lines changed Original file line number Diff line number Diff line change @@ -72,5 +72,5 @@ func runPipelineWithOptions() {
7272 crawler := exp .NewMonsterWithOptions (opt )
7373 opAdapterPipe := exp .GetOutputAdapterPipe ()
7474 crawler .BuildSystem (opAdapterPipe )
75- crawler .StartCrawling (LessLinkUrl )
75+ crawler .StartCrawling (HomeUrl )
7676}
Original file line number Diff line number Diff line change 11package octopus
22
3- func (o * octopus ) setup () {}
3+ func (o * octopus ) BuildSystem (opAdapter * OutputAdapter ) {
4+ //parsePipe, compPipeChan := m.MakeParsingPipe()
5+ //var reqPipe chan<- *Node
6+ //if opAdapterPipe == nil {
7+ // reqPipe = m.MakeRequisitionPipe(parsePipe, nil)
8+ //} else {
9+ // reqPipe = m.MakeRequisitionPipe(parsePipe, opAdapterPipe)
10+ //}
11+ //validationPipe := m.MakeUrlValidationPipe(reqPipe)
12+ //unduplPipe := m.MakeUnduplicationPipe(validationPipe)
13+ //cleanPipe := m.MakeLinkCleaningPipe(unduplPipe)
14+ //compPipe := m.MakeCompositionPipe(cleanPipe)
15+ //compPipeChan <- compPipe
16+ //m.compPipe = compPipe
17+
18+ }
19+
20+ func (o * octopus ) BeginCrawling () {
21+
22+ }
Original file line number Diff line number Diff line change 11package octopus
22
3+ import "sync"
4+
35const (
46 defaultMaxDepth int16 = 2
57 anchorTag = "a"
68 anchorAttrb = "href"
79)
810
9- // MakeNew - Creates an Instance of the Octopus Crawler with the given options .
10- func MakeNew ( opt CrawlOptions ) * octopus {
11+ // NewWithDefaultOptions - Create an Instance of the Octopus with the default CrawlOptions .
12+ func NewWithDefaultOptions ( ) * octopus {
1113 oct := & octopus {
12- CrawlOptions : opt ,
13- visited : make (map [Node ]bool ),
14+ CrawlOptions : getDefaultCrawlOptions (),
15+ visited : new (sync.Map ),
16+ isBuilt : false ,
1417 }
1518 oct .setup ()
1619 return oct
1720}
21+
22+ // New - Create an Instance of the Octopus with the given CrawlOptions.
23+ func New (opt * CrawlOptions ) * octopus {
24+ oct := & octopus {
25+ CrawlOptions : opt ,
26+ visited : new (sync.Map ),
27+ isBuilt : false ,
28+ }
29+ return oct
30+ }
31+
32+ func getDefaultCrawlOptions () * CrawlOptions {
33+ return & CrawlOptions {
34+ MaxDepthCrawled : - 1 ,
35+ MaxLinksCrawled : - 1 ,
36+ StayWithinBaseHost : false ,
37+ CrawlRatePerSec : - 1 ,
38+ RespectRobots : false ,
39+ IncludeBody : true ,
40+ OpAdapter : nil ,
41+ }
42+ }
Original file line number Diff line number Diff line change @@ -2,7 +2,7 @@ package octopus
22
33import (
44 "io"
5- "time "
5+ "sync "
66)
77
88// Node is used to represent each crawled link and its associated depth of crawl.
@@ -17,8 +17,9 @@ type Node struct {
1717// It also has a CrawlOptions structure to initialize setting specific
1818// to an instance of the crawler.
1919type octopus struct {
20- CrawlOptions
21- visited map [Node ]bool
20+ * CrawlOptions
21+ visited * sync.Map
22+ isBuilt bool
2223}
2324
2425// CrawlOptions is used to house options for crawling.
@@ -33,18 +34,17 @@ type octopus struct {
3334// CrawlRate is the rate at which requests will be made.
3435// RespectRobots (unimplemented) choose whether to respect robots.txt or not.
3536type CrawlOptions struct {
36- DepthPerLink int16
37+ MaxDepthCrawled int64
3738 MaxLinksCrawled int64
3839 StayWithinBaseHost bool
39- BaseURLString string
40- CrawlRate time.Duration
40+ CrawlRatePerSec int64
4141 RespectRobots bool
4242 IncludeBody bool
43- OpAdapter OutputAdapter
43+ OpAdapter * OutputAdapter
4444}
4545
4646type CrawlOutput struct {
47- Node
47+ * Node
4848 Body io.ReadCloser
4949}
5050
@@ -56,5 +56,5 @@ type CrawlOutput struct {
5656// Implementers of the interface should listen on this channel for output from
5757// the crawler.
5858type OutputAdapter interface {
59- Consume (quitCh <- chan bool ) chan <- CrawlOutput
59+ Consume (quitCh <- chan bool ) chan <- * CrawlOutput
6060}
Original file line number Diff line number Diff line change 1+ package octopus
Original file line number Diff line number Diff line change 1+ package octopus
Original file line number Diff line number Diff line change 1+ package octopus
Original file line number Diff line number Diff line change 1+ package octopus
Original file line number Diff line number Diff line change 1+ package octopus
Original file line number Diff line number Diff line change 1+ package octopus
You can’t perform that action at this time.
0 commit comments