Skip to content

Commit 8c66325

Browse files
committed
redesign octopus v1
1 parent 82a2c21 commit 8c66325

15 files changed

+301
-53
lines changed

adapter/basicadapters.go

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,33 +12,48 @@ import (
1212
// StdOpAdapter is an output adapter that just prints the output onto the screen.
1313
type StdOpAdapter struct{}
1414

15-
func (s *StdOpAdapter) Consume(quitCh <-chan bool) chan<- oct.CrawlOutput {
16-
listenCh := make(chan oct.CrawlOutput)
15+
func (s *StdOpAdapter) Consume() *oct.NodeChSet {
16+
listenCh := make(chan *oct.Node)
17+
quitCh := make(chan int, 1)
18+
listenChSet := &oct.NodeChSet{
19+
NodeCh: listenCh,
20+
StdChannels: &oct.StdChannels{
21+
QuitCh: quitCh,
22+
},
23+
}
1724
go func() {
1825
for {
1926
select {
2027
case output := <-listenCh:
21-
fmt.Printf("%d - %s\n", output.Depth, output.URLString)
28+
fmt.Printf("%d - %s\n", output.Depth, output.UrlString)
2229
case <-quitCh:
2330
return
2431
}
2532
}
2633
}()
27-
return listenCh
34+
return listenChSet
2835
}
2936

3037
// FileWriterAdapter is an output adapter that writes the output to a specified file.
3138
type FileWriterAdapter struct {
3239
FilePath string
3340
}
3441

35-
func (fw *FileWriterAdapter) Consume(quitCh <-chan bool) chan<- oct.CrawlOutput {
36-
listenCh := make(chan oct.CrawlOutput)
37-
fw.writeToFile(quitCh, listenCh)
38-
return listenCh
42+
func (fw *FileWriterAdapter) Consume() *oct.NodeChSet {
43+
listenCh := make(chan *oct.Node)
44+
quitCh := make(chan int, 1)
45+
listenChSet := &oct.NodeChSet{
46+
NodeCh: listenCh,
47+
StdChannels: &oct.StdChannels{
48+
QuitCh: quitCh,
49+
},
50+
}
51+
fw.writeToFile(listenCh, quitCh)
52+
return listenChSet
3953
}
4054

41-
func (fw *FileWriterAdapter) writeToFile(quitCh <-chan bool, ch <-chan oct.CrawlOutput) {
55+
func (fw *FileWriterAdapter) writeToFile(listenCh chan *oct.Node,
56+
quitCh chan int) {
4257
fp, err := fw.getFilePointer()
4358
if err != nil {
4459
fp.Close()
@@ -48,8 +63,8 @@ func (fw *FileWriterAdapter) writeToFile(quitCh <-chan bool, ch <-chan oct.Crawl
4863
defer fp.Close()
4964
for {
5065
select {
51-
case output := <-ch:
52-
fmt.Fprintf(fp, "%d - %s\n", output.Depth, output.URLString)
66+
case output := <-listenCh:
67+
fmt.Fprintf(fp, "%d - %s\n", output.Depth, output.UrlString)
5368
case <-quitCh:
5469
return
5570
}

experimental/PIPELINE_ZDESIGN.txt

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11

22
Pipeline
33

4-
url => structure => absolutify links => remove duplicates => validate urls => make request => parse page for urls ||
5-
^ => output adapter ||
6-
^=====================================================================================================
4+
url => ingest
5+
v |=> output adapter
6+
structure => absolutify links => remove duplicates => validate urls => make request |=> parse page for urls ||
7+
^========================================================================================================
8+
79

810

911
1. Composition

octopus/core.go

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
11
package octopus
22

3-
func (o *octopus) BuildSystem(opAdapter *OutputAdapter) {
4-
//parsePipe, compPipeChan := m.MakeParsingPipe()
5-
//var reqPipe chan<- *Node
6-
//if opAdapterPipe == nil {
7-
// reqPipe = m.MakeRequisitionPipe(parsePipe, nil)
8-
//} else {
9-
// reqPipe = m.MakeRequisitionPipe(parsePipe, opAdapterPipe)
10-
//}
11-
//validationPipe := m.MakeUrlValidationPipe(reqPipe)
12-
//unduplPipe := m.MakeUnduplicationPipe(validationPipe)
13-
//cleanPipe := m.MakeLinkCleaningPipe(unduplPipe)
14-
//compPipe := m.MakeCompositionPipe(cleanPipe)
15-
//compPipeChan <- compPipe
16-
//m.compPipe = compPipe
3+
func (o *octopus) BuildSystem() {
4+
// parsePipe, compPipeChan := m.MakeParsingPipe()
5+
// var reqPipe chan<- *NodeInfo
6+
// if opAdapterPipe == nil {
7+
// reqPipe = m.MakeRequisitionPipe(parsePipe, nil)
8+
// } else {
9+
// reqPipe = m.MakeRequisitionPipe(parsePipe, opAdapterPipe)
10+
// }
11+
// validationPipe := m.MakeUrlValidationPipe(reqPipe)
12+
// unduplPipe := m.MakeUnduplicationPipe(validationPipe)
13+
// cleanPipe := m.MakeLinkCleaningPipe(unduplPipe)
14+
// compPipe := m.MakeCompositionPipe(cleanPipe)
15+
// compPipeChan <- compPipe
16+
// m.compPipe = compPipe
1717

1818
}
1919

octopus/modelfactory.go

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,10 @@ const (
1111
// NewWithDefaultOptions - Create an Instance of the Octopus with the default CrawlOptions.
1212
func NewWithDefaultOptions() *octopus {
1313
oct := &octopus{
14-
CrawlOptions: getDefaultCrawlOptions(),
14+
CrawlOptions: GetDefaultCrawlOptions(),
1515
visited: new(sync.Map),
16-
isBuilt: false,
16+
isReady: false,
1717
}
18-
oct.setup()
1918
return oct
2019
}
2120

@@ -24,15 +23,15 @@ func New(opt *CrawlOptions) *octopus {
2423
oct := &octopus{
2524
CrawlOptions: opt,
2625
visited: new(sync.Map),
27-
isBuilt: false,
26+
isReady: false,
2827
}
2928
return oct
3029
}
3130

32-
func getDefaultCrawlOptions() *CrawlOptions {
31+
func GetDefaultCrawlOptions() *CrawlOptions {
3332
return &CrawlOptions{
34-
MaxDepthCrawled: -1,
35-
MaxLinksCrawled: -1,
33+
MaxCrawlDepth: -1,
34+
MaxCrawlLinks: -1,
3635
StayWithinBaseHost: false,
3736
CrawlRatePerSec: -1,
3837
RespectRobots: false,

octopus/models.go

Lines changed: 41 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,49 +5,74 @@ import (
55
"sync"
66
)
77

8-
// Node is used to represent each crawled link and its associated depth of crawl.
9-
type Node struct {
10-
ParentUrlString string
11-
UrlString string
12-
Depth int
13-
}
14-
158
// octopus is a concurrent web crawler.
169
// It has an inbuilt parser based of html.NewTokenizer to collect all links in a web-page.
1710
// It also has a CrawlOptions structure to initialize setting specific
1811
// to an instance of the crawler.
1912
type octopus struct {
2013
*CrawlOptions
21-
visited *sync.Map
22-
isBuilt bool
14+
visited *sync.Map
15+
isReady bool
16+
adapterChSet *NodeChSet
2317
}
2418

2519
// CrawlOptions is used to house options for crawling.
2620
// You can specify depth of exploration for each link,
2721
// if crawler should ignore other hostnames (except from base host).
28-
// MaxLinksCrawled - Specifies the Maximum Number of Unique Links that will be crawled.
22+
// MaxCrawlDepth - Indicates the maximum depth that will be crawled,
23+
// for each new link.
24+
// MaxCrawlLinks - Specifies the Maximum Number of Unique Links that will be crawled.
2925
// Note : When combined with DepthPerLink, it will combine both.
3026
// Use -1 to indicate infinite links to be crawled (only bounded by depth of traversal).
31-
// IncludeBody - Include the response Body in the crawled Node (for further processing).
27+
// IncludeBody - Include the response Body in the crawled NodeInfo (for further processing).
3228
// OpAdapter is a user specified concrete implementation of an Output Adapter. The crawler
3329
// will pump output onto the implementation's channel returned by its Consume method.
3430
// CrawlRate is the rate at which requests will be made.
3531
// RespectRobots (unimplemented) choose whether to respect robots.txt or not.
3632
type CrawlOptions struct {
37-
MaxDepthCrawled int64
38-
MaxLinksCrawled int64
33+
MaxCrawlDepth int64
34+
MaxCrawlLinks int64
3935
StayWithinBaseHost bool
4036
CrawlRatePerSec int64
4137
RespectRobots bool
4238
IncludeBody bool
4339
OpAdapter *OutputAdapter
4440
}
4541

46-
type CrawlOutput struct {
47-
*Node
42+
// NodeInfo is used to represent each crawled link and its associated crawl depth.
43+
type NodeInfo struct {
44+
ParentUrlString string
45+
UrlString string
46+
Depth int64
47+
}
48+
49+
// Node encloses a NodeInfo and its Body (HTML) Content.
50+
type Node struct {
51+
*NodeInfo
4852
Body io.ReadCloser
4953
}
5054

55+
type StdChannels struct {
56+
QuitCh chan<- int
57+
// logCh chan<- string
58+
// errorCh chan<- string
59+
}
60+
61+
type NodeChSet struct {
62+
NodeCh chan<- *Node
63+
*StdChannels
64+
}
65+
66+
type StringChSet struct {
67+
strCh chan<- string
68+
*StdChannels
69+
}
70+
71+
type NodeInfoChSet struct {
72+
nodeInfoCh chan<- *NodeInfo
73+
*StdChannels
74+
}
75+
5176
// OutputAdapter is the interface for the Adapter that is used to handle
5277
// output from the Octopus Crawler.
5378
// The contract stipulates that the crawler provides the channel
@@ -56,5 +81,5 @@ type CrawlOutput struct {
5681
// Implementers of the interface should listen on this channel for output from
5782
// the crawler.
5883
type OutputAdapter interface {
59-
Consume(quitCh <-chan bool) chan<- *CrawlOutput
84+
Consume() *NodeChSet
6085
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,27 @@
11
package octopus
2+
3+
import (
4+
"net/url"
5+
)
6+
7+
func (o *octopus) makeLinkAbsolutionPipe(outChSet *NodeChSet) *NodeChSet {
8+
return stdLinearNodeFunc(makeLinkAbsolute, outChSet)
9+
}
10+
11+
func makeLinkAbsolute(node *Node, outChSet *NodeChSet) {
12+
if node.ParentUrlString != "" {
13+
linkUrl, err := url.Parse(node.UrlString)
14+
if err != nil {
15+
return
16+
}
17+
if !linkUrl.IsAbs() {
18+
baseUrl, err := url.Parse(node.ParentUrlString)
19+
if err != nil {
20+
return
21+
}
22+
absLinkUrl := baseUrl.ResolveReference(linkUrl)
23+
node.UrlString = absLinkUrl.String()
24+
}
25+
}
26+
outChSet.NodeCh <- node
27+
}

octopus/pipe_filter_crawldepth.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
package octopus
2+
3+
func (o *octopus) makeFilterCrawlDepthPipe(outChSet *NodeChSet) *NodeChSet {
4+
return stdLinearNodeFunc(o.filterByUrlDepth, outChSet)
5+
}
6+
7+
func (o *octopus) filterByUrlDepth(node *Node, outChSet *NodeChSet) {
8+
if node.Depth < o.MaxCrawlDepth {
9+
outChSet.NodeCh <- node
10+
}
11+
}

octopus/pipe_filter_duplication.go

Lines changed: 0 additions & 1 deletion
This file was deleted.
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
package octopus
2+
3+
func (o *octopus) makeFilterUnduplicationPipe(outChSet *NodeChSet) *NodeChSet {
4+
return stdLinearNodeFunc(o.filterDuplicates, outChSet)
5+
}
6+
7+
func (o *octopus) filterDuplicates(node *Node, outChSet *NodeChSet) {
8+
if _, visited := o.visited.Load(node.UrlString); !visited {
9+
o.visited.Store(node.UrlString, true)
10+
outChSet.NodeCh <- node
11+
}
12+
}

0 commit comments

Comments
 (0)