Skip to content

Commit 7ab08ee

Browse files
committed
v 0.0.1 - Models, Output Adapters
- Basic Structure of Models - StdOutput adapter and File adapter complete Signed-off-by: Rahul Thomas <thomas.rah@husky.neu.edu>
1 parent e34acbb commit 7ab08ee

File tree

5 files changed

+141
-0
lines changed

5 files changed

+141
-0
lines changed

.gitignore

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,11 @@
1010

1111
# Output of the go coverage tool, specifically when used with LiteIDE
1212
*.out
13+
14+
# osx stuff
15+
.DS_Store
16+
17+
# IDE stuff
18+
.vscode
19+
.idea
20+
*.iml

octopus/modelfactory.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package octopus
2+
3+
const (
4+
defaultMaxDepth int16 = 2
5+
anchorTag = "a"
6+
anchorAttrb = "href"
7+
)
8+
9+
// New - Creates an Instance of the Octopus Crawler with the given options.
10+
func New(opt CrawlOptions) *webOctopus {
11+
oct := &webOctopus{
12+
CrawlOptions: opt,
13+
visited: nil,
14+
}
15+
oct.setup()
16+
return oct
17+
}

octopus/models.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
package octopus
2+
3+
import "io"
4+
5+
// Node is used to represent each crawled link and its associated depth of crawl.
6+
type Node struct {
7+
URLString string
8+
Depth int
9+
}
10+
11+
// webOctopus is a concurrent version of webSpider.
12+
// It has an inbuilt parser based of htmlparser.Parser to collect all links in a web-page.
13+
// It also has a CrawlOptions structure to initialize setting specific
14+
// to an instance of the crawler.
15+
type webOctopus struct {
16+
CrawlOptions
17+
visited map[Node]bool
18+
}
19+
20+
// CrawlOptions is used to house options for crawling.
21+
// You can specify depth of exploration for each link,
22+
// if crawler should ignore other hostnames (except from base host).
23+
// MaxLinksCrawled - Specifies the Maximum Number of Unique Links that will be crawled.
24+
// Note : When combined with DepthPerLink, it will combine both.
25+
// Use -1 to indicate infinite links to be crawled (only bounded by depth of traversal).
26+
// IncludeBody - Include the response Body in the crawled Node (for further processing).
27+
// OpAdapter is a user specified concrete implementation of an Output Adapter. The crawler
28+
// will pump output onto the implementation's channel returned by its Consume method.
29+
type CrawlOptions struct {
30+
DepthPerLink int16
31+
MaxLinksCrawled int64
32+
StayWithinBaseHost bool
33+
BaseURLString string
34+
IncludeBody bool
35+
OpAdapter OutputAdapter
36+
}
37+
38+
type CrawlOutput struct {
39+
Node
40+
Body io.ReadCloser
41+
}

octopus/octocore.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
package octopus
2+
3+
func (o *webOctopus) setup() {}

octopus/outputadapter.go

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
package octopus
2+
3+
import (
4+
"fmt"
5+
"io"
6+
"log"
7+
"os"
8+
)
9+
10+
// OutputAdapter is the interface for the Adapter that is used to handle
11+
// output from the Octopus Crawler.
12+
// The contract stipulates that the crawler provides the channel
13+
// to listen for a quit command.
14+
// The crawler pumps its output onto the returned channel of the Consume method.
15+
// Implementers of the interface should listen on this channel for output from
16+
// the crawler.
17+
type OutputAdapter interface {
18+
Consume(quitCh <-chan bool) chan<- CrawlOutput
19+
}
20+
21+
// StdOpAdapter is an output adapter that just prints the output onto the screen.
22+
type StdOpAdapter struct{}
23+
24+
func (s *StdOpAdapter) Consume(quitCh <-chan bool) chan<- CrawlOutput {
25+
listenCh := make(chan CrawlOutput)
26+
go func() {
27+
for {
28+
select {
29+
case output := <-listenCh:
30+
fmt.Printf("%d - %s\n", output.Depth, output.URLString)
31+
case <-quitCh:
32+
return
33+
}
34+
}
35+
}()
36+
return listenCh
37+
}
38+
39+
// FileWriterAdapter is an output adapter that writes the output to a specified file.
40+
type FileWriterAdapter struct {
41+
FilePath string
42+
}
43+
44+
func (fw *FileWriterAdapter) Consume(quitCh <-chan bool) chan<- CrawlOutput {
45+
listenCh := make(chan CrawlOutput)
46+
fw.writeToFile(quitCh, listenCh)
47+
return listenCh
48+
}
49+
50+
func (fw *FileWriterAdapter) writeToFile(quitCh <-chan bool, ch <-chan CrawlOutput) {
51+
fp, err := fw.getFilePointer()
52+
if err != nil {
53+
fp.Close()
54+
log.Fatal(err)
55+
}
56+
go func() {
57+
defer fp.Close()
58+
for {
59+
select {
60+
case output := <-ch:
61+
fmt.Fprintf(fp, "%d - %s\n", output.Depth, output.URLString)
62+
case <-quitCh:
63+
return
64+
}
65+
}
66+
}()
67+
}
68+
69+
func (fw *FileWriterAdapter) getFilePointer() (w io.WriteCloser, err error) {
70+
w, err = os.OpenFile(fw.FilePath, os.O_RDWR|os.O_CREATE, 0755)
71+
return
72+
}

0 commit comments

Comments
 (0)