v 0.0.1 - Models, Output Adapters

rapidclock · rapidclock · commit 7ab08eeee365 · 2018-06-18T17:31:14.000-07:00
- Basic Structure of Models
- StdOutput adapter and File adapter complete

Signed-off-by: Rahul Thomas &lt;thomas.rah@husky.neu.edu&gt;
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,11 @@
 
 # Output of the go coverage tool, specifically when used with LiteIDE
 *.out
+
+# osx stuff
+.DS_Store
+
+# IDE stuff
+.vscode
+.idea
+*.iml
diff --git a/octopus/modelfactory.go b/octopus/modelfactory.go
@@ -0,0 +1,17 @@
+package octopus
+
+const (
+	defaultMaxDepth int16 = 2
+	anchorTag             = "a"
+	anchorAttrb           = "href"
+)
+
+// New - Creates an Instance of the Octopus Crawler with the given options.
+func New(opt CrawlOptions) *webOctopus {
+	oct := &webOctopus{
+		CrawlOptions: opt,
+		visited:      nil,
+	}
+	oct.setup()
+	return oct
+}
diff --git a/octopus/models.go b/octopus/models.go
@@ -0,0 +1,41 @@
+package octopus
+
+import "io"
+
+// Node is used to represent each crawled link and its associated depth of crawl.
+type Node struct {
+	URLString string
+	Depth     int
+}
+
+// webOctopus is a concurrent version of webSpider.
+// It has an inbuilt parser based of htmlparser.Parser to collect all links in a web-page.
+// It also has a CrawlOptions structure to initialize setting specific
+// to an instance of the crawler.
+type webOctopus struct {
+	CrawlOptions
+	visited map[Node]bool
+}
+
+// CrawlOptions is used to house options for crawling.
+// You can specify depth of exploration for each link,
+// if crawler should ignore other hostnames (except from base host).
+// MaxLinksCrawled - Specifies the Maximum Number of Unique Links that will be crawled.
+// Note : When combined with DepthPerLink, it will combine both.
+// Use -1 to indicate infinite links to be crawled (only bounded by depth of traversal).
+// IncludeBody - Include the response Body in the crawled Node (for further processing).
+// OpAdapter is a user specified concrete implementation of an Output Adapter. The crawler
+// will pump output onto the implementation's channel returned by its Consume method.
+type CrawlOptions struct {
+	DepthPerLink       int16
+	MaxLinksCrawled    int64
+	StayWithinBaseHost bool
+	BaseURLString      string
+	IncludeBody        bool
+	OpAdapter          OutputAdapter
+}
+
+type CrawlOutput struct {
+	Node
+	Body io.ReadCloser
+}
diff --git a/octopus/octocore.go b/octopus/octocore.go
@@ -0,0 +1,3 @@
+package octopus
+
+func (o *webOctopus) setup() {}
diff --git a/octopus/outputadapter.go b/octopus/outputadapter.go
@@ -0,0 +1,72 @@
+package octopus
+
+import (
+	"fmt"
+	"io"
+	"log"
+	"os"
+)
+
+// OutputAdapter is the interface for the Adapter that is used to handle
+// output from the Octopus Crawler.
+// The contract stipulates that the crawler provides the channel
+// to listen for a quit command.
+// The crawler pumps its output onto the returned channel of the Consume method.
+// Implementers of the interface should listen on this channel for output from
+// the crawler.
+type OutputAdapter interface {
+	Consume(quitCh <-chan bool) chan<- CrawlOutput
+}
+
+// StdOpAdapter is an output adapter that just prints the output onto the screen.
+type StdOpAdapter struct{}
+
+func (s *StdOpAdapter) Consume(quitCh <-chan bool) chan<- CrawlOutput {
+	listenCh := make(chan CrawlOutput)
+	go func() {
+		for {
+			select {
+			case output := <-listenCh:
+				fmt.Printf("%d - %s\n", output.Depth, output.URLString)
+			case <-quitCh:
+				return
+			}
+		}
+	}()
+	return listenCh
+}
+
+// FileWriterAdapter is an output adapter that writes the output to a specified file.
+type FileWriterAdapter struct {
+	FilePath string
+}
+
+func (fw *FileWriterAdapter) Consume(quitCh <-chan bool) chan<- CrawlOutput {
+	listenCh := make(chan CrawlOutput)
+	fw.writeToFile(quitCh, listenCh)
+	return listenCh
+}
+
+func (fw *FileWriterAdapter) writeToFile(quitCh <-chan bool, ch <-chan CrawlOutput) {
+	fp, err := fw.getFilePointer()
+	if err != nil {
+		fp.Close()
+		log.Fatal(err)
+	}
+	go func() {
+		defer fp.Close()
+		for {
+			select {
+			case output := <-ch:
+				fmt.Fprintf(fp, "%d - %s\n", output.Depth, output.URLString)
+			case <-quitCh:
+				return
+			}
+		}
+	}()
+}
+
+func (fw *FileWriterAdapter) getFilePointer() (w io.WriteCloser, err error) {
+	w, err = os.OpenFile(fw.FilePath, os.O_RDWR|os.O_CREATE, 0755)
+	return
+}

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+package octopus`
	`2`	`+`
	`3`	`+func (o *webOctopus) setup() {}`