redesign octopus v1

rapidclock · rapidclock · commit 8c663256ad35 · 2018-11-05T23:34:33.000-08:00
diff --git a/adapter/basicadapters.go b/adapter/basicadapters.go
@@ -12,33 +12,48 @@ import (
 // StdOpAdapter is an output adapter that just prints the output onto the screen.
 type StdOpAdapter struct{}
 
-func (s *StdOpAdapter) Consume(quitCh <-chan bool) chan<- oct.CrawlOutput {
-	listenCh := make(chan oct.CrawlOutput)
+func (s *StdOpAdapter) Consume() *oct.NodeChSet {
+	listenCh := make(chan *oct.Node)
+	quitCh := make(chan int, 1)
+	listenChSet := &oct.NodeChSet{
+		NodeCh: listenCh,
+		StdChannels: &oct.StdChannels{
+			QuitCh: quitCh,
+		},
+	}
 	go func() {
 		for {
 			select {
 			case output := <-listenCh:
-				fmt.Printf("%d - %s\n", output.Depth, output.URLString)
+				fmt.Printf("%d - %s\n", output.Depth, output.UrlString)
 			case <-quitCh:
 				return
 			}
 		}
 	}()
-	return listenCh
+	return listenChSet
 }
 
 // FileWriterAdapter is an output adapter that writes the output to a specified file.
 type FileWriterAdapter struct {
 	FilePath string
 }
 
-func (fw *FileWriterAdapter) Consume(quitCh <-chan bool) chan<- oct.CrawlOutput {
-	listenCh := make(chan oct.CrawlOutput)
-	fw.writeToFile(quitCh, listenCh)
-	return listenCh
+func (fw *FileWriterAdapter) Consume() *oct.NodeChSet {
+	listenCh := make(chan *oct.Node)
+	quitCh := make(chan int, 1)
+	listenChSet := &oct.NodeChSet{
+		NodeCh: listenCh,
+		StdChannels: &oct.StdChannels{
+			QuitCh: quitCh,
+		},
+	}
+	fw.writeToFile(listenCh, quitCh)
+	return listenChSet
 }
 
-func (fw *FileWriterAdapter) writeToFile(quitCh <-chan bool, ch <-chan oct.CrawlOutput) {
+func (fw *FileWriterAdapter) writeToFile(listenCh chan *oct.Node,
+	quitCh chan int) {
 	fp, err := fw.getFilePointer()
 	if err != nil {
 		fp.Close()
@@ -48,8 +63,8 @@ func (fw *FileWriterAdapter) writeToFile(quitCh <-chan bool, ch <-chan oct.Crawl
 		defer fp.Close()
 		for {
 			select {
-			case output := <-ch:
-				fmt.Fprintf(fp, "%d - %s\n", output.Depth, output.URLString)
+			case output := <-listenCh:
+				fmt.Fprintf(fp, "%d - %s\n", output.Depth, output.UrlString)
 			case <-quitCh:
 				return
 			}
diff --git a/experimental/PIPELINE_ZDESIGN.txt b/experimental/PIPELINE_ZDESIGN.txt
@@ -1,9 +1,11 @@
 
 Pipeline
 
-url => structure => absolutify links => remove duplicates => validate urls => make request => parse page for urls ||
-			^											                    			   => output adapter      ||
-			^=====================================================================================================
+url => ingest
+          v                                                                                 |=> output adapter
+      structure => absolutify links => remove duplicates => validate urls =>  make request  |=> parse page for urls ||
+			^========================================================================================================
+
 
 
 1. Composition
diff --git a/octopus/core.go b/octopus/core.go
@@ -1,19 +1,19 @@
 package octopus
 
-func (o *octopus) BuildSystem(opAdapter *OutputAdapter) {
-	//parsePipe, compPipeChan := m.MakeParsingPipe()
-	//var reqPipe chan<- *Node
-	//if opAdapterPipe == nil {
-	//	reqPipe = m.MakeRequisitionPipe(parsePipe, nil)
-	//} else {
-	//	reqPipe = m.MakeRequisitionPipe(parsePipe, opAdapterPipe)
-	//}
-	//validationPipe := m.MakeUrlValidationPipe(reqPipe)
-	//unduplPipe := m.MakeUnduplicationPipe(validationPipe)
-	//cleanPipe := m.MakeLinkCleaningPipe(unduplPipe)
-	//compPipe := m.MakeCompositionPipe(cleanPipe)
-	//compPipeChan <- compPipe
-	//m.compPipe = compPipe
+func (o *octopus) BuildSystem() {
+	// parsePipe, compPipeChan := m.MakeParsingPipe()
+	// var reqPipe chan<- *NodeInfo
+	// if opAdapterPipe == nil {
+	// 	reqPipe = m.MakeRequisitionPipe(parsePipe, nil)
+	// } else {
+	// 	reqPipe = m.MakeRequisitionPipe(parsePipe, opAdapterPipe)
+	// }
+	// validationPipe := m.MakeUrlValidationPipe(reqPipe)
+	// unduplPipe := m.MakeUnduplicationPipe(validationPipe)
+	// cleanPipe := m.MakeLinkCleaningPipe(unduplPipe)
+	// compPipe := m.MakeCompositionPipe(cleanPipe)
+	// compPipeChan <- compPipe
+	// m.compPipe = compPipe
 
 }
 
diff --git a/octopus/modelfactory.go b/octopus/modelfactory.go
@@ -11,11 +11,10 @@ const (
 // NewWithDefaultOptions - Create an Instance of the Octopus with the default CrawlOptions.
 func NewWithDefaultOptions() *octopus {
 	oct := &octopus{
-		CrawlOptions: getDefaultCrawlOptions(),
+		CrawlOptions: GetDefaultCrawlOptions(),
 		visited:      new(sync.Map),
-		isBuilt:      false,
+		isReady:      false,
 	}
-	oct.setup()
 	return oct
 }
 
@@ -24,15 +23,15 @@ func New(opt *CrawlOptions) *octopus {
 	oct := &octopus{
 		CrawlOptions: opt,
 		visited:      new(sync.Map),
-		isBuilt:      false,
+		isReady:      false,
 	}
 	return oct
 }
 
-func getDefaultCrawlOptions() *CrawlOptions {
+func GetDefaultCrawlOptions() *CrawlOptions {
 	return &CrawlOptions{
-		MaxDepthCrawled:    -1,
-		MaxLinksCrawled:    -1,
+		MaxCrawlDepth:      -1,
+		MaxCrawlLinks:      -1,
 		StayWithinBaseHost: false,
 		CrawlRatePerSec:    -1,
 		RespectRobots:      false,
diff --git a/octopus/models.go b/octopus/models.go
@@ -5,49 +5,74 @@ import (
 	"sync"
 )
 
-// Node is used to represent each crawled link and its associated depth of crawl.
-type Node struct {
-	ParentUrlString string
-	UrlString       string
-	Depth           int
-}
-
 // octopus is a concurrent web crawler.
 // It has an inbuilt parser based of html.NewTokenizer to collect all links in a web-page.
 // It also has a CrawlOptions structure to initialize setting specific
 // to an instance of the crawler.
 type octopus struct {
 	*CrawlOptions
-	visited *sync.Map
-	isBuilt bool
+	visited      *sync.Map
+	isReady      bool
+	adapterChSet *NodeChSet
 }
 
 // CrawlOptions is used to house options for crawling.
 // You can specify depth of exploration for each link,
 // if crawler should ignore other hostnames (except from base host).
-// MaxLinksCrawled - Specifies the Maximum Number of Unique Links that will be crawled.
+// MaxCrawlDepth - Indicates the maximum depth that will be crawled,
+// for each new link.
+// MaxCrawlLinks - Specifies the Maximum Number of Unique Links that will be crawled.
 // Note : When combined with DepthPerLink, it will combine both.
 // Use -1 to indicate infinite links to be crawled (only bounded by depth of traversal).
-// IncludeBody - Include the response Body in the crawled Node (for further processing).
+// IncludeBody - Include the response Body in the crawled NodeInfo (for further processing).
 // OpAdapter is a user specified concrete implementation of an Output Adapter. The crawler
 // will pump output onto the implementation's channel returned by its Consume method.
 // CrawlRate is the rate at which requests will be made.
 // RespectRobots (unimplemented) choose whether to respect robots.txt or not.
 type CrawlOptions struct {
-	MaxDepthCrawled    int64
-	MaxLinksCrawled    int64
+	MaxCrawlDepth      int64
+	MaxCrawlLinks      int64
 	StayWithinBaseHost bool
 	CrawlRatePerSec    int64
 	RespectRobots      bool
 	IncludeBody        bool
 	OpAdapter          *OutputAdapter
 }
 
-type CrawlOutput struct {
-	*Node
+// NodeInfo is used to represent each crawled link and its associated crawl depth.
+type NodeInfo struct {
+	ParentUrlString string
+	UrlString       string
+	Depth           int64
+}
+
+// Node encloses a NodeInfo and its Body (HTML) Content.
+type Node struct {
+	*NodeInfo
 	Body io.ReadCloser
 }
 
+type StdChannels struct {
+	QuitCh chan<- int
+	// logCh     chan<- string
+	// errorCh   chan<- string
+}
+
+type NodeChSet struct {
+	NodeCh chan<- *Node
+	*StdChannels
+}
+
+type StringChSet struct {
+	strCh chan<- string
+	*StdChannels
+}
+
+type NodeInfoChSet struct {
+	nodeInfoCh chan<- *NodeInfo
+	*StdChannels
+}
+
 // OutputAdapter is the interface for the Adapter that is used to handle
 // output from the Octopus Crawler.
 // The contract stipulates that the crawler provides the channel
@@ -56,5 +81,5 @@ type CrawlOutput struct {
 // Implementers of the interface should listen on this channel for output from
 // the crawler.
 type OutputAdapter interface {
-	Consume(quitCh <-chan bool) chan<- *CrawlOutput
+	Consume() *NodeChSet
 }
diff --git a/octopus/pipe_augment_linkabsolution.go b/octopus/pipe_augment_linkabsolution.go
@@ -1 +1,27 @@
 package octopus
+
+import (
+	"net/url"
+)
+
+func (o *octopus) makeLinkAbsolutionPipe(outChSet *NodeChSet) *NodeChSet {
+	return stdLinearNodeFunc(makeLinkAbsolute, outChSet)
+}
+
+func makeLinkAbsolute(node *Node, outChSet *NodeChSet) {
+	if node.ParentUrlString != "" {
+		linkUrl, err := url.Parse(node.UrlString)
+		if err != nil {
+			return
+		}
+		if !linkUrl.IsAbs() {
+			baseUrl, err := url.Parse(node.ParentUrlString)
+			if err != nil {
+				return
+			}
+			absLinkUrl := baseUrl.ResolveReference(linkUrl)
+			node.UrlString = absLinkUrl.String()
+		}
+	}
+	outChSet.NodeCh <- node
+}
diff --git a/octopus/pipe_comp_structurize.go b/octopus/pipe_comp_structurize.go
diff --git a/octopus/pipe_filter_crawldepth.go b/octopus/pipe_filter_crawldepth.go
@@ -0,0 +1,11 @@
+package octopus
+
+func (o *octopus) makeFilterCrawlDepthPipe(outChSet *NodeChSet) *NodeChSet {
+	return stdLinearNodeFunc(o.filterByUrlDepth, outChSet)
+}
+
+func (o *octopus) filterByUrlDepth(node *Node, outChSet *NodeChSet) {
+	if node.Depth < o.MaxCrawlDepth {
+		outChSet.NodeCh <- node
+	}
+}
diff --git a/octopus/pipe_filter_duplication.go b/octopus/pipe_filter_duplication.go
diff --git a/octopus/pipe_filter_unduplication.go b/octopus/pipe_filter_unduplication.go
@@ -0,0 +1,12 @@
+package octopus
+
+func (o *octopus) makeFilterUnduplicationPipe(outChSet *NodeChSet) *NodeChSet {
+	return stdLinearNodeFunc(o.filterDuplicates, outChSet)
+}
+
+func (o *octopus) filterDuplicates(node *Node, outChSet *NodeChSet) {
+	if _, visited := o.visited.Load(node.UrlString); !visited {
+		o.visited.Store(node.UrlString, true)
+		outChSet.NodeCh <- node
+	}
+}
diff --git a/octopus/pipe_filter_urlvalidation.go b/octopus/pipe_filter_urlvalidation.go
@@ -1 +1,17 @@
 package octopus
+
+import (
+	"net/http"
+)
+
+func (o *octopus) makeFilterUrlValidationPipe(outChSet *NodeChSet) *NodeChSet {
+	return stdLinearNodeFunc(validateUrl, outChSet)
+}
+
+func validateUrl(node *Node, outChSet *NodeChSet) {
+	resp, err := http.Head(node.UrlString)
+	if err == nil && resp.StatusCode == 200 {
+		outChSet.NodeCh <- node
+	}
+	// log.Printf("%v\n", err)
+}
diff --git a/octopus/pipe_htmlparsing.go b/octopus/pipe_htmlparsing.go
@@ -1 +1,59 @@
 package octopus
+
+import (
+	"golang.org/x/net/html"
+)
+
+func (o *octopus) makeHtmlParsingPipe(outChSet *NodeInfoChSet) *NodeChSet {
+	listenCh := make(chan *Node)
+	listenQuitCh := make(chan int, 1)
+	listenChSet := &NodeChSet{
+		NodeCh: listenCh,
+		StdChannels: &StdChannels{
+			QuitCh: listenQuitCh,
+		},
+	}
+	go func() {
+		defer close(listenCh)
+		defer close(listenQuitCh)
+		for {
+			select {
+			case node := <-listenCh:
+				{
+					go parseHtmlPage(node, outChSet)
+				}
+			case <-listenQuitCh:
+				{
+					outChSet.QuitCh <- 1
+					return
+				}
+			}
+		}
+	}()
+	return listenChSet
+}
+
+func parseHtmlPage(node *Node, outChSet *NodeInfoChSet) {
+	defer node.Body.Close()
+	z := html.NewTokenizer(node.Body)
+	for {
+		tt := z.Next()
+		switch tt {
+		case html.ErrorToken:
+			return
+		case html.StartTagToken, html.EndTagToken:
+			token := z.Token()
+			if "a" == token.Data {
+				for _, attr := range token.Attr {
+					if attr.Key == "href" {
+						outChSet.nodeInfoCh <- &NodeInfo{
+							ParentUrlString: node.UrlString,
+							UrlString:       attr.Val,
+							Depth:           node.Depth + 1,
+						}
+					}
+				}
+			}
+		}
+	}
+}
diff --git a/octopus/pipe_pagerequisition.go b/octopus/pipe_pagerequisition.go
diff --git a/octopus/pipe_splitter.go b/octopus/pipe_splitter.go
diff --git a/octopus/stdpipefunc.go b/octopus/stdpipefunc.go