rapidclock
diff --git a/‎README.md‎
Lines changed: 119 additions & 3 deletions b/‎README.md‎
Lines changed: 119 additions & 3 deletions
diff --git a/‎adapter/basicadapters.go‎
Lines changed: 5 additions & 1 deletion b/‎adapter/basicadapters.go‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎octopus/core.go‎
Lines changed: 26 additions & 36 deletions b/‎octopus/core.go‎
Lines changed: 26 additions & 36 deletions
diff --git a/‎octopus/doc.go‎
Lines changed: 4 additions & 3 deletions b/‎octopus/doc.go‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎octopus/modelfactory.go‎
Lines changed: 24 additions & 14 deletions b/‎octopus/modelfactory.go‎
Lines changed: 24 additions & 14 deletions
diff --git a/‎octopus/models.go‎
Lines changed: 19 additions & 11 deletions b/‎octopus/models.go‎
Lines changed: 19 additions & 11 deletions
diff --git a/‎octopus/pipe_augment_linkabsolution.go‎
Lines changed: 1 addition & 1 deletion b/‎octopus/pipe_augment_linkabsolution.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎octopus/pipe_ctrl_limitcrawl.go‎
Lines changed: 2 additions & 2 deletions b/‎octopus/pipe_ctrl_limitcrawl.go‎
Lines changed: 2 additions & 2 deletions
@@ -7,8 +7,8 @@ A concurent web crawler to crawl the web.
 - Depth Limited Crawling
 - User specified valid protocols
 - User buildable adapters that the crawler feeds output to.
-- Filter Duplicates.
-- Filter URLs that fail a HEAD request.
+- Filter Duplicates. (Default, Non-Customizable)
+- Filter URLs that fail a HEAD request. (Default, Non-Customizable)
 - User specifiable max timeout between two successive url requests.
 - Max Number of Links to be crawled.
 
@@ -35,4 +35,120 @@ func main() {
 	crawler.SetupSystem()
 	crawler.BeginCrawling("https://www.example.com")
 }
-```
+```
+
+### List of customizations
+
+Customizations can be made by supplying the crawler an instance of `CrawlOptions`. The basic structure is shown below, with a brief explanation for each option.
+
+```go
+type CrawlOptions struct {
+	MaxCrawlDepth      int64 // Max Depth of Crawl, 0 is the initial link.
+	MaxCrawledUrls     int64 // Max number of links to be crawled in total.
+	StayWithinBaseHost bool // [Not-Implemented-Yet]
+	CrawlRate          int64 // Max Rate at which requests can be made (req/sec).
+	CrawlBurstLimit    int64 // Max Burst Capacity (should be atleast the crawl rate).
+	RespectRobots      bool // [Not-Implemented-Yet]
+	IncludeBody        bool // Include the Request Body (Contents of the web page) in the result of the crawl.
+	OpAdapter          OutputAdapter // A user defined crawl output handler (See next section for info).
+	ValidProtocols     []string // Valid protocols to crawl (http, https, ftp, etc.)
+	TimeToQuit         int64 // Timeout (seconds) between two attempts or requests, before the crawler quits.
+}
+```
+
+A default instance of the `CrawlOptions` can be obtained by calling `octopus.GetDefaultCrawlOptions()`. This can be further customized by overriding individual properties.
+
+### Output Adapters
+
+An Output Adapter is the final destination of a crawler processed request. The output of the crawler is fed here, according to the customizations made before starting the crawler through the `CrawlOptions` attached to the crawler.
+
+The `OutputAdapter` is a Go Interface, that has to be implemented by your(user-defined) processor.
+
+```go
+type OutputAdapter interface {
+	Consume() *NodeChSet
+}
+```
+
+The user has to implement the `Consume()` method that returns a __*pointer*__ to a `NodeChSet`. The `NodeChSet` is described below. The crawler uses the returned channel to send the crawl output. The user can start listening for output from the crawler.
+
+**Note** : If the user chooses to implement their custom `OutputAdapter` **REMEMBER** to listen for the output on another go-routine. Otherwise you might block the crawler from running. Atleast begin the crawling on another go-routine before you begin processing output.
+
+The structure of the `NodeChSet` is given below.
+
+```go
+type NodeChSet struct {
+	NodeCh chan<- *Node
+	*StdChannels
+}
+
+type StdChannels struct {
+	QuitCh chan<- int
+}
+
+type Node struct {
+	*NodeInfo
+	Body io.ReadCloser
+}
+
+type NodeInfo struct {
+	ParentUrlString string
+	UrlString       string
+	Depth           int64
+}
+```
+
+You can use the utility function `MakeDefaultNodeChSet()` to get a `NodeChSet` built for you. This also returns the `Node` and quit channels. Example given below:
+
+```go
+var opNodeChSet *NodeChSet
+var nodeCh chan *Node
+var quitCh chan int
+// above to demo the types. One can easily use go lang type erasure.
+opNodeChSet, nodeCh, quitCh = MakeDefaultNodeChSet()
+```
+
+The user should supply the custom OutputAdapter as an argument to the `CrawlOptions`.
+
+#### Default Output Adapters:
+
+We supply two default Adapters for you to try out. They are not meant to be feature rich, but you can still use them. Their primary purpose is meant to be a demonstration of how to build and use a `OutputAdapter`.
+
+1. `adapter.StdOpAdapter` : Writes the crawled output (only links, not body) to the standard output.
+1. `adapter.FileWriterAdapter` : Writes the crawled output (only links, not body) to a supplied file.
+
+#### Implementation of the `adapter.StdOpAdapter`:
+We have supplied the implementation of `adapter.StdOpAdapter` below to get a rough idea of what goes into building your own adapter.
+
+```go
+// StdOpAdapter is an output adapter that just prints the output onto the
+// screen.
+//
+// Sample Output Format is:
+// 	LinkNum - Depth - Url
+type StdOpAdapter struct{}
+
+func (s *StdOpAdapter) Consume() *oct.NodeChSet {
+	listenCh := make(chan *oct.Node)
+	quitCh := make(chan int, 1)
+	listenChSet := &oct.NodeChSet{
+		NodeCh: listenCh,
+		StdChannels: &oct.StdChannels{
+			QuitCh: quitCh,
+		},
+	}
+	go func() {
+		i := 1
+		for {
+			select {
+			case output := <-listenCh:
+				fmt.Printf("%d - %d - %s\n", i, output.Depth, output.UrlString)
+				i++
+			case <-quitCh:
+				return
+			}
+		}
+	}()
+	return listenChSet
+}
+```
@@ -73,7 +73,11 @@ func (fw *FileWriterAdapter) writeToFile(listenCh chan *oct.Node,
 		for {
 			select {
 			case output := <-listenCh:
-				fmt.Fprintf(fp, "%d - %s\n", output.Depth, output.UrlString)
+				_, err = fmt.Fprintf(fp, "%d - %s\n", output.Depth,
+					output.UrlString)
+				if err != nil {
+					log.Println("File Error - ", err)
+				}
 			case <-quitCh:
 				return
 			}
 
@@ -2,37 +2,9 @@ package octopus
 
 import (
 	"fmt"
-	"log"
 	"time"
 )
 
-func (o *octopus) setupOctopus() {
-	o.setupValidProtocolMap()
-	o.setupTimeToQuit()
-	o.setupMaxLinksCrawled()
-}
-
-func (o *octopus) setupValidProtocolMap() {
-	o.isValidProtocol = make(map[string]bool)
-	for _, protocol := range o.ValidProtocols {
-		o.isValidProtocol[protocol] = true
-	}
-}
-
-func (o *octopus) setupTimeToQuit() {
-	if o.TimeToQuit > 0 {
-		o.timeToQuit = time.Duration(o.TimeToQuit) * time.Second
-	} else {
-		log.Fatalln("TimeToQuit is not greater than 0")
-	}
-}
-
-func (o *octopus) setupMaxLinksCrawled() {
-	if o.MaxCrawledUrls == 0 {
-		panic("MaxCrawledUrls should either be negative or greater than 0.")
-	}
-}
-
 func (o *octopus) SetupSystem() {
 	o.isReady = false
 	o.setupOctopus()
@@ -57,16 +29,12 @@ func (o *octopus) SetupSystem() {
 	depthLimitChSet := o.makeCrawlDepthFilterPipe(pageParseChSet)
 	maxDelayChSet := o.makeMaxDelayPipe(depthLimitChSet)
 
-	var distributorChSet *NodeChSet
-	if o.MaxCrawledUrls < 0 {
-		distributorChSet = o.makeDistributorPipe(maxDelayChSet, outAdapterChSet)
-	} else {
-		maxLinksCrawledChSet := o.makeLimitCrawlPipe(outAdapterChSet)
-		distributorChSet = o.makeDistributorPipe(maxDelayChSet, maxLinksCrawledChSet)
-	}
+	distributorChSet := o.handleDistributorPipeline(maxDelayChSet, outAdapterChSet)
 
 	pageReqChSet := o.makePageRequisitionPipe(distributorChSet)
-	invUrlFilterChSet := o.makeInvalidUrlFilterPipe(pageReqChSet)
+
+	invUrlFilterChSet := o.handleRateLimitingPipeline(pageReqChSet)
+
 	dupFilterChSet := o.makeDuplicateUrlFilterPipe(invUrlFilterChSet)
 	protoFilterChSet := o.makeUrlProtocolFilterPipe(dupFilterChSet)
 	linkAbsChSet := o.makeLinkAbsolutionPipe(protoFilterChSet)
@@ -77,6 +45,28 @@ func (o *octopus) SetupSystem() {
 	o.isReady = true
 }
 
+func (o *octopus) handleDistributorPipeline(maxDelayChSet, outAdapterChSet *NodeChSet) *NodeChSet {
+	var distributorChSet *NodeChSet
+	if o.MaxCrawledUrls < 0 {
+		distributorChSet = o.makeDistributorPipe(maxDelayChSet, outAdapterChSet)
+	} else {
+		maxLinksCrawledChSet := o.makeCrawlLinkCountLimitPipe(outAdapterChSet)
+		distributorChSet = o.makeDistributorPipe(maxDelayChSet, maxLinksCrawledChSet)
+	}
+	return distributorChSet
+}
+
+func (o *octopus) handleRateLimitingPipeline(pageReqChSet *NodeChSet) *NodeChSet {
+	var invUrlFilterChSet *NodeChSet
+	if o.rateLimiter != nil {
+		rateLimitingChSet := o.makeRateLimitingPipe(pageReqChSet)
+		invUrlFilterChSet = o.makeInvalidUrlFilterPipe(rateLimitingChSet)
+	} else {
+		invUrlFilterChSet = o.makeInvalidUrlFilterPipe(pageReqChSet)
+	}
+	return invUrlFilterChSet
+}
+
 func (o *octopus) BeginCrawling(baseUrlStr string) {
 	if !o.isReady {
 		panic("Call BuildSystem first to setup Octopus")
 
@@ -24,14 +24,15 @@ The overview of the Pipeline is given below:
 	3. Protocol Filter
 	4. Duplicate Filter
 	5. Invalid Url Filter (Urls whose HEAD request Fails)
-	6. Make GET Request
+	(5x) (Optional) Crawl Rate Limiter.
+	[6]. Make GET Request
 	7a. Send to Output Adapter
 	7b. Check for Timeout (gap between two output on this channel).
 	8. Max Links Crawled Limit Filter
 	9. Depth Limit Filter
 	10. Parse Page for more URLs.
 
 Note: The output from 7b. is fed to 8.
-	1 -> 2 -> 3 -> 4 -> 5 -> 6 -> 7b -> 8 -> 9 -> 10 -> 1
- */
+	1 -> 2 -> 3 -> 4 -> 5 -> (5x) -> [6] -> 7b -> 8 -> 9 -> 10 -> 1
+*/
 package octopus
@@ -3,11 +3,13 @@ package octopus
 import "sync"
 
 const (
-	defaultMaxDepth   int64 = 2
-	anchorTag               = "a"
-	anchorAttrb             = "href"
-	defaultTimeToQuit       = 5
-	defaultCrawlLimit int64 = -1
+	defaultMaxDepth       int64  = 2
+	anchorTag                    = "a"
+	anchorAttrb                  = "href"
+	defaultTimeToQuit            = 30
+	defaultLinkCrawlLimit int64  = -1
+	defaultCrawlRateLimit int64  = -1
+	defaultRequestTimeout uint64 = 15
 )
 
 // NewWithDefaultOptions - Create an Instance of the Octopus with the default CrawlOptions.
@@ -44,15 +46,16 @@ func createNode(parentUrlStr, urlStr string, depth int64) *Node {
 // Returns an instance of CrawlOptions with the values set to sensible defaults.
 func GetDefaultCrawlOptions() *CrawlOptions {
 	return &CrawlOptions{
-		MaxCrawlDepth:      defaultMaxDepth,
-		MaxCrawledUrls:     defaultCrawlLimit,
-		StayWithinBaseHost: false,
-		CrawlRate:          -1,
-		RespectRobots:      false,
-		IncludeBody:        true,
-		OpAdapter:          nil,
-		ValidProtocols:     []string{"http", "https"},
-		TimeToQuit:         defaultTimeToQuit,
+		MaxCrawlDepth:         defaultMaxDepth,
+		MaxCrawledUrls:        defaultLinkCrawlLimit,
+		StayWithinBaseHost:    false,
+		CrawlRatePerSec:       defaultCrawlRateLimit,
+		CrawlBurstLimitPerSec: defaultCrawlRateLimit,
+		RespectRobots:         false,
+		IncludeBody:           true,
+		OpAdapter:             nil,
+		ValidProtocols:        []string{"http", "https"},
+		TimeToQuit:            defaultTimeToQuit,
 	}
 }
 
@@ -65,3 +68,10 @@ func MakeNodeChSet(nodeCh chan<- *Node, quitCh chan<- int) *NodeChSet {
 		},
 	}
 }
+
+// Utility to create a NodeChSet and get full access to the Quit & Node Channel.
+func MakeDefaultNodeChSet() (*NodeChSet, chan *Node, chan int) {
+	nodeCh := make(chan *Node)
+	quitCh := make(chan int)
+	return MakeNodeChSet(nodeCh, quitCh), nodeCh, quitCh
+}
@@ -4,6 +4,8 @@ import (
 	"io"
 	"sync"
 	"time"
+
+	"golang.org/x/time/rate"
 )
 
 // octopus is a concurrent web crawler.
@@ -20,6 +22,8 @@ type octopus struct {
 	inputUrlStrChan   chan string
 	masterQuitCh      chan int
 	crawledUrlCounter int64
+	rateLimiter       *rate.Limiter
+	requestTimeout    uint64
 }
 
 // CrawlOptions is used to house options for crawling.
@@ -37,8 +41,11 @@ type octopus struct {
 // 	StayWithinBaseHost - (unimplemented) Ensures crawler stays within the
 // 	level 1 link's hostname.
 //
-// 	CrawlRate (unimplemented) is the rate at which requests will be made.
-// 	In seconds
+// 	CrawlRatePerSec - is the rate at which requests will be made (per second).
+// 	If this is negative, Crawl feature will be ignored. Default is negative.
+//
+// 	CrawlBurstLimitPerSec - Represents the max burst capacity with which requests
+// 	can be made. This must be greater than or equal to the CrawlRatePerSec.
 //
 // 	RespectRobots (unimplemented) choose whether to respect robots.txt or not.
 //
@@ -54,15 +61,16 @@ type octopus struct {
 // 	TimeToQuit - represents the total time to wait between two new nodes to be
 // 	generated before the crawler quits. This is in seconds.
 type CrawlOptions struct {
-	MaxCrawlDepth      int64
-	MaxCrawledUrls     int64
-	StayWithinBaseHost bool
-	CrawlRate          int64
-	RespectRobots      bool
-	IncludeBody        bool
-	OpAdapter          OutputAdapter
-	ValidProtocols     []string
-	TimeToQuit         int64
+	MaxCrawlDepth         int64
+	MaxCrawledUrls        int64
+	StayWithinBaseHost    bool
+	CrawlRatePerSec       int64
+	CrawlBurstLimitPerSec int64
+	RespectRobots         bool
+	IncludeBody           bool
+	OpAdapter             OutputAdapter
+	ValidProtocols        []string
+	TimeToQuit            int64
 }
 
 // NodeInfo is used to represent each crawled link and its associated crawl depth.
 
@@ -6,7 +6,7 @@ import (
 )
 
 func (o *octopus) makeLinkAbsolutionPipe(outChSet *NodeChSet) *NodeChSet {
-	return stdLinearNodeFunc(makeLinkAbsolute, outChSet)
+	return stdLinearNodeFunc(makeLinkAbsolute, outChSet, "Link Absolution")
 }
 
 func makeLinkAbsolute(node *Node, outChSet *NodeChSet) {
 
@@ -4,8 +4,8 @@ import (
 	"sync/atomic"
 )
 
-func (o *octopus) makeLimitCrawlPipe(inChSet *NodeChSet) *NodeChSet {
-	return stdLinearNodeFunc(o.checkWithinLimit, inChSet)
+func (o *octopus) makeCrawlLinkCountLimitPipe(inChSet *NodeChSet) *NodeChSet {
+	return stdLinearNodeFunc(o.checkWithinLimit, inChSet, "Crawl Link Limit")
 }
 
 func (o *octopus) checkWithinLimit(node *Node, outChSet *NodeChSet) {
Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,7 @@ import (`
`6`	`6`	`)`
`7`	`7`
`8`	`8`	`func (o octopus) makeLinkAbsolutionPipe(outChSet NodeChSet) *NodeChSet {`
`9`		`- return stdLinearNodeFunc(makeLinkAbsolute, outChSet)`
	`9`	`+ return stdLinearNodeFunc(makeLinkAbsolute, outChSet, "Link Absolution")`
`10`	`10`	`}`
`11`	`11`
`12`	`12`	`func makeLinkAbsolute(node Node, outChSet NodeChSet) {`
Original file line number	Diff line number	Diff line change
`@@ -4,8 +4,8 @@ import (`
`4`	`4`	`"sync/atomic"`
`5`	`5`	`)`
`6`	`6`
`7`		`-func (o octopus) makeLimitCrawlPipe(inChSet NodeChSet) *NodeChSet {`
`8`		`- return stdLinearNodeFunc(o.checkWithinLimit, inChSet)`
	`7`	`+func (o octopus) makeCrawlLinkCountLimitPipe(inChSet NodeChSet) *NodeChSet {`
	`8`	`+ return stdLinearNodeFunc(o.checkWithinLimit, inChSet, "Crawl Link Limit")`
`9`	`9`	`}`
`10`	`10`
`11`	`11`	`func (o octopus) checkWithinLimit(node Node, outChSet *NodeChSet) {`