@@ -5,49 +5,74 @@ import (
55 "sync"
66)
77
8- // Node is used to represent each crawled link and its associated depth of crawl.
9- type Node struct {
10- ParentUrlString string
11- UrlString string
12- Depth int
13- }
14-
158// octopus is a concurrent web crawler.
169// It has an inbuilt parser based of html.NewTokenizer to collect all links in a web-page.
1710// It also has a CrawlOptions structure to initialize setting specific
1811// to an instance of the crawler.
1912type octopus struct {
2013 * CrawlOptions
21- visited * sync.Map
22- isBuilt bool
14+ visited * sync.Map
15+ isReady bool
16+ adapterChSet * NodeChSet
2317}
2418
2519// CrawlOptions is used to house options for crawling.
2620// You can specify depth of exploration for each link,
2721// if crawler should ignore other hostnames (except from base host).
28- // MaxLinksCrawled - Specifies the Maximum Number of Unique Links that will be crawled.
22+ // MaxCrawlDepth - Indicates the maximum depth that will be crawled,
23+ // for each new link.
24+ // MaxCrawlLinks - Specifies the Maximum Number of Unique Links that will be crawled.
2925// Note : When combined with DepthPerLink, it will combine both.
3026// Use -1 to indicate infinite links to be crawled (only bounded by depth of traversal).
31- // IncludeBody - Include the response Body in the crawled Node (for further processing).
27+ // IncludeBody - Include the response Body in the crawled NodeInfo (for further processing).
3228// OpAdapter is a user specified concrete implementation of an Output Adapter. The crawler
3329// will pump output onto the implementation's channel returned by its Consume method.
3430// CrawlRate is the rate at which requests will be made.
3531// RespectRobots (unimplemented) choose whether to respect robots.txt or not.
3632type CrawlOptions struct {
37- MaxDepthCrawled int64
38- MaxLinksCrawled int64
33+ MaxCrawlDepth int64
34+ MaxCrawlLinks int64
3935 StayWithinBaseHost bool
4036 CrawlRatePerSec int64
4137 RespectRobots bool
4238 IncludeBody bool
4339 OpAdapter * OutputAdapter
4440}
4541
46- type CrawlOutput struct {
47- * Node
42+ // NodeInfo is used to represent each crawled link and its associated crawl depth.
43+ type NodeInfo struct {
44+ ParentUrlString string
45+ UrlString string
46+ Depth int64
47+ }
48+
49+ // Node encloses a NodeInfo and its Body (HTML) Content.
50+ type Node struct {
51+ * NodeInfo
4852 Body io.ReadCloser
4953}
5054
55+ type StdChannels struct {
56+ QuitCh chan <- int
57+ // logCh chan<- string
58+ // errorCh chan<- string
59+ }
60+
61+ type NodeChSet struct {
62+ NodeCh chan <- * Node
63+ * StdChannels
64+ }
65+
66+ type StringChSet struct {
67+ strCh chan <- string
68+ * StdChannels
69+ }
70+
71+ type NodeInfoChSet struct {
72+ nodeInfoCh chan <- * NodeInfo
73+ * StdChannels
74+ }
75+
5176// OutputAdapter is the interface for the Adapter that is used to handle
5277// output from the Octopus Crawler.
5378// The contract stipulates that the crawler provides the channel
@@ -56,5 +81,5 @@ type CrawlOutput struct {
5681// Implementers of the interface should listen on this channel for output from
5782// the crawler.
5883type OutputAdapter interface {
59- Consume (quitCh <- chan bool ) chan <- * CrawlOutput
84+ Consume () * NodeChSet
6085}
0 commit comments