@@ -26,31 +26,37 @@ type octopus struct {
2626// You can specify depth of exploration for each link,
2727// if crawler should ignore other host names (except from base host).
2828//
29- // MaxCrawlDepth - Indicates the maximum depth that will be crawled,
30- // for each new link.
29+ // MaxCrawlDepth - Indicates the maximum depth that will be crawled,
30+ // for each new link.
3131//
32- // MaxCrawlLinks - Specifies the Maximum Number of Unique Links that will be crawled.
33- // Note : When combined with DepthPerLink, it will combine both.
34- // Use -1 to indicate infinite links to be crawled (only bounded by depth of traversal).
32+ // MaxCrawlLinks - Specifies the Maximum Number of Unique Links that will be crawled.
33+ // Note : When combined with DepthPerLink, it will combine both.
34+ // Use -1 to indicate infinite links to be crawled (only bounded by depth of traversal).
3535//
36- // IncludeBody - Include the response Body in the crawled NodeInfo (for further processing).
37- // OpAdapter is a user specified concrete implementation of an Output Adapter. The crawler
38- // will pump output onto the implementation's channel returned by its Consume method.
36+ // StayWithinBaseHost - (unimplemented) Ensures crawler stays within the
37+ // level 1 link's hostname.
3938//
40- // CrawlRate (unimplemented) is the rate at which requests will be made.
39+ // CrawlRate (unimplemented) is the rate at which requests will be made.
40+ // In seconds
4141//
42- // RespectRobots (unimplemented) choose whether to respect robots.txt or not.
42+ // RespectRobots (unimplemented) choose whether to respect robots.txt or not.
4343//
44- // ValidProtocols - This is an array containing the list of url protocols that
45- // should be crawled .
44+ // IncludeBody - (unimplemented) Include the response Body in the crawled
45+ // NodeInfo (for further processing) .
4646//
47- // TimeToQuit - represents the total time to wait between two new nodes to be
48- // generated before the crawler quits. This is in seconds.
47+ // OpAdapter is a user specified concrete implementation of an Output Adapter. The crawler
48+ // will pump output onto the implementation's channel returned by its Consume method.
49+ //
50+ // ValidProtocols - This is an array containing the list of url protocols that
51+ // should be crawled.
52+ //
53+ // TimeToQuit - represents the total time to wait between two new nodes to be
54+ // generated before the crawler quits. This is in seconds.
4955type CrawlOptions struct {
5056 MaxCrawlDepth int64
5157 MaxCrawlLinks int64
5258 StayWithinBaseHost bool
53- CrawlRatePerSec int64
59+ CrawlRate int64
5460 RespectRobots bool
5561 IncludeBody bool
5662 OpAdapter OutputAdapter
@@ -71,12 +77,17 @@ type Node struct {
7177 Body io.ReadCloser
7278}
7379
80+ // StdChannels are used to hold the standard set of channels that are used
81+ // for special operations. Will include channels for Logging, Statistics,
82+ // etc. in the future.
7483type StdChannels struct {
7584 QuitCh chan <- int
7685 // logCh chan<- string
7786 // errorCh chan<- string
7887}
7988
89+ // NodeChSet is the standard set of channels used to build the concurrency
90+ // pipelines in the crawler.
8091type NodeChSet struct {
8192 NodeCh chan <- * Node
8293 * StdChannels
@@ -88,13 +99,19 @@ type ingestPipeChSet struct {
8899 QuitCh chan int
89100}
90101
91- // OutputAdapter is the interface for the Adapter that is used to handle
92- // output from the Octopus Crawler.
93- // The contract stipulates that the crawler provides the channel
94- // to listen for a quit command.
95- // The crawler pumps its output onto the returned channel of the Consume method.
96- // Implementers of the interface should listen on this channel for output from
97- // the crawler.
102+ // OutputAdapter is the interface that has to be implemented in order to
103+ // handle outputs from the octopus crawler.
104+ //
105+ // The octopus will call the OutputAdapter.Consume(
106+ // ) method and deliver all relevant output and quit signals on the channels
107+ // included in the received NodeChSet.
108+ //
109+ // This implies that it is the responsibility of the user who implements
110+ // OutputAdapter to handle processing the output of the crawler that is
111+ // delivered on the NodeChSet.NodeCh.
112+ //
113+ // Implementers of the interface should listen to the included channels in
114+ // the output of Consume() for output from the crawler.
98115type OutputAdapter interface {
99116 Consume () * NodeChSet
100117}
0 commit comments