@@ -104,8 +104,15 @@ public function __construct() {
104104
105105 public static function wp2staticCrawl ( string $ crawler_slug ) : void {
106106 if ( 'wp2static ' === $ crawler_slug ) {
107+ $ paths = CrawlQueue::getPathsIter ();
107108 $ crawler = new Crawler ();
108- $ crawler ->crawlSite ();
109+ $ crawled = $ crawler ->crawlIter ( $ paths );
110+ $ crawled = CrawlCache::remove404s ( $ crawled );
111+ $ crawled = CrawlCache::writeFilesIter ( $ crawled );
112+ if ( $ crawler ->use_crawl_cache ) {
113+ $ crawled = CrawlCache::addPathsIter ( $ crawled );
114+ }
115+ foreach ( $ crawled as $ crawled ) {}
109116 $ crawler ->crawlComplete ();
110117 }
111118 }
@@ -123,154 +130,6 @@ public function crawlComplete() : void {
123130 do_action ( 'wp2static_crawling_complete ' , $ args );
124131 }
125132
126- /**
127- * Crawls URLs in WordPressSite, saving them to StaticSite
128- */
129- public function crawlSite () : void {
130- $ site_host = parse_url ( $ this ->site_path , PHP_URL_HOST );
131- $ site_port = parse_url ( $ this ->site_path , PHP_URL_PORT );
132- $ site_host = $ site_port ? $ site_host . ": $ site_port " : $ site_host ;
133- $ site_urls = [ "http:// $ site_host " , "https:// $ site_host " ];
134-
135- // TODO: use some Iterable or other performance optimisation here
136- // to help reduce resources for large URL sites
137-
138- /**
139- * When you call method that executes database query in for loop
140- * you are calling method and querying database for every loop iteration.
141- * To avoid that you need to assing the result to a variable.
142- */
143-
144- $ crawlable_paths = CrawlQueue::getCrawlablePaths ();
145- $ urls = [];
146-
147- foreach ( $ crawlable_paths as $ root_relative_path ) {
148- $ absolute_uri = new URL ( $ this ->site_path . $ root_relative_path );
149- $ urls [] = [
150- 'url ' => $ absolute_uri ->get (),
151- 'path ' => $ root_relative_path ,
152- ];
153- }
154-
155- $ requests = function ( $ urls ) {
156- foreach ( $ urls as $ url ) {
157- yield new Request ( 'GET ' , $ url ['url ' ] );
158- }
159- };
160-
161- $ concurrency = intval ( CoreOptions::getValue ( 'crawlConcurrency ' ) );
162- $ last_log_time = microtime ( true );
163-
164- $ pool = new Pool (
165- $ this ->client ,
166- $ requests ( $ urls ),
167- [
168- 'concurrency ' => $ concurrency ,
169- 'fulfilled ' => function ( Response $ response , $ index ) use (
170- $ last_log_time , &$ urls , $ site_urls
171- ) {
172- $ root_relative_path = $ urls [ $ index ]['path ' ];
173- $ crawled_contents = (string ) $ response ->getBody ();
174- $ status_code = $ response ->getStatusCode ();
175-
176- $ is_cacheable = true ;
177- if ( $ status_code === 404 ) {
178- WsLog::l ( '404 for URL ' . $ root_relative_path );
179- CrawlCache::rmUrl ( $ root_relative_path );
180- // Delete crawl queue to prevent crawling not found urls forever.
181- CrawlQueue::rmUrl ( $ root_relative_path );
182- // Delete previously generated files under the directories,
183- // both the crawled and the processed.
184- array_map (
185- function ( $ dir ) use ( $ root_relative_path ) {
186- $ transformed_path = StaticSite::transformPath ( $ root_relative_path );
187- $ suffix = ltrim ( $ transformed_path , '/ ' );
188- $ full_path = trailingslashit ( $ dir ) . $ suffix ;
189- if ( file_exists ( $ full_path ) && ! is_dir ( $ full_path ) ) {
190- unlink ( $ full_path );
191- }
192- },
193- [ StaticSite::getPath (), ProcessedSite::getPath () ]
194- );
195- $ crawled_contents = null ;
196- $ is_cacheable = false ;
197- } elseif ( in_array ( $ status_code , WP2STATIC_REDIRECT_CODES ) ) {
198- $ crawled_contents = null ;
199- }
200-
201- $ redirect_to = null ;
202-
203- if ( in_array ( $ status_code , WP2STATIC_REDIRECT_CODES ) ) {
204- $ effective_url = $ urls [ $ index ]['url ' ];
205-
206- // returns as string
207- $ redirect_history =
208- $ response ->getHeaderLine ( 'X-Guzzle-Redirect-History ' );
209-
210- if ( $ redirect_history ) {
211- $ redirects = explode ( ', ' , $ redirect_history );
212- $ effective_url = end ( $ redirects );
213- }
214-
215- $ redirect_to =
216- (string ) str_replace ( $ site_urls , '' , $ effective_url );
217- $ page_hash = md5 ( $ status_code . $ redirect_to );
218- } elseif ( ! is_null ( $ crawled_contents ) ) {
219- $ page_hash = md5 ( $ crawled_contents );
220- } else {
221- $ page_hash = md5 ( (string ) $ status_code );
222- }
223-
224- $ write_contents = true ;
225-
226- if ( $ this ->use_crawl_cache ) {
227- // if not already cached
228- if ( CrawlCache::getUrl ( $ root_relative_path , $ page_hash ) ) {
229- $ this ->cache_hits ++;
230- $ write_contents = false ;
231- }
232- }
233-
234- $ this ->crawled ++;
235-
236- if ( $ crawled_contents && $ write_contents ) {
237- $ static_path = StaticSite::transformPath ( $ root_relative_path );
238- StaticSite::add ( $ static_path , $ crawled_contents );
239- }
240-
241- if ( $ is_cacheable ) {
242- CrawlCache::addUrl (
243- $ root_relative_path ,
244- $ page_hash ,
245- $ status_code ,
246- $ redirect_to
247- );
248- }
249-
250- $ now = microtime ( true );
251-
252- if ( $ now - $ last_log_time >= 60 ) {
253- WsLog::l ( 'Crawled ' . $ root_relative_path );
254- $ notice = "Crawling progress: $ this ->crawled crawled, " .
255- " $ this ->cache_hits skipped (cached). " ;
256- WsLog::l ( $ notice );
257- $ last_log_time = microtime ( true );
258- }
259- },
260- 'rejected ' => function ( RequestException $ reason , $ index ) use ( $ urls ) {
261- $ root_relative_path = $ urls [ $ index ]['path ' ];
262- WsLog::l ( 'Failed ' . $ root_relative_path );
263- },
264- ]
265- );
266-
267- // Initiate the transfers and create a promise
268- $ promise = $ pool ->promise ();
269-
270- // Force the pool of requests to complete.
271- $ promise ->wait ();
272- }
273-
274133 public function crawlPath (array $ detected , array $ site_urls ) : PromiseInterface {
275134 $ filename = $ detected ['filename ' ] ?? null ;
276135 $ path = $ detected ['path ' ];
0 commit comments