Skip to content

Commit 4c2386f

Browse files
committed
Replace crawlSite with an iterator-based crawl
1 parent 30937ab commit 4c2386f

File tree

1 file changed

+8
-149
lines changed

1 file changed

+8
-149
lines changed

src/Crawler.php

Lines changed: 8 additions & 149 deletions
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,15 @@ public function __construct() {
104104

105105
public static function wp2staticCrawl( string $crawler_slug ) : void {
106106
if ( 'wp2static' === $crawler_slug ) {
107+
$paths = CrawlQueue::getPathsIter();
107108
$crawler = new Crawler();
108-
$crawler->crawlSite();
109+
$crawled = $crawler->crawlIter( $paths );
110+
$crawled = CrawlCache::remove404s( $crawled );
111+
$crawled = CrawlCache::writeFilesIter( $crawled );
112+
if ( $crawler->use_crawl_cache ) {
113+
$crawled = CrawlCache::addPathsIter( $crawled );
114+
}
115+
foreach ( $crawled as $crawled ) {}
109116
$crawler->crawlComplete();
110117
}
111118
}
@@ -123,154 +130,6 @@ public function crawlComplete() : void {
123130
do_action( 'wp2static_crawling_complete', $args );
124131
}
125132

126-
/**
127-
* Crawls URLs in WordPressSite, saving them to StaticSite
128-
*/
129-
public function crawlSite() : void {
130-
$site_host = parse_url( $this->site_path, PHP_URL_HOST );
131-
$site_port = parse_url( $this->site_path, PHP_URL_PORT );
132-
$site_host = $site_port ? $site_host . ":$site_port" : $site_host;
133-
$site_urls = [ "http://$site_host", "https://$site_host" ];
134-
135-
// TODO: use some Iterable or other performance optimisation here
136-
// to help reduce resources for large URL sites
137-
138-
/**
139-
* When you call method that executes database query in for loop
140-
* you are calling method and querying database for every loop iteration.
141-
* To avoid that you need to assing the result to a variable.
142-
*/
143-
144-
$crawlable_paths = CrawlQueue::getCrawlablePaths();
145-
$urls = [];
146-
147-
foreach ( $crawlable_paths as $root_relative_path ) {
148-
$absolute_uri = new URL( $this->site_path . $root_relative_path );
149-
$urls[] = [
150-
'url' => $absolute_uri->get(),
151-
'path' => $root_relative_path,
152-
];
153-
}
154-
155-
$requests = function ( $urls ) {
156-
foreach ( $urls as $url ) {
157-
yield new Request( 'GET', $url['url'] );
158-
}
159-
};
160-
161-
$concurrency = intval( CoreOptions::getValue( 'crawlConcurrency' ) );
162-
$last_log_time = microtime( true );
163-
164-
$pool = new Pool(
165-
$this->client,
166-
$requests( $urls ),
167-
[
168-
'concurrency' => $concurrency,
169-
'fulfilled' => function ( Response $response, $index ) use (
170-
$last_log_time, &$urls, $site_urls
171-
) {
172-
$root_relative_path = $urls[ $index ]['path'];
173-
$crawled_contents = (string) $response->getBody();
174-
$status_code = $response->getStatusCode();
175-
176-
$is_cacheable = true;
177-
if ( $status_code === 404 ) {
178-
WsLog::l( '404 for URL ' . $root_relative_path );
179-
CrawlCache::rmUrl( $root_relative_path );
180-
// Delete crawl queue to prevent crawling not found urls forever.
181-
CrawlQueue::rmUrl( $root_relative_path );
182-
// Delete previously generated files under the directories,
183-
// both the crawled and the processed.
184-
array_map(
185-
function( $dir ) use ( $root_relative_path ) {
186-
$transformed_path = StaticSite::transformPath( $root_relative_path );
187-
$suffix = ltrim( $transformed_path, '/' );
188-
$full_path = trailingslashit( $dir ) . $suffix;
189-
if ( file_exists( $full_path ) && ! is_dir( $full_path ) ) {
190-
unlink( $full_path );
191-
}
192-
},
193-
[ StaticSite::getPath(), ProcessedSite::getPath() ]
194-
);
195-
$crawled_contents = null;
196-
$is_cacheable = false;
197-
} elseif ( in_array( $status_code, WP2STATIC_REDIRECT_CODES ) ) {
198-
$crawled_contents = null;
199-
}
200-
201-
$redirect_to = null;
202-
203-
if ( in_array( $status_code, WP2STATIC_REDIRECT_CODES ) ) {
204-
$effective_url = $urls[ $index ]['url'];
205-
206-
// returns as string
207-
$redirect_history =
208-
$response->getHeaderLine( 'X-Guzzle-Redirect-History' );
209-
210-
if ( $redirect_history ) {
211-
$redirects = explode( ', ', $redirect_history );
212-
$effective_url = end( $redirects );
213-
}
214-
215-
$redirect_to =
216-
(string) str_replace( $site_urls, '', $effective_url );
217-
$page_hash = md5( $status_code . $redirect_to );
218-
} elseif ( ! is_null( $crawled_contents ) ) {
219-
$page_hash = md5( $crawled_contents );
220-
} else {
221-
$page_hash = md5( (string) $status_code );
222-
}
223-
224-
$write_contents = true;
225-
226-
if ( $this->use_crawl_cache ) {
227-
// if not already cached
228-
if ( CrawlCache::getUrl( $root_relative_path, $page_hash ) ) {
229-
$this->cache_hits++;
230-
$write_contents = false;
231-
}
232-
}
233-
234-
$this->crawled++;
235-
236-
if ( $crawled_contents && $write_contents ) {
237-
$static_path = StaticSite::transformPath( $root_relative_path );
238-
StaticSite::add( $static_path, $crawled_contents );
239-
}
240-
241-
if ( $is_cacheable ) {
242-
CrawlCache::addUrl(
243-
$root_relative_path,
244-
$page_hash,
245-
$status_code,
246-
$redirect_to
247-
);
248-
}
249-
250-
$now = microtime( true );
251-
252-
if ( $now - $last_log_time >= 60 ) {
253-
WsLog::l( 'Crawled ' . $root_relative_path );
254-
$notice = "Crawling progress: $this->crawled crawled," .
255-
" $this->cache_hits skipped (cached).";
256-
WsLog::l( $notice );
257-
$last_log_time = microtime( true );
258-
}
259-
},
260-
'rejected' => function ( RequestException $reason, $index ) use ( $urls ) {
261-
$root_relative_path = $urls[ $index ]['path'];
262-
WsLog::l( 'Failed ' . $root_relative_path );
263-
},
264-
]
265-
);
266-
267-
// Initiate the transfers and create a promise
268-
$promise = $pool->promise();
269-
270-
// Force the pool of requests to complete.
271-
$promise->wait();
272-
}
273-
274133
public function crawlPath(array $detected, array $site_urls) : PromiseInterface {
275134
$filename = $detected['filename'] ?? null;
276135
$path = $detected['path'];

0 commit comments

Comments
 (0)