@@ -17,6 +17,7 @@ const WACZ_VERSION = "1.1.1";
1717const SPLIT_REQUEST_Q_RX = / ( .* ?) [ ? & ] (?: _ _ w b _ m e t h o d = | _ _ w b _ p o s t = ) [ ^ & ] + & ( .* ) / ;
1818
1919const LINES_PER_BLOCK = 1024 ;
20+ const RESOURCE_BATCH_SIZE = LINES_PER_BLOCK * 8 ;
2021
2122const DEFAULT_UUID_NAMESPACE = "f9ec3936-7f66-4461-bec4-34f4495ea242" ;
2223
@@ -91,8 +92,9 @@ class Downloader
9192 }
9293
9394 this . offset = 0 ;
94- this . resources = [ ] ;
95+ this . firstResources = [ ] ;
9596 this . textResources = [ ] ;
97+ this . cdxjLines = [ ] ;
9698
9799 // compressed index (idx) entries
98100 this . indexLines = [ ] ;
@@ -142,34 +144,33 @@ class Downloader
142144 return resp ;
143145 }
144146
145- async loadResources ( ) {
146- if ( this . pageList ) {
147- for await ( const resource of this . db . resourcesByPages ( this . pageList ) ) {
148- this . resources . push ( resource ) ;
149- }
150- } else {
151- this . resources = await this . db . db . getAll ( "resources" ) ;
152- }
147+ async loadResourcesBlock ( start = [ ] ) {
148+ return await this . db . db . getAll ( "resources" , IDBKeyRange . lowerBound ( start , true ) , RESOURCE_BATCH_SIZE ) ;
149+ }
153150
154- this . resources . sort ( ( a , b ) => {
155- if ( ! a . surt ) {
156- a . surt = getSurt ( a . url ) ;
157- }
151+ async * iterResources ( resources ) {
152+ let start = [ ] ;
153+ let count = 0 ;
158154
159- if ( ! b . surt ) {
160- b . surt = getSurt ( b . url ) ;
161- }
155+ while ( resources . length ) {
156+ const last = resources [ resources . length - 1 ] ;
162157
163- if ( a . surt == b . surt ) {
164- return 0 ;
158+ if ( this . pageList ) {
159+ resources = resources . filter ( ( res ) => this . pageList . includes ( res . pageId ) ) ;
165160 }
161+ count += resources . length ;
162+ yield * resources ;
166163
167- return a . surt < b . surt ? - 1 : 1 ;
168- } ) ;
164+ start = [ last . url , last . ts ] ;
165+ resources = await this . loadResourcesBlock ( start ) ;
166+ }
167+ if ( count !== this . numResources ) {
168+ console . warn ( `Iterated ${ count } , but expected ${ this . numResources } ` ) ;
169+ }
169170 }
170171
171172 async queueWARC ( controller , filename , sizeCallback ) {
172- await this . loadResources ( ) ;
173+ this . firstResources = await this . loadResourcesBlock ( ) ;
173174
174175 for await ( const chunk of this . generateWARC ( filename ) ) {
175176 controller . enqueue ( chunk ) ;
@@ -215,20 +216,20 @@ class Downloader
215216 async downloadWACZ ( filename , sizeCallback ) {
216217 filename = ( filename || "webarchive" ) . split ( "." ) [ 0 ] + ".wacz" ;
217218
218- await this . loadResources ( ) ;
219-
220219 this . fileHasher = await createSHA256 ( ) ;
221220 this . recordHasher = await createSHA256 ( ) ;
222221 this . hashType = "sha256" ;
223222
224223 const zip = [ ] ;
225224
225+ this . firstResources = await this . loadResourcesBlock ( ) ;
226+
226227 this . addFile ( zip , "pages/pages.jsonl" , this . generatePages ( ) , sizeCallback , true ) ;
227228 this . addFile ( zip , "archive/data.warc.gz" , this . generateWARC ( filename + "#/archive/data.warc.gz" , true ) , sizeCallback , false ) ;
228229 //this.addFile(zip, "archive/text.warc", this.generateTextWARC(filename + "#/archive/text.warc"), false);
229230
230231 // don't use compressed index if we'll have a single block, need to have at least enough for 2 blocks
231- if ( this . resources . length < ( 2 * LINES_PER_BLOCK ) ) {
232+ if ( this . firstResources . length < ( 2 * LINES_PER_BLOCK ) ) {
232233 this . addFile ( zip , "indexes/index.cdx" , this . generateCDX ( ) , sizeCallback , true ) ;
233234 } else {
234235 this . addFile ( zip , "indexes/index.cdx.gz" , this . generateCompressedCDX ( "index.cdx.gz" ) , sizeCallback , false ) ;
@@ -250,7 +251,7 @@ class Downloader
250251 return response ;
251252 }
252253
253- async * generateWARC ( filename , digestRecord = false ) {
254+ async * generateWARC ( filename , digestRecordAndCDX = false ) {
254255 try {
255256 let offset = 0 ;
256257
@@ -261,7 +262,7 @@ class Downloader
261262 offset += warcinfo . length ;
262263 }
263264
264- for ( const resource of this . resources ) {
265+ for await ( const resource of this . iterResources ( this . firstResources ) ) {
265266 resource . offset = offset ;
266267 const records = await this . createWARCRecord ( resource ) ;
267268 if ( ! records ) {
@@ -273,7 +274,7 @@ class Downloader
273274 yield records [ 0 ] ;
274275 offset += records [ 0 ] . length ;
275276 resource . length = records [ 0 ] . length ;
276- if ( digestRecord ) {
277+ if ( digestRecordAndCDX ) {
277278 resource . recordDigest = this . recordDigest ( records [ 0 ] ) ;
278279 }
279280
@@ -282,6 +283,10 @@ class Downloader
282283 yield records [ 1 ] ;
283284 offset += records [ 1 ] . length ;
284285 }
286+
287+ if ( digestRecordAndCDX ) {
288+ this . cdxjLines . push ( this . getCDXJ ( resource , "data.warc.gz" ) ) ;
289+ }
285290 }
286291 } catch ( e ) {
287292 console . warn ( e ) ;
@@ -311,62 +316,41 @@ class Downloader
311316 }
312317 }
313318
314- async * generateCDX ( raw = false ) {
315- const getCDX = ( resource , filename , raw ) => {
316-
317- const data = {
318- url : resource . url ,
319- digest : resource . digest ,
320- mime : resource . mime ,
321- offset : resource . offset ,
322- length : resource . length ,
323- recordDigest : resource . recordDigest ,
324- status : resource . status
325- } ;
326-
327- if ( filename ) {
328- data . filename = filename ;
329- }
330-
331- if ( resource . method && resource . method !== "GET" ) {
332- const m = resource . url . match ( SPLIT_REQUEST_Q_RX ) ;
333- if ( m ) {
334- data . url = m [ 1 ] ;
335- // resource.requestBody is the raw payload, use the converted one from the url for the cdx
336- data . requestBody = m [ 2 ] ;
337- }
338- data . method = resource . method ;
339- }
319+ getCDXJ ( resource , filename ) {
320+ const data = {
321+ url : resource . url ,
322+ digest : resource . digest ,
323+ mime : resource . mime ,
324+ offset : resource . offset ,
325+ length : resource . length ,
326+ recordDigest : resource . recordDigest ,
327+ status : resource . status
328+ } ;
340329
341- const cdx = `${ resource . surt } ${ resource . timestamp } ${ JSON . stringify ( data ) } \n` ;
330+ if ( filename ) {
331+ data . filename = filename ;
332+ }
342333
343- if ( ! raw ) {
344- return cdx ;
345- } else {
346- return [ resource , cdx ] ;
334+ if ( resource . method && resource . method !== "GET" ) {
335+ const m = resource . url . match ( SPLIT_REQUEST_Q_RX ) ;
336+ if ( m ) {
337+ data . url = m [ 1 ] ;
338+ // resource.requestBody is the raw payload, use the converted one from the url for the cdx
339+ data . requestBody = m [ 2 ] ;
347340 }
348- } ;
341+ data . method = resource . method ;
342+ }
349343
350- try {
351- for await ( const resource of this . resources ) {
352- if ( resource . skipped ) {
353- continue ;
354- }
355- yield getCDX ( resource , "data.warc.gz" , raw ) ;
356- }
344+ return `${ getSurt ( resource . url ) } ${ resource . timestamp } ${ JSON . stringify ( data ) } \n` ;
345+ }
357346
358- // for await (const resource of this.textResources) {
359- // resource.mime = "text/plain";
360- // resource.status = 200;
361- // yield getCDX(resource, "text.warc", raw);
362- // }
347+ * generateCDX ( ) {
348+ this . cdxjLines . sort ( ) ;
363349
364- } catch ( e ) {
365- console . warn ( e ) ;
366- }
350+ yield * this . cdxjLines ;
367351 }
368352
369- async * generateCompressedCDX ( filename ) {
353+ * generateCompressedCDX ( filename ) {
370354 let offset = 0 ;
371355
372356 let chunkDeflater = null ;
@@ -393,7 +377,7 @@ class Downloader
393377 return data ;
394378 } ;
395379
396- for await ( const [ /*resource*/ , cdx ] of this . generateCDX ( true ) ) {
380+ for ( const cdx of this . generateCDX ( ) ) {
397381 if ( ! chunkDeflater ) {
398382 chunkDeflater = new Deflate ( { gzip : true } ) ;
399383 }
0 commit comments