Skip to content

Commit 6b5cfe2

Browse files
authored
Download Electron UI + Optimizations (#95)
Download UI for App: - show download progress modal when using app. - select new download path automatically in downloads folder to mimic browser functionality. Download Optimization: Improved download performance for very large archives: - don't load all resources at once! - Iterate over resources in chunks (8192 at a time) - For WACZ download, building cdxj list and sort at the end while iterating over records. Additional fixes: - don't autoupdate page list while downloading - don't autoupdate page list at all if >=100 pages - use inline index.html to support <replay-web-page> embed - use replaywebpage 1.6.1 release.
1 parent 47e982a commit 6b5cfe2

File tree

9 files changed

+240
-114
lines changed

9 files changed

+240
-114
lines changed

package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,9 @@
2222
"node-fetch": "2.6.7",
2323
"pretty-bytes": "^5.6.0",
2424
"querystring-es3": "^0.2.1",
25-
"replaywebpage": "github:webrecorder/replayweb.page#rec-embed-custom",
25+
"replaywebpage": "^1.6.1",
2626
"stream-browserify": "^3.0.0",
27+
"unused-filename": "^4.0.1",
2728
"url": "^0.11.0",
2829
"uuid": "^8.3.2",
2930
"warcio": "^1.5.1"

src/downloader.js

Lines changed: 60 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ const WACZ_VERSION = "1.1.1";
1717
const SPLIT_REQUEST_Q_RX = /(.*?)[?&](?:__wb_method=|__wb_post=)[^&]+&(.*)/;
1818

1919
const LINES_PER_BLOCK = 1024;
20+
const RESOURCE_BATCH_SIZE = LINES_PER_BLOCK * 8;
2021

2122
const DEFAULT_UUID_NAMESPACE = "f9ec3936-7f66-4461-bec4-34f4495ea242";
2223

@@ -91,8 +92,9 @@ class Downloader
9192
}
9293

9394
this.offset = 0;
94-
this.resources = [];
95+
this.firstResources = [];
9596
this.textResources = [];
97+
this.cdxjLines = [];
9698

9799
// compressed index (idx) entries
98100
this.indexLines = [];
@@ -142,34 +144,33 @@ class Downloader
142144
return resp;
143145
}
144146

145-
async loadResources() {
146-
if (this.pageList) {
147-
for await (const resource of this.db.resourcesByPages(this.pageList)) {
148-
this.resources.push(resource);
149-
}
150-
} else {
151-
this.resources = await this.db.db.getAll("resources");
152-
}
147+
async loadResourcesBlock(start = []) {
148+
return await this.db.db.getAll("resources", IDBKeyRange.lowerBound(start, true), RESOURCE_BATCH_SIZE);
149+
}
153150

154-
this.resources.sort((a, b) => {
155-
if (!a.surt) {
156-
a.surt = getSurt(a.url);
157-
}
151+
async* iterResources(resources) {
152+
let start = [];
153+
let count = 0;
158154

159-
if (!b.surt) {
160-
b.surt = getSurt(b.url);
161-
}
155+
while (resources.length) {
156+
const last = resources[resources.length - 1];
162157

163-
if (a.surt == b.surt) {
164-
return 0;
158+
if (this.pageList) {
159+
resources = resources.filter((res) => this.pageList.includes(res.pageId));
165160
}
161+
count += resources.length;
162+
yield* resources;
166163

167-
return a.surt < b.surt ? -1 : 1;
168-
});
164+
start = [last.url, last.ts];
165+
resources = await this.loadResourcesBlock(start);
166+
}
167+
if (count !== this.numResources) {
168+
console.warn(`Iterated ${count}, but expected ${this.numResources}`);
169+
}
169170
}
170171

171172
async queueWARC(controller, filename, sizeCallback) {
172-
await this.loadResources();
173+
this.firstResources = await this.loadResourcesBlock();
173174

174175
for await (const chunk of this.generateWARC(filename)) {
175176
controller.enqueue(chunk);
@@ -215,20 +216,20 @@ class Downloader
215216
async downloadWACZ(filename, sizeCallback) {
216217
filename = (filename || "webarchive").split(".")[0] + ".wacz";
217218

218-
await this.loadResources();
219-
220219
this.fileHasher = await createSHA256();
221220
this.recordHasher = await createSHA256();
222221
this.hashType = "sha256";
223222

224223
const zip = [];
225224

225+
this.firstResources = await this.loadResourcesBlock();
226+
226227
this.addFile(zip, "pages/pages.jsonl", this.generatePages(), sizeCallback, true);
227228
this.addFile(zip, "archive/data.warc.gz", this.generateWARC(filename + "#/archive/data.warc.gz", true), sizeCallback, false);
228229
//this.addFile(zip, "archive/text.warc", this.generateTextWARC(filename + "#/archive/text.warc"), false);
229230

230231
// don't use compressed index if we'll have a single block, need to have at least enough for 2 blocks
231-
if (this.resources.length < (2 * LINES_PER_BLOCK)) {
232+
if (this.firstResources.length < (2 * LINES_PER_BLOCK)) {
232233
this.addFile(zip, "indexes/index.cdx", this.generateCDX(), sizeCallback, true);
233234
} else {
234235
this.addFile(zip, "indexes/index.cdx.gz", this.generateCompressedCDX("index.cdx.gz"), sizeCallback, false);
@@ -250,7 +251,7 @@ class Downloader
250251
return response;
251252
}
252253

253-
async* generateWARC(filename, digestRecord = false) {
254+
async* generateWARC(filename, digestRecordAndCDX = false) {
254255
try {
255256
let offset = 0;
256257

@@ -261,7 +262,7 @@ class Downloader
261262
offset += warcinfo.length;
262263
}
263264

264-
for (const resource of this.resources) {
265+
for await (const resource of this.iterResources(this.firstResources)) {
265266
resource.offset = offset;
266267
const records = await this.createWARCRecord(resource);
267268
if (!records) {
@@ -273,7 +274,7 @@ class Downloader
273274
yield records[0];
274275
offset += records[0].length;
275276
resource.length = records[0].length;
276-
if (digestRecord) {
277+
if (digestRecordAndCDX) {
277278
resource.recordDigest = this.recordDigest(records[0]);
278279
}
279280

@@ -282,6 +283,10 @@ class Downloader
282283
yield records[1];
283284
offset += records[1].length;
284285
}
286+
287+
if (digestRecordAndCDX) {
288+
this.cdxjLines.push(this.getCDXJ(resource, "data.warc.gz"));
289+
}
285290
}
286291
} catch (e) {
287292
console.warn(e);
@@ -311,62 +316,41 @@ class Downloader
311316
}
312317
}
313318

314-
async* generateCDX(raw = false) {
315-
const getCDX = (resource, filename, raw) => {
316-
317-
const data = {
318-
url: resource.url,
319-
digest: resource.digest,
320-
mime: resource.mime,
321-
offset: resource.offset,
322-
length: resource.length,
323-
recordDigest: resource.recordDigest,
324-
status: resource.status
325-
};
326-
327-
if (filename) {
328-
data.filename = filename;
329-
}
330-
331-
if (resource.method && resource.method !== "GET") {
332-
const m = resource.url.match(SPLIT_REQUEST_Q_RX);
333-
if (m) {
334-
data.url = m[1];
335-
// resource.requestBody is the raw payload, use the converted one from the url for the cdx
336-
data.requestBody = m[2];
337-
}
338-
data.method = resource.method;
339-
}
319+
getCDXJ(resource, filename) {
320+
const data = {
321+
url: resource.url,
322+
digest: resource.digest,
323+
mime: resource.mime,
324+
offset: resource.offset,
325+
length: resource.length,
326+
recordDigest: resource.recordDigest,
327+
status: resource.status
328+
};
340329

341-
const cdx = `${resource.surt} ${resource.timestamp} ${JSON.stringify(data)}\n`;
330+
if (filename) {
331+
data.filename = filename;
332+
}
342333

343-
if (!raw) {
344-
return cdx;
345-
} else {
346-
return [resource, cdx];
334+
if (resource.method && resource.method !== "GET") {
335+
const m = resource.url.match(SPLIT_REQUEST_Q_RX);
336+
if (m) {
337+
data.url = m[1];
338+
// resource.requestBody is the raw payload, use the converted one from the url for the cdx
339+
data.requestBody = m[2];
347340
}
348-
};
341+
data.method = resource.method;
342+
}
349343

350-
try {
351-
for await (const resource of this.resources) {
352-
if (resource.skipped) {
353-
continue;
354-
}
355-
yield getCDX(resource, "data.warc.gz", raw);
356-
}
344+
return `${getSurt(resource.url)} ${resource.timestamp} ${JSON.stringify(data)}\n`;
345+
}
357346

358-
// for await (const resource of this.textResources) {
359-
// resource.mime = "text/plain";
360-
// resource.status = 200;
361-
// yield getCDX(resource, "text.warc", raw);
362-
// }
347+
*generateCDX() {
348+
this.cdxjLines.sort();
363349

364-
} catch (e) {
365-
console.warn(e);
366-
}
350+
yield* this.cdxjLines;
367351
}
368352

369-
async* generateCompressedCDX(filename) {
353+
*generateCompressedCDX(filename) {
370354
let offset = 0;
371355

372356
let chunkDeflater = null;
@@ -393,7 +377,7 @@ class Downloader
393377
return data;
394378
};
395379

396-
for await (const [/*resource*/, cdx] of this.generateCDX(true)) {
380+
for (const cdx of this.generateCDX()) {
397381
if (!chunkDeflater) {
398382
chunkDeflater = new Deflate({gzip: true});
399383
}

src/electron/electron-rec-main.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ global.btoa = btoa;
1010
// ===========================================================================
1111
const recorderApp = new ElectronRecorderApp({
1212
staticPath: path.join(__dirname, "./"),
13-
profileName: "archivewebpage"
13+
profileName: process.env.AWP_PROFILE_NAME || "archivewebpage"
1414
});
1515

1616
recorderApp.init();

src/electron/electron-rec-preload.js

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ import { Downloader } from "../downloader";
1010

1111
const { ipcRenderer, contextBridge } = require("electron");
1212

13+
let downloadCallback;
14+
1315

1416
// ===========================================================================
1517
contextBridge.exposeInMainWorld("archivewebpage", {
@@ -24,6 +26,15 @@ contextBridge.exposeInMainWorld("archivewebpage", {
2426
ipfsUnpin: (collId) => {
2527
return handleIpfsUnpin(collId);
2628
},
29+
30+
setDownloadCallback: (callback) => {
31+
downloadCallback = callback;
32+
},
33+
34+
downloadCancel: (dlprogress) => {
35+
ipcRenderer.send("dlcancel:" + dlprogress.origFilename);
36+
}
37+
2738
});
2839

2940

@@ -72,6 +83,14 @@ ipcRenderer.on("inc-sizes", async (event, totalSize, writtenSize, collId) => {
7283
});
7384

7485

86+
// ===========================================================================
87+
ipcRenderer.on("download-progress", async (event, progress) => {
88+
if (downloadCallback) {
89+
downloadCallback(progress);
90+
}
91+
});
92+
93+
7594
// ===========================================================================
7695
async function handleIpfsPin(collId, callback) {
7796
const reqId = "pin-" + collId + (100 * Math.random());

src/electron/electron-recorder-app.js

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import { PassThrough } from "stream";
1010

1111
import fs from "fs";
1212
import util from "util";
13-
13+
import { unusedFilenameSync } from 'unused-filename';
1414

1515
import { checkPins, ipfsAddWithReplay, ipfsUnpinAll } from "../utils";
1616

@@ -64,6 +64,51 @@ class ElectronRecorderApp extends ElectronReplayApp
6464
this.ipfsUnpin(event, reqId, pinList);
6565
});
6666

67+
sesh.on("will-download", (event, item, webContents) => {
68+
const origFilename = item.getFilename();
69+
70+
console.log(`will-download: ${origFilename}`);
71+
72+
item.setSavePath(unusedFilenameSync(path.join(app.getPath("downloads"), origFilename)));
73+
74+
ipcMain.on("dlcancel:" + origFilename, () => {
75+
console.log(`Canceled download for ${origFilename} to ${item.getSavePath()}`);
76+
item.cancel();
77+
});
78+
79+
item.on("updated", (_, state) => {
80+
const filename = item.getSavePath();
81+
82+
const dlprogress = {
83+
filename,
84+
origFilename,
85+
currSize: item.getReceivedBytes(),
86+
totalSize: item.getTotalBytes(),
87+
startTime: item.getStartTime(),
88+
state,
89+
};
90+
91+
try {
92+
webContents.send("download-progress", dlprogress);
93+
} catch (e) {
94+
console.log("download update failed", e);
95+
}
96+
});
97+
98+
item.once("done", (event, state) => {
99+
const dlprogress = {
100+
origFilename,
101+
state
102+
};
103+
try {
104+
webContents.send("download-progress", dlprogress);
105+
} catch (e) {
106+
console.log("download update failed", e);
107+
}
108+
});
109+
110+
});
111+
67112
//require('@electron/remote/main').initialize();
68113

69114
super.onAppReady();

src/sw/main.js

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,21 @@ import { ExtAPI } from "./api";
55
import { RecordingCollections } from "./recproxy";
66

77
import REC_INDEX_HTML from "../static/replay/index.html";
8-
import RWP_INDEX_HTML from "replaywebpage/index.html";
98
import { WorkerLoader } from "@webrecorder/wabac/src/loaders";
109

10+
const RWP_INDEX_HTML = `
11+
<!doctype html>
12+
<html class="no-overflow">
13+
<head><title>ReplayWeb.page</title>
14+
<meta name="viewport" content="width=device-width, initial-scale=1">
15+
<script src="./ui.js"></script>
16+
</head>
17+
<body>
18+
<replay-app-main></replay-app-main>
19+
</body>
20+
</html>
21+
`;
22+
1123
if (self.registration) {
1224
const defaultConfig = {
1325
injectScripts: ["/ruffle/ruffle.js"],

0 commit comments

Comments
 (0)