Download Electron UI + Optimizations (#95)

ikreymer · web-flow · commit 6b5cfe26e349 · 2022-07-05T11:10:33.000-07:00
Download UI for App:
- show download progress modal when using app.
- select new download path automatically in downloads folder to mimic browser functionality.

Download Optimization: Improved download performance for very large archives:
- don't load all resources at once!
- Iterate over resources in chunks (8192 at a time)
- For WACZ download, building cdxj list and sort at the end while iterating over records.

Additional fixes:
- don't autoupdate page list while downloading
- don't autoupdate page list at all if &gt;=100 pages
- use inline index.html to support &lt;replay-web-page&gt; embed
- use replaywebpage 1.6.1 release.
diff --git a/package.json b/package.json
@@ -22,8 +22,9 @@
     "node-fetch": "2.6.7",
     "pretty-bytes": "^5.6.0",
     "querystring-es3": "^0.2.1",
-    "replaywebpage": "github:webrecorder/replayweb.page#rec-embed-custom",
+    "replaywebpage": "^1.6.1",
     "stream-browserify": "^3.0.0",
+    "unused-filename": "^4.0.1",
     "url": "^0.11.0",
     "uuid": "^8.3.2",
     "warcio": "^1.5.1"
diff --git a/src/downloader.js b/src/downloader.js
@@ -17,6 +17,7 @@ const WACZ_VERSION = "1.1.1";
 const SPLIT_REQUEST_Q_RX = /(.*?)[?&](?:__wb_method=|__wb_post=)[^&]+&(.*)/;
 
 const LINES_PER_BLOCK = 1024;
+const RESOURCE_BATCH_SIZE = LINES_PER_BLOCK * 8;
 
 const DEFAULT_UUID_NAMESPACE = "f9ec3936-7f66-4461-bec4-34f4495ea242";
 
@@ -91,8 +92,9 @@ class Downloader
     }
 
     this.offset = 0;
-    this.resources = [];
+    this.firstResources = [];
     this.textResources = [];
+    this.cdxjLines = [];
 
     // compressed index (idx) entries
     this.indexLines = [];
@@ -142,34 +144,33 @@ class Downloader
     return resp;
   }
 
-  async loadResources() {
-    if (this.pageList) {
-      for await (const resource of this.db.resourcesByPages(this.pageList)) {
-        this.resources.push(resource);
-      }
-    } else {
-      this.resources = await this.db.db.getAll("resources");  
-    }
+  async loadResourcesBlock(start = []) {
+    return await this.db.db.getAll("resources", IDBKeyRange.lowerBound(start, true), RESOURCE_BATCH_SIZE);
+  }
 
-    this.resources.sort((a, b) => {
-      if (!a.surt) {
-        a.surt = getSurt(a.url);
-      }
+  async* iterResources(resources) {
+    let start = [];
+    let count = 0;
 
-      if (!b.surt) {
-        b.surt = getSurt(b.url);
-      }
+    while (resources.length) {
+      const last = resources[resources.length - 1];
 
-      if (a.surt == b.surt) {
-        return 0;
+      if (this.pageList) {
+        resources = resources.filter((res) => this.pageList.includes(res.pageId));
       }
+      count += resources.length;
+      yield* resources;
 
-      return a.surt < b.surt ? -1 : 1;
-    });
+      start = [last.url, last.ts];
+      resources = await this.loadResourcesBlock(start);
+    }
+    if (count !== this.numResources) {
+      console.warn(`Iterated ${count}, but expected ${this.numResources}`);
+    }
   }
 
   async queueWARC(controller, filename, sizeCallback) {
-    await this.loadResources();
+    this.firstResources = await this.loadResourcesBlock();
 
     for await (const chunk of this.generateWARC(filename)) {
       controller.enqueue(chunk);
@@ -215,20 +216,20 @@ class Downloader
   async downloadWACZ(filename, sizeCallback) {
     filename = (filename || "webarchive").split(".")[0] + ".wacz";
 
-    await this.loadResources();
-
     this.fileHasher = await createSHA256();
     this.recordHasher = await createSHA256();
     this.hashType = "sha256";
 
     const zip = [];
 
+    this.firstResources = await this.loadResourcesBlock();
+
     this.addFile(zip, "pages/pages.jsonl", this.generatePages(), sizeCallback, true);
     this.addFile(zip, "archive/data.warc.gz", this.generateWARC(filename + "#/archive/data.warc.gz", true), sizeCallback, false);
     //this.addFile(zip, "archive/text.warc", this.generateTextWARC(filename + "#/archive/text.warc"), false);
 
     // don't use compressed index if we'll have a single block, need to have at least enough for 2 blocks
-    if (this.resources.length < (2 * LINES_PER_BLOCK)) {
+    if (this.firstResources.length < (2 * LINES_PER_BLOCK)) {
       this.addFile(zip, "indexes/index.cdx", this.generateCDX(), sizeCallback, true);
     } else {
       this.addFile(zip, "indexes/index.cdx.gz", this.generateCompressedCDX("index.cdx.gz"), sizeCallback, false);
@@ -250,7 +251,7 @@ class Downloader
     return response;
   }
 
-  async* generateWARC(filename, digestRecord = false)  {
+  async* generateWARC(filename, digestRecordAndCDX = false)  {
     try {
       let offset = 0;
 
@@ -261,7 +262,7 @@ class Downloader
         offset += warcinfo.length;
       }
 
-      for (const resource of this.resources) {
+      for await (const resource of this.iterResources(this.firstResources)) {
         resource.offset = offset;
         const records = await this.createWARCRecord(resource);
         if (!records) {
@@ -273,7 +274,7 @@ class Downloader
         yield records[0];
         offset += records[0].length;
         resource.length = records[0].length;
-        if (digestRecord) {
+        if (digestRecordAndCDX) {
           resource.recordDigest = this.recordDigest(records[0]);
         }
 
@@ -282,6 +283,10 @@ class Downloader
           yield records[1];
           offset += records[1].length;
         }
+
+        if (digestRecordAndCDX) {
+          this.cdxjLines.push(this.getCDXJ(resource, "data.warc.gz"));
+        }
       }
     } catch (e) {
       console.warn(e);
@@ -311,62 +316,41 @@ class Downloader
     }
   }
 
-  async* generateCDX(raw = false) {
-    const getCDX = (resource, filename, raw) => {
-
-      const data = {
-        url: resource.url,
-        digest: resource.digest,
-        mime: resource.mime,
-        offset: resource.offset,
-        length: resource.length,
-        recordDigest: resource.recordDigest,
-        status: resource.status
-      };
-
-      if (filename) {
-        data.filename = filename;
-      }
-
-      if (resource.method && resource.method !== "GET") {
-        const m = resource.url.match(SPLIT_REQUEST_Q_RX);
-        if (m) {
-          data.url = m[1];
-          // resource.requestBody is the raw payload, use the converted one from the url for the cdx
-          data.requestBody = m[2];
-        }
-        data.method = resource.method;
-      }
+  getCDXJ(resource, filename) {
+    const data = {
+      url: resource.url,
+      digest: resource.digest,
+      mime: resource.mime,
+      offset: resource.offset,
+      length: resource.length,
+      recordDigest: resource.recordDigest,
+      status: resource.status
+    };
 
-      const cdx = `${resource.surt} ${resource.timestamp} ${JSON.stringify(data)}\n`;
+    if (filename) {
+      data.filename = filename;
+    }
 
-      if (!raw) {
-        return cdx;
-      } else {
-        return [resource, cdx];
+    if (resource.method && resource.method !== "GET") {
+      const m = resource.url.match(SPLIT_REQUEST_Q_RX);
+      if (m) {
+        data.url = m[1];
+        // resource.requestBody is the raw payload, use the converted one from the url for the cdx
+        data.requestBody = m[2];
       }
-    };
+      data.method = resource.method;
+    }
 
-    try {
-      for await (const resource of this.resources) {
-        if (resource.skipped) {
-          continue;
-        }
-        yield getCDX(resource, "data.warc.gz", raw);
-      }
+    return `${getSurt(resource.url)} ${resource.timestamp} ${JSON.stringify(data)}\n`;
+  }
 
-      // for await (const resource of this.textResources) {
-      //   resource.mime = "text/plain";
-      //   resource.status = 200;
-      //   yield getCDX(resource, "text.warc", raw);
-      // }
+  *generateCDX() {
+    this.cdxjLines.sort();
 
-    } catch (e) {
-      console.warn(e);
-    }
+    yield* this.cdxjLines;
   }
 
-  async* generateCompressedCDX(filename) {
+  *generateCompressedCDX(filename) {
     let offset = 0;
 
     let chunkDeflater = null;
@@ -393,7 +377,7 @@ class Downloader
       return data;
     };
 
-    for await (const [/*resource*/, cdx] of this.generateCDX(true)) {
+    for (const cdx of this.generateCDX()) {
       if (!chunkDeflater) {
         chunkDeflater = new Deflate({gzip: true});
       }
diff --git a/src/electron/electron-rec-main.js b/src/electron/electron-rec-main.js
@@ -10,7 +10,7 @@ global.btoa = btoa;
 // ===========================================================================
 const recorderApp = new ElectronRecorderApp({
   staticPath: path.join(__dirname, "./"),
-  profileName: "archivewebpage"
+  profileName: process.env.AWP_PROFILE_NAME || "archivewebpage"
 });
 
 recorderApp.init();
diff --git a/src/electron/electron-rec-preload.js b/src/electron/electron-rec-preload.js
@@ -10,6 +10,8 @@ import { Downloader } from "../downloader";
 
 const { ipcRenderer, contextBridge } = require("electron");
 
+let downloadCallback;
+
 
 // ===========================================================================
 contextBridge.exposeInMainWorld("archivewebpage", {
@@ -24,6 +26,15 @@ contextBridge.exposeInMainWorld("archivewebpage", {
   ipfsUnpin: (collId) => {
     return handleIpfsUnpin(collId);
   },
+
+  setDownloadCallback: (callback) => {
+    downloadCallback = callback;
+  },
+
+  downloadCancel: (dlprogress) => {
+    ipcRenderer.send("dlcancel:" + dlprogress.origFilename);
+  }
+
 });
 
 
@@ -72,6 +83,14 @@ ipcRenderer.on("inc-sizes", async (event, totalSize, writtenSize, collId) => {
 });
 
 
+// ===========================================================================
+ipcRenderer.on("download-progress", async (event, progress) => {
+  if (downloadCallback) {
+    downloadCallback(progress);
+  }
+});
+
+
 // ===========================================================================
 async function handleIpfsPin(collId, callback) {
   const reqId = "pin-" + collId + (100 * Math.random());
diff --git a/src/electron/electron-recorder-app.js b/src/electron/electron-recorder-app.js
@@ -10,7 +10,7 @@ import { PassThrough } from "stream";
 
 import fs from "fs";
 import util from "util";
-
+import { unusedFilenameSync } from 'unused-filename';
 
 import { checkPins, ipfsAddWithReplay, ipfsUnpinAll } from "../utils";
 
@@ -64,6 +64,51 @@ class ElectronRecorderApp extends ElectronReplayApp
       this.ipfsUnpin(event, reqId, pinList);
     });
 
+    sesh.on("will-download", (event, item, webContents) => {
+      const origFilename = item.getFilename();
+
+      console.log(`will-download: ${origFilename}`);
+
+      item.setSavePath(unusedFilenameSync(path.join(app.getPath("downloads"), origFilename)));
+
+      ipcMain.on("dlcancel:" + origFilename, () => {
+        console.log(`Canceled download for ${origFilename} to ${item.getSavePath()}`);
+        item.cancel();
+      });
+
+      item.on("updated", (_, state) => {
+        const filename = item.getSavePath();
+
+        const dlprogress = {
+          filename,
+          origFilename,
+          currSize: item.getReceivedBytes(),
+          totalSize: item.getTotalBytes(),
+          startTime: item.getStartTime(),
+          state,
+        };
+
+        try {
+          webContents.send("download-progress", dlprogress);
+        } catch (e) {
+          console.log("download update failed", e);
+        }
+      });
+
+      item.once("done", (event, state) => {
+        const dlprogress = {
+          origFilename,
+          state
+        };
+        try {
+          webContents.send("download-progress", dlprogress);
+        } catch (e) {
+          console.log("download update failed", e);
+        }
+      });
+
+    });
+
     //require('@electron/remote/main').initialize();
 
     super.onAppReady();
diff --git a/src/sw/main.js b/src/sw/main.js
@@ -5,9 +5,21 @@ import { ExtAPI } from "./api";
 import { RecordingCollections } from "./recproxy";
 
 import REC_INDEX_HTML from "../static/replay/index.html";
-import RWP_INDEX_HTML from "replaywebpage/index.html";
 import { WorkerLoader } from "@webrecorder/wabac/src/loaders";
 
+const RWP_INDEX_HTML = `
+<!doctype html>
+<html class="no-overflow">
+<head><title>ReplayWeb.page</title>
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <script src="./ui.js"></script>
+</head>
+<body>
+  <replay-app-main></replay-app-main>
+</body>
+</html>
+`;
+
 if (self.registration) {
   const defaultConfig = {
     injectScripts: ["/ruffle/ruffle.js"],
diff --git a/src/ui/app.js b/src/ui/app.js
diff --git a/src/ui/coll-info.js b/src/ui/coll-info.js
diff --git a/yarn.lock b/yarn.lock