33// found in the LICENSE file.
44
55/**
6- * Mozilla Readability library bundled as a string for runtime injection.
7- * Version: 0.6.0
6+ * Bundled Mozilla Readability library (main branch)
87 * Source: https://github.com/mozilla/readability
9- * License: Apache-2.0
8+ * This file contains the Readability.js source code as a string constant
9+ * for injection into web pages for content extraction.
1010 */
1111
12- // @ts -nocheck - Large bundled library source
13- export const READABILITY_SOURCE = ` /*
12+ export const READABILITY_SOURCE = `
13+ /*
1414 * Copyright (c) 2010 Arc90 Inc
1515 *
1616 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -151,7 +151,8 @@ Readability.prototype = {
151151 // Readability-readerable.js. Please keep both copies in sync.
152152 unlikelyCandidates:
153153 /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
154- okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i,
154+ okMaybeItsACandidate:
155+ /and|article|body|column|content|main|mathjax|shadow/i,
155156
156157 positive:
157158 /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
@@ -163,7 +164,7 @@ Readability.prototype = {
163164 replaceFonts: /<(\\/?)font[^>]*>/gi,
164165 normalize: /\\s{2,}/g,
165166 videos:
166- /\\/\\/(www\\.)?((dailymotion|youtube|youtube-nocookie|player\\.vimeo|v\\.qq)\\.com|(archive|upload\\.wikimedia)\\.org|player\\.twitch\\.tv)/i,
167+ /\\/\\/(www\\.)?((dailymotion|youtube|youtube-nocookie|player\\.vimeo|v\\.qq|bilibili|live.bilibili )\\.com|(archive|upload\\.wikimedia)\\.org|player\\.twitch\\.tv)/i,
167168 shareElements: /(\\b|_)(share|sharedaddy)(\\b|_)/i,
168169 nextLink: /(next|weiter|continue|>([^\\|]|$)|»([^\\|]|$))/i,
169170 prevLink: /(prev|earl|old|new|<|«)/i,
@@ -605,14 +606,20 @@ Readability.prototype = {
605606 }
606607
607608 // If there's a separator in the title, first remove the final part
608- if (/ [\\|\\-\\\\\\/>»] /.test(curTitle)) {
609- titleHadHierarchicalSeparators = / [\\\\\\/>»] /.test(curTitle);
610- let allSeparators = Array.from(origTitle.matchAll(/ [\\|\\-\\\\\\/>»] /gi));
609+ const titleSeparators = /\\|\\-–—\\\\\\/>»/.source;
610+ if (new RegExp(\`\\\\s[\${titleSeparators}]\\\\s\`).test(curTitle)) {
611+ titleHadHierarchicalSeparators = /\\s[\\\\\\/>»]\\s/.test(curTitle);
612+ let allSeparators = Array.from(
613+ origTitle.matchAll(new RegExp(\`\\\\s[\${titleSeparators}]\\\\s\`, "gi"))
614+ );
611615 curTitle = origTitle.substring(0, allSeparators.pop().index);
612616
613617 // If the resulting title is too short, remove the first part instead:
614618 if (wordCount(curTitle) < 3) {
615- curTitle = origTitle.replace(/^[^\\|\\-\\\\\\/>»]*[\\|\\-\\\\\\/>»]/gi, "");
619+ curTitle = origTitle.replace(
620+ new RegExp(\`^[^\${titleSeparators}]*[\${titleSeparators}]\`, "gi"),
621+ ""
622+ );
616623 }
617624 } else if (curTitle.includes(": ")) {
618625 // Check if we have an heading containing this exact string, so we
@@ -654,7 +661,10 @@ Readability.prototype = {
654661 curTitleWordCount <= 4 &&
655662 (!titleHadHierarchicalSeparators ||
656663 curTitleWordCount !=
657- wordCount(origTitle.replace(/[\\|\\-\\\\\\/>»]+/g, "")) - 1)
664+ wordCount(
665+ origTitle.replace(new RegExp(\`\\\\s[\${titleSeparators}]\\\\s\`, "g"), "")
666+ ) -
667+ 1)
658668 ) {
659669 curTitle = origTitle;
660670 }
@@ -1176,23 +1186,39 @@ Readability.prototype = {
11761186 // Turn all divs that don't have children block level elements into p's
11771187 if (node.tagName === "DIV") {
11781188 // Put phrasing content into paragraphs.
1179- var p = null;
11801189 var childNode = node.firstChild;
11811190 while (childNode) {
11821191 var nextSibling = childNode.nextSibling;
11831192 if (this._isPhrasingContent(childNode)) {
1184- if (p !== null) {
1185- p.appendChild(childNode);
1186- } else if (!this._isWhitespace(childNode)) {
1187- p = doc.createElement("p");
1188- node.replaceChild(p, childNode);
1189- p.appendChild(childNode);
1193+ var fragment = doc.createDocumentFragment();
1194+ // Collect all consecutive phrasing content into a fragment.
1195+ do {
1196+ nextSibling = childNode.nextSibling;
1197+ fragment.appendChild(childNode);
1198+ childNode = nextSibling;
1199+ } while (childNode && this._isPhrasingContent(childNode));
1200+
1201+ // Trim leading and trailing whitespace from the fragment.
1202+ while (
1203+ fragment.firstChild &&
1204+ this._isWhitespace(fragment.firstChild)
1205+ ) {
1206+ fragment.firstChild.remove();
11901207 }
1191- } else if (p !== null) {
1192- while (p.lastChild && this._isWhitespace(p.lastChild)) {
1193- p.lastChild.remove();
1208+ while (
1209+ fragment.lastChild &&
1210+ this._isWhitespace(fragment.lastChild)
1211+ ) {
1212+ fragment.lastChild.remove();
1213+ }
1214+
1215+ // If the fragment contains anything, wrap it in a paragraph and
1216+ // insert it before the next non-phrasing node.
1217+ if (fragment.firstChild) {
1218+ var p = doc.createElement("p");
1219+ p.appendChild(fragment);
1220+ node.insertBefore(p, nextSibling);
11941221 }
1195- p = null;
11961222 }
11971223 childNode = nextSibling;
11981224 }
@@ -2796,4 +2822,5 @@ if (typeof module === "object") {
27962822 /* global module */
27972823 module.exports = Readability;
27982824}
2825+
27992826` ;
0 commit comments