diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index e0347c6f29..505e42c9a8 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -376,6 +376,8 @@ class RawCrawlConfig(BaseModel): saveStorage: Optional[bool] = False + robots: Optional[bool] = False + # ============================================================================ class CrawlConfigIn(BaseModel): diff --git a/frontend/docs/docs/user-guide/workflow-setup.md b/frontend/docs/docs/user-guide/workflow-setup.md index 3f86223c5f..a63b08a468 100644 --- a/frontend/docs/docs/user-guide/workflow-setup.md +++ b/frontend/docs/docs/user-guide/workflow-setup.md @@ -97,6 +97,10 @@ Refer to a specific [_Crawl Scope_ option](#crawl-scope-options) for details on **These credentials WILL BE WRITTEN into the archive.** We recommend exercising caution and only archiving with dedicated archival accounts, changing your password or deleting the account when finished. +### Skip Pages Disallowed By Robots.txt + +When enabled, the crawler will check for a [Robots Exclusion Protocol](https://www.rfc-editor.org/rfc/rfc9309.html) file at /robots.txt for each host encountered during crawling and skip any pages that are disallowed by the rules found therein. + ### Include Any Linked Page When enabled, the crawler will visit all the links it finds within each URL defined in the [URL input field](#crawl-start-url-urls-to-crawl) under _Crawl Scope_. diff --git a/frontend/src/components/ui/config-details.ts b/frontend/src/components/ui/config-details.ts index 63f6e76991..f9f58035ac 100644 --- a/frontend/src/components/ui/config-details.ts +++ b/frontend/src/components/ui/config-details.ts @@ -462,6 +462,10 @@ export class ConfigDetails extends BtrixElement { msg("Include Any Linked Page (“one hop out”)"), Boolean(config.extraHops), )} + ${this.renderSetting( + msg("Skip Pages Disallowed By Robots.txt"), + Boolean(config.robots), + )} ${this.renderSetting( msg("Fail Crawl If Not Logged In"), Boolean(config.failOnContentCheck), @@ -536,6 +540,10 @@ export class ConfigDetails extends BtrixElement { msg("Include Any Linked Page (“one hop out”)"), Boolean(primarySeedConfig?.extraHops ?? config.extraHops), )} + ${this.renderSetting( + msg("Skip Pages Disallowed By Robots.txt"), + Boolean(config.robots), + )} ${this.renderSetting( msg("Check For Sitemap"), Boolean(config.useSitemap), diff --git a/frontend/src/features/crawl-workflows/workflow-editor.ts b/frontend/src/features/crawl-workflows/workflow-editor.ts index 4236b0ff13..5163356e32 100644 --- a/frontend/src/features/crawl-workflows/workflow-editor.ts +++ b/frontend/src/features/crawl-workflows/workflow-editor.ts @@ -1036,6 +1036,12 @@ export class WorkflowEditor extends BtrixElement { `)} ${this.renderHelpTextCol(infoTextFor["includeLinkedPages"], false)} + ${inputCol(html` + + ${msg("Skip pages disallowed by robots.txt")} + + `)} + ${this.renderHelpTextCol(infoTextFor["robots"], false)} ${inputCol(html` `)} ${this.renderHelpTextCol(infoTextFor["includeLinkedPages"], false)} + ${inputCol(html` + + ${msg("Skip pages disallowed by robots.txt")} + + `)} + ${this.renderHelpTextCol(infoTextFor["robots"], false)} ${inputCol(html` ${msg("Check for sitemap")} @@ -3263,6 +3275,7 @@ https://archiveweb.page/images/${"logo.svg"}`} | "failOnFailedSeed" | "failOnContentCheck" | "saveStorage" + | "robots" > { const jsonSeeds = this.formState.seedListFormat === SeedListFormat.JSON; @@ -3282,6 +3295,7 @@ https://archiveweb.page/images/${"logo.svg"}`} failOnFailedSeed: this.formState.failOnFailedSeed, failOnContentCheck: this.formState.failOnContentCheck, saveStorage: this.formState.saveStorage, + robots: this.formState.robots, }; return config; @@ -3295,6 +3309,7 @@ https://archiveweb.page/images/${"logo.svg"}`} | "failOnFailedSeed" | "failOnContentCheck" | "saveStorage" + | "robots" > { const primarySeedUrl = this.formState.primarySeedUrl; const includeUrlList = this.formState.customIncludeUrlList @@ -3327,6 +3342,7 @@ https://archiveweb.page/images/${"logo.svg"}`} failOnFailedSeed: false, failOnContentCheck: this.formState.failOnContentCheck, saveStorage: this.formState.saveStorage, + robots: this.formState.robots, }; return config; } diff --git a/frontend/src/strings/crawl-workflows/infoText.ts b/frontend/src/strings/crawl-workflows/infoText.ts index 042aa12c84..741366d46b 100644 --- a/frontend/src/strings/crawl-workflows/infoText.ts +++ b/frontend/src/strings/crawl-workflows/infoText.ts @@ -85,6 +85,9 @@ export const infoTextFor = { saveStorage: msg( `Include data from the browser's local and session storage in the web archive.`, ), + robots: msg( + `Check for a /robots.txt for each host and skip any disallowed pages.`, + ), } as const satisfies Partial>; export default infoTextFor; diff --git a/frontend/src/types/crawler.ts b/frontend/src/types/crawler.ts index 72191f350e..6c8b1cade7 100644 --- a/frontend/src/types/crawler.ts +++ b/frontend/src/types/crawler.ts @@ -55,6 +55,7 @@ export type SeedConfig = Expand< customBehaviors: string[]; clickSelector: string; saveStorage?: boolean; + robots?: boolean; } >; diff --git a/frontend/src/utils/workflow.ts b/frontend/src/utils/workflow.ts index f2cbb7ebd1..0cae4d371e 100644 --- a/frontend/src/utils/workflow.ts +++ b/frontend/src/utils/workflow.ts @@ -184,6 +184,7 @@ export type FormState = { selectLinks: string[]; clickSelector: string; saveStorage: WorkflowParams["config"]["saveStorage"]; + robots: WorkflowParams["config"]["robots"]; }; export type FormStateField = keyof FormState; @@ -246,6 +247,7 @@ export const getDefaultFormState = (): FormState => ({ clickSelector: DEFAULT_AUTOCLICK_SELECTOR, customBehavior: false, saveStorage: false, + robots: false, }); export const mapSeedToUrl = (arr: Seed[]) => @@ -416,6 +418,7 @@ export function getInitialFormState(params: { params.initialWorkflow.crawlerChannel || defaultFormState.crawlerChannel, proxyId: params.initialWorkflow.proxyId || defaultFormState.proxyId, saveStorage: params.initialWorkflow.config.saveStorage, + robots: params.initialWorkflow.config.robots, ...formState, }; }