diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py
index e0347c6f29..505e42c9a8 100644
--- a/backend/btrixcloud/models.py
+++ b/backend/btrixcloud/models.py
@@ -376,6 +376,8 @@ class RawCrawlConfig(BaseModel):
saveStorage: Optional[bool] = False
+ robots: Optional[bool] = False
+
# ============================================================================
class CrawlConfigIn(BaseModel):
diff --git a/frontend/docs/docs/user-guide/workflow-setup.md b/frontend/docs/docs/user-guide/workflow-setup.md
index 3f86223c5f..a63b08a468 100644
--- a/frontend/docs/docs/user-guide/workflow-setup.md
+++ b/frontend/docs/docs/user-guide/workflow-setup.md
@@ -97,6 +97,10 @@ Refer to a specific [_Crawl Scope_ option](#crawl-scope-options) for details on
**These credentials WILL BE WRITTEN into the archive.** We recommend exercising caution and only archiving with dedicated archival accounts, changing your password or deleting the account when finished.
+### Skip Pages Disallowed By Robots.txt
+
+When enabled, the crawler will check for a [Robots Exclusion Protocol](https://www.rfc-editor.org/rfc/rfc9309.html) file at /robots.txt for each host encountered during crawling and skip any pages that are disallowed by the rules found therein.
+
### Include Any Linked Page
When enabled, the crawler will visit all the links it finds within each URL defined in the [URL input field](#crawl-start-url-urls-to-crawl) under _Crawl Scope_.
diff --git a/frontend/src/components/ui/config-details.ts b/frontend/src/components/ui/config-details.ts
index 63f6e76991..f9f58035ac 100644
--- a/frontend/src/components/ui/config-details.ts
+++ b/frontend/src/components/ui/config-details.ts
@@ -462,6 +462,10 @@ export class ConfigDetails extends BtrixElement {
msg("Include Any Linked Page (“one hop out”)"),
Boolean(config.extraHops),
)}
+ ${this.renderSetting(
+ msg("Skip Pages Disallowed By Robots.txt"),
+ Boolean(config.robots),
+ )}
${this.renderSetting(
msg("Fail Crawl If Not Logged In"),
Boolean(config.failOnContentCheck),
@@ -536,6 +540,10 @@ export class ConfigDetails extends BtrixElement {
msg("Include Any Linked Page (“one hop out”)"),
Boolean(primarySeedConfig?.extraHops ?? config.extraHops),
)}
+ ${this.renderSetting(
+ msg("Skip Pages Disallowed By Robots.txt"),
+ Boolean(config.robots),
+ )}
${this.renderSetting(
msg("Check For Sitemap"),
Boolean(config.useSitemap),
diff --git a/frontend/src/features/crawl-workflows/workflow-editor.ts b/frontend/src/features/crawl-workflows/workflow-editor.ts
index 4236b0ff13..5163356e32 100644
--- a/frontend/src/features/crawl-workflows/workflow-editor.ts
+++ b/frontend/src/features/crawl-workflows/workflow-editor.ts
@@ -1036,6 +1036,12 @@ export class WorkflowEditor extends BtrixElement {
`)}
${this.renderHelpTextCol(infoTextFor["includeLinkedPages"], false)}
+ ${inputCol(html`
+
+ ${msg("Skip pages disallowed by robots.txt")}
+
+ `)}
+ ${this.renderHelpTextCol(infoTextFor["robots"], false)}
${inputCol(html`
`)}
${this.renderHelpTextCol(infoTextFor["includeLinkedPages"], false)}
+ ${inputCol(html`
+
+ ${msg("Skip pages disallowed by robots.txt")}
+
+ `)}
+ ${this.renderHelpTextCol(infoTextFor["robots"], false)}
${inputCol(html`
${msg("Check for sitemap")}
@@ -3263,6 +3275,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
| "failOnFailedSeed"
| "failOnContentCheck"
| "saveStorage"
+ | "robots"
> {
const jsonSeeds = this.formState.seedListFormat === SeedListFormat.JSON;
@@ -3282,6 +3295,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
failOnFailedSeed: this.formState.failOnFailedSeed,
failOnContentCheck: this.formState.failOnContentCheck,
saveStorage: this.formState.saveStorage,
+ robots: this.formState.robots,
};
return config;
@@ -3295,6 +3309,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
| "failOnFailedSeed"
| "failOnContentCheck"
| "saveStorage"
+ | "robots"
> {
const primarySeedUrl = this.formState.primarySeedUrl;
const includeUrlList = this.formState.customIncludeUrlList
@@ -3327,6 +3342,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
failOnFailedSeed: false,
failOnContentCheck: this.formState.failOnContentCheck,
saveStorage: this.formState.saveStorage,
+ robots: this.formState.robots,
};
return config;
}
diff --git a/frontend/src/strings/crawl-workflows/infoText.ts b/frontend/src/strings/crawl-workflows/infoText.ts
index 042aa12c84..741366d46b 100644
--- a/frontend/src/strings/crawl-workflows/infoText.ts
+++ b/frontend/src/strings/crawl-workflows/infoText.ts
@@ -85,6 +85,9 @@ export const infoTextFor = {
saveStorage: msg(
`Include data from the browser's local and session storage in the web archive.`,
),
+ robots: msg(
+ `Check for a /robots.txt for each host and skip any disallowed pages.`,
+ ),
} as const satisfies Partial>;
export default infoTextFor;
diff --git a/frontend/src/types/crawler.ts b/frontend/src/types/crawler.ts
index 72191f350e..6c8b1cade7 100644
--- a/frontend/src/types/crawler.ts
+++ b/frontend/src/types/crawler.ts
@@ -55,6 +55,7 @@ export type SeedConfig = Expand<
customBehaviors: string[];
clickSelector: string;
saveStorage?: boolean;
+ robots?: boolean;
}
>;
diff --git a/frontend/src/utils/workflow.ts b/frontend/src/utils/workflow.ts
index f2cbb7ebd1..0cae4d371e 100644
--- a/frontend/src/utils/workflow.ts
+++ b/frontend/src/utils/workflow.ts
@@ -184,6 +184,7 @@ export type FormState = {
selectLinks: string[];
clickSelector: string;
saveStorage: WorkflowParams["config"]["saveStorage"];
+ robots: WorkflowParams["config"]["robots"];
};
export type FormStateField = keyof FormState;
@@ -246,6 +247,7 @@ export const getDefaultFormState = (): FormState => ({
clickSelector: DEFAULT_AUTOCLICK_SELECTOR,
customBehavior: false,
saveStorage: false,
+ robots: false,
});
export const mapSeedToUrl = (arr: Seed[]) =>
@@ -416,6 +418,7 @@ export function getInitialFormState(params: {
params.initialWorkflow.crawlerChannel || defaultFormState.crawlerChannel,
proxyId: params.initialWorkflow.proxyId || defaultFormState.proxyId,
saveStorage: params.initialWorkflow.config.saveStorage,
+ robots: params.initialWorkflow.config.robots,
...formState,
};
}