Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions backend/btrixcloud/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,8 @@ class RawCrawlConfig(BaseModel):

saveStorage: Optional[bool] = False

robots: Optional[bool] = False


# ============================================================================
class CrawlConfigIn(BaseModel):
Expand Down
4 changes: 4 additions & 0 deletions frontend/docs/docs/user-guide/workflow-setup.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,10 @@ Refer to a specific [_Crawl Scope_ option](#crawl-scope-options) for details on

**These credentials WILL BE WRITTEN into the archive.** We recommend exercising caution and only archiving with dedicated archival accounts, changing your password or deleting the account when finished.

### Skip Pages Disallowed By Robots.txt

When enabled, the crawler will check for a [Robots Exclusion Protocol](https://www.rfc-editor.org/rfc/rfc9309.html) file at /robots.txt for each host encountered during crawling and skip any pages that are disallowed by the rules found therein.

### Include Any Linked Page

When enabled, the crawler will visit all the links it finds within each URL defined in the [URL input field](#crawl-start-url-urls-to-crawl) under _Crawl Scope_.
Expand Down
8 changes: 8 additions & 0 deletions frontend/src/components/ui/config-details.ts
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,10 @@ export class ConfigDetails extends BtrixElement {
msg("Include Any Linked Page (“one hop out”)"),
Boolean(config.extraHops),
)}
${this.renderSetting(
msg("Skip Pages Disallowed By Robots.txt"),
Boolean(config.robots),
)}
${this.renderSetting(
msg("Fail Crawl If Not Logged In"),
Boolean(config.failOnContentCheck),
Expand Down Expand Up @@ -536,6 +540,10 @@ export class ConfigDetails extends BtrixElement {
msg("Include Any Linked Page (“one hop out”)"),
Boolean(primarySeedConfig?.extraHops ?? config.extraHops),
)}
${this.renderSetting(
msg("Skip Pages Disallowed By Robots.txt"),
Boolean(config.robots),
)}
${this.renderSetting(
msg("Check For Sitemap"),
Boolean(config.useSitemap),
Expand Down
16 changes: 16 additions & 0 deletions frontend/src/features/crawl-workflows/workflow-editor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1036,6 +1036,12 @@ export class WorkflowEditor extends BtrixElement {
</sl-checkbox>
`)}
${this.renderHelpTextCol(infoTextFor["includeLinkedPages"], false)}
${inputCol(html`
<sl-checkbox name="robots" ?checked=${this.formState.robots}>
${msg("Skip pages disallowed by robots.txt")}
</sl-checkbox>
`)}
${this.renderHelpTextCol(infoTextFor["robots"], false)}
${inputCol(html`
<sl-checkbox
name="failOnContentCheck"
Expand Down Expand Up @@ -1553,6 +1559,12 @@ https://example.net`}
</sl-checkbox>
`)}
${this.renderHelpTextCol(infoTextFor["includeLinkedPages"], false)}
${inputCol(html`
<sl-checkbox name="robots" ?checked=${this.formState.robots}>
${msg("Skip pages disallowed by robots.txt")}
</sl-checkbox>
`)}
${this.renderHelpTextCol(infoTextFor["robots"], false)}
${inputCol(html`
<sl-checkbox name="useSitemap" ?checked=${this.formState.useSitemap}>
${msg("Check for sitemap")}
Expand Down Expand Up @@ -3263,6 +3275,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
| "failOnFailedSeed"
| "failOnContentCheck"
| "saveStorage"
| "robots"
> {
const jsonSeeds = this.formState.seedListFormat === SeedListFormat.JSON;

Expand All @@ -3282,6 +3295,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
failOnFailedSeed: this.formState.failOnFailedSeed,
failOnContentCheck: this.formState.failOnContentCheck,
saveStorage: this.formState.saveStorage,
robots: this.formState.robots,
};

return config;
Expand All @@ -3295,6 +3309,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
| "failOnFailedSeed"
| "failOnContentCheck"
| "saveStorage"
| "robots"
> {
const primarySeedUrl = this.formState.primarySeedUrl;
const includeUrlList = this.formState.customIncludeUrlList
Expand Down Expand Up @@ -3327,6 +3342,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
failOnFailedSeed: false,
failOnContentCheck: this.formState.failOnContentCheck,
saveStorage: this.formState.saveStorage,
robots: this.formState.robots,
};
return config;
}
Expand Down
3 changes: 3 additions & 0 deletions frontend/src/strings/crawl-workflows/infoText.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ export const infoTextFor = {
saveStorage: msg(
`Include data from the browser's local and session storage in the web archive.`,
),
robots: msg(
`Check for a /robots.txt for each host and skip any disallowed pages.`,
),
} as const satisfies Partial<Record<Field, string | TemplateResult>>;

export default infoTextFor;
1 change: 1 addition & 0 deletions frontend/src/types/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ export type SeedConfig = Expand<
customBehaviors: string[];
clickSelector: string;
saveStorage?: boolean;
robots?: boolean;
}
>;

Expand Down
3 changes: 3 additions & 0 deletions frontend/src/utils/workflow.ts
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ export type FormState = {
selectLinks: string[];
clickSelector: string;
saveStorage: WorkflowParams["config"]["saveStorage"];
robots: WorkflowParams["config"]["robots"];
};

export type FormStateField = keyof FormState;
Expand Down Expand Up @@ -246,6 +247,7 @@ export const getDefaultFormState = (): FormState => ({
clickSelector: DEFAULT_AUTOCLICK_SELECTOR,
customBehavior: false,
saveStorage: false,
robots: false,
});

export const mapSeedToUrl = (arr: Seed[]) =>
Expand Down Expand Up @@ -416,6 +418,7 @@ export function getInitialFormState(params: {
params.initialWorkflow.crawlerChannel || defaultFormState.crawlerChannel,
proxyId: params.initialWorkflow.proxyId || defaultFormState.proxyId,
saveStorage: params.initialWorkflow.config.saveStorage,
robots: params.initialWorkflow.config.robots,
...formState,
};
}
Expand Down
Loading