Skip to content

Commit 33f6217

Browse files
Firecrawl V1 migration / rehauling (#15834)
* pnpm * Scrape Page migration + prop adjustments * Get Crawl Status migration * Crawl migration and prop adjustments * Version bumps and adjustments * Typo fix * Add example to additional options desc * Fixing description with broken backticks * pnpm --------- Co-authored-by: Leo Vu <vunguyenhung@outlook.com>
1 parent aa27714 commit 33f6217

File tree

11 files changed

+168
-412
lines changed

11 files changed

+168
-412
lines changed
Lines changed: 43 additions & 147 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1+
import { parseObjectEntries } from "../../common/utils.mjs";
12
import firecrawl from "../../firecrawl.app.mjs";
23

34
export default {
45
key: "firecrawl-crawl-url",
56
name: "Crawl URL",
6-
description: "Crawls a given input URL and returns the contents of sub-pages. [See the documentation](https://docs.firecrawl.dev/api-reference/endpoint/crawl)",
7-
version: "0.0.1",
7+
description: "Crawls a given URL and returns the contents of sub-pages. [See the documentation](https://docs.firecrawl.dev/api-reference/endpoint/crawl-post)",
8+
version: "1.0.0",
89
type: "action",
910
props: {
1011
firecrawl,
@@ -14,180 +15,75 @@ export default {
1415
"url",
1516
],
1617
},
17-
includes: {
18-
propDefinition: [
19-
firecrawl,
20-
"includes",
21-
],
18+
excludePaths: {
19+
type: "string[]",
20+
label: "Exclude Paths",
21+
description: "URL pathname regex patterns that exclude matching URLs from the crawl. For example, a value of `blog/.*` for the URL `firecrawl.dev` will exclude any results matching that pattern, such as `https://www.firecrawl.dev/blog/firecrawl-launch-week-1-recap`",
2222
optional: true,
2323
},
24-
excludes: {
25-
propDefinition: [
26-
firecrawl,
27-
"excludes",
28-
],
29-
optional: true,
30-
},
31-
generateImgAltText: {
32-
propDefinition: [
33-
firecrawl,
34-
"generateImgAltText",
35-
],
36-
optional: true,
37-
},
38-
returnOnlyUrls: {
39-
propDefinition: [
40-
firecrawl,
41-
"returnOnlyUrls",
42-
],
24+
includePaths: {
25+
type: "string[]",
26+
label: "Include Paths",
27+
description: "Similar to `Exclude Paths`, but if set, only the paths matching the specified patterns will be included",
4328
optional: true,
4429
},
4530
maxDepth: {
46-
propDefinition: [
47-
firecrawl,
48-
"maxDepth",
49-
],
50-
optional: true,
51-
},
52-
mode: {
53-
propDefinition: [
54-
firecrawl,
55-
"mode",
56-
],
31+
type: "integer",
32+
label: "Max Depth",
33+
description: "Maximum depth to crawl relative to the entered URL",
5734
optional: true,
5835
},
5936
ignoreSitemap: {
60-
propDefinition: [
61-
firecrawl,
62-
"ignoreSitemap",
63-
],
64-
optional: true,
65-
},
66-
limit: {
67-
propDefinition: [
68-
firecrawl,
69-
"limit",
70-
],
71-
optional: true,
72-
},
73-
allowBackwardCrawling: {
74-
propDefinition: [
75-
firecrawl,
76-
"allowBackwardCrawling",
77-
],
78-
optional: true,
79-
},
80-
allowExternalContentLinks: {
81-
propDefinition: [
82-
firecrawl,
83-
"allowExternalContentLinks",
84-
],
37+
type: "boolean",
38+
label: "Ignore Sitemap",
39+
description: "Ignore the website sitemap when crawling",
8540
optional: true,
8641
},
87-
headers: {
88-
propDefinition: [
89-
firecrawl,
90-
"headers",
91-
],
42+
ignoreQueryParameters: {
43+
type: "boolean",
44+
label: "Ignore Query Parameters",
45+
description: "Do not re-scrape the same path with different (or none) query parameters",
9246
optional: true,
9347
},
94-
includeHtml: {
95-
propDefinition: [
96-
firecrawl,
97-
"includeHtml",
98-
],
99-
optional: true,
100-
},
101-
includeRawHtml: {
102-
propDefinition: [
103-
firecrawl,
104-
"includeRawHtml",
105-
],
106-
optional: true,
107-
},
108-
onlyIncludeTags: {
109-
propDefinition: [
110-
firecrawl,
111-
"onlyIncludeTags",
112-
],
113-
optional: true,
114-
},
115-
onlyMainContent: {
116-
propDefinition: [
117-
firecrawl,
118-
"onlyMainContent",
119-
],
120-
optional: true,
121-
},
122-
removeTags: {
123-
propDefinition: [
124-
firecrawl,
125-
"removeTags",
126-
],
127-
optional: true,
128-
},
129-
replaceAllPathsWithAbsolutePaths: {
130-
propDefinition: [
131-
firecrawl,
132-
"replaceAllPathsWithAbsolutePaths",
133-
],
48+
limit: {
49+
type: "integer",
50+
label: "Limit",
51+
description: "Maximum number of pages to crawl",
13452
optional: true,
13553
},
136-
screenshot: {
137-
propDefinition: [
138-
firecrawl,
139-
"screenshot",
140-
],
54+
allowBackwardLinks: {
55+
type: "boolean",
56+
label: "Allow Backward Links",
57+
description: "Enables the crawler to navigate from a specific URL to previously linked pages",
14158
optional: true,
14259
},
143-
fullPageScreenshot: {
144-
propDefinition: [
145-
firecrawl,
146-
"fullPageScreenshot",
147-
],
60+
allowExternalLinks: {
61+
type: "boolean",
62+
label: "Allow External Links",
63+
description: "Allows the crawler to follow links to external websites",
14864
optional: true,
14965
},
150-
waitFor: {
66+
additionalOptions: {
15167
propDefinition: [
15268
firecrawl,
153-
"waitFor",
69+
"additionalOptions",
15470
],
155-
optional: true,
71+
description: "Additional parameters to send in the request. [https://docs.firecrawl.dev/api-reference/endpoint/crawl-post) for available parameters. Values will be parsed as JSON where applicable. For example, to add the `webhook` param, use the value `{\"webhook\": {\"url\": \"https://your-server-webhook-api.com\",\"headers\": {},\"metadata\": {},\"events\": [\"completed\"]}}`",
15672
},
15773
},
15874
async run({ $ }) {
159-
const response = await this.firecrawl.crawl({
75+
const {
76+
firecrawl, additionalOptions, ...data
77+
} = this;
78+
const response = await firecrawl.crawl({
16079
$,
16180
data: {
162-
url: this.url,
163-
crawlerOptions: {
164-
includes: this.includes,
165-
excludes: this.excludes,
166-
generateImgAltText: this.generateImgAltText,
167-
returnOnlyUrls: this.returnOnlyUrls,
168-
maxDepth: parseInt(this.maxDepth),
169-
mode: this.mode,
170-
ignoreSitemap: this.ignoreSitemap,
171-
limit: this.limit,
172-
allowBackwardCrawling: this.allowBackwardCrawling,
173-
allowExternalContentLinks: this.allowExternalContentLinks,
174-
},
175-
pageOptions: {
176-
headers: this.headers,
177-
includeHtml: this.includeHtml,
178-
includeRawHtml: this.includeRawHtml,
179-
onlyIncludeTags: this.onlyIncludeTags,
180-
onlyMainContent: this.onlyMainContent,
181-
removeTags: this.removeTags,
182-
replaceAllPathsWithAbsolutePaths: this.replaceAllPathsWithAbsolutePaths,
183-
screenshot: this.screenshot,
184-
fullPageScreenshot: this.fullPageScreenshot,
185-
waitFor: parseInt(this.waitFor),
186-
},
81+
...data,
82+
...(additionalOptions && parseObjectEntries(additionalOptions)),
18783
},
18884
});
18985

190-
$.export("$summary", `Crawl job started with jobId: ${response.jobId}`);
86+
$.export("$summary", `Crawl job started (ID: ${response.id})`);
19187
return response;
19288
},
19389
};

components/firecrawl/actions/get-crawl-status/get-crawl-status.mjs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@ import firecrawl from "../../firecrawl.app.mjs";
22

33
export default {
44
key: "firecrawl-get-crawl-status",
5-
name: "Get Crawl Status",
6-
description: "Obtains the status and data from a previous crawl operation. [See the documentation](https://docs.firecrawl.dev/api-reference/endpoint/status)",
7-
version: "0.0.1",
5+
name: "Get Crawl Data",
6+
description: "Obtains the status and data from a previous crawl operation. [See the documentation](https://docs.firecrawl.dev/api-reference/endpoint/crawl-get)",
7+
version: "0.0.2",
88
type: "action",
99
props: {
1010
firecrawl,
@@ -21,7 +21,7 @@ export default {
2121
crawlId: this.crawlId,
2222
});
2323

24-
$.export("$summary", `Successfully retrieved status for crawl ID: ${this.crawlId}`);
24+
$.export("$summary", `Successfully retrieved status for crawl (ID: ${this.crawlId})`);
2525
return response;
2626
},
2727
};

0 commit comments

Comments
 (0)