Skip to content

Commit b9441d1

Browse files
authored
New Components - firecrawl (#13574)
* firecrawl init * [Components] firecrawl #13524 Actions - Crawl URL - Get Crawl Data - Scrape Page * pnpm update * some adjusts
1 parent 5631a76 commit b9441d1

File tree

7 files changed

+587
-7
lines changed

7 files changed

+587
-7
lines changed
Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
import firecrawl from "../../firecrawl.app.mjs";
2+
3+
export default {
4+
key: "firecrawl-crawl-url",
5+
name: "Crawl URL",
6+
description: "Crawls a given input URL and returns the contents of sub-pages. [See the documentation](https://docs.firecrawl.dev/api-reference/endpoint/crawl)",
7+
version: "0.0.1",
8+
type: "action",
9+
props: {
10+
firecrawl,
11+
url: {
12+
propDefinition: [
13+
firecrawl,
14+
"url",
15+
],
16+
},
17+
includes: {
18+
propDefinition: [
19+
firecrawl,
20+
"includes",
21+
],
22+
optional: true,
23+
},
24+
excludes: {
25+
propDefinition: [
26+
firecrawl,
27+
"excludes",
28+
],
29+
optional: true,
30+
},
31+
generateImgAltText: {
32+
propDefinition: [
33+
firecrawl,
34+
"generateImgAltText",
35+
],
36+
optional: true,
37+
},
38+
returnOnlyUrls: {
39+
propDefinition: [
40+
firecrawl,
41+
"returnOnlyUrls",
42+
],
43+
optional: true,
44+
},
45+
maxDepth: {
46+
propDefinition: [
47+
firecrawl,
48+
"maxDepth",
49+
],
50+
optional: true,
51+
},
52+
mode: {
53+
propDefinition: [
54+
firecrawl,
55+
"mode",
56+
],
57+
optional: true,
58+
},
59+
ignoreSitemap: {
60+
propDefinition: [
61+
firecrawl,
62+
"ignoreSitemap",
63+
],
64+
optional: true,
65+
},
66+
limit: {
67+
propDefinition: [
68+
firecrawl,
69+
"limit",
70+
],
71+
optional: true,
72+
},
73+
allowBackwardCrawling: {
74+
propDefinition: [
75+
firecrawl,
76+
"allowBackwardCrawling",
77+
],
78+
optional: true,
79+
},
80+
allowExternalContentLinks: {
81+
propDefinition: [
82+
firecrawl,
83+
"allowExternalContentLinks",
84+
],
85+
optional: true,
86+
},
87+
headers: {
88+
propDefinition: [
89+
firecrawl,
90+
"headers",
91+
],
92+
optional: true,
93+
},
94+
includeHtml: {
95+
propDefinition: [
96+
firecrawl,
97+
"includeHtml",
98+
],
99+
optional: true,
100+
},
101+
includeRawHtml: {
102+
propDefinition: [
103+
firecrawl,
104+
"includeRawHtml",
105+
],
106+
optional: true,
107+
},
108+
onlyIncludeTags: {
109+
propDefinition: [
110+
firecrawl,
111+
"onlyIncludeTags",
112+
],
113+
optional: true,
114+
},
115+
onlyMainContent: {
116+
propDefinition: [
117+
firecrawl,
118+
"onlyMainContent",
119+
],
120+
optional: true,
121+
},
122+
removeTags: {
123+
propDefinition: [
124+
firecrawl,
125+
"removeTags",
126+
],
127+
optional: true,
128+
},
129+
replaceAllPathsWithAbsolutePaths: {
130+
propDefinition: [
131+
firecrawl,
132+
"replaceAllPathsWithAbsolutePaths",
133+
],
134+
optional: true,
135+
},
136+
screenshot: {
137+
propDefinition: [
138+
firecrawl,
139+
"screenshot",
140+
],
141+
optional: true,
142+
},
143+
fullPageScreenshot: {
144+
propDefinition: [
145+
firecrawl,
146+
"fullPageScreenshot",
147+
],
148+
optional: true,
149+
},
150+
waitFor: {
151+
propDefinition: [
152+
firecrawl,
153+
"waitFor",
154+
],
155+
optional: true,
156+
},
157+
},
158+
async run({ $ }) {
159+
const response = await this.firecrawl.crawl({
160+
$,
161+
data: {
162+
url: this.url,
163+
crawlerOptions: {
164+
includes: this.includes,
165+
excludes: this.excludes,
166+
generateImgAltText: this.generateImgAltText,
167+
returnOnlyUrls: this.returnOnlyUrls,
168+
maxDepth: parseInt(this.maxDepth),
169+
mode: this.mode,
170+
ignoreSitemap: this.ignoreSitemap,
171+
limit: this.limit,
172+
allowBackwardCrawling: this.allowBackwardCrawling,
173+
allowExternalContentLinks: this.allowExternalContentLinks,
174+
},
175+
pageOptions: {
176+
headers: this.headers,
177+
includeHtml: this.includeHtml,
178+
includeRawHtml: this.includeRawHtml,
179+
onlyIncludeTags: this.onlyIncludeTags,
180+
onlyMainContent: this.onlyMainContent,
181+
removeTags: this.removeTags,
182+
replaceAllPathsWithAbsolutePaths: this.replaceAllPathsWithAbsolutePaths,
183+
screenshot: this.screenshot,
184+
fullPageScreenshot: this.fullPageScreenshot,
185+
waitFor: parseInt(this.waitFor),
186+
},
187+
},
188+
});
189+
190+
$.export("$summary", `Crawl job started with jobId: ${response.jobId}`);
191+
return response;
192+
},
193+
};
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import firecrawl from "../../firecrawl.app.mjs";
2+
3+
export default {
4+
key: "firecrawl-get-crawl-status",
5+
name: "Get Crawl Status",
6+
description: "Obtains the status and data from a previous crawl operation. [See the documentation](https://docs.firecrawl.dev/api-reference/endpoint/status)",
7+
version: "0.0.1",
8+
type: "action",
9+
props: {
10+
firecrawl,
11+
crawlId: {
12+
propDefinition: [
13+
firecrawl,
14+
"crawlId",
15+
],
16+
},
17+
},
18+
async run({ $ }) {
19+
const response = await this.firecrawl.getCrawlStatus({
20+
$,
21+
crawlId: this.crawlId,
22+
});
23+
24+
$.export("$summary", `Successfully retrieved status for crawl ID: ${this.crawlId}`);
25+
return response;
26+
},
27+
};
Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
import { parseObject } from "../../common/utils.mjs";
2+
import firecrawl from "../../firecrawl.app.mjs";
3+
4+
export default {
5+
key: "firecrawl-scrape-page",
6+
name: "Scrape Page",
7+
description: "Scrapes a URL and returns content from that page. [See the documentation](https://docs.firecrawl.dev/api-reference/endpoint/scrape)",
8+
version: "0.0.1",
9+
type: "action",
10+
props: {
11+
firecrawl,
12+
url: {
13+
propDefinition: [
14+
firecrawl,
15+
"url",
16+
],
17+
description: "The URL to start scraping from.",
18+
},
19+
extractorMode: {
20+
propDefinition: [
21+
firecrawl,
22+
"extractorMode",
23+
],
24+
optional: true,
25+
},
26+
extractionPrompt: {
27+
propDefinition: [
28+
firecrawl,
29+
"extractionPrompt",
30+
],
31+
optional: true,
32+
},
33+
extractionSchema: {
34+
propDefinition: [
35+
firecrawl,
36+
"extractionSchema",
37+
],
38+
optional: true,
39+
},
40+
41+
headers: {
42+
propDefinition: [
43+
firecrawl,
44+
"headers",
45+
],
46+
optional: true,
47+
},
48+
includeHtml: {
49+
propDefinition: [
50+
firecrawl,
51+
"includeHtml",
52+
],
53+
optional: true,
54+
},
55+
includeRawHtml: {
56+
propDefinition: [
57+
firecrawl,
58+
"includeRawHtml",
59+
],
60+
optional: true,
61+
},
62+
onlyIncludeTags: {
63+
propDefinition: [
64+
firecrawl,
65+
"onlyIncludeTags",
66+
],
67+
optional: true,
68+
},
69+
onlyMainContent: {
70+
propDefinition: [
71+
firecrawl,
72+
"onlyMainContent",
73+
],
74+
optional: true,
75+
},
76+
removeTags: {
77+
propDefinition: [
78+
firecrawl,
79+
"removeTags",
80+
],
81+
optional: true,
82+
},
83+
replaceAllPathsWithAbsolutePaths: {
84+
propDefinition: [
85+
firecrawl,
86+
"replaceAllPathsWithAbsolutePaths",
87+
],
88+
optional: true,
89+
},
90+
screenshot: {
91+
propDefinition: [
92+
firecrawl,
93+
"screenshot",
94+
],
95+
optional: true,
96+
},
97+
fullPageScreenshot: {
98+
propDefinition: [
99+
firecrawl,
100+
"fullPageScreenshot",
101+
],
102+
optional: true,
103+
},
104+
waitFor: {
105+
propDefinition: [
106+
firecrawl,
107+
"waitFor",
108+
],
109+
optional: true,
110+
},
111+
timeout: {
112+
propDefinition: [
113+
firecrawl,
114+
"timeout",
115+
],
116+
optional: true,
117+
},
118+
},
119+
async run({ $ }) {
120+
const extractorOptions = {};
121+
if (this.extractorMode) extractorOptions.extractorMode = this.extractorMode;
122+
if (this.extractionPrompt) extractorOptions.extractionPrompt = this.extractionPrompt;
123+
if (this.extractionSchema)
124+
extractorOptions.extractionSchema = parseObject(this.extractionSchema);
125+
126+
const response = await this.firecrawl.scrape({
127+
$,
128+
data: {
129+
url: this.url,
130+
pageOptions: {
131+
headers: this.headers,
132+
includeHtml: this.includeHtml,
133+
includeRawHtml: this.includeRawHtml,
134+
onlyIncludeTags: this.onlyIncludeTags,
135+
onlyMainContent: this.onlyMainContent,
136+
removeTags: this.removeTags,
137+
replaceAllPathsWithAbsolutePaths: this.replaceAllPathsWithAbsolutePaths,
138+
screenshot: this.screenshot,
139+
fullPageScreenshot: this.fullPageScreenshot,
140+
waitFor: parseInt(this.waitFor),
141+
},
142+
extractorOptions,
143+
timeout: this.timeout,
144+
},
145+
});
146+
147+
$.export("$summary", `Successfully scraped content from ${this.url}`);
148+
return response;
149+
},
150+
};
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
export const parseObject = (obj) => {
2+
if (!obj) return undefined;
3+
4+
if (Array.isArray(obj)) {
5+
return obj.map((item) => {
6+
if (typeof item === "string") {
7+
try {
8+
return JSON.parse(item);
9+
} catch (e) {
10+
return item;
11+
}
12+
}
13+
return item;
14+
});
15+
}
16+
if (typeof obj === "string") {
17+
try {
18+
return JSON.parse(obj);
19+
} catch (e) {
20+
return obj;
21+
}
22+
}
23+
return obj;
24+
};

0 commit comments

Comments
 (0)