Skip to content

Commit f1a3d24

Browse files
luancazarinecoderabbitai[bot]GTFalcao
authored
New Components - scrapeninja (#15753)
* scrapeninja init * [Components] scrapeninja #15137 Actions - Non JS Scraping - Scraping With JS Rendering * pnpm update * some adjusts * Update components/scrapeninja/actions/scraping-with-js-rendering/scraping-with-js-rendering.mjs Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> * Update components/scrapeninja/scrapeninja.app.mjs Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> * Renaming action files to match their names --------- Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> Co-authored-by: GTFalcao <gtfalcao96@gmail.com>
1 parent 492fbe1 commit f1a3d24

File tree

8 files changed

+592
-20
lines changed

8 files changed

+592
-20
lines changed

components/scrapeninja/.gitignore

Lines changed: 0 additions & 3 deletions
This file was deleted.
Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
import { ConfigurationError } from "@pipedream/platform";
2+
import {
3+
clearObj,
4+
parseError, parseObject,
5+
} from "../../common/utils.mjs";
6+
import scrapeninja from "../../scrapeninja.app.mjs";
7+
8+
export default {
9+
key: "scrapeninja-scrape-with-js-rendering",
10+
name: "Scrape with JS Rendering",
11+
description: "Uses the ScrapeNinja real Chrome browser engine to scrape pages that require JS rendering. [See the documentation](https://scrapeninja.net/docs/api-reference/scrape-js/)",
12+
version: "0.0.1",
13+
type: "action",
14+
props: {
15+
scrapeninja,
16+
url: {
17+
propDefinition: [
18+
scrapeninja,
19+
"url",
20+
],
21+
},
22+
waitForSelector: {
23+
propDefinition: [
24+
scrapeninja,
25+
"waitForSelector",
26+
],
27+
optional: true,
28+
},
29+
postWaitTime: {
30+
propDefinition: [
31+
scrapeninja,
32+
"postWaitTime",
33+
],
34+
optional: true,
35+
},
36+
dumpIframe: {
37+
propDefinition: [
38+
scrapeninja,
39+
"dumpIframe",
40+
],
41+
optional: true,
42+
},
43+
waitForSelectorIframe: {
44+
propDefinition: [
45+
scrapeninja,
46+
"waitForSelectorIframe",
47+
],
48+
optional: true,
49+
},
50+
extractorTargetIframe: {
51+
propDefinition: [
52+
scrapeninja,
53+
"extractorTargetIframe",
54+
],
55+
optional: true,
56+
},
57+
headers: {
58+
propDefinition: [
59+
scrapeninja,
60+
"headers",
61+
],
62+
optional: true,
63+
},
64+
retryNum: {
65+
propDefinition: [
66+
scrapeninja,
67+
"retryNum",
68+
],
69+
optional: true,
70+
},
71+
geo: {
72+
propDefinition: [
73+
scrapeninja,
74+
"geo",
75+
],
76+
optional: true,
77+
},
78+
proxy: {
79+
propDefinition: [
80+
scrapeninja,
81+
"proxy",
82+
],
83+
optional: true,
84+
},
85+
timeout: {
86+
propDefinition: [
87+
scrapeninja,
88+
"timeout",
89+
],
90+
optional: true,
91+
},
92+
textNotExpected: {
93+
propDefinition: [
94+
scrapeninja,
95+
"textNotExpected",
96+
],
97+
optional: true,
98+
},
99+
statusNotExpected: {
100+
propDefinition: [
101+
scrapeninja,
102+
"statusNotExpected",
103+
],
104+
optional: true,
105+
},
106+
blockImages: {
107+
propDefinition: [
108+
scrapeninja,
109+
"blockImages",
110+
],
111+
optional: true,
112+
},
113+
blockMedia: {
114+
propDefinition: [
115+
scrapeninja,
116+
"blockMedia",
117+
],
118+
optional: true,
119+
},
120+
screenshot: {
121+
propDefinition: [
122+
scrapeninja,
123+
"screenshot",
124+
],
125+
optional: true,
126+
},
127+
catchAjaxHeadersUrlMask: {
128+
propDefinition: [
129+
scrapeninja,
130+
"catchAjaxHeadersUrlMask",
131+
],
132+
optional: true,
133+
},
134+
viewportWidth: {
135+
propDefinition: [
136+
scrapeninja,
137+
"viewportWidth",
138+
],
139+
optional: true,
140+
},
141+
viewportHeight: {
142+
propDefinition: [
143+
scrapeninja,
144+
"viewportHeight",
145+
],
146+
optional: true,
147+
},
148+
viewportDeviceScaleFactor: {
149+
propDefinition: [
150+
scrapeninja,
151+
"viewportDeviceScaleFactor",
152+
],
153+
optional: true,
154+
},
155+
viewportHasTouch: {
156+
propDefinition: [
157+
scrapeninja,
158+
"viewportHasTouch",
159+
],
160+
optional: true,
161+
},
162+
viewportIsMobile: {
163+
propDefinition: [
164+
scrapeninja,
165+
"viewportIsMobile",
166+
],
167+
optional: true,
168+
},
169+
viewportIsLandscape: {
170+
propDefinition: [
171+
scrapeninja,
172+
"viewportIsLandscape",
173+
],
174+
optional: true,
175+
},
176+
extractor: {
177+
propDefinition: [
178+
scrapeninja,
179+
"extractor",
180+
],
181+
optional: true,
182+
},
183+
},
184+
async run({ $ }) {
185+
try {
186+
const viewport = clearObj({
187+
width: this.viewportWidth,
188+
height: this.viewportHeight,
189+
deviceScaleFactor: this.viewportDeviceScaleFactor,
190+
hasTouch: this.viewportHasTouch,
191+
isMobile: this.viewportIsMobile,
192+
isLandscape: this.viewportIsLandscape,
193+
});
194+
195+
const data = clearObj({
196+
url: this.url,
197+
waitForSelector: this.waitForSelector,
198+
postWaitTime: this.postWaitTime,
199+
dumpIframe: this.dumpIframe,
200+
waitForSelectorIframe: this.waitForSelectorIframe,
201+
extractorTargetIframe: this.extractorTargetIframe,
202+
headers: parseObject(this.headers),
203+
retryNum: this.retryNum,
204+
geo: this.geo,
205+
proxy: this.proxy,
206+
timeout: this.timeout,
207+
textNotExpected: parseObject(this.textNotExpected),
208+
statusNotExpected: parseObject(this.statusNotExpected),
209+
blockImages: this.blockImages,
210+
blockMedia: this.blockMedia,
211+
screenshot: this.screenshot,
212+
catchAjaxHeadersUrlMask: this.catchAjaxHeadersUrlMask,
213+
extractor: this.extractor,
214+
});
215+
216+
if (Object.entries(viewport).length) {
217+
data.viewport = viewport;
218+
}
219+
220+
const response = await this.scrapeninja.scrapeJs({
221+
$,
222+
data,
223+
});
224+
225+
$.export("$summary", `Successfully scraped ${this.url} with JS rendering`);
226+
return response;
227+
} catch ({ response: { data } }) {
228+
throw new ConfigurationError(parseError(data));
229+
}
230+
},
231+
};
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
import { ConfigurationError } from "@pipedream/platform";
2+
import { parseObject } from "../../common/utils.mjs";
3+
import scrapeninja from "../../scrapeninja.app.mjs";
4+
5+
export default {
6+
key: "scrapeninja-scrape-without-js",
7+
name: "Scrape without JS",
8+
description: "Use high-performance web scraping endpoint with Chrome browser TLS fingerprint, but without JavaScript execution and real browser overhead. [See the documentation](https://scrapeninja.net/docs/api-reference/scrape/)",
9+
version: "0.0.1",
10+
type: "action",
11+
props: {
12+
scrapeninja,
13+
url: {
14+
propDefinition: [
15+
scrapeninja,
16+
"url",
17+
],
18+
},
19+
headers: {
20+
propDefinition: [
21+
scrapeninja,
22+
"headers",
23+
],
24+
optional: true,
25+
},
26+
retryNum: {
27+
propDefinition: [
28+
scrapeninja,
29+
"retryNum",
30+
],
31+
optional: true,
32+
},
33+
geo: {
34+
propDefinition: [
35+
scrapeninja,
36+
"geo",
37+
],
38+
optional: true,
39+
},
40+
proxy: {
41+
propDefinition: [
42+
scrapeninja,
43+
"proxy",
44+
],
45+
optional: true,
46+
},
47+
followRedirects: {
48+
propDefinition: [
49+
scrapeninja,
50+
"followRedirects",
51+
],
52+
optional: true,
53+
},
54+
timeout: {
55+
propDefinition: [
56+
scrapeninja,
57+
"timeout",
58+
],
59+
optional: true,
60+
},
61+
textNotExpected: {
62+
propDefinition: [
63+
scrapeninja,
64+
"textNotExpected",
65+
],
66+
optional: true,
67+
},
68+
statusNotExpected: {
69+
propDefinition: [
70+
scrapeninja,
71+
"statusNotExpected",
72+
],
73+
optional: true,
74+
},
75+
extractor: {
76+
propDefinition: [
77+
scrapeninja,
78+
"extractor",
79+
],
80+
optional: true,
81+
},
82+
},
83+
async run({ $ }) {
84+
try {
85+
const response = await this.scrapeninja.scrapeNonJs({
86+
$,
87+
data: {
88+
url: this.url,
89+
headers: parseObject(this.headers),
90+
retryNum: this.retryNum,
91+
geo: this.geo,
92+
proxy: this.proxy,
93+
followRedirects: this.followRedirects,
94+
timeout: this.timeout,
95+
textNotExpected: parseObject(this.textNotExpected),
96+
statusNotExpected: parseObject(this.statusNotExpected),
97+
extractor: this.extractor,
98+
},
99+
});
100+
$.export("$summary", "Successfully scraped the URL");
101+
return response;
102+
} catch ({ response: { data } }) {
103+
throw new ConfigurationError(data.message || data.stderr);
104+
}
105+
},
106+
};

components/scrapeninja/app/scrapeninja.app.ts

Lines changed: 0 additions & 13 deletions
This file was deleted.

0 commit comments

Comments
 (0)