Skip to content

Commit cdcb883

Browse files
authored
Merge pull request unclecode#1605 from Nstproxy/feat/nstproxy
feat: Add Nstproxy Proxies
2 parents b207ae2 + 8045216 commit cdcb883

17 files changed

+752
-6
lines changed

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,10 @@ async def test_news_crawl():
544544

545545
</details>
546546

547+
---
548+
549+
> **💡 Tip:** Some websites may use **CAPTCHA** based verification mechanisms to prevent automated access. If your workflow encounters such challenges, you may optionally integrate a third-party CAPTCHA-handling service such as <strong>[CapSolver](https://www.capsolver.com/blog/Partners/crawl4ai-capsolver/?utm_source=crawl4ai&utm_medium=github_pr&utm_campaign=crawl4ai_integration)</strong>. They support reCAPTCHA v2/v3, Cloudflare Turnstile, Challenge, AWS WAF, and more. Please ensure that your usage complies with the target website’s terms of service and applicable laws.
550+
547551
## ✨ Recent Updates
548552

549553
<details>

crawl4ai/async_configs.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import os
22
from typing import Union
33
import warnings
4+
import requests
45
from .config import (
56
DEFAULT_PROVIDER,
67
DEFAULT_PROVIDER_API_KEY,
@@ -649,6 +650,85 @@ def load(data: dict) -> "BrowserConfig":
649650
return config
650651
return BrowserConfig.from_kwargs(config)
651652

653+
def set_nstproxy(
654+
self,
655+
token: str,
656+
channel_id: str,
657+
country: str = "ANY",
658+
state: str = "",
659+
city: str = "",
660+
protocol: str = "http",
661+
session_duration: int = 10,
662+
):
663+
"""
664+
Fetch a proxy from NSTProxy API and automatically assign it to proxy_config.
665+
666+
Get your NSTProxy token from: https://app.nstproxy.com/profile
667+
668+
Args:
669+
token (str): NSTProxy API token.
670+
channel_id (str): NSTProxy channel ID.
671+
country (str, optional): Country code (default: "ANY").
672+
state (str, optional): State code (default: "").
673+
city (str, optional): City name (default: "").
674+
protocol (str, optional): Proxy protocol ("http" or "socks5"). Defaults to "http".
675+
session_duration (int, optional): Session duration in minutes (0 = rotate each request). Defaults to 10.
676+
677+
Raises:
678+
ValueError: If the API response format is invalid.
679+
PermissionError: If the API returns an error message.
680+
"""
681+
682+
# --- Validate input early ---
683+
if not token or not channel_id:
684+
raise ValueError("[NSTProxy] token and channel_id are required")
685+
686+
if protocol not in ("http", "socks5"):
687+
raise ValueError(f"[NSTProxy] Invalid protocol: {protocol}")
688+
689+
# --- Build NSTProxy API URL ---
690+
params = {
691+
"fType": 2,
692+
"count": 1,
693+
"channelId": channel_id,
694+
"country": country,
695+
"protocol": protocol,
696+
"sessionDuration": session_duration,
697+
"token": token,
698+
}
699+
if state:
700+
params["state"] = state
701+
if city:
702+
params["city"] = city
703+
704+
url = "https://api.nstproxy.com/api/v1/generate/apiproxies"
705+
706+
try:
707+
response = requests.get(url, params=params, timeout=10)
708+
response.raise_for_status()
709+
710+
data = response.json()
711+
712+
# --- Handle API error response ---
713+
if isinstance(data, dict) and data.get("err"):
714+
raise PermissionError(f"[NSTProxy] API Error: {data.get('msg', 'Unknown error')}")
715+
716+
if not isinstance(data, list) or not data:
717+
raise ValueError("[NSTProxy] Invalid API response — expected a non-empty list")
718+
719+
proxy_info = data[0]
720+
721+
# --- Apply proxy config ---
722+
self.proxy_config = ProxyConfig(
723+
server=f"{protocol}://{proxy_info['ip']}:{proxy_info['port']}",
724+
username=proxy_info["username"],
725+
password=proxy_info["password"],
726+
)
727+
728+
except Exception as e:
729+
print(f"[NSTProxy] ❌ Failed to set proxy: {e}")
730+
raise
731+
652732
class VirtualScrollConfig:
653733
"""Configuration for virtual scroll handling.
654734
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import asyncio
2+
import capsolver
3+
from crawl4ai import *
4+
5+
6+
# TODO: set your config
7+
# Docs: https://docs.capsolver.com/guide/captcha/awsWaf/
8+
api_key = "CAP-xxxxxxxxxxxxxxxxxxxxx" # your api key of capsolver
9+
site_url = "https://nft.porsche.com/onboarding@6" # page url of your target site
10+
cookie_domain = ".nft.porsche.com" # the domain name to which you want to apply the cookie
11+
captcha_type = "AntiAwsWafTaskProxyLess" # type of your target captcha
12+
capsolver.api_key = api_key
13+
14+
15+
async def main():
16+
browser_config = BrowserConfig(
17+
verbose=True,
18+
headless=False,
19+
use_persistent_context=True,
20+
)
21+
22+
async with AsyncWebCrawler(config=browser_config) as crawler:
23+
await crawler.arun(
24+
url=site_url,
25+
cache_mode=CacheMode.BYPASS,
26+
session_id="session_captcha_test"
27+
)
28+
29+
# get aws waf cookie using capsolver sdk
30+
solution = capsolver.solve({
31+
"type": captcha_type,
32+
"websiteURL": site_url,
33+
})
34+
cookie = solution["cookie"]
35+
print("aws waf cookie:", cookie)
36+
37+
js_code = """
38+
document.cookie = \'aws-waf-token=""" + cookie + """;domain=""" + cookie_domain + """;path=/\';
39+
location.reload();
40+
"""
41+
42+
wait_condition = """() => {
43+
return document.title === \'Join Porsche’s journey into Web3\';
44+
}"""
45+
46+
run_config = CrawlerRunConfig(
47+
cache_mode=CacheMode.BYPASS,
48+
session_id="session_captcha_test",
49+
js_code=js_code,
50+
js_only=True,
51+
wait_for=f"js:{wait_condition}"
52+
)
53+
54+
result_next = await crawler.arun(
55+
url=site_url,
56+
config=run_config,
57+
)
58+
print(result_next.markdown)
59+
60+
61+
if __name__ == "__main__":
62+
asyncio.run(main())
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import asyncio
2+
import capsolver
3+
from crawl4ai import *
4+
5+
6+
# TODO: set your config
7+
# Docs: https://docs.capsolver.com/guide/captcha/cloudflare_challenge/
8+
api_key = "CAP-xxxxxxxxxxxxxxxxxxxxx" # your api key of capsolver
9+
site_url = "https://gitlab.com/users/sign_in" # page url of your target site
10+
captcha_type = "AntiCloudflareTask" # type of your target captcha
11+
# your http proxy to solve cloudflare challenge
12+
proxy_server = "proxy.example.com:8080"
13+
proxy_username = "myuser"
14+
proxy_password = "mypass"
15+
capsolver.api_key = api_key
16+
17+
18+
async def main():
19+
# get challenge cookie using capsolver sdk
20+
solution = capsolver.solve({
21+
"type": captcha_type,
22+
"websiteURL": site_url,
23+
"proxy": f"{proxy_server}:{proxy_username}:{proxy_password}",
24+
})
25+
cookies = solution["cookies"]
26+
user_agent = solution["userAgent"]
27+
print("challenge cookies:", cookies)
28+
29+
cookies_list = []
30+
for name, value in cookies.items():
31+
cookies_list.append({
32+
"name": name,
33+
"value": value,
34+
"url": site_url,
35+
})
36+
37+
browser_config = BrowserConfig(
38+
verbose=True,
39+
headless=False,
40+
use_persistent_context=True,
41+
user_agent=user_agent,
42+
cookies=cookies_list,
43+
proxy_config={
44+
"server": f"http://{proxy_server}",
45+
"username": proxy_username,
46+
"password": proxy_password,
47+
},
48+
)
49+
50+
async with AsyncWebCrawler(config=browser_config) as crawler:
51+
result = await crawler.arun(
52+
url=site_url,
53+
cache_mode=CacheMode.BYPASS,
54+
session_id="session_captcha_test"
55+
)
56+
print(result.markdown)
57+
58+
59+
if __name__ == "__main__":
60+
asyncio.run(main())
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import asyncio
2+
import capsolver
3+
from crawl4ai import *
4+
5+
6+
# TODO: set your config
7+
# Docs: https://docs.capsolver.com/guide/captcha/cloudflare_turnstile/
8+
api_key = "CAP-xxxxxxxxxxxxxxxxxxxxx" # your api key of capsolver
9+
site_key = "0x4AAAAAAAGlwMzq_9z6S9Mh" # site key of your target site
10+
site_url = "https://clifford.io/demo/cloudflare-turnstile" # page url of your target site
11+
captcha_type = "AntiTurnstileTaskProxyLess" # type of your target captcha
12+
capsolver.api_key = api_key
13+
14+
15+
async def main():
16+
browser_config = BrowserConfig(
17+
verbose=True,
18+
headless=False,
19+
use_persistent_context=True,
20+
)
21+
22+
async with AsyncWebCrawler(config=browser_config) as crawler:
23+
await crawler.arun(
24+
url=site_url,
25+
cache_mode=CacheMode.BYPASS,
26+
session_id="session_captcha_test"
27+
)
28+
29+
# get turnstile token using capsolver sdk
30+
solution = capsolver.solve({
31+
"type": captcha_type,
32+
"websiteURL": site_url,
33+
"websiteKey": site_key,
34+
})
35+
token = solution["token"]
36+
print("turnstile token:", token)
37+
38+
js_code = """
39+
document.querySelector(\'input[name="cf-turnstile-response"]\').value = \'"""+token+"""\';
40+
document.querySelector(\'button[type="submit"]\').click();
41+
"""
42+
43+
wait_condition = """() => {
44+
const items = document.querySelectorAll(\'h1\');
45+
return items.length === 0;
46+
}"""
47+
48+
run_config = CrawlerRunConfig(
49+
cache_mode=CacheMode.BYPASS,
50+
session_id="session_captcha_test",
51+
js_code=js_code,
52+
js_only=True,
53+
wait_for=f"js:{wait_condition}"
54+
)
55+
56+
result_next = await crawler.arun(
57+
url=site_url,
58+
config=run_config,
59+
)
60+
print(result_next.markdown)
61+
62+
63+
if __name__ == "__main__":
64+
asyncio.run(main())
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import asyncio
2+
import capsolver
3+
from crawl4ai import *
4+
5+
6+
# TODO: set your config
7+
# Docs: https://docs.capsolver.com/guide/captcha/ReCaptchaV2/
8+
api_key = "CAP-xxxxxxxxxxxxxxxxxxxxx" # your api key of capsolver
9+
site_key = "6LfW6wATAAAAAHLqO2pb8bDBahxlMxNdo9g947u9" # site key of your target site
10+
site_url = "https://recaptcha-demo.appspot.com/recaptcha-v2-checkbox.php" # page url of your target site
11+
captcha_type = "ReCaptchaV2TaskProxyLess" # type of your target captcha
12+
capsolver.api_key = api_key
13+
14+
15+
async def main():
16+
browser_config = BrowserConfig(
17+
verbose=True,
18+
headless=False,
19+
use_persistent_context=True,
20+
)
21+
22+
async with AsyncWebCrawler(config=browser_config) as crawler:
23+
await crawler.arun(
24+
url=site_url,
25+
cache_mode=CacheMode.BYPASS,
26+
session_id="session_captcha_test"
27+
)
28+
29+
# get recaptcha token using capsolver sdk
30+
solution = capsolver.solve({
31+
"type": captcha_type,
32+
"websiteURL": site_url,
33+
"websiteKey": site_key,
34+
})
35+
token = solution["gRecaptchaResponse"]
36+
print("recaptcha token:", token)
37+
38+
js_code = """
39+
const textarea = document.getElementById(\'g-recaptcha-response\');
40+
if (textarea) {
41+
textarea.value = \"""" + token + """\";
42+
document.querySelector(\'button.form-field[type="submit"]\').click();
43+
}
44+
"""
45+
46+
wait_condition = """() => {
47+
const items = document.querySelectorAll(\'h2\');
48+
return items.length > 1;
49+
}"""
50+
51+
run_config = CrawlerRunConfig(
52+
cache_mode=CacheMode.BYPASS,
53+
session_id="session_captcha_test",
54+
js_code=js_code,
55+
js_only=True,
56+
wait_for=f"js:{wait_condition}"
57+
)
58+
59+
result_next = await crawler.arun(
60+
url=site_url,
61+
config=run_config,
62+
)
63+
print(result_next.markdown)
64+
65+
66+
if __name__ == "__main__":
67+
asyncio.run(main())

0 commit comments

Comments
 (0)