Skip to content

Commit 6a3b3e9

Browse files
Commit without API
1 parent 1eacea1 commit 6a3b3e9

File tree

3 files changed

+74
-14
lines changed

3 files changed

+74
-14
lines changed

deploy/docker/server.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -482,9 +482,14 @@ async def crawl(
482482
):
483483
"""
484484
Crawl a list of URLs and return the results as JSON.
485+
For streaming responses, use /crawl/stream endpoint.
485486
"""
486487
if not crawl_request.urls:
487488
raise HTTPException(400, "At least one URL required")
489+
# Check whether it is a redirection for a streaming request
490+
crawler_config = CrawlerRunConfig.load(crawl_request.crawler_config)
491+
if crawler_config.stream:
492+
return await stream_process(crawl_request=crawl_request)
488493
results = await handle_crawl_request(
489494
urls=crawl_request.urls,
490495
browser_config=crawl_request.browser_config,
@@ -506,12 +511,16 @@ async def crawl_stream(
506511
):
507512
if not crawl_request.urls:
508513
raise HTTPException(400, "At least one URL required")
514+
515+
return await stream_process(crawl_request=crawl_request)
516+
517+
async def stream_process(crawl_request: CrawlRequest):
509518
crawler, gen = await handle_stream_crawl_request(
510519
urls=crawl_request.urls,
511520
browser_config=crawl_request.browser_config,
512521
crawler_config=crawl_request.crawler_config,
513522
config=config,
514-
)
523+
)
515524
return StreamingResponse(
516525
stream_results(crawler, gen),
517526
media_type="application/x-ndjson",

deploy/docker/static/playground/index.html

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -182,8 +182,8 @@ <h1 class="text-lg font-medium flex items-center space-x-4">
182182
<div class="px-4 py-2 border-b border-border flex items-center">
183183
<h2 class="font-medium">Request Builder</h2>
184184
<select id="endpoint" class="ml-auto bg-dark border border-border rounded px-2 py-1 text-sm">
185-
<option value="crawl">/crawl (batch)</option>
186-
<option value="crawl_stream">/crawl/stream</option>
185+
<option value="crawl">/crawl (supports streaming)</option>
186+
<option value="crawl_stream">/crawl/stream (legacy)</option>
187187
<option value="md">/md</option>
188188
<option value="llm">/llm</option>
189189
</select>
@@ -371,7 +371,7 @@ <h2 class="font-medium text-accent">🔥 Stress Test</h2>
371371

372372
<div class="flex items-center">
373373
<input id="st-stream" type="checkbox" class="mr-2">
374-
<label for="st-stream" class="text-sm">Use /crawl/stream</label>
374+
<label for="st-stream" class="text-sm">Enable streaming mode</label>
375375
<button id="st-run"
376376
class="ml-auto bg-accent text-dark px-4 py-2 rounded hover:bg-opacity-90 font-medium">
377377
Run Stress Test
@@ -596,6 +596,14 @@ <h2 class="font-medium text-accent">🔥 Stress Test</h2>
596596
forceHighlightElement(curlCodeEl);
597597
}
598598

599+
// Detect if stream is requested inside payload
600+
function shouldUseStream(payload) {
601+
const toBool = (v) => v === true || (typeof v === 'string' && v.toLowerCase() === 'true');
602+
const fromCrawler = payload && payload.crawler_config && payload.crawler_config.params && payload.crawler_config.params.stream;
603+
const direct = payload && payload.stream;
604+
return toBool(fromCrawler) || toBool(direct);
605+
}
606+
599607
// Main run function
600608
async function runCrawl() {
601609
const endpoint = document.getElementById('endpoint').value;
@@ -611,16 +619,24 @@ <h2 class="font-medium text-accent">🔥 Stress Test</h2>
611619
: { browser_config: cfgJson };
612620
}
613621
} catch (err) {
614-
updateStatus('error');
615-
document.querySelector('#response-content code').textContent =
616-
JSON.stringify({ error: err.message }, null, 2);
617-
forceHighlightElement(document.querySelector('#response-content code'));
618-
return; // stop run
622+
const codeText = cm.getValue();
623+
const streamFlag = /stream\s*=\s*True/i.test(codeText);
624+
const isCrawlEndpoint = document.getElementById('endpoint').value === 'crawl';
625+
if (isCrawlEndpoint && streamFlag) {
626+
// Fallback: proceed with minimal config only for stream
627+
advConfig = { crawler_config: { stream: true } };
628+
} else {
629+
updateStatus('error');
630+
document.querySelector('#response-content code').textContent =
631+
JSON.stringify({ error: err.message }, null, 2);
632+
forceHighlightElement(document.querySelector('#response-content code'));
633+
return; // stop run
634+
}
619635
}
620636

621637
const endpointMap = {
622638
crawl: '/crawl',
623-
// crawl_stream: '/crawl/stream',
639+
crawl_stream: '/crawl/stream', // Keep for backward compatibility
624640
md: '/md',
625641
llm: '/llm'
626642
};
@@ -647,7 +663,7 @@ <h2 class="font-medium text-accent">🔥 Stress Test</h2>
647663
// This will be handled directly in the fetch below
648664
payload = null;
649665
} else {
650-
// Default payload for /crawl and /crawl/stream
666+
// Default payload for /crawl (supports both streaming and batch modes)
651667
payload = {
652668
urls,
653669
...advConfig
@@ -659,6 +675,7 @@ <h2 class="font-medium text-accent">🔥 Stress Test</h2>
659675
try {
660676
const startTime = performance.now();
661677
let response, responseData;
678+
const useStreamOverride = (endpoint === 'crawl') && shouldUseStream(payload);
662679

663680
if (endpoint === 'llm') {
664681
// Special handling for LLM endpoint which uses URL pattern: /llm/{encoded_url}?q={query}
@@ -681,8 +698,8 @@ <h2 class="font-medium text-accent">🔥 Stress Test</h2>
681698
document.querySelector('#response-content code').textContent = JSON.stringify(responseData, null, 2);
682699
document.querySelector('#response-content code').className = 'json hljs';
683700
forceHighlightElement(document.querySelector('#response-content code'));
684-
} else if (endpoint === 'crawl_stream') {
685-
// Stream processing
701+
} else if (endpoint === 'crawl_stream' || useStreamOverride) {
702+
// Stream processing - now handled directly by /crawl endpoint
686703
response = await fetch(api, {
687704
method: 'POST',
688705
headers: { 'Content-Type': 'application/json' },
@@ -757,6 +774,7 @@ <h2 class="font-medium text-accent">🔥 Stress Test</h2>
757774
const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
758775
generateSnippets(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, null, 'GET');
759776
} else {
777+
// Use the same API endpoint for both streaming and non-streaming
760778
generateSnippets(api, payload);
761779
}
762780
} catch (error) {
@@ -786,7 +804,7 @@ <h2 class="font-medium text-accent">🔥 Stress Test</h2>
786804
document.getElementById('stress-avg-time').textContent = '0';
787805
document.getElementById('stress-peak-mem').textContent = '0';
788806

789-
const api = useStream ? '/crawl/stream' : '/crawl';
807+
const api = '/crawl'; // Always use /crawl - backend handles streaming internally
790808
const urls = Array.from({ length: total }, (_, i) => `https://httpbin.org/anything/stress-${i}-${Date.now()}`);
791809
const chunks = [];
792810

tests/docker/test_server_requests.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,40 @@ async def test_simple_crawl_single_url(self, async_client: httpx.AsyncClient):
143143
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
144144
# We don't specify a markdown generator in this test, so don't make assumptions about markdown field
145145
# It might be null, missing, or populated depending on the server's default behavior
146+
async def test_crawl_with_stream_direct(self, async_client: httpx.AsyncClient):
147+
"""Test that /crawl endpoint handles stream=True directly without redirect."""
148+
payload = {
149+
"urls": [SIMPLE_HTML_URL],
150+
"browser_config": {
151+
"type": "BrowserConfig",
152+
"params": {
153+
"headless": True,
154+
}
155+
},
156+
"crawler_config": {
157+
"type": "CrawlerRunConfig",
158+
"params": {
159+
"stream": True, # Set stream to True for direct streaming
160+
"screenshot": False,
161+
"cache_mode": CacheMode.BYPASS.value
162+
}
163+
}
164+
}
146165

166+
# Send a request to the /crawl endpoint - should handle streaming directly
167+
async with async_client.stream("POST", "/crawl", json=payload) as response:
168+
assert response.status_code == 200
169+
assert response.headers["content-type"] == "application/x-ndjson"
170+
assert response.headers.get("x-stream-status") == "active"
171+
172+
results = await process_streaming_response(response)
173+
174+
assert len(results) == 1
175+
result = results[0]
176+
await assert_crawl_result_structure(result)
177+
assert result["success"] is True
178+
assert result["url"] == SIMPLE_HTML_URL
179+
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
147180
async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient):
148181
"""Test /crawl/stream with a single URL and simple config values."""
149182
payload = {

0 commit comments

Comments
 (0)