From f8d03dbdb2c684f7a64fb9b3d7bf1ae3eff65999 Mon Sep 17 00:00:00 2001 From: nina-kollman <59646487+nina-kollman@users.noreply.github.com> Date: Thu, 20 Nov 2025 09:50:55 +0200 Subject: [PATCH 01/17] in github --- .../traceloop/sdk/experiment/experiment.py | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py index 6f0baf70d3..68cabf2a4d 100644 --- a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py +++ b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py @@ -1,6 +1,7 @@ import cuid import asyncio import json +import os from typing import Any, List, Callable, Optional, Tuple, Dict from traceloop.sdk.client.http import HTTPClient from traceloop.sdk.datasets.datasets import Datasets @@ -36,6 +37,7 @@ async def run( dataset_version: Optional[str] = None, evaluators: Optional[List[EvaluatorDetails]] = None, experiment_slug: Optional[str] = None, + experiment_metadata: Optional[Dict[str, Any]] = None, related_ref: Optional[Dict[str, str]] = None, aux: Optional[Dict[str, str]] = None, stop_on_error: bool = False, @@ -48,6 +50,7 @@ async def run( task: Function to run on each dataset row evaluators: List of evaluator slugs to run experiment_slug: Slug for this experiment run + experiment_metadata: Metadata for this experiment (an experiment holds all the experinent runs) related_ref: Related reference for this experiment run aux: Auxiliary information for this experiment run stop_on_error: Whether to stop on first error (default: False) @@ -82,6 +85,7 @@ async def run( evaluator_slugs=[slug for slug, _ in evaluator_details] if evaluator_details else None, + experiment_metadata=experiment_metadata, experiment_run_metadata=experiment_run_metadata, ) @@ -177,6 +181,73 @@ async def run_with_semaphore(row) -> TaskResponse: return results, errors + async def run_in_github ( + self, + task: Callable[[Optional[Dict[str, Any]]], Dict[str, Any]], + dataset_slug: Optional[str] = None, + dataset_version: Optional[str] = None, + evaluators: Optional[List[EvaluatorDetails]] = None, + experiment_slug: Optional[str] = None, + related_ref: Optional[Dict[str, str]] = None, + aux: Optional[Dict[str, str]] = None, + stop_on_error: bool = False, + wait_for_results: bool = True, + ) -> Tuple[List[TaskResponse], List[str]]: + """Run an experiment with the given task and evaluators + + Args: + dataset_slug: Slug of the dataset to use + task: Function to run on each dataset row + evaluators: List of evaluator slugs to run + experiment_slug: Slug for this experiment run + related_ref: Related reference for this experiment run + aux: Auxiliary information for this experiment run + stop_on_error: Whether to stop on first error (default: False) + wait_for_results: Whether to wait for async tasks to complete (default: True) + + Returns: + Tuple of (results, errors). Returns ([], []) if wait_for_results is False + """ + + # Construct PR URL from repository and PR number + repository = os.getenv("GITHUB_REPOSITORY") + server_url = os.getenv("GITHUB_SERVER_URL", "https://github.com") + + # Extract PR number from GITHUB_REF (format: "refs/pull/123/merge") + github_ref = os.getenv("GITHUB_REF", "") + pr_number = None + if github_ref.startswith("refs/pull/"): + pr_number = github_ref.split("/")[2] + pr_url = f"{server_url}/{repository}/pull/{pr_number}" if pr_number and repository else None + + github_context = { + "github_pr_url": pr_url, + "github_repository": repository, + "github_commit_hash": os.getenv("GITHUB_SHA", ""), + "github_actor": os.getenv("GITHUB_ACTOR", ""), + } + merged_related_ref = {**github_context, **(related_ref or {})} + + experiment_metadata = { + "created_from": "github", + } + + results, errors = await self.run( + task=task, + dataset_slug=dataset_slug, + dataset_version=dataset_version, + evaluators=evaluators, + experiment_slug=experiment_slug, + related_ref=merged_related_ref, + experiment_metadata=experiment_metadata, + aux=github_context, + stop_on_error=stop_on_error, + wait_for_results=wait_for_results, + ) + + return results, errors + + def _init_experiment( self, experiment_slug: str, From 28212d2845493bd8a51ce9b399b33bb4283dece1 Mon Sep 17 00:00:00 2001 From: nina-kollman <59646487+nina-kollman@users.noreply.github.com> Date: Thu, 20 Nov 2025 10:01:21 +0200 Subject: [PATCH 02/17] added yml --- .github/workflows/run-experiments.yml | 39 +++++++++ experiments/run_research_experiment.py | 86 +++++++++++++++++++ .../traceloop/sdk/experiment/experiment.py | 2 +- 3 files changed, 126 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/run-experiments.yml create mode 100644 experiments/run_research_experiment.py diff --git a/.github/workflows/run-experiments.yml b/.github/workflows/run-experiments.yml new file mode 100644 index 0000000000..8236413b30 --- /dev/null +++ b/.github/workflows/run-experiments.yml @@ -0,0 +1,39 @@ +name: Run Research Experiment in GitHub CI/CD + +on: + pull_request: + types: [opened, synchronize, reopened, ready_for_review] + branches: + - main + +# To enable this workflow, set EXPERIMENTS_ENABLED to 'true' in repository variables +env: + EXPERIMENTS_ENABLED: 'true' + +jobs: + run-experiments: + runs-on: ubuntu-latest + if: github.event.pull_request.draft == false && env.EXPERIMENTS_ENABLED == 'true' + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.13' + cache: 'pip' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt || pip install traceloop-sdk openai python-dotenv + + - name: Run Research Agent Experiment + env: + TRACELOOP_API_KEY: ${{ secrets.TRACELOOP_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + TRACELOOP_BASE_URL: ${{ secrets.TRACELOOP_BASE_URL }} + run: | + python experiments/run_research_experiment.py diff --git a/experiments/run_research_experiment.py b/experiments/run_research_experiment.py new file mode 100644 index 0000000000..9f7d444f33 --- /dev/null +++ b/experiments/run_research_experiment.py @@ -0,0 +1,86 @@ +""" +Example experiment script for CI/CD using run_in_github +""" + +import asyncio +import os +from openai import AsyncOpenAI +from traceloop.sdk import Traceloop + +# Initialize Traceloop client +client = Traceloop.init( + app_name="research-experiment-ci-cd", + api_key=os.getenv("TRACELOOP_API_KEY"), + api_endpoint=os.getenv("TRACELOOP_BASE_URL"), +) + + +async def generate_research_response(question: str) -> str: + """Generate a research response using OpenAI""" + openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) + + response = await openai_client.chat.completions.create( + model="gpt-4", + messages=[ + { + "role": "system", + "content": "You are a helpful research assistant. Provide accurate, well-researched answers.", + }, + {"role": "user", "content": question}, + ], + temperature=0.7, + max_tokens=500, + ) + + return response.choices[0].message.content + + +async def research_task(row): + """Task function that processes each dataset row""" + question = row.get("question", "") + answer = await generate_research_response(question) + + print(f"Question: {question}") + print(f"Answer: {answer[:100]}...") + + return { + "completion": answer, + "question": question, + } + + +async def main(): + """Run experiment in GitHub context""" + print("šŸš€ Starting research experiment in GitHub CI/CD...") + + # Run experiment using run_in_github which automatically captures GitHub context + results, errors = await client.experiment.run_in_github( + dataset_slug="research-questions", + dataset_version="v1", + task=research_task, + evaluators=["accuracy", "relevance"], + experiment_slug="research-exp", + stop_on_error=False, + wait_for_results=True, + ) + + # Print results + print(f"\nāœ… Experiment completed!") + print(f"Total results: {len(results)}") + + if results: + print(f"\nSample result:") + print(f" Task output: {results[0].task_result}") + print(f" Evaluations: {results[0].evaluations}") + + if errors: + print(f"\nāš ļø Errors encountered: {len(errors)}") + for error in errors[:5]: # Show first 5 errors + print(f" - {error}") + exit(1) + + print("\nšŸŽ‰ All tasks completed successfully!") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py index 68cabf2a4d..48aec5cd4f 100644 --- a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py +++ b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py @@ -240,7 +240,7 @@ async def run_in_github ( experiment_slug=experiment_slug, related_ref=merged_related_ref, experiment_metadata=experiment_metadata, - aux=github_context, + aux=aux, stop_on_error=stop_on_error, wait_for_results=wait_for_results, ) From 6558d72342e984d7f22b4ee0327caa85d8bafb62 Mon Sep 17 00:00:00 2001 From: nina-kollman <59646487+nina-kollman@users.noreply.github.com> Date: Thu, 20 Nov 2025 10:07:07 +0200 Subject: [PATCH 03/17] added raise --- .../traceloop/sdk/experiment/experiment.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py index 48aec5cd4f..5816f06daa 100644 --- a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py +++ b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py @@ -207,8 +207,18 @@ async def run_in_github ( Returns: Tuple of (results, errors). Returns ([], []) if wait_for_results is False + + Raises: + RuntimeError: If not running in GitHub Actions environment """ + # Check if running in GitHub Actions + if not os.getenv("GITHUB_ACTIONS"): + raise RuntimeError( + "run_in_github() can only be used in GitHub Actions CI/CD environment. " + "To run experiments locally, use the run() method instead." + ) + # Construct PR URL from repository and PR number repository = os.getenv("GITHUB_REPOSITORY") server_url = os.getenv("GITHUB_SERVER_URL", "https://github.com") From bcd66b3e25d51eb5662e88beb71530642f499a29 Mon Sep 17 00:00:00 2001 From: nina-kollman <59646487+nina-kollman@users.noreply.github.com> Date: Thu, 20 Nov 2025 10:43:21 +0200 Subject: [PATCH 04/17] no semaphore --- experiments/run_research_experiment.py | 38 ++--- .../traceloop/sdk/experiment/experiment.py | 147 +++++++++++++++--- .../traceloop/sdk/experiment/model.py | 39 +++++ 3 files changed, 180 insertions(+), 44 deletions(-) diff --git a/experiments/run_research_experiment.py b/experiments/run_research_experiment.py index 9f7d444f33..3b72ffde1e 100644 --- a/experiments/run_research_experiment.py +++ b/experiments/run_research_experiment.py @@ -1,5 +1,10 @@ """ Example experiment script for CI/CD using run_in_github + +This script: +1. Executes tasks locally on the dataset +2. Sends task results to the backend +3. Backend runs evaluators and posts PR comment with results """ import asyncio @@ -51,35 +56,30 @@ async def research_task(row): async def main(): """Run experiment in GitHub context""" - print("šŸš€ Starting research experiment in GitHub CI/CD...") + print("šŸš€ Running research experiment in GitHub CI/CD...") - # Run experiment using run_in_github which automatically captures GitHub context - results, errors = await client.experiment.run_in_github( + # Execute tasks locally and send results to backend + response = await client.experiment.run_in_github( + task=research_task, dataset_slug="research-questions", dataset_version="v1", - task=research_task, evaluators=["accuracy", "relevance"], experiment_slug="research-exp", stop_on_error=False, - wait_for_results=True, ) - # Print results - print(f"\nāœ… Experiment completed!") - print(f"Total results: {len(results)}") - - if results: - print(f"\nSample result:") - print(f" Task output: {results[0].task_result}") - print(f" Evaluations: {results[0].evaluations}") + # Print response + print("\nāœ… Experiment completed and submitted!") + print(f"Experiment ID: {response.experiment_id}") + print(f"Experiment Slug: {response.experiment_slug}") + print(f"Run ID: {response.run_id}") + print(f"Status: {response.status}") - if errors: - print(f"\nāš ļø Errors encountered: {len(errors)}") - for error in errors[:5]: # Show first 5 errors - print(f" - {error}") - exit(1) + if response.message: + print(f"Message: {response.message}") - print("\nšŸŽ‰ All tasks completed successfully!") + print("\nšŸ“ The backend will run evaluators and post results to your PR.") + print(" Check your GitHub PR for the results comment.") if __name__ == "__main__": diff --git a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py index 5816f06daa..763bbbf734 100644 --- a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py +++ b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py @@ -13,6 +13,10 @@ CreateTaskResponse, EvaluatorDetails, TaskResponse, + RunInGithubRequest, + RunInGithubResponse, + TaskResult, + GithubContext, ) import httpx @@ -181,7 +185,61 @@ async def run_with_semaphore(row) -> TaskResponse: return results, errors - async def run_in_github ( + async def _execute_tasks( + self, + rows: List[Dict[str, Any]], + task: Callable[[Optional[Dict[str, Any]]], Dict[str, Any]], + stop_on_error: bool = False, + ) -> List[TaskResult]: + """Execute tasks locally with concurrency control + + Args: + rows: List of dataset rows to process + task: Function to run on each row + stop_on_error: Whether to stop on first error + + Returns: + List of TaskResult objects with inputs, outputs, and errors + """ + task_results: List[TaskResult] = [] + + async def run_single_row(row) -> TaskResult: + try: + task_output = await task(row) + return TaskResult( + task_input=row, + task_output=task_output, + ) + except Exception as e: + if stop_on_error: + raise e + return TaskResult( + task_input=row, + error=str(e), + ) + + # Execute tasks with concurrency control + semaphore = asyncio.Semaphore(50) + + async def run_with_semaphore(row) -> TaskResult: + async with semaphore: + return await run_single_row(row) + + tasks = [asyncio.create_task(run_with_semaphore(row)) for row in rows] + + for completed_task in asyncio.as_completed(tasks): + try: + result = await completed_task + task_results.append(result) + if result.error and stop_on_error: + break + except Exception as e: + if stop_on_error: + raise e + + return task_results + + async def run_in_github( self, task: Callable[[Optional[Dict[str, Any]]], Dict[str, Any]], dataset_slug: Optional[str] = None, @@ -191,25 +249,31 @@ async def run_in_github ( related_ref: Optional[Dict[str, str]] = None, aux: Optional[Dict[str, str]] = None, stop_on_error: bool = False, - wait_for_results: bool = True, - ) -> Tuple[List[TaskResponse], List[str]]: - """Run an experiment with the given task and evaluators + ) -> RunInGithubResponse: + """Execute tasks locally and submit results to backend for GitHub CI/CD + + This method: + 1. Fetches the dataset + 2. Executes all tasks locally + 3. Sends task results to backend + 4. Backend runs evaluators and posts PR comment Args: - dataset_slug: Slug of the dataset to use task: Function to run on each dataset row - evaluators: List of evaluator slugs to run + dataset_slug: Slug of the dataset to use + dataset_version: Version of the dataset + evaluators: List of evaluator slugs or (slug, version) tuples to run experiment_slug: Slug for this experiment run - related_ref: Related reference for this experiment run - aux: Auxiliary information for this experiment run + related_ref: Additional reference information for this experiment run + aux: Auxiliary metadata for this experiment run stop_on_error: Whether to stop on first error (default: False) - wait_for_results: Whether to wait for async tasks to complete (default: True) Returns: - Tuple of (results, errors). Returns ([], []) if wait_for_results is False + RunInGithubResponse with experiment_id, run_id, and status Raises: RuntimeError: If not running in GitHub Actions environment + Exception: If the API request fails """ # Check if running in GitHub Actions @@ -219,7 +283,19 @@ async def run_in_github ( "To run experiments locally, use the run() method instead." ) - # Construct PR URL from repository and PR number + if not experiment_slug: + experiment_slug = self._experiment_slug or "exp-" + str(cuid.cuid())[:11] + + # Fetch dataset rows + rows = [] + if dataset_slug and dataset_version: + jsonl_data = self._datasets.get_version_jsonl(dataset_slug, dataset_version) + rows = self._parse_jsonl_to_rows(jsonl_data) + + # Execute all tasks locally + task_results = await self._execute_tasks(rows, task, stop_on_error) + + # Construct GitHub context repository = os.getenv("GITHUB_REPOSITORY") server_url = os.getenv("GITHUB_SERVER_URL", "https://github.com") @@ -230,32 +306,53 @@ async def run_in_github ( pr_number = github_ref.split("/")[2] pr_url = f"{server_url}/{repository}/pull/{pr_number}" if pr_number and repository else None - github_context = { - "github_pr_url": pr_url, - "github_repository": repository, - "github_commit_hash": os.getenv("GITHUB_SHA", ""), - "github_actor": os.getenv("GITHUB_ACTOR", ""), - } + + github_context = GithubContext( + github_pr_url=pr_url, + github_commit_hash=os.getenv("GITHUB_SHA", ""), + github_actor=os.getenv("GITHUB_ACTOR", ""), + ) + + # Merge user-provided related_ref with github_context merged_related_ref = {**github_context, **(related_ref or {})} experiment_metadata = { "created_from": "github", } - results, errors = await self.run( - task=task, + # Extract evaluator slugs + evaluator_slugs = None + if evaluators: + evaluator_slugs = [ + slug if isinstance(slug, str) else slug[0] + for slug in evaluators + ] + + # Prepare request payload + request_body = RunInGithubRequest( dataset_slug=dataset_slug, dataset_version=dataset_version, - evaluators=evaluators, - experiment_slug=experiment_slug, - related_ref=merged_related_ref, + evaluator_slugs=evaluator_slugs, + task_results=task_results, + github_context=github_context, experiment_metadata=experiment_metadata, + related_ref=merged_related_ref, aux=aux, stop_on_error=stop_on_error, - wait_for_results=wait_for_results, ) - return results, errors + # Send bulk request to backend + response = self._http_client.post( + f"/experiments/{experiment_slug}/run-in-github", + request_body.model_dump(mode="json", exclude_none=True), + ) + + if response is None: + raise Exception( + f"Failed to submit experiment '{experiment_slug}' for GitHub execution" + ) + + return RunInGithubResponse(**response) def _init_experiment( @@ -319,4 +416,4 @@ def _parse_jsonl_to_rows(self, jsonl_data: str) -> List[Dict[str, Any]]: # Skip invalid JSON lines continue - return rows + return rows \ No newline at end of file diff --git a/packages/traceloop-sdk/traceloop/sdk/experiment/model.py b/packages/traceloop-sdk/traceloop/sdk/experiment/model.py index 887bcd5743..e9b3caf905 100644 --- a/packages/traceloop-sdk/traceloop/sdk/experiment/model.py +++ b/packages/traceloop-sdk/traceloop/sdk/experiment/model.py @@ -66,3 +66,42 @@ class CreateTaskResponse(BaseModel): """Model for create task response""" id: str + + +class TaskResult(BaseModel): + """Model for a single task result""" + + task_input: Dict[str, Any] + task_output: Optional[Dict[str, Any]] = None + error: Optional[str] = None + + +class GithubContext(BaseModel): + """Model for GitHub context""" + + github_pr_url: Optional[str] = None + github_commit_hash: Optional[str] = None + github_actor: Optional[str] = None + +class RunInGithubRequest(BaseModel): + """Model for bulk GitHub experiment execution request""" + + dataset_slug: Optional[str] = None + dataset_version: Optional[str] = None + evaluator_slugs: Optional[List[str]] = None + task_results: List[TaskResult] + github_context: Dict[str, Any] + experiment_metadata: Optional[Dict[str, Any]] = None + related_ref: Optional[Dict[str, Any]] = None + aux: Optional[Dict[str, Any]] = None + stop_on_error: bool = False + + +class RunInGithubResponse(BaseModel): + """Model for bulk GitHub experiment execution response""" + + experiment_id: str + experiment_slug: str + run_id: str + status: str + message: Optional[str] = None From a6ab63038519ba0a744d4dfc5eb4391e39b23c8c Mon Sep 17 00:00:00 2001 From: nina-kollman <59646487+nina-kollman@users.noreply.github.com> Date: Thu, 20 Nov 2025 11:05:57 +0200 Subject: [PATCH 05/17] add --- .../traceloop/sdk/client/http.py | 3 +- .../traceloop/sdk/experiment/experiment.py | 131 ++++++++---------- .../traceloop/sdk/experiment/model.py | 14 +- 3 files changed, 67 insertions(+), 81 deletions(-) diff --git a/packages/traceloop-sdk/traceloop/sdk/client/http.py b/packages/traceloop-sdk/traceloop/sdk/client/http.py index 60287bf593..a9cb78a12c 100644 --- a/packages/traceloop-sdk/traceloop/sdk/client/http.py +++ b/packages/traceloop-sdk/traceloop/sdk/client/http.py @@ -33,7 +33,8 @@ def post(self, path: str, data: Dict[str, Any]) -> Any: response.raise_for_status() return response.json() except requests.exceptions.RequestException as e: - print(Fore.RED + f"Error making request to {path}: {str(e)}" + Fore.RESET) + status_code = e.response.status_code if hasattr(e, 'response') and e.response is not None else 'Unknown' + print(Fore.RED + f"Error making request to {path} (HTTP {status_code}): {str(e)}" + Fore.RESET) return None def get(self, path: str, params: Optional[Dict[str, Any]] = None) -> Any: diff --git a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py index 763bbbf734..a2ba1ff9ed 100644 --- a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py +++ b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py @@ -1,3 +1,4 @@ +import re import cuid import asyncio import json @@ -185,60 +186,6 @@ async def run_with_semaphore(row) -> TaskResponse: return results, errors - async def _execute_tasks( - self, - rows: List[Dict[str, Any]], - task: Callable[[Optional[Dict[str, Any]]], Dict[str, Any]], - stop_on_error: bool = False, - ) -> List[TaskResult]: - """Execute tasks locally with concurrency control - - Args: - rows: List of dataset rows to process - task: Function to run on each row - stop_on_error: Whether to stop on first error - - Returns: - List of TaskResult objects with inputs, outputs, and errors - """ - task_results: List[TaskResult] = [] - - async def run_single_row(row) -> TaskResult: - try: - task_output = await task(row) - return TaskResult( - task_input=row, - task_output=task_output, - ) - except Exception as e: - if stop_on_error: - raise e - return TaskResult( - task_input=row, - error=str(e), - ) - - # Execute tasks with concurrency control - semaphore = asyncio.Semaphore(50) - - async def run_with_semaphore(row) -> TaskResult: - async with semaphore: - return await run_single_row(row) - - tasks = [asyncio.create_task(run_with_semaphore(row)) for row in rows] - - for completed_task in asyncio.as_completed(tasks): - try: - result = await completed_task - task_results.append(result) - if result.error and stop_on_error: - break - except Exception as e: - if stop_on_error: - raise e - - return task_results - async def run_in_github( self, task: Callable[[Optional[Dict[str, Any]]], Dict[str, Any]], @@ -247,8 +194,6 @@ async def run_in_github( evaluators: Optional[List[EvaluatorDetails]] = None, experiment_slug: Optional[str] = None, related_ref: Optional[Dict[str, str]] = None, - aux: Optional[Dict[str, str]] = None, - stop_on_error: bool = False, ) -> RunInGithubResponse: """Execute tasks locally and submit results to backend for GitHub CI/CD @@ -265,8 +210,6 @@ async def run_in_github( evaluators: List of evaluator slugs or (slug, version) tuples to run experiment_slug: Slug for this experiment run related_ref: Additional reference information for this experiment run - aux: Auxiliary metadata for this experiment run - stop_on_error: Whether to stop on first error (default: False) Returns: RunInGithubResponse with experiment_id, run_id, and status @@ -293,7 +236,7 @@ async def run_in_github( rows = self._parse_jsonl_to_rows(jsonl_data) # Execute all tasks locally - task_results = await self._execute_tasks(rows, task, stop_on_error) + task_results = await self._execute_tasks(rows, task) # Construct GitHub context repository = os.getenv("GITHUB_REPOSITORY") @@ -308,14 +251,11 @@ async def run_in_github( github_context = GithubContext( - github_pr_url=pr_url, - github_commit_hash=os.getenv("GITHUB_SHA", ""), - github_actor=os.getenv("GITHUB_ACTOR", ""), + pr_url=pr_url, + commit_hash=os.getenv("GITHUB_SHA", ""), + actor=os.getenv("GITHUB_ACTOR", ""), ) - # Merge user-provided related_ref with github_context - merged_related_ref = {**github_context, **(related_ref or {})} - experiment_metadata = { "created_from": "github", } @@ -336,12 +276,9 @@ async def run_in_github( task_results=task_results, github_context=github_context, experiment_metadata=experiment_metadata, - related_ref=merged_related_ref, - aux=aux, - stop_on_error=stop_on_error, + experiment_run_metadata=related_ref, ) - # Send bulk request to backend response = self._http_client.post( f"/experiments/{experiment_slug}/run-in-github", request_body.model_dump(mode="json", exclude_none=True), @@ -349,7 +286,7 @@ async def run_in_github( if response is None: raise Exception( - f"Failed to submit experiment '{experiment_slug}' for GitHub execution" + f"Failed to submit experiment '{experiment_slug}' for GitHub execution. " ) return RunInGithubResponse(**response) @@ -416,4 +353,56 @@ def _parse_jsonl_to_rows(self, jsonl_data: str) -> List[Dict[str, Any]]: # Skip invalid JSON lines continue - return rows \ No newline at end of file + return rows + + async def _execute_tasks( + self, + rows: List[Dict[str, Any]], + task: Callable[[Optional[Dict[str, Any]]], Dict[str, Any]], + ) -> List[TaskResult]: + """Execute tasks locally with concurrency control + + Args: + rows: List of dataset rows to process + task: Function to run on each row + stop_on_error: Whether to stop on first error + + Returns: + List of TaskResult objects with inputs, outputs, and errors + """ + task_results: List[TaskResult] = [] + + async def run_single_row(row) -> TaskResult: + try: + task_output = await task(row) + return TaskResult( + task_input=row, + task_output=task_output, + ) + except Exception as e: + return TaskResult( + task_input=row, + error=str(e), + ) + + # Execute tasks with concurrency control + semaphore = asyncio.Semaphore(50) + + async def run_with_semaphore(row: Dict[str, Any]) -> TaskResult: + async with semaphore: + return await run_single_row(row) + + tasks = [asyncio.create_task(run_with_semaphore(row)) for row in rows] + + for completed_task in asyncio.as_completed(tasks): + try: + result = await completed_task + task_results.append(result) + except Exception as e: + task_results.append(TaskResult( + task_input=completed_task.task_input, + error=str(e), + )) + continue + + return task_results \ No newline at end of file diff --git a/packages/traceloop-sdk/traceloop/sdk/experiment/model.py b/packages/traceloop-sdk/traceloop/sdk/experiment/model.py index e9b3caf905..6d3e8ca2eb 100644 --- a/packages/traceloop-sdk/traceloop/sdk/experiment/model.py +++ b/packages/traceloop-sdk/traceloop/sdk/experiment/model.py @@ -79,9 +79,9 @@ class TaskResult(BaseModel): class GithubContext(BaseModel): """Model for GitHub context""" - github_pr_url: Optional[str] = None - github_commit_hash: Optional[str] = None - github_actor: Optional[str] = None + pr_url: Optional[str] = None + commit_hash: Optional[str] = None + actor: Optional[str] = None class RunInGithubRequest(BaseModel): """Model for bulk GitHub experiment execution request""" @@ -90,11 +90,9 @@ class RunInGithubRequest(BaseModel): dataset_version: Optional[str] = None evaluator_slugs: Optional[List[str]] = None task_results: List[TaskResult] - github_context: Dict[str, Any] + github_context: GithubContext experiment_metadata: Optional[Dict[str, Any]] = None - related_ref: Optional[Dict[str, Any]] = None - aux: Optional[Dict[str, Any]] = None - stop_on_error: bool = False + experiment_run_metadata: Optional[Dict[str, Any]] = None class RunInGithubResponse(BaseModel): @@ -103,5 +101,3 @@ class RunInGithubResponse(BaseModel): experiment_id: str experiment_slug: str run_id: str - status: str - message: Optional[str] = None From a0b21566a5aba1c3eae1a2a8dd93ae5b95f79bd5 Mon Sep 17 00:00:00 2001 From: nina-kollman <59646487+nina-kollman@users.noreply.github.com> Date: Sun, 23 Nov 2025 08:33:01 +0200 Subject: [PATCH 06/17] wip --- .../traceloop/sdk/experiment/experiment.py | 11 ++++++----- .../traceloop-sdk/traceloop/sdk/experiment/model.py | 5 +++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py index a2ba1ff9ed..3e9d362591 100644 --- a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py +++ b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py @@ -270,6 +270,7 @@ async def run_in_github( # Prepare request payload request_body = RunInGithubRequest( + experiment_slug=experiment_slug, dataset_slug=dataset_slug, dataset_version=dataset_version, evaluator_slugs=evaluator_slugs, @@ -280,7 +281,7 @@ async def run_in_github( ) response = self._http_client.post( - f"/experiments/{experiment_slug}/run-in-github", + "/experiments/run-in-github", request_body.model_dump(mode="json", exclude_none=True), ) @@ -376,12 +377,12 @@ async def run_single_row(row) -> TaskResult: try: task_output = await task(row) return TaskResult( - task_input=row, - task_output=task_output, + input=row, + output=task_output, ) except Exception as e: return TaskResult( - task_input=row, + input=row, error=str(e), ) @@ -400,7 +401,7 @@ async def run_with_semaphore(row: Dict[str, Any]) -> TaskResult: task_results.append(result) except Exception as e: task_results.append(TaskResult( - task_input=completed_task.task_input, + input=completed_task.task_input, error=str(e), )) continue diff --git a/packages/traceloop-sdk/traceloop/sdk/experiment/model.py b/packages/traceloop-sdk/traceloop/sdk/experiment/model.py index 6d3e8ca2eb..3b8ef56900 100644 --- a/packages/traceloop-sdk/traceloop/sdk/experiment/model.py +++ b/packages/traceloop-sdk/traceloop/sdk/experiment/model.py @@ -71,8 +71,8 @@ class CreateTaskResponse(BaseModel): class TaskResult(BaseModel): """Model for a single task result""" - task_input: Dict[str, Any] - task_output: Optional[Dict[str, Any]] = None + input: Dict[str, Any] + output: Optional[Dict[str, Any]] = None error: Optional[str] = None @@ -86,6 +86,7 @@ class GithubContext(BaseModel): class RunInGithubRequest(BaseModel): """Model for bulk GitHub experiment execution request""" + experiment_slug: str dataset_slug: Optional[str] = None dataset_version: Optional[str] = None evaluator_slugs: Optional[List[str]] = None From 00928514309d1b3d336fedb3092837ad2be6605b Mon Sep 17 00:00:00 2001 From: nina-kollman <59646487+nina-kollman@users.noreply.github.com> Date: Sun, 23 Nov 2025 09:54:02 +0200 Subject: [PATCH 07/17] move exp --- .../sample-app/sample_app/experiment}/run_research_experiment.py | 1 - 1 file changed, 1 deletion(-) rename {experiments => packages/sample-app/sample_app/experiment}/run_research_experiment.py (98%) diff --git a/experiments/run_research_experiment.py b/packages/sample-app/sample_app/experiment/run_research_experiment.py similarity index 98% rename from experiments/run_research_experiment.py rename to packages/sample-app/sample_app/experiment/run_research_experiment.py index 3b72ffde1e..85aba21e82 100644 --- a/experiments/run_research_experiment.py +++ b/packages/sample-app/sample_app/experiment/run_research_experiment.py @@ -65,7 +65,6 @@ async def main(): dataset_version="v1", evaluators=["accuracy", "relevance"], experiment_slug="research-exp", - stop_on_error=False, ) # Print response From 3b763b246d806f6adb4915e6de93ed2924d26dad Mon Sep 17 00:00:00 2001 From: nina-kollman <59646487+nina-kollman@users.noreply.github.com> Date: Mon, 24 Nov 2025 12:38:00 +0200 Subject: [PATCH 08/17] add' --- .github/workflows/run-experiments.yml | 39 ------------------- .../experiment/run_research_experiment.py | 21 ++++------ .../traceloop-sdk/traceloop/sdk/__init__.py | 4 +- .../traceloop/sdk/client/http.py | 3 +- .../traceloop/sdk/experiment/experiment.py | 9 ++++- .../traceloop/sdk/experiment/model.py | 7 ++-- 6 files changed, 22 insertions(+), 61 deletions(-) delete mode 100644 .github/workflows/run-experiments.yml diff --git a/.github/workflows/run-experiments.yml b/.github/workflows/run-experiments.yml deleted file mode 100644 index 8236413b30..0000000000 --- a/.github/workflows/run-experiments.yml +++ /dev/null @@ -1,39 +0,0 @@ -name: Run Research Experiment in GitHub CI/CD - -on: - pull_request: - types: [opened, synchronize, reopened, ready_for_review] - branches: - - main - -# To enable this workflow, set EXPERIMENTS_ENABLED to 'true' in repository variables -env: - EXPERIMENTS_ENABLED: 'true' - -jobs: - run-experiments: - runs-on: ubuntu-latest - if: github.event.pull_request.draft == false && env.EXPERIMENTS_ENABLED == 'true' - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.13' - cache: 'pip' - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements.txt || pip install traceloop-sdk openai python-dotenv - - - name: Run Research Agent Experiment - env: - TRACELOOP_API_KEY: ${{ secrets.TRACELOOP_API_KEY }} - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - TRACELOOP_BASE_URL: ${{ secrets.TRACELOOP_BASE_URL }} - run: | - python experiments/run_research_experiment.py diff --git a/packages/sample-app/sample_app/experiment/run_research_experiment.py b/packages/sample-app/sample_app/experiment/run_research_experiment.py index 85aba21e82..1e15aec569 100644 --- a/packages/sample-app/sample_app/experiment/run_research_experiment.py +++ b/packages/sample-app/sample_app/experiment/run_research_experiment.py @@ -42,15 +42,13 @@ async def generate_research_response(question: str) -> str: async def research_task(row): """Task function that processes each dataset row""" - question = row.get("question", "") - answer = await generate_research_response(question) - - print(f"Question: {question}") - print(f"Answer: {answer[:100]}...") + query = row.get("query", "") + answer = await generate_research_response(query) return { "completion": answer, - "question": question, + "question": query, + "sentenece": answer } @@ -61,21 +59,16 @@ async def main(): # Execute tasks locally and send results to backend response = await client.experiment.run_in_github( task=research_task, - dataset_slug="research-questions", - dataset_version="v1", - evaluators=["accuracy", "relevance"], + dataset_slug="research-queries", + dataset_version="v2", + evaluators=["research-relevancy", "categories", "research-facts-counter"], experiment_slug="research-exp", ) # Print response print("\nāœ… Experiment completed and submitted!") - print(f"Experiment ID: {response.experiment_id}") print(f"Experiment Slug: {response.experiment_slug}") print(f"Run ID: {response.run_id}") - print(f"Status: {response.status}") - - if response.message: - print(f"Message: {response.message}") print("\nšŸ“ The backend will run evaluators and post results to your PR.") print(" Check your GitHub PR for the results comment.") diff --git a/packages/traceloop-sdk/traceloop/sdk/__init__.py b/packages/traceloop-sdk/traceloop/sdk/__init__.py index 1b91cb9ff6..9015a1e96d 100644 --- a/packages/traceloop-sdk/traceloop/sdk/__init__.py +++ b/packages/traceloop-sdk/traceloop/sdk/__init__.py @@ -187,8 +187,8 @@ def init( Traceloop.__logger_wrapper = LoggerWrapper(exporter=logging_exporter) if ( - api_endpoint.find("traceloop.com") != -1 - and api_key + # api_endpoint.find("traceloop.com") != -1 + api_key and (exporter is None) and (processor is None) ): diff --git a/packages/traceloop-sdk/traceloop/sdk/client/http.py b/packages/traceloop-sdk/traceloop/sdk/client/http.py index a9cb78a12c..60287bf593 100644 --- a/packages/traceloop-sdk/traceloop/sdk/client/http.py +++ b/packages/traceloop-sdk/traceloop/sdk/client/http.py @@ -33,8 +33,7 @@ def post(self, path: str, data: Dict[str, Any]) -> Any: response.raise_for_status() return response.json() except requests.exceptions.RequestException as e: - status_code = e.response.status_code if hasattr(e, 'response') and e.response is not None else 'Unknown' - print(Fore.RED + f"Error making request to {path} (HTTP {status_code}): {str(e)}" + Fore.RESET) + print(Fore.RED + f"Error making request to {path}: {str(e)}" + Fore.RESET) return None def get(self, path: str, params: Optional[Dict[str, Any]] = None) -> Any: diff --git a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py index 3e9d362591..7de2c2b136 100644 --- a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py +++ b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py @@ -235,7 +235,6 @@ async def run_in_github( jsonl_data = self._datasets.get_version_jsonl(dataset_slug, dataset_version) rows = self._parse_jsonl_to_rows(jsonl_data) - # Execute all tasks locally task_results = await self._execute_tasks(rows, task) # Construct GitHub context @@ -244,6 +243,13 @@ async def run_in_github( # Extract PR number from GITHUB_REF (format: "refs/pull/123/merge") github_ref = os.getenv("GITHUB_REF", "") + + if not repository or not github_ref: + raise RuntimeError( + "GITHUB_REPOSITORY and GITHUB_REF must be set in the environment. " + "To run experiments locally, use the run() method instead." + ) + pr_number = None if github_ref.startswith("refs/pull/"): pr_number = github_ref.split("/")[2] @@ -251,6 +257,7 @@ async def run_in_github( github_context = GithubContext( + repository=repository, pr_url=pr_url, commit_hash=os.getenv("GITHUB_SHA", ""), actor=os.getenv("GITHUB_ACTOR", ""), diff --git a/packages/traceloop-sdk/traceloop/sdk/experiment/model.py b/packages/traceloop-sdk/traceloop/sdk/experiment/model.py index 3b8ef56900..01fc2d1639 100644 --- a/packages/traceloop-sdk/traceloop/sdk/experiment/model.py +++ b/packages/traceloop-sdk/traceloop/sdk/experiment/model.py @@ -79,9 +79,10 @@ class TaskResult(BaseModel): class GithubContext(BaseModel): """Model for GitHub context""" - pr_url: Optional[str] = None - commit_hash: Optional[str] = None - actor: Optional[str] = None + repository: str + pr_url: str + commit_hash: str + actor: str class RunInGithubRequest(BaseModel): """Model for bulk GitHub experiment execution request""" From c0138f52b3c89fcd7ed93c5e74b62ed8808602b2 Mon Sep 17 00:00:00 2001 From: nina-kollman <59646487+nina-kollman@users.noreply.github.com> Date: Mon, 24 Nov 2025 12:49:49 +0200 Subject: [PATCH 09/17] lint --- packages/traceloop-sdk/traceloop/sdk/__init__.py | 4 ++-- .../traceloop-sdk/traceloop/sdk/experiment/experiment.py | 7 ++----- packages/traceloop-sdk/traceloop/sdk/experiment/model.py | 1 + 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/packages/traceloop-sdk/traceloop/sdk/__init__.py b/packages/traceloop-sdk/traceloop/sdk/__init__.py index 9015a1e96d..1b91cb9ff6 100644 --- a/packages/traceloop-sdk/traceloop/sdk/__init__.py +++ b/packages/traceloop-sdk/traceloop/sdk/__init__.py @@ -187,8 +187,8 @@ def init( Traceloop.__logger_wrapper = LoggerWrapper(exporter=logging_exporter) if ( - # api_endpoint.find("traceloop.com") != -1 - api_key + api_endpoint.find("traceloop.com") != -1 + and api_key and (exporter is None) and (processor is None) ): diff --git a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py index 7de2c2b136..1a5b2ab494 100644 --- a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py +++ b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py @@ -1,4 +1,3 @@ -import re import cuid import asyncio import json @@ -255,7 +254,6 @@ async def run_in_github( pr_number = github_ref.split("/")[2] pr_url = f"{server_url}/{repository}/pull/{pr_number}" if pr_number and repository else None - github_context = GithubContext( repository=repository, pr_url=pr_url, @@ -299,7 +297,6 @@ async def run_in_github( return RunInGithubResponse(**response) - def _init_experiment( self, experiment_slug: str, @@ -362,7 +359,7 @@ def _parse_jsonl_to_rows(self, jsonl_data: str) -> List[Dict[str, Any]]: continue return rows - + async def _execute_tasks( self, rows: List[Dict[str, Any]], @@ -413,4 +410,4 @@ async def run_with_semaphore(row: Dict[str, Any]) -> TaskResult: )) continue - return task_results \ No newline at end of file + return task_results diff --git a/packages/traceloop-sdk/traceloop/sdk/experiment/model.py b/packages/traceloop-sdk/traceloop/sdk/experiment/model.py index 01fc2d1639..f55ed0b487 100644 --- a/packages/traceloop-sdk/traceloop/sdk/experiment/model.py +++ b/packages/traceloop-sdk/traceloop/sdk/experiment/model.py @@ -84,6 +84,7 @@ class GithubContext(BaseModel): commit_hash: str actor: str + class RunInGithubRequest(BaseModel): """Model for bulk GitHub experiment execution request""" From 05d6b44927e5dca13fa0c01c18c3acf42302ac45 Mon Sep 17 00:00:00 2001 From: nina-kollman <59646487+nina-kollman@users.noreply.github.com> Date: Mon, 24 Nov 2025 13:05:56 +0200 Subject: [PATCH 10/17] fixes --- .../experiment/run_research_experiment.py | 2 +- .../traceloop/sdk/experiment/experiment.py | 13 ++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/packages/sample-app/sample_app/experiment/run_research_experiment.py b/packages/sample-app/sample_app/experiment/run_research_experiment.py index 1e15aec569..240f6c5326 100644 --- a/packages/sample-app/sample_app/experiment/run_research_experiment.py +++ b/packages/sample-app/sample_app/experiment/run_research_experiment.py @@ -48,7 +48,7 @@ async def research_task(row): return { "completion": answer, "question": query, - "sentenece": answer + "sentence": answer } diff --git a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py index 1a5b2ab494..b5cd490753 100644 --- a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py +++ b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py @@ -54,7 +54,7 @@ async def run( task: Function to run on each dataset row evaluators: List of evaluator slugs to run experiment_slug: Slug for this experiment run - experiment_metadata: Metadata for this experiment (an experiment holds all the experinent runs) + experiment_metadata: Metadata for this experiment (an experiment holds all the experiment runs) related_ref: Related reference for this experiment run aux: Auxiliary information for this experiment run stop_on_error: Whether to stop on first error (default: False) @@ -242,17 +242,17 @@ async def run_in_github( # Extract PR number from GITHUB_REF (format: "refs/pull/123/merge") github_ref = os.getenv("GITHUB_REF", "") + pr_number = None + if github_ref.startswith("refs/pull/"): + pr_number = github_ref.split("/")[2] - if not repository or not github_ref: + if not repository or not github_ref or not pr_number: raise RuntimeError( "GITHUB_REPOSITORY and GITHUB_REF must be set in the environment. " "To run experiments locally, use the run() method instead." ) - pr_number = None - if github_ref.startswith("refs/pull/"): - pr_number = github_ref.split("/")[2] - pr_url = f"{server_url}/{repository}/pull/{pr_number}" if pr_number and repository else None + pr_url = f"{server_url}/{repository}/pull/{pr_number}" github_context = GithubContext( repository=repository, @@ -405,7 +405,6 @@ async def run_with_semaphore(row: Dict[str, Any]) -> TaskResult: task_results.append(result) except Exception as e: task_results.append(TaskResult( - input=completed_task.task_input, error=str(e), )) continue From 4048c6c648777ccfe2f2f394277d5d0a4ede0327 Mon Sep 17 00:00:00 2001 From: nina-kollman <59646487+nina-kollman@users.noreply.github.com> Date: Mon, 24 Nov 2025 13:18:46 +0200 Subject: [PATCH 11/17] fix comments --- .../traceloop/sdk/experiment/experiment.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py index b5cd490753..7c37895a07 100644 --- a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py +++ b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py @@ -239,6 +239,15 @@ async def run_in_github( # Construct GitHub context repository = os.getenv("GITHUB_REPOSITORY") server_url = os.getenv("GITHUB_SERVER_URL", "https://github.com") + github_event_name = os.getenv("GITHUB_EVENT_NAME", "") + + # Verify this is running in a pull request context + if github_event_name != "pull_request": + raise RuntimeError( + f"run_in_github() can only be used in pull_request workflow. " + f"Current event: {github_event_name}. " + "To run experiments locally, use the run() method instead." + ) # Extract PR number from GITHUB_REF (format: "refs/pull/123/merge") github_ref = os.getenv("GITHUB_REF", "") @@ -400,13 +409,7 @@ async def run_with_semaphore(row: Dict[str, Any]) -> TaskResult: tasks = [asyncio.create_task(run_with_semaphore(row)) for row in rows] for completed_task in asyncio.as_completed(tasks): - try: - result = await completed_task - task_results.append(result) - except Exception as e: - task_results.append(TaskResult( - error=str(e), - )) - continue + result = await completed_task + task_results.append(result) return task_results From 676478111050e359688b19789d4c7006e0ebd29e Mon Sep 17 00:00:00 2001 From: nina-kollman <59646487+nina-kollman@users.noreply.github.com> Date: Tue, 25 Nov 2025 15:04:32 +0200 Subject: [PATCH 12/17] to private methods --- .../experiment/run_research_experiment.py | 2 +- .../traceloop/sdk/experiment/experiment.py | 54 ++++++++++++++++++- 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/packages/sample-app/sample_app/experiment/run_research_experiment.py b/packages/sample-app/sample_app/experiment/run_research_experiment.py index 240f6c5326..f8d04634ea 100644 --- a/packages/sample-app/sample_app/experiment/run_research_experiment.py +++ b/packages/sample-app/sample_app/experiment/run_research_experiment.py @@ -57,7 +57,7 @@ async def main(): print("šŸš€ Running research experiment in GitHub CI/CD...") # Execute tasks locally and send results to backend - response = await client.experiment.run_in_github( + response = await client.experiment.run( task=research_task, dataset_slug="research-queries", dataset_version="v2", diff --git a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py index 7c37895a07..022933c616 100644 --- a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py +++ b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py @@ -48,6 +48,58 @@ async def run( wait_for_results: bool = True, ) -> Tuple[List[TaskResponse], List[str]]: """Run an experiment with the given task and evaluators + If running in GitHub Actions, will run the experiment in GitHub context. + Otherwise, will run the experiment locally. + + Args: + task: Function to run on each dataset row + dataset_slug: Slug of the dataset to use + dataset_version: Version of the dataset to use + evaluators: List of evaluator slugs to run + experiment_slug: Slug for this experiment run + experiment_metadata: Metadata for this experiment (an experiment holds all the experiment runs) + + Returns: + Tuple of (results, errors). Returns ([], []) if wait_for_results is False + """ + if os.getenv("GITHUB_ACTIONS"): + return await self._run_in_github( + task=task, + dataset_slug=dataset_slug, + dataset_version=dataset_version, + evaluators=evaluators, + experiment_slug=experiment_slug, + related_ref=related_ref, + ) + else: + return await self._run_locally( + task=task, + dataset_slug=dataset_slug, + dataset_version=dataset_version, + evaluators=evaluators, + experiment_slug=experiment_slug, + experiment_metadata=experiment_metadata, + related_ref=related_ref, + aux=aux, + stop_on_error=stop_on_error, + wait_for_results=wait_for_results, + ) + + + async def _run_locally( + self, + task: Callable[[Optional[Dict[str, Any]]], Dict[str, Any]], + dataset_slug: Optional[str] = None, + dataset_version: Optional[str] = None, + evaluators: Optional[List[EvaluatorDetails]] = None, + experiment_slug: Optional[str] = None, + experiment_metadata: Optional[Dict[str, Any]] = None, + related_ref: Optional[Dict[str, str]] = None, + aux: Optional[Dict[str, str]] = None, + stop_on_error: bool = False, + wait_for_results: bool = True, + ) -> Tuple[List[TaskResponse], List[str]]: + """Run an experiment with the given task and evaluators Args: dataset_slug: Slug of the dataset to use @@ -185,7 +237,7 @@ async def run_with_semaphore(row) -> TaskResponse: return results, errors - async def run_in_github( + async def _run_in_github( self, task: Callable[[Optional[Dict[str, Any]]], Dict[str, Any]], dataset_slug: Optional[str] = None, From 5b7817edea0ee7ec09ee6e6b482d1608747d2ca4 Mon Sep 17 00:00:00 2001 From: nina-kollman <59646487+nina-kollman@users.noreply.github.com> Date: Tue, 25 Nov 2025 15:27:01 +0200 Subject: [PATCH 13/17] merge --- .../experiment/run_research_experiment.py | 9 +++++++-- .../traceloop/sdk/experiment/experiment.py | 16 +++++++++++----- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/packages/sample-app/sample_app/experiment/run_research_experiment.py b/packages/sample-app/sample_app/experiment/run_research_experiment.py index f8d04634ea..974f690d34 100644 --- a/packages/sample-app/sample_app/experiment/run_research_experiment.py +++ b/packages/sample-app/sample_app/experiment/run_research_experiment.py @@ -11,6 +11,7 @@ import os from openai import AsyncOpenAI from traceloop.sdk import Traceloop +from traceloop.sdk.experiment.model import RunInGithubResponse # Initialize Traceloop client client = Traceloop.init( @@ -67,8 +68,12 @@ async def main(): # Print response print("\nāœ… Experiment completed and submitted!") - print(f"Experiment Slug: {response.experiment_slug}") - print(f"Run ID: {response.run_id}") + + if isinstance(response, RunInGithubResponse): + print(f"Experiment Slug: {response.experiment_slug}") + print(f"Run ID: {response.run_id}") + else: + print(f"Results: {response}") print("\nšŸ“ The backend will run evaluators and post results to your PR.") print(" Check your GitHub PR for the results comment.") diff --git a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py index 14d778d35e..0f241189b4 100644 --- a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py +++ b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py @@ -46,7 +46,7 @@ async def run( aux: Optional[Dict[str, str]] = None, stop_on_error: bool = False, wait_for_results: bool = True, - ) -> Tuple[List[TaskResponse], List[str]]: + ) -> Tuple[List[TaskResponse], List[str]] | RunInGithubResponse: """Run an experiment with the given task and evaluators If running in GitHub Actions, will run the experiment in GitHub context. Otherwise, will run the experiment locally. @@ -58,7 +58,10 @@ async def run( evaluators: List of evaluator slugs to run experiment_slug: Slug for this experiment run experiment_metadata: Metadata for this experiment (an experiment holds all the experiment runs) - + related_ref: Related reference for this experiment run + aux: Auxiliary information for this experiment run + stop_on_error: Whether to stop on first error (default: False) + wait_for_results: Whether to wait for async tasks to complete (default: True) Returns: Tuple of (results, errors). Returns ([], []) if wait_for_results is False """ @@ -247,6 +250,7 @@ async def _run_in_github( dataset_version: Optional[str] = None, evaluators: Optional[List[EvaluatorDetails]] = None, experiment_slug: Optional[str] = None, + experiment_metadata: Optional[Dict[str, Any]] = None, related_ref: Optional[Dict[str, str]] = None, ) -> RunInGithubResponse: """Execute tasks locally and submit results to backend for GitHub CI/CD @@ -263,6 +267,7 @@ async def _run_in_github( dataset_version: Version of the dataset evaluators: List of evaluator slugs or (slug, version) tuples to run experiment_slug: Slug for this experiment run + experiment_metadata: Metadata for this experiment (an experiment holds all the experiment runs) related_ref: Additional reference information for this experiment run Returns: @@ -325,9 +330,10 @@ async def _run_in_github( actor=os.getenv("GITHUB_ACTOR", ""), ) - experiment_metadata = { - "created_from": "github", - } + experiment_metadata = dict( + experiment_metadata or {}, + created_from="github" + ) # Extract evaluator slugs evaluator_slugs = None From d36d7128a10062deab8df89cb15fbff652861cb9 Mon Sep 17 00:00:00 2001 From: nina-kollman <59646487+nina-kollman@users.noreply.github.com> Date: Tue, 25 Nov 2025 15:54:49 +0200 Subject: [PATCH 14/17] my py --- .../traceloop/sdk/experiment/experiment.py | 37 ++++++++----------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py index 0f241189b4..5619720d6d 100644 --- a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py +++ b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py @@ -2,7 +2,7 @@ import asyncio import json import os -from typing import Any, List, Callable, Optional, Tuple, Dict +from typing import Any, List, Callable, Optional, Tuple, Dict, Awaitable, Union from traceloop.sdk.client.http import HTTPClient from traceloop.sdk.datasets.datasets import Datasets from traceloop.sdk.evaluator.evaluator import Evaluator @@ -36,7 +36,7 @@ def __init__(self, http_client: HTTPClient, async_http_client: httpx.AsyncClient async def run( self, - task: Callable[[Optional[Dict[str, Any]]], Dict[str, Any]], + task: Callable[[Optional[Dict[str, Any]]], Awaitable[Dict[str, Any]]], dataset_slug: Optional[str] = None, dataset_version: Optional[str] = None, evaluators: Optional[List[EvaluatorDetails]] = None, @@ -52,7 +52,7 @@ async def run( Otherwise, will run the experiment locally. Args: - task: Function to run on each dataset row + task: Async function to run on each dataset row dataset_slug: Slug of the dataset to use dataset_version: Version of the dataset to use evaluators: List of evaluator slugs to run @@ -87,11 +87,10 @@ async def run( stop_on_error=stop_on_error, wait_for_results=wait_for_results, ) - - + async def _run_locally( self, - task: Callable[[Optional[Dict[str, Any]]], Dict[str, Any]], + task: Callable[[Optional[Dict[str, Any]]], Awaitable[Dict[str, Any]]], dataset_slug: Optional[str] = None, dataset_version: Optional[str] = None, evaluators: Optional[List[EvaluatorDetails]] = None, @@ -106,7 +105,7 @@ async def _run_locally( Args: dataset_slug: Slug of the dataset to use - task: Function to run on each dataset row + task: Async function to run on each dataset row evaluators: List of evaluator slugs to run experiment_slug: Slug for this experiment run experiment_metadata: Metadata for this experiment (an experiment holds all the experiment runs) @@ -160,17 +159,15 @@ async def _run_locally( async def run_single_row(row: Optional[Dict[str, Any]]) -> TaskResponse: try: - # TODO: Fix type annotation - task should return Awaitable, not dict - task_result = await task(row) # type: ignore[misc] - # TODO: Fix type - task_input should accept Optional[Dict] + task_result = await task(row) task_id = self._create_task( experiment_slug=experiment_slug, experiment_run_id=run_id, - task_input=row, # type: ignore[arg-type] + task_input=row, task_output=task_result, ).id - eval_results = {} + eval_results: Dict[str, Union[Dict[str, Any], str]] = {} if evaluator_details: for evaluator_slug, evaluator_version in evaluator_details: try: @@ -197,13 +194,11 @@ async def run_single_row(row: Optional[Dict[str, Any]]) -> TaskResponse: input=task_result, ) - # TODO: Fix type - eval_results should accept Union[Dict, str] msg = f"Triggered execution of {evaluator_slug}" - eval_results[evaluator_slug] = msg # type: ignore[assignment] + eval_results[evaluator_slug] = msg except Exception as e: - # TODO: Fix type - eval_results should accept Union[Dict, str] - eval_results[evaluator_slug] = f"Error: {str(e)}" # type: ignore[assignment] + eval_results[evaluator_slug] = f"Error: {str(e)}" return TaskResponse( task_result=task_result, @@ -245,7 +240,7 @@ async def run_with_semaphore(row: Optional[Dict[str, Any]]) -> TaskResponse: async def _run_in_github( self, - task: Callable[[Optional[Dict[str, Any]]], Dict[str, Any]], + task: Callable[[Optional[Dict[str, Any]]], Awaitable[Dict[str, Any]]], dataset_slug: Optional[str] = None, dataset_version: Optional[str] = None, evaluators: Optional[List[EvaluatorDetails]] = None, @@ -262,7 +257,7 @@ async def _run_in_github( 4. Backend runs evaluators and posts PR comment Args: - task: Function to run on each dataset row + task: Async function to run on each dataset row dataset_slug: Slug of the dataset to use dataset_version: Version of the dataset evaluators: List of evaluator slugs or (slug, version) tuples to run @@ -398,7 +393,7 @@ def _create_task( self, experiment_slug: str, experiment_run_id: str, - task_input: Dict[str, Any], + task_input: Optional[Dict[str, Any]], task_output: Dict[str, Any], ) -> CreateTaskResponse: body = CreateTaskRequest( @@ -433,7 +428,7 @@ def _parse_jsonl_to_rows(self, jsonl_data: str) -> List[Dict[str, Any]]: async def _execute_tasks( self, rows: List[Dict[str, Any]], - task: Callable[[Optional[Dict[str, Any]]], Dict[str, Any]], + task: Callable[[Optional[Dict[str, Any]]], Awaitable[Dict[str, Any]]], ) -> List[TaskResult]: """Execute tasks locally with concurrency control @@ -447,7 +442,7 @@ async def _execute_tasks( """ task_results: List[TaskResult] = [] - async def run_single_row(row) -> TaskResult: + async def run_single_row(row: Optional[Dict[str, Any]]) -> TaskResult: try: task_output = await task(row) return TaskResult( From 49e92f2e2e3f9d24a7b147385545d172f4d9ce2a Mon Sep 17 00:00:00 2001 From: nina-kollman <59646487+nina-kollman@users.noreply.github.com> Date: Tue, 25 Nov 2025 16:44:49 +0200 Subject: [PATCH 15/17] comments --- .../traceloop/sdk/experiment/experiment.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py index 5619720d6d..cf9e5bc04f 100644 --- a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py +++ b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py @@ -73,6 +73,7 @@ async def run( evaluators=evaluators, experiment_slug=experiment_slug, related_ref=related_ref, + aux=aux, ) else: return await self._run_locally( @@ -247,6 +248,7 @@ async def _run_in_github( experiment_slug: Optional[str] = None, experiment_metadata: Optional[Dict[str, Any]] = None, related_ref: Optional[Dict[str, str]] = None, + aux: Optional[Dict[str, str]] = None, ) -> RunInGithubResponse: """Execute tasks locally and submit results to backend for GitHub CI/CD @@ -264,6 +266,7 @@ async def _run_in_github( experiment_slug: Slug for this experiment run experiment_metadata: Metadata for this experiment (an experiment holds all the experiment runs) related_ref: Additional reference information for this experiment run + aux: Auxiliary information for this experiment run Returns: RunInGithubResponse with experiment_id, run_id, and status @@ -330,6 +333,12 @@ async def _run_in_github( created_from="github" ) + experiment_run_metadata = { + key: value + for key, value in [("related_ref", related_ref), ("aux", aux)] + if value is not None + } + # Extract evaluator slugs evaluator_slugs = None if evaluators: @@ -347,7 +356,7 @@ async def _run_in_github( task_results=task_results, github_context=github_context, experiment_metadata=experiment_metadata, - experiment_run_metadata=related_ref, + experiment_run_metadata=experiment_run_metadata, ) response = self._http_client.post( From 724f0d56dc84aea279f6d347e113ef5f6ce94fd8 Mon Sep 17 00:00:00 2001 From: nina-kollman <59646487+nina-kollman@users.noreply.github.com> Date: Tue, 25 Nov 2025 16:53:36 +0200 Subject: [PATCH 16/17] doc change --- packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py index cf9e5bc04f..fd0011dd39 100644 --- a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py +++ b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py @@ -444,7 +444,6 @@ async def _execute_tasks( Args: rows: List of dataset rows to process task: Function to run on each row - stop_on_error: Whether to stop on first error Returns: List of TaskResult objects with inputs, outputs, and errors From 7ad43b694d298694444596911e0f022d1d0b4ede Mon Sep 17 00:00:00 2001 From: nina-kollman <59646487+nina-kollman@users.noreply.github.com> Date: Thu, 27 Nov 2025 10:58:08 +0200 Subject: [PATCH 17/17] cooment --- packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py index fd0011dd39..453c6f7e8d 100644 --- a/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py +++ b/packages/traceloop-sdk/traceloop/sdk/experiment/experiment.py @@ -48,8 +48,6 @@ async def run( wait_for_results: bool = True, ) -> Tuple[List[TaskResponse], List[str]] | RunInGithubResponse: """Run an experiment with the given task and evaluators - If running in GitHub Actions, will run the experiment in GitHub context. - Otherwise, will run the experiment locally. Args: task: Async function to run on each dataset row