-
Notifications
You must be signed in to change notification settings - Fork 839
fix(exp): Add run in github experiment #3459
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 10 commits
Commits
Show all changes
21 commits
Select commit
Hold shift + click to select a range
f8d03db
in github
nina-kollman 28212d2
added yml
nina-kollman 6558d72
added raise
nina-kollman bcd66b3
no semaphore
nina-kollman a6ab630
add
nina-kollman a0b2156
wip
nina-kollman 0092851
move exp
nina-kollman c373783
Merge branch 'main' of https://github.com/traceloop/openllmetry into …
nina-kollman 3b763b2
add'
nina-kollman c0138f5
lint
nina-kollman 05d6b44
fixes
nina-kollman 4048c6c
fix comments
nina-kollman 6764781
to private methods
nina-kollman 265b089
Merge branch 'main' of https://github.com/traceloop/openllmetry into …
nina-kollman 5b7817e
merge
nina-kollman d36d712
my py
nina-kollman 49e92f2
comments
nina-kollman 724f0d5
doc change
nina-kollman 4531017
Merge branch 'main' of https://github.com/traceloop/openllmetry into …
nina-kollman 61d919f
Merge branch 'main' of https://github.com/traceloop/openllmetry into …
nina-kollman 7ad43b6
cooment
nina-kollman File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
78 changes: 78 additions & 0 deletions
78
packages/sample-app/sample_app/experiment/run_research_experiment.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,78 @@ | ||
| """ | ||
| Example experiment script for CI/CD using run_in_github | ||
| This script: | ||
| 1. Executes tasks locally on the dataset | ||
| 2. Sends task results to the backend | ||
| 3. Backend runs evaluators and posts PR comment with results | ||
| """ | ||
|
|
||
| import asyncio | ||
| import os | ||
| from openai import AsyncOpenAI | ||
| from traceloop.sdk import Traceloop | ||
|
|
||
| # Initialize Traceloop client | ||
| client = Traceloop.init( | ||
| app_name="research-experiment-ci-cd", | ||
| api_key=os.getenv("TRACELOOP_API_KEY"), | ||
| api_endpoint=os.getenv("TRACELOOP_BASE_URL"), | ||
| ) | ||
|
|
||
|
|
||
| async def generate_research_response(question: str) -> str: | ||
| """Generate a research response using OpenAI""" | ||
| openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) | ||
|
|
||
| response = await openai_client.chat.completions.create( | ||
| model="gpt-4", | ||
| messages=[ | ||
| { | ||
| "role": "system", | ||
| "content": "You are a helpful research assistant. Provide accurate, well-researched answers.", | ||
| }, | ||
| {"role": "user", "content": question}, | ||
| ], | ||
| temperature=0.7, | ||
| max_tokens=500, | ||
| ) | ||
|
|
||
| return response.choices[0].message.content | ||
|
|
||
|
|
||
| async def research_task(row): | ||
| """Task function that processes each dataset row""" | ||
| query = row.get("query", "") | ||
| answer = await generate_research_response(query) | ||
|
|
||
| return { | ||
| "completion": answer, | ||
| "question": query, | ||
| "sentenece": answer | ||
| } | ||
nina-kollman marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
|
|
||
| async def main(): | ||
| """Run experiment in GitHub context""" | ||
| print("🚀 Running research experiment in GitHub CI/CD...") | ||
|
|
||
| # Execute tasks locally and send results to backend | ||
| response = await client.experiment.run_in_github( | ||
| task=research_task, | ||
| dataset_slug="research-queries", | ||
| dataset_version="v2", | ||
| evaluators=["research-relevancy", "categories", "research-facts-counter"], | ||
| experiment_slug="research-exp", | ||
| ) | ||
|
|
||
| # Print response | ||
| print("\n✅ Experiment completed and submitted!") | ||
| print(f"Experiment Slug: {response.experiment_slug}") | ||
| print(f"Run ID: {response.run_id}") | ||
|
|
||
| print("\n📝 The backend will run evaluators and post results to your PR.") | ||
| print(" Check your GitHub PR for the results comment.") | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| asyncio.run(main()) | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,7 @@ | ||
| import cuid | ||
| import asyncio | ||
| import json | ||
| import os | ||
| from typing import Any, List, Callable, Optional, Tuple, Dict | ||
| from traceloop.sdk.client.http import HTTPClient | ||
| from traceloop.sdk.datasets.datasets import Datasets | ||
|
|
@@ -12,6 +13,10 @@ | |
| CreateTaskResponse, | ||
| EvaluatorDetails, | ||
| TaskResponse, | ||
| RunInGithubRequest, | ||
| RunInGithubResponse, | ||
| TaskResult, | ||
| GithubContext, | ||
| ) | ||
| import httpx | ||
|
|
||
|
|
@@ -36,6 +41,7 @@ async def run( | |
| dataset_version: Optional[str] = None, | ||
| evaluators: Optional[List[EvaluatorDetails]] = None, | ||
| experiment_slug: Optional[str] = None, | ||
| experiment_metadata: Optional[Dict[str, Any]] = None, | ||
| related_ref: Optional[Dict[str, str]] = None, | ||
| aux: Optional[Dict[str, str]] = None, | ||
| stop_on_error: bool = False, | ||
|
|
@@ -48,6 +54,7 @@ async def run( | |
| task: Function to run on each dataset row | ||
| evaluators: List of evaluator slugs to run | ||
| experiment_slug: Slug for this experiment run | ||
| experiment_metadata: Metadata for this experiment (an experiment holds all the experinent runs) | ||
nina-kollman marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| related_ref: Related reference for this experiment run | ||
| aux: Auxiliary information for this experiment run | ||
| stop_on_error: Whether to stop on first error (default: False) | ||
|
|
@@ -82,6 +89,7 @@ async def run( | |
| evaluator_slugs=[slug for slug, _ in evaluator_details] | ||
| if evaluator_details | ||
| else None, | ||
| experiment_metadata=experiment_metadata, | ||
| experiment_run_metadata=experiment_run_metadata, | ||
| ) | ||
|
|
||
|
|
@@ -177,6 +185,118 @@ async def run_with_semaphore(row) -> TaskResponse: | |
|
|
||
| return results, errors | ||
|
|
||
| async def run_in_github( | ||
|
||
| self, | ||
| task: Callable[[Optional[Dict[str, Any]]], Dict[str, Any]], | ||
| dataset_slug: Optional[str] = None, | ||
| dataset_version: Optional[str] = None, | ||
| evaluators: Optional[List[EvaluatorDetails]] = None, | ||
| experiment_slug: Optional[str] = None, | ||
| related_ref: Optional[Dict[str, str]] = None, | ||
| ) -> RunInGithubResponse: | ||
| """Execute tasks locally and submit results to backend for GitHub CI/CD | ||
|
|
||
| This method: | ||
| 1. Fetches the dataset | ||
| 2. Executes all tasks locally | ||
| 3. Sends task results to backend | ||
| 4. Backend runs evaluators and posts PR comment | ||
|
|
||
| Args: | ||
| task: Function to run on each dataset row | ||
| dataset_slug: Slug of the dataset to use | ||
| dataset_version: Version of the dataset | ||
| evaluators: List of evaluator slugs or (slug, version) tuples to run | ||
| experiment_slug: Slug for this experiment run | ||
| related_ref: Additional reference information for this experiment run | ||
|
|
||
| Returns: | ||
| RunInGithubResponse with experiment_id, run_id, and status | ||
|
|
||
| Raises: | ||
| RuntimeError: If not running in GitHub Actions environment | ||
| Exception: If the API request fails | ||
| """ | ||
|
|
||
| # Check if running in GitHub Actions | ||
| if not os.getenv("GITHUB_ACTIONS"): | ||
| raise RuntimeError( | ||
| "run_in_github() can only be used in GitHub Actions CI/CD environment. " | ||
| "To run experiments locally, use the run() method instead." | ||
| ) | ||
|
|
||
| if not experiment_slug: | ||
| experiment_slug = self._experiment_slug or "exp-" + str(cuid.cuid())[:11] | ||
|
|
||
| # Fetch dataset rows | ||
| rows = [] | ||
| if dataset_slug and dataset_version: | ||
| jsonl_data = self._datasets.get_version_jsonl(dataset_slug, dataset_version) | ||
| rows = self._parse_jsonl_to_rows(jsonl_data) | ||
|
|
||
| task_results = await self._execute_tasks(rows, task) | ||
|
|
||
| # Construct GitHub context | ||
| repository = os.getenv("GITHUB_REPOSITORY") | ||
| server_url = os.getenv("GITHUB_SERVER_URL", "https://github.com") | ||
|
|
||
| # Extract PR number from GITHUB_REF (format: "refs/pull/123/merge") | ||
| github_ref = os.getenv("GITHUB_REF", "") | ||
|
|
||
| if not repository or not github_ref: | ||
| raise RuntimeError( | ||
| "GITHUB_REPOSITORY and GITHUB_REF must be set in the environment. " | ||
| "To run experiments locally, use the run() method instead." | ||
| ) | ||
|
|
||
| pr_number = None | ||
| if github_ref.startswith("refs/pull/"): | ||
| pr_number = github_ref.split("/")[2] | ||
| pr_url = f"{server_url}/{repository}/pull/{pr_number}" if pr_number and repository else None | ||
|
|
||
| github_context = GithubContext( | ||
| repository=repository, | ||
| pr_url=pr_url, | ||
| commit_hash=os.getenv("GITHUB_SHA", ""), | ||
| actor=os.getenv("GITHUB_ACTOR", ""), | ||
| ) | ||
nina-kollman marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| experiment_metadata = { | ||
| "created_from": "github", | ||
| } | ||
nina-kollman marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| # Extract evaluator slugs | ||
| evaluator_slugs = None | ||
| if evaluators: | ||
| evaluator_slugs = [ | ||
| slug if isinstance(slug, str) else slug[0] | ||
| for slug in evaluators | ||
| ] | ||
|
|
||
| # Prepare request payload | ||
| request_body = RunInGithubRequest( | ||
| experiment_slug=experiment_slug, | ||
| dataset_slug=dataset_slug, | ||
| dataset_version=dataset_version, | ||
| evaluator_slugs=evaluator_slugs, | ||
| task_results=task_results, | ||
| github_context=github_context, | ||
| experiment_metadata=experiment_metadata, | ||
| experiment_run_metadata=related_ref, | ||
| ) | ||
|
|
||
| response = self._http_client.post( | ||
| "/experiments/run-in-github", | ||
| request_body.model_dump(mode="json", exclude_none=True), | ||
| ) | ||
|
|
||
| if response is None: | ||
| raise Exception( | ||
| f"Failed to submit experiment '{experiment_slug}' for GitHub execution. " | ||
| ) | ||
|
|
||
| return RunInGithubResponse(**response) | ||
|
|
||
| def _init_experiment( | ||
| self, | ||
| experiment_slug: str, | ||
|
|
@@ -239,3 +359,55 @@ def _parse_jsonl_to_rows(self, jsonl_data: str) -> List[Dict[str, Any]]: | |
| continue | ||
|
|
||
| return rows | ||
|
|
||
| async def _execute_tasks( | ||
| self, | ||
| rows: List[Dict[str, Any]], | ||
| task: Callable[[Optional[Dict[str, Any]]], Dict[str, Any]], | ||
| ) -> List[TaskResult]: | ||
| """Execute tasks locally with concurrency control | ||
|
|
||
| Args: | ||
| rows: List of dataset rows to process | ||
| task: Function to run on each row | ||
| stop_on_error: Whether to stop on first error | ||
|
|
||
nina-kollman marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| Returns: | ||
| List of TaskResult objects with inputs, outputs, and errors | ||
| """ | ||
| task_results: List[TaskResult] = [] | ||
|
|
||
| async def run_single_row(row) -> TaskResult: | ||
| try: | ||
| task_output = await task(row) | ||
| return TaskResult( | ||
| input=row, | ||
| output=task_output, | ||
| ) | ||
| except Exception as e: | ||
| return TaskResult( | ||
| input=row, | ||
| error=str(e), | ||
| ) | ||
|
|
||
| # Execute tasks with concurrency control | ||
| semaphore = asyncio.Semaphore(50) | ||
|
|
||
| async def run_with_semaphore(row: Dict[str, Any]) -> TaskResult: | ||
| async with semaphore: | ||
| return await run_single_row(row) | ||
|
|
||
| tasks = [asyncio.create_task(run_with_semaphore(row)) for row in rows] | ||
|
|
||
| for completed_task in asyncio.as_completed(tasks): | ||
| try: | ||
| result = await completed_task | ||
| task_results.append(result) | ||
| except Exception as e: | ||
| task_results.append(TaskResult( | ||
| input=completed_task.task_input, | ||
nina-kollman marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| error=str(e), | ||
| )) | ||
| continue | ||
|
|
||
| return task_results | ||
nina-kollman marked this conversation as resolved.
Show resolved
Hide resolved
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.