diff --git a/README.md b/README.md
index fda3a6eb..2445b8f0 100644
--- a/README.md
+++ b/README.md
@@ -41,7 +41,7 @@ client.prompts.log(
messages=[{"role": "user", "content": "What really happened at Roswell?"}],
inputs={"person": "Trump"},
created_at=datetime.datetime.fromisoformat(
- "2024-07-19 00:29:35.178000+00:00",
+ "2024-07-18 23:29:35.178000+00:00",
),
provider_latency=6.5931549072265625,
output_message={
@@ -88,7 +88,7 @@ async def main() -> None:
],
inputs={"person": "Trump"},
created_at=datetime.datetime.fromisoformat(
- "2024-07-19 00:29:35.178000+00:00",
+ "2024-07-18 23:29:35.178000+00:00",
),
provider_latency=6.5931549072265625,
output_message={
@@ -165,7 +165,6 @@ response = client.prompts.call_stream(
),
source_datapoint_id="string",
trace_parent_id="string",
- batch_id="string",
user="string",
prompts_call_stream_request_environment="string",
save=True,
diff --git a/pyproject.toml b/pyproject.toml
index 279764ab..a62db965 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "humanloop"
-version = "0.8.8"
+version = "0.8.9"
description = ""
readme = "README.md"
authors = []
diff --git a/reference.md b/reference.md
index aae70be1..27cd7ce4 100644
--- a/reference.md
+++ b/reference.md
@@ -56,7 +56,7 @@ client.prompts.log(
messages=[{"role": "user", "content": "What really happened at Roswell?"}],
inputs={"person": "Trump"},
created_at=datetime.datetime.fromisoformat(
- "2024-07-19 00:29:35.178000+00:00",
+ "2024-07-18 23:29:35.178000+00:00",
),
provider_latency=6.5931549072265625,
output_message={
@@ -100,7 +100,7 @@ client.prompts.log(
-
-**evaluation_id:** `typing.Optional[str]` — Unique identifier for the Evaluation Report to associate the Log to.
+**run_id:** `typing.Optional[str]` — Unique identifier for the Run to associate the Log to.
@@ -314,14 +314,6 @@ Controls how the model uses tools. The following options are supported:
-
-**batch_id:** `typing.Optional[str]` — Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist.
-
-
-
-
-
--
-
**user:** `typing.Optional[str]` — End-user ID related to the Log.
@@ -682,7 +674,6 @@ response = client.prompts.call_stream(
),
source_datapoint_id="string",
trace_parent_id="string",
- batch_id="string",
user="string",
prompts_call_stream_request_environment="string",
save=True,
@@ -836,14 +827,6 @@ Controls how the model uses tools. The following options are supported:
-
-**batch_id:** `typing.Optional[str]` — Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist.
-
-
-
-
-
--
-
**user:** `typing.Optional[str]` — End-user ID related to the Log.
@@ -1102,14 +1085,6 @@ Controls how the model uses tools. The following options are supported:
-
-**batch_id:** `typing.Optional[str]` — Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist.
-
-
-
-
-
--
-
**user:** `typing.Optional[str]` — End-user ID related to the Log.
@@ -2525,14 +2500,6 @@ client.tools.log(
-
-**batch_id:** `typing.Optional[str]` — Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist.
-
-
-
-
-
--
-
**user:** `typing.Optional[str]` — End-user ID related to the Log.
@@ -4497,6 +4464,14 @@ client.datasets.list_versions(
-
+**include_datapoints:** `typing.Optional[typing.Literal["latest_committed"]]` — If set to 'latest_committed', include the Datapoints for the latest committed version. Defaults to `None`.
+
+
+
+
+
+-
+
**request_options:** `typing.Optional[RequestOptions]` — Request-specific configuration.
@@ -5157,14 +5132,6 @@ client.evaluators.log(
-
-**batch_id:** `typing.Optional[str]` — Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist.
-
-
-
-
-
--
-
**user:** `typing.Optional[str]` — End-user ID related to the Log.
@@ -6258,10 +6225,10 @@ client.flows.log(
output="The patient is likely experiencing a myocardial infarction. Immediate medical attention is required.",
trace_status="incomplete",
start_time=datetime.datetime.fromisoformat(
- "2024-07-08 22:40:35+00:00",
+ "2024-07-08 21:40:35+00:00",
),
end_time=datetime.datetime.fromisoformat(
- "2024-07-08 22:40:39+00:00",
+ "2024-07-08 21:40:39+00:00",
),
)
@@ -6295,7 +6262,7 @@ client.flows.log(
-
-**evaluation_id:** `typing.Optional[str]` — Unique identifier for the Evaluation Report to associate the Log to.
+**run_id:** `typing.Optional[str]` — Unique identifier for the Run to associate the Log to.
@@ -6431,14 +6398,6 @@ client.flows.log(
-
-**batch_id:** `typing.Optional[str]` — Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist.
-
-
-
-
-
--
-
**user:** `typing.Optional[str]` — End-user ID related to the Log.
@@ -8212,16 +8171,10 @@ for page in response.iter_pages():
Create an Evaluation.
-Create a new Evaluation by specifying the Dataset, versions to be
-evaluated (Evaluatees), and which Evaluators to provide judgments.
+Create an Evaluation by specifying the File to evaluate, and a name
+for the Evaluation.
-Humanloop will automatically start generating Logs and running Evaluators where
-`orchestrated=true`. If you own the runtime for the Evaluatee or Evaluator, you
-can set `orchestrated=false` and then generate and submit the required logs using
-your runtime.
-
-To keep updated on the progress of the Evaluation, you can poll the Evaluation using
-the `GET /evaluations/:id` endpoint and check its status.
+You can then add Runs to this Evaluation using the `POST /evaluations/{id}/runs` endpoint.
@@ -8242,11 +8195,7 @@ client = Humanloop(
api_key="YOUR_API_KEY",
)
client.evaluations.create(
- dataset={"version_id": "dsv_6L78pqrdFi2xa"},
- evaluatees=[
- {"version_id": "prv_7ZlQREDScH0xkhUwtXruN", "orchestrated": False}
- ],
- evaluators=[{"version_id": "evv_012def", "orchestrated": False}],
+ evaluators=[{}],
)
```
@@ -8263,7 +8212,7 @@ client.evaluations.create(
-
-**dataset:** `EvaluationsDatasetRequestParams` — Dataset to use in this Evaluation.
+**evaluators:** `typing.Sequence[EvaluationsRequestParams]` — The Evaluators used to evaluate.
@@ -8271,7 +8220,7 @@ client.evaluations.create(
-
-**evaluators:** `typing.Sequence[EvaluationsRequestParams]` — The Evaluators used to evaluate.
+**file:** `typing.Optional[FileRequestParams]` — The File to associate with the Evaluation. This File contains the Logs you're evaluating.
@@ -8279,7 +8228,7 @@ client.evaluations.create(
-
-**evaluatees:** `typing.Optional[typing.Sequence[EvaluateeRequestParams]]` — Unique identifiers for the Prompt/Tool Versions to include in the Evaluation. Can be left unpopulated if you wish to add Evaluatees to this Evaluation by specifying `evaluation_id` in Log calls.
+**name:** `typing.Optional[str]` — Name of the Evaluation to help identify it. Must be unique within the associated File.
@@ -8287,15 +8236,163 @@ client.evaluations.create(
-
-**name:** `typing.Optional[str]` — Name of the Evaluation to help identify it. Must be unique within the associated File.
+**request_options:** `typing.Optional[RequestOptions]` — Request-specific configuration.
+
+
+
+
+
+
+
+
+
+
+client.evaluations.add_evaluators(...)
+
+-
+
+#### 📝 Description
+
+
+-
+
+
+-
+
+Add Evaluators to an Evaluation.
+
+Add new Evaluators to an Evaluation. The Evaluators will be run on the Logs
+generated for the Evaluation.
+
+
+#### 🔌 Usage
+
-
-**file:** `typing.Optional[FileRequestParams]` — The File to associate with the Evaluation.
+
+-
+
+```python
+from humanloop import Humanloop
+
+client = Humanloop(
+ api_key="YOUR_API_KEY",
+)
+client.evaluations.add_evaluators(
+ id="id",
+ evaluators=[{}],
+)
+
+```
+
+
+
+
+
+#### ⚙️ Parameters
+
+
+-
+
+
+-
+
+**id:** `str` — Unique identifier for Evaluation.
+
+
+
+
+
+-
+
+**evaluators:** `typing.Sequence[EvaluationsRequestParams]` — The Evaluators to add to this Evaluation.
+
+
+
+
+
+-
+
+**request_options:** `typing.Optional[RequestOptions]` — Request-specific configuration.
+
+
+
+
+
+
+
+
+
+
+
+client.evaluations.remove_evaluator(...)
+
+-
+
+#### 📝 Description
+
+
+-
+
+
+-
+
+Remove an Evaluator from an Evaluation.
+
+Remove an Evaluator from an Evaluation. The Evaluator will no longer be run on the Logs
+generated for the Evaluation.
+
+
+
+
+
+#### 🔌 Usage
+
+
+-
+
+
+-
+
+```python
+from humanloop import Humanloop
+
+client = Humanloop(
+ api_key="YOUR_API_KEY",
+)
+client.evaluations.remove_evaluator(
+ id="id",
+ evaluator_version_id="evaluator_version_id",
+)
+
+```
+
+
+
+
+
+#### ⚙️ Parameters
+
+
+-
+
+
+-
+
+**id:** `str` — Unique identifier for Evaluation.
+
+
+
+
+
+-
+
+**evaluator_version_id:** `str` — Unique identifier for Evaluator Version.
@@ -8458,7 +8555,7 @@ client.evaluations.delete(
-client.evaluations.update_setup(...)
+client.evaluations.list_runs_for_evaluation(...)
-
@@ -8470,10 +8567,7 @@ client.evaluations.delete(
-
-Update an Evaluation.
-
-Update the setup of an Evaluation by specifying the Dataset, versions to be
-evaluated (Evaluatees), and which Evaluators to provide judgments.
+List all Runs for an Evaluation.
@@ -8493,13 +8587,8 @@ from humanloop import Humanloop
client = Humanloop(
api_key="YOUR_API_KEY",
)
-client.evaluations.update_setup(
- id="ev_567yza",
- dataset={"version_id": "dsv_6L78pqrdFi2xa"},
- evaluatees=[
- {"version_id": "prv_7ZlQREDScH0xkhUwtXruN", "orchestrated": False}
- ],
- evaluators=[{"version_id": "evv_012def", "orchestrated": False}],
+client.evaluations.list_runs_for_evaluation(
+ id="id",
)
```
@@ -8524,7 +8613,87 @@ client.evaluations.update_setup(
-
-**dataset:** `typing.Optional[EvaluationsDatasetRequestParams]` — Dataset to use in this Evaluation.
+**request_options:** `typing.Optional[RequestOptions]` — Request-specific configuration.
+
+
+
+
+
+
+
+
+
+
+
+client.evaluations.create_run(...)
+
+-
+
+#### 📝 Description
+
+
+-
+
+
+-
+
+Create an Evaluation Run.
+
+Create a new Evaluation Run. Optionally specify the Dataset and version to be
+evaluated.
+
+Humanloop will automatically start generating Logs and running Evaluators where
+`orchestrated=true`. If you are generating Logs yourself, you can set `orchestrated=false`
+and then generate and submit the required Logs via the API.
+
+The `logs` parameter controls which Logs are associated with the Run. Defaults to `dynamic`
+if `dataset` and `version` are provided. This means that Logs will automatically be retrieved
+if they're associated with the specified Version and has `source_datapoint_id` referencing
+a datapoint in the specified Dataset.
+If `logs` is set to `fixed`, no existing Logs will be automatically associated with the Run.
+You can then add Logs to the Run using the `POST /evaluations/{id}/runs/{run_id}/logs` endpoint,
+or by adding `run_id` to your `POST /prompts/logs` requests.
+
+To keep updated on the progress of the Run, you can poll the Run using
+the `GET /evaluations/{id}/runs` endpoint and check its status.
+
+
+
+
+
+#### 🔌 Usage
+
+
+-
+
+
+-
+
+```python
+from humanloop import Humanloop
+
+client = Humanloop(
+ api_key="YOUR_API_KEY",
+)
+client.evaluations.create_run(
+ id="id",
+)
+
+```
+
+
+
+
+
+#### ⚙️ Parameters
+
+
+-
+
+
+-
+
+**id:** `str` — Unique identifier for Evaluation.
@@ -8532,7 +8701,7 @@ client.evaluations.update_setup(
-
-**evaluatees:** `typing.Optional[typing.Sequence[EvaluateeRequestParams]]` — Unique identifiers for the Prompt/Tool Versions to include in the Evaluation. Can be left unpopulated if you wish to add evaluatees to this Evaluation by specifying `evaluation_id` in Log calls.
+**dataset:** `typing.Optional[EvaluationsDatasetRequestParams]` — Dataset to use in this Run.
@@ -8540,7 +8709,7 @@ client.evaluations.update_setup(
-
-**evaluators:** `typing.Optional[typing.Sequence[EvaluationsRequestParams]]` — The Evaluators used to evaluate.
+**version:** `typing.Optional[VersionSpecificationParams]` — Version to use in this Run.
@@ -8548,7 +8717,7 @@ client.evaluations.update_setup(
-
-**name:** `typing.Optional[str]` — Name of the Evaluation to help identify it. Must be unique within the associated File.
+**orchestrated:** `typing.Optional[bool]` — Whether the Run is orchestrated by Humanloop. If `True`, Humanloop will generate Logs for the Run; `dataset` and `version` must be provided. If `False`, a log for the Prompt/Tool should be submitted by the user via the API.
@@ -8556,7 +8725,7 @@ client.evaluations.update_setup(
-
-**file:** `typing.Optional[FileRequestParams]` — The File to associate with the Evaluation.
+**logs:** `typing.Optional[LogsAssociationType]` — How the Logs are associated with the Run. If `dynamic`, the latest relevant Logs will be inferred from the Dataset and Version. If `fixed`, the Logs will be explicitly associated. You can provide a list of Log IDs to associate with the Run, or add them to the Run later. Defaults to `dynamic` if `dataset` and `version` are provided; otherwise, defaults to `fixed`.
@@ -8576,7 +8745,7 @@ client.evaluations.update_setup(
-client.evaluations.update_status(...)
+client.evaluations.add_existing_run(...)
-
@@ -8588,10 +8757,7 @@ client.evaluations.update_setup(
-
-Update the status of an Evaluation.
-
-Can be used to cancel a running Evaluation, or mark an Evaluation that uses
-external or human evaluators as completed.
+Add an existing Run to an Evaluation.
@@ -8611,9 +8777,9 @@ from humanloop import Humanloop
client = Humanloop(
api_key="YOUR_API_KEY",
)
-client.evaluations.update_status(
+client.evaluations.add_existing_run(
id="id",
- status="pending",
+ run_id="run_id",
)
```
@@ -8638,7 +8804,7 @@ client.evaluations.update_status(
-
-**status:** `EvaluationStatus`
+**run_id:** `str` — Unique identifier for Run.
@@ -8658,7 +8824,7 @@ client.evaluations.update_status(
-client.evaluations.get_stats(...)
+client.evaluations.remove_run_from_evaluation(...)
-
@@ -8670,11 +8836,10 @@ client.evaluations.update_status(
-
-Get Evaluation Stats.
+Remove a Run from an Evaluation.
-Retrieve aggregate stats for the specified Evaluation.
-This includes the number of generated Logs for each evaluated version and the
-corresponding Evaluator statistics (such as the mean and percentiles).
+Remove a Run from an Evaluation. The Logs and Versions used in the Run will not be deleted.
+If this Run is used in any other Evaluations, it will still be available in those Evaluations.
@@ -8694,8 +8859,9 @@ from humanloop import Humanloop
client = Humanloop(
api_key="YOUR_API_KEY",
)
-client.evaluations.get_stats(
+client.evaluations.remove_run_from_evaluation(
id="id",
+ run_id="run_id",
)
```
@@ -8720,6 +8886,14 @@ client.evaluations.get_stats(
-
+**run_id:** `str` — Unique identifier for Run.
+
+
+
+
+
+-
+
**request_options:** `typing.Optional[RequestOptions]` — Request-specific configuration.
@@ -8732,7 +8906,7 @@ client.evaluations.get_stats(
-client.evaluations.get_logs(...)
+client.evaluations.update_evaluation_run(...)
-
@@ -8744,10 +8918,9 @@ client.evaluations.get_stats(
-
-Get the Logs associated to a specific Evaluation.
+Update an Evaluation Run.
-Each Datapoint in your Dataset will have a corresponding Log for each File version evaluated.
-e.g. If you have 50 Datapoints and are evaluating 2 Prompts, there will be 100 Logs associated with the Evaluation.
+Update the Dataset and version to be evaluated for an existing Run.
@@ -8767,8 +8940,10 @@ from humanloop import Humanloop
client = Humanloop(
api_key="YOUR_API_KEY",
)
-client.evaluations.get_logs(
+client.evaluations.update_evaluation_run(
id="id",
+ run_id="run_id",
+ control=True,
)
```
@@ -8785,7 +8960,7 @@ client.evaluations.get_logs(
-
-**id:** `str` — String ID of evaluation. Starts with `ev_` or `evr_`.
+**id:** `str` — Unique identifier for Evaluation.
@@ -8793,7 +8968,7 @@ client.evaluations.get_logs(
-
-**page:** `typing.Optional[int]` — Page number for pagination.
+**run_id:** `str` — Unique identifier for Run.
@@ -8801,7 +8976,7 @@ client.evaluations.get_logs(
-
-**size:** `typing.Optional[int]` — Page size for pagination. Number of Logs to fetch.
+**control:** `bool` — If `True`, this Run will be used as the control in the Evaluation. Stats for other Runs will be compared to this Run. This will replace any existing control Run.
@@ -8821,7 +8996,7 @@ client.evaluations.get_logs(
-client.evaluations.pin_evaluatee(...)
+client.evaluations.add_logs_to_run(...)
-
@@ -8833,10 +9008,11 @@ client.evaluations.get_logs(
-
-Pin the specified Evaluatee.
+Add Logs to an Evaluation Run.
-Pinned Evaluatees are always displayed in the Evaluation Overview,
-and serve as the baseline for comparison with other Evaluatees.
+This is supported only for Runs that have a fixed set of Logs.
+(Runs can either have a fixed set of Logs, or can be set to dynamically retrieve the latest Logs
+if a Dataset and Version are provided.)
@@ -8856,8 +9032,10 @@ from humanloop import Humanloop
client = Humanloop(
api_key="YOUR_API_KEY",
)
-client.evaluations.pin_evaluatee(
+client.evaluations.add_logs_to_run(
id="id",
+ run_id="run_id",
+ log_ids=["log_ids"],
)
```
@@ -8882,7 +9060,7 @@ client.evaluations.pin_evaluatee(
-
-**version_id:** `typing.Optional[str]` — Unique identifier for the File Version. If provided, none of the other fields should be specified.
+**run_id:** `str` — Unique identifier for Run.
@@ -8890,7 +9068,7 @@ client.evaluations.pin_evaluatee(
-
-**path:** `typing.Optional[str]` — Path identifying a File. Provide either this or `file_id` if you want to specify a File.
+**log_ids:** `typing.Sequence[str]` — The IDs of the Logs to add to the Run.
@@ -8898,15 +9076,74 @@ client.evaluations.pin_evaluatee(
-
-**file_id:** `typing.Optional[str]` — Unique identifier for the File. Provide either this or `path` if you want to specify a File.
+**request_options:** `typing.Optional[RequestOptions]` — Request-specific configuration.
+
+
+
+
+
+
+
+
+client.evaluations.get_stats(...)
+
+-
+
+#### 📝 Description
+
+
+-
-
-**environment:** `typing.Optional[str]` — Name of the Environment a Version is deployed to. Only provide this when specifying a File. If not provided (and a File is specified), the default Environment is used.
+Get Evaluation Stats.
+
+Retrieve aggregate stats for the specified Evaluation.
+
+This includes the number of generated Logs for each Run and the
+corresponding Evaluator statistics (such as the mean and percentiles).
+
+
+
+
+
+#### 🔌 Usage
+
+
+-
+
+
+-
+
+```python
+from humanloop import Humanloop
+
+client = Humanloop(
+ api_key="YOUR_API_KEY",
+)
+client.evaluations.get_stats(
+ id="id",
+)
+
+```
+
+
+
+
+
+#### ⚙️ Parameters
+
+
+-
+
+
+-
+
+**id:** `str` — Unique identifier for Evaluation.
@@ -8914,15 +9151,85 @@ client.evaluations.pin_evaluatee(
-
-**batch_id:** `typing.Optional[str]` — Unique identifier for the batch of Logs to include in the Evaluation Report.
+**request_options:** `typing.Optional[RequestOptions]` — Request-specific configuration.
+
+
+
+
+
+
+
+client.evaluations.get_logs(...)
-
-**orchestrated:** `typing.Optional[bool]` — Whether the Prompt/Tool is orchestrated by Humanloop. Default is `True`. If `False`, a log for the Prompt/Tool should be submitted by the user via the API.
+#### 📝 Description
+
+
+-
+
+
+-
+
+Get the Logs associated to a specific Evaluation.
+
+
+
+
+
+#### 🔌 Usage
+
+
+-
+
+
+-
+
+```python
+from humanloop import Humanloop
+
+client = Humanloop(
+ api_key="YOUR_API_KEY",
+)
+client.evaluations.get_logs(
+ id="id",
+)
+
+```
+
+
+
+
+
+#### ⚙️ Parameters
+
+
+-
+
+
+-
+
+**id:** `str` — String ID of evaluation. Starts with `ev_` or `evr_`.
+
+
+
+
+
+-
+
+**page:** `typing.Optional[int]` — Page number for pagination.
+
+
+
+
+
+-
+
+**size:** `typing.Optional[int]` — Page size for pagination. Number of Logs to fetch.
diff --git a/src/humanloop/__init__.py b/src/humanloop/__init__.py
index f782b29d..ac2e2567 100644
--- a/src/humanloop/__init__.py
+++ b/src/humanloop/__init__.py
@@ -27,12 +27,13 @@
DirectoryWithParentsAndChildrenResponseFilesItem,
EnvironmentResponse,
EnvironmentTag,
- EvaluatedVersionResponse,
EvaluateeRequest,
EvaluateeResponse,
EvaluationEvaluatorResponse,
- EvaluationReportLogResponse,
+ EvaluationLogResponse,
EvaluationResponse,
+ EvaluationRunResponse,
+ EvaluationRunsResponse,
EvaluationStats,
EvaluationStatus,
EvaluationsDatasetRequest,
@@ -77,6 +78,7 @@
ListTools,
LlmEvaluatorRequest,
LogResponse,
+ LogsAssociationType,
ModelEndpoints,
ModelProviders,
MonitoringEvaluatorEnvironmentRequest,
@@ -86,7 +88,7 @@
NumericEvaluatorStatsResponse,
ObservabilityStatus,
OverallStats,
- PaginatedDataEvaluationReportLogResponse,
+ PaginatedDataEvaluationLogResponse,
PaginatedDataEvaluatorResponse,
PaginatedDataFlowResponse,
PaginatedDataLogResponse,
@@ -116,6 +118,9 @@
ProviderApiKeys,
ResponseFormat,
ResponseFormatType,
+ RunStatsResponse,
+ RunStatsResponseEvaluatorStatsItem,
+ RunVersionResponse,
SelectEvaluatorStatsResponse,
SortOrder,
TextChatContent,
@@ -139,6 +144,7 @@
VersionIdResponse,
VersionIdResponseVersion,
VersionReferenceResponse,
+ VersionSpecification,
VersionStatsResponse,
VersionStatsResponseEvaluatorVersionStatsItem,
VersionStatus,
@@ -191,12 +197,13 @@
DirectoryWithParentsAndChildrenResponseFilesItemParams,
DirectoryWithParentsAndChildrenResponseParams,
EnvironmentResponseParams,
- EvaluatedVersionResponseParams,
EvaluateeRequestParams,
EvaluateeResponseParams,
EvaluationEvaluatorResponseParams,
- EvaluationReportLogResponseParams,
+ EvaluationLogResponseParams,
EvaluationResponseParams,
+ EvaluationRunResponseParams,
+ EvaluationRunsResponseParams,
EvaluationStatsParams,
EvaluationsDatasetRequestParams,
EvaluationsRequestParams,
@@ -238,7 +245,7 @@
MonitoringEvaluatorVersionRequestParams,
NumericEvaluatorStatsResponseParams,
OverallStatsParams,
- PaginatedDataEvaluationReportLogResponseParams,
+ PaginatedDataEvaluationLogResponseParams,
PaginatedDataEvaluatorResponseParams,
PaginatedDataFlowResponseParams,
PaginatedDataLogResponseParams,
@@ -263,6 +270,9 @@
PromptResponseTemplateParams,
ProviderApiKeysParams,
ResponseFormatParams,
+ RunStatsResponseEvaluatorStatsItemParams,
+ RunStatsResponseParams,
+ RunVersionResponseParams,
SelectEvaluatorStatsResponseParams,
TextChatContentParams,
TextEvaluatorStatsResponseParams,
@@ -279,6 +289,7 @@
VersionIdResponseParams,
VersionIdResponseVersionParams,
VersionReferenceResponseParams,
+ VersionSpecificationParams,
VersionStatsResponseEvaluatorVersionStatsItemParams,
VersionStatsResponseParams,
)
@@ -337,18 +348,20 @@
"EnvironmentResponse",
"EnvironmentResponseParams",
"EnvironmentTag",
- "EvaluatedVersionResponse",
- "EvaluatedVersionResponseParams",
"EvaluateeRequest",
"EvaluateeRequestParams",
"EvaluateeResponse",
"EvaluateeResponseParams",
"EvaluationEvaluatorResponse",
"EvaluationEvaluatorResponseParams",
- "EvaluationReportLogResponse",
- "EvaluationReportLogResponseParams",
+ "EvaluationLogResponse",
+ "EvaluationLogResponseParams",
"EvaluationResponse",
"EvaluationResponseParams",
+ "EvaluationRunResponse",
+ "EvaluationRunResponseParams",
+ "EvaluationRunsResponse",
+ "EvaluationRunsResponseParams",
"EvaluationStats",
"EvaluationStatsParams",
"EvaluationStatus",
@@ -431,6 +444,7 @@
"LlmEvaluatorRequestParams",
"LogResponse",
"LogResponseParams",
+ "LogsAssociationType",
"ModelEndpoints",
"ModelProviders",
"MonitoringEvaluatorEnvironmentRequest",
@@ -445,8 +459,8 @@
"ObservabilityStatus",
"OverallStats",
"OverallStatsParams",
- "PaginatedDataEvaluationReportLogResponse",
- "PaginatedDataEvaluationReportLogResponseParams",
+ "PaginatedDataEvaluationLogResponse",
+ "PaginatedDataEvaluationLogResponseParams",
"PaginatedDataEvaluatorResponse",
"PaginatedDataEvaluatorResponseParams",
"PaginatedDataFlowResponse",
@@ -512,6 +526,12 @@
"ResponseFormat",
"ResponseFormatParams",
"ResponseFormatType",
+ "RunStatsResponse",
+ "RunStatsResponseEvaluatorStatsItem",
+ "RunStatsResponseEvaluatorStatsItemParams",
+ "RunStatsResponseParams",
+ "RunVersionResponse",
+ "RunVersionResponseParams",
"SelectEvaluatorStatsResponse",
"SelectEvaluatorStatsResponseParams",
"SortOrder",
@@ -554,6 +574,8 @@
"VersionIdResponseVersionParams",
"VersionReferenceResponse",
"VersionReferenceResponseParams",
+ "VersionSpecification",
+ "VersionSpecificationParams",
"VersionStatsResponse",
"VersionStatsResponseEvaluatorVersionStatsItem",
"VersionStatsResponseEvaluatorVersionStatsItemParams",
diff --git a/src/humanloop/core/client_wrapper.py b/src/humanloop/core/client_wrapper.py
index 04653533..4282222b 100644
--- a/src/humanloop/core/client_wrapper.py
+++ b/src/humanloop/core/client_wrapper.py
@@ -16,7 +16,7 @@ def get_headers(self) -> typing.Dict[str, str]:
headers: typing.Dict[str, str] = {
"X-Fern-Language": "Python",
"X-Fern-SDK-Name": "humanloop",
- "X-Fern-SDK-Version": "0.8.8",
+ "X-Fern-SDK-Version": "0.8.9",
}
headers["X-API-KEY"] = self.api_key
return headers
diff --git a/src/humanloop/datasets/client.py b/src/humanloop/datasets/client.py
index a4e5dd99..ddfede4d 100644
--- a/src/humanloop/datasets/client.py
+++ b/src/humanloop/datasets/client.py
@@ -602,6 +602,7 @@ def list_versions(
id: str,
*,
status: typing.Optional[VersionStatus] = None,
+ include_datapoints: typing.Optional[typing.Literal["latest_committed"]] = None,
request_options: typing.Optional[RequestOptions] = None,
) -> ListDatasets:
"""
@@ -615,6 +616,9 @@ def list_versions(
status : typing.Optional[VersionStatus]
Filter versions by status: 'uncommitted', 'committed'. If no status is provided, all versions are returned.
+ include_datapoints : typing.Optional[typing.Literal["latest_committed"]]
+ If set to 'latest_committed', include the Datapoints for the latest committed version. Defaults to `None`.
+
request_options : typing.Optional[RequestOptions]
Request-specific configuration.
@@ -640,6 +644,7 @@ def list_versions(
method="GET",
params={
"status": status,
+ "include_datapoints": include_datapoints,
},
request_options=request_options,
)
@@ -1647,6 +1652,7 @@ async def list_versions(
id: str,
*,
status: typing.Optional[VersionStatus] = None,
+ include_datapoints: typing.Optional[typing.Literal["latest_committed"]] = None,
request_options: typing.Optional[RequestOptions] = None,
) -> ListDatasets:
"""
@@ -1660,6 +1666,9 @@ async def list_versions(
status : typing.Optional[VersionStatus]
Filter versions by status: 'uncommitted', 'committed'. If no status is provided, all versions are returned.
+ include_datapoints : typing.Optional[typing.Literal["latest_committed"]]
+ If set to 'latest_committed', include the Datapoints for the latest committed version. Defaults to `None`.
+
request_options : typing.Optional[RequestOptions]
Request-specific configuration.
@@ -1693,6 +1702,7 @@ async def main() -> None:
method="GET",
params={
"status": status,
+ "include_datapoints": include_datapoints,
},
request_options=request_options,
)
diff --git a/src/humanloop/eval_utils.py b/src/humanloop/eval_utils.py
index e5112d19..c6bc3b98 100644
--- a/src/humanloop/eval_utils.py
+++ b/src/humanloop/eval_utils.py
@@ -18,7 +18,6 @@
from typing_extensions import NotRequired, TypedDict
import time
import sys
-import uuid
from concurrent.futures import ThreadPoolExecutor, as_completed
from .client import BaseHumanloop
@@ -41,11 +40,13 @@
from .types import ToolKernelRequest as Tool
from .types import BooleanEvaluatorStatsResponse as BooleanStats
from .types import NumericEvaluatorStatsResponse as NumericStats
-from .types import UpdateDatesetAction as UpdateDatasetAction # TODO: fix original type typo
+from .types import (
+ UpdateDatesetAction as UpdateDatasetAction,
+) # TODO: fix original type typo
from .types import DatapointResponse as Datapoint
from .types import (
EvaluationStats,
- VersionStatsResponse,
+ RunStatsResponse,
EvaluatorArgumentsType,
EvaluatorReturnTypeEnum,
EvaluationResponse,
@@ -61,7 +62,9 @@
if not logger.hasHandlers():
logger.addHandler(console_handler)
-EvaluatorDict = Union[CodeEvaluatorDict, LLMEvaluatorDict, HumanEvaluatorDict, ExternalEvaluator]
+EvaluatorDict = Union[
+ CodeEvaluatorDict, LLMEvaluatorDict, HumanEvaluatorDict, ExternalEvaluator
+]
Version = Union[FlowDict, PromptDict, ToolDict, EvaluatorDict]
FileType = Literal["flow", "prompt", "tool", "evaluator"]
@@ -202,9 +205,13 @@ def _run_eval(
function_ = file.pop("callable")
except KeyError as _:
if type_ == "flow":
- raise ValueError("You must provide a `callable` for your Flow `file` to run a local eval.")
+ raise ValueError(
+ "You must provide a `callable` for your Flow `file` to run a local eval."
+ )
else:
- logger.info(f"No `callable` provided for your {type_} file - will attempt to generate logs on Humanloop.")
+ logger.info(
+ f"No `callable` provided for your {type_} file - will attempt to generate logs on Humanloop."
+ )
custom_logger = file.pop("custom_logger", None)
file_dict = {**file, **version}
@@ -222,7 +229,9 @@ def _run_eval(
try:
_ = Prompt.parse_obj(version)
except ValidationError as error_:
- logger.error(msg=f"Invalid Prompt `version` in your `file` request. \n\nValidation error: \n)")
+ logger.error(
+ msg=f"Invalid Prompt `version` in your `file` request. \n\nValidation error: \n)"
+ )
raise error_
hl_file = client.prompts.upsert(**file_dict)
@@ -230,7 +239,9 @@ def _run_eval(
try:
_ = Tool.parse_obj(version)
except ValidationError as error_:
- logger.error(msg=f"Invalid Tool `version` in your `file` request. \n\nValidation error: \n)")
+ logger.error(
+ msg=f"Invalid Tool `version` in your `file` request. \n\nValidation error: \n)"
+ )
raise error_
hl_file = client.tools.upsert(**file_dict)
@@ -263,7 +274,9 @@ def _run_eval(
attributes={"code": inspect.getsource(eval_function)},
evaluator_type="external",
)
- _ = client.evaluators.upsert(id=evaluator.get("id"), path=evaluator.get("path"), spec=spec)
+ _ = client.evaluators.upsert(
+ id=evaluator.get("id"), path=evaluator.get("path"), spec=spec
+ )
# Validate upfront that the local Evaluators and Dataset fit
requires_target = False
@@ -286,7 +299,6 @@ def _run_eval(
try:
evaluation = client.evaluations.create(
name=name,
- dataset={"file_id": hl_dataset.id},
evaluators=[{"path": e["path"]} for e in evaluators],
file={"id": hl_file.id},
)
@@ -301,15 +313,22 @@ def _run_eval(
if not evaluation:
raise ValueError(f"Evaluation with name {name} not found.")
- # Every run will generate a new batch of logs
- batch_id = uuid.uuid4().hex[:10] # ignore risk of collision
+ # Create a new Run
+ run = client.evaluations.create_run(
+ id=evaluation.id,
+ dataset={"file_id": hl_dataset.id},
+ logs="fixed",
+ orchestrated=False,
+ )
+
+ # Every Run will generate a new batch of Logs
+ run_id = run.id
log_func = _get_log_func(
client=client,
type_=type_,
file_id=hl_file.id,
version_id=hl_file.version_id,
- evaluation_id=evaluation.id,
- batch_id=batch_id,
+ run_id=run_id,
)
# Define the function to execute your function in parallel and Log to Humanloop
@@ -318,7 +337,9 @@ def process_datapoint(datapoint: Datapoint):
datapoint_dict = datapoint.dict()
try:
if "messages" in datapoint_dict:
- output = function_(**datapoint_dict["inputs"], messages=datapoint_dict["messages"])
+ output = function_(
+ **datapoint_dict["inputs"], messages=datapoint_dict["messages"]
+ )
else:
output = function_(**datapoint_dict["inputs"])
if custom_logger:
@@ -343,7 +364,9 @@ def process_datapoint(datapoint: Datapoint):
start_time=start_time,
end_time=datetime.now(),
)
- logger.warning(msg=f"\nYour {type_}'s `callable` failed for Datapoint: {datapoint.id}. \n Error: {str(e)}")
+ logger.warning(
+ msg=f"\nYour {type_}'s `callable` failed for Datapoint: {datapoint.id}. \n Error: {str(e)}"
+ )
# Apply local Evaluators
for local_evaluator in local_evaluators:
@@ -376,28 +399,35 @@ def process_datapoint(datapoint: Datapoint):
start_time=start_time,
end_time=datetime.now(),
)
- logger.warning(f"\nEvaluator {local_evaluator['path']} failed with error {str(e)}")
+ logger.warning(
+ f"\nEvaluator {local_evaluator['path']} failed with error {str(e)}"
+ )
# Execute the function and send the logs to Humanloop in parallel
total_datapoints = len(hl_dataset.datapoints)
logger.info(f"\n{CYAN}Navigate to your Evaluation:{RESET}\n{evaluation.url}\n")
logger.info(f"{CYAN}{type_.capitalize()} Version ID: {hl_file.version_id}{RESET}")
- logger.info(f"{CYAN}Run ID: {batch_id}{RESET}")
+ logger.info(f"{CYAN}Run ID: {run_id}{RESET}")
# Generate locally if a file `callable` is provided
if function_:
logger.info(
- f"{CYAN}\nRunning {hl_file.name} over the Dataset {hl_dataset.name} using {workers} workers{RESET} "
+ f"{CYAN}\nRunning '{hl_file.name}' over the Dataset '{hl_dataset.name}' using {workers} workers{RESET} "
)
completed_tasks = 0
with ThreadPoolExecutor(max_workers=workers) as executor:
- futures = [executor.submit(process_datapoint, datapoint) for datapoint in hl_dataset.datapoints]
+ futures = [
+ executor.submit(process_datapoint, datapoint)
+ for datapoint in hl_dataset.datapoints
+ ]
for _ in as_completed(futures):
completed_tasks += 1
_progress_bar(total_datapoints, completed_tasks)
else:
# TODO: trigger run when updated API is available
- logger.info(f"{CYAN}\nRunning {hl_file.name} over the Dataset {hl_dataset.name}{RESET}")
+ logger.info(
+ f"{CYAN}\nRunning '{hl_file.name}' over the Dataset '{hl_dataset.name}'{RESET}"
+ )
# Wait for the Evaluation to complete then print the results
complete = False
@@ -413,39 +443,43 @@ def process_datapoint(datapoint: Datapoint):
logger.info(stats.report)
checks: List[EvaluatorCheck] = []
- if all(evaluator.get("threshold") is None for evaluator in evaluators) and len(stats.version_stats) == 1:
- # Skip `check_evaluation_improvement` if no thresholds were provided and there is only one run.
- # (Or the logs would not be helpful)
- return checks
- for evaluator in evaluators:
- _, score, delta = check_evaluation_improvement(
- evaluation=evaluation,
- stats=stats,
- evaluator_path=evaluator["path"],
- batch_id=batch_id,
- )
- threshold_check = None
- threshold = evaluator.get("threshold")
- if threshold is not None:
- threshold_check = check_evaluation_threshold(
+
+ # Skip `check_evaluation_improvement` if no thresholds were provided and there is only one run.
+ # (Or the logs would not be helpful)
+ if (
+ any(evaluator.get("threshold") is not None for evaluator in evaluators)
+ or len(stats.run_stats) > 1
+ ):
+ for evaluator in evaluators:
+ _, score, delta = check_evaluation_improvement(
evaluation=evaluation,
stats=stats,
evaluator_path=evaluator["path"],
- threshold=threshold,
- batch_id=batch_id,
+ run_id=run_id,
)
- checks.append(
- EvaluatorCheck(
- path=evaluator["path"],
- # TODO: Add back in with number valence on Evaluators
- # improvement_check=improvement_check,
- score=score,
- delta=delta,
- threshold=threshold,
- threshold_check=threshold_check,
- evaluation_id=evaluation.id,
+ threshold_check = None
+ threshold = evaluator.get("threshold")
+ if threshold is not None:
+ threshold_check = check_evaluation_threshold(
+ evaluation=evaluation,
+ stats=stats,
+ evaluator_path=evaluator["path"],
+ threshold=threshold,
+ run_id=run_id,
+ )
+ checks.append(
+ EvaluatorCheck(
+ path=evaluator["path"],
+ # TODO: Add back in with number valence on Evaluators
+ # improvement_check=improvement_check,
+ score=score,
+ delta=delta,
+ threshold=threshold,
+ threshold_check=threshold_check,
+ evaluation_id=evaluation.id,
+ )
)
- )
+
logger.info(f"\n{CYAN}View your Evaluation:{RESET}\n{evaluation.url}\n")
return checks
@@ -455,8 +489,7 @@ def _get_log_func(
type_: FileType,
file_id: str,
version_id: str,
- evaluation_id: str,
- batch_id: str,
+ run_id: str,
) -> Callable:
"""Returns the appropriate log function pre-filled with common parameters."""
log_request = {
@@ -464,8 +497,7 @@ def _get_log_func(
# Why are both `id` and `version_id` needed in the API?
"id": file_id,
"version_id": version_id,
- "evaluation_id": evaluation_id,
- "batch_id": batch_id,
+ "run_id": run_id,
}
if type_ == "flow":
return partial(client.flows.log, **log_request, trace_status="complete")
@@ -479,7 +511,9 @@ def _get_log_func(
raise NotImplementedError(f"Unsupported File version: {type_}")
-def get_score_from_evaluator_stat(stat: Union[NumericStats, BooleanStats]) -> Union[float, None]:
+def get_score_from_evaluator_stat(
+ stat: Union[NumericStats, BooleanStats],
+) -> Union[float, None]:
"""Get the score from an Evaluator Stat."""
score = None
if isinstance(stat, BooleanStats):
@@ -526,14 +560,18 @@ def _progress_bar(total: int, progress: int):
def get_evaluator_stats_by_path(
- stat: VersionStatsResponse, evaluation: EvaluationResponse
+ stat: RunStatsResponse, evaluation: EvaluationResponse
) -> Dict[str, Union[NumericStats, BooleanStats]]:
"""Get the Evaluator stats by path."""
# TODO: Update the API so this is not necessary
- evaluators_by_id = {evaluator.version.version_id: evaluator for evaluator in evaluation.evaluators}
+ evaluators_by_id = {
+ evaluator.version.version_id: evaluator for evaluator in evaluation.evaluators
+ }
evaluator_stats_by_path = {
- evaluators_by_id[evaluator_stat.evaluator_version_id].version.path: evaluator_stat
- for evaluator_stat in stat.evaluator_version_stats
+ evaluators_by_id[
+ evaluator_stat.evaluator_version_id
+ ].version.path: evaluator_stat
+ for evaluator_stat in stat.evaluator_stats
}
return evaluator_stats_by_path
@@ -543,12 +581,13 @@ def check_evaluation_threshold(
stats: EvaluationStats,
evaluator_path: str,
threshold: float,
- batch_id: str,
+ run_id: str,
) -> bool:
"""Checks if the latest version has an average Evaluator result above a threshold."""
# TODO: Update the API so this is not necessary
evaluator_stats_by_path = get_evaluator_stats_by_path(
- stat=next((stat for stat in stats.version_stats if stat.batch_id == batch_id), None), evaluation=evaluation
+ stat=next((stat for stat in stats.run_stats if stat.run_id == run_id), None),
+ evaluation=evaluation,
)
if evaluator_path in evaluator_stats_by_path:
evaluator_stat = evaluator_stats_by_path[evaluator_path]
@@ -571,7 +610,7 @@ def check_evaluation_improvement(
evaluation: EvaluationResponse,
evaluator_path: str,
stats: EvaluationStats,
- batch_id: str,
+ run_id: str,
) -> Tuple[bool, float, float]:
"""
Check the latest version has improved across for a specific Evaluator.
@@ -581,24 +620,34 @@ def check_evaluation_improvement(
# TODO: Update the API so this is not necessary
latest_evaluator_stats_by_path = get_evaluator_stats_by_path(
- stat=next((stat for stat in stats.version_stats if stat.batch_id == batch_id), None), evaluation=evaluation
+ stat=next((stat for stat in stats.run_stats if stat.run_id == run_id), None),
+ evaluation=evaluation,
)
- if len(stats.version_stats) == 1:
+ if len(stats.run_stats) == 1:
logger.info(f"{YELLOW}⚠️ No previous versions to compare with.{RESET}")
return True, 0, 0
- previous_evaluator_stats_by_path = get_evaluator_stats_by_path(stat=stats.version_stats[-2], evaluation=evaluation)
- if evaluator_path in latest_evaluator_stats_by_path and evaluator_path in previous_evaluator_stats_by_path:
+ previous_evaluator_stats_by_path = get_evaluator_stats_by_path(
+ stat=stats.run_stats[-2], evaluation=evaluation
+ )
+ if (
+ evaluator_path in latest_evaluator_stats_by_path
+ and evaluator_path in previous_evaluator_stats_by_path
+ ):
latest_evaluator_stat = latest_evaluator_stats_by_path[evaluator_path]
previous_evaluator_stat = previous_evaluator_stats_by_path[evaluator_path]
latest_score = get_score_from_evaluator_stat(stat=latest_evaluator_stat)
previous_score = get_score_from_evaluator_stat(stat=previous_evaluator_stat)
diff = round(latest_score - previous_score, 2)
if diff >= 0:
- logger.info(f"{CYAN}Change of [{diff}] for Evaluator {evaluator_path}{RESET}")
+ logger.info(
+ f"{CYAN}Change of [{diff}] for Evaluator {evaluator_path}{RESET}"
+ )
return True, latest_score, diff
else:
- logger.info(f"{CYAN}Change of [{diff}] for Evaluator {evaluator_path}{RESET}")
+ logger.info(
+ f"{CYAN}Change of [{diff}] for Evaluator {evaluator_path}{RESET}"
+ )
return False, latest_score, diff
else:
raise ValueError(f"Evaluator {evaluator_path} not found in the stats.")
diff --git a/src/humanloop/evaluations/client.py b/src/humanloop/evaluations/client.py
index 92cf4033..c2190762 100644
--- a/src/humanloop/evaluations/client.py
+++ b/src/humanloop/evaluations/client.py
@@ -11,15 +11,17 @@
from ..types.http_validation_error import HttpValidationError
from json.decoder import JSONDecodeError
from ..core.api_error import ApiError
-from ..requests.evaluations_dataset_request import EvaluationsDatasetRequestParams
from ..requests.evaluations_request import EvaluationsRequestParams
-from ..requests.evaluatee_request import EvaluateeRequestParams
from ..requests.file_request import FileRequestParams
from ..core.serialization import convert_and_respect_annotation_metadata
from ..core.jsonable_encoder import jsonable_encoder
-from ..types.evaluation_status import EvaluationStatus
+from ..types.evaluation_runs_response import EvaluationRunsResponse
+from ..requests.evaluations_dataset_request import EvaluationsDatasetRequestParams
+from ..requests.version_specification import VersionSpecificationParams
+from ..types.logs_association_type import LogsAssociationType
+from ..types.evaluation_run_response import EvaluationRunResponse
from ..types.evaluation_stats import EvaluationStats
-from ..types.paginated_data_evaluation_report_log_response import PaginatedDataEvaluationReportLogResponse
+from ..types.paginated_data_evaluation_log_response import PaginatedDataEvaluationLogResponse
from ..core.client_wrapper import AsyncClientWrapper
from ..core.pagination import AsyncPager
@@ -127,44 +129,30 @@ def list(
def create(
self,
*,
- dataset: EvaluationsDatasetRequestParams,
evaluators: typing.Sequence[EvaluationsRequestParams],
- evaluatees: typing.Optional[typing.Sequence[EvaluateeRequestParams]] = OMIT,
- name: typing.Optional[str] = OMIT,
file: typing.Optional[FileRequestParams] = OMIT,
+ name: typing.Optional[str] = OMIT,
request_options: typing.Optional[RequestOptions] = None,
) -> EvaluationResponse:
"""
Create an Evaluation.
- Create a new Evaluation by specifying the Dataset, versions to be
- evaluated (Evaluatees), and which Evaluators to provide judgments.
+ Create an Evaluation by specifying the File to evaluate, and a name
+ for the Evaluation.
- Humanloop will automatically start generating Logs and running Evaluators where
- `orchestrated=true`. If you own the runtime for the Evaluatee or Evaluator, you
- can set `orchestrated=false` and then generate and submit the required logs using
- your runtime.
-
- To keep updated on the progress of the Evaluation, you can poll the Evaluation using
- the `GET /evaluations/:id` endpoint and check its status.
+ You can then add Runs to this Evaluation using the `POST /evaluations/{id}/runs` endpoint.
Parameters
----------
- dataset : EvaluationsDatasetRequestParams
- Dataset to use in this Evaluation.
-
evaluators : typing.Sequence[EvaluationsRequestParams]
The Evaluators used to evaluate.
- evaluatees : typing.Optional[typing.Sequence[EvaluateeRequestParams]]
- Unique identifiers for the Prompt/Tool Versions to include in the Evaluation. Can be left unpopulated if you wish to add Evaluatees to this Evaluation by specifying `evaluation_id` in Log calls.
+ file : typing.Optional[FileRequestParams]
+ The File to associate with the Evaluation. This File contains the Logs you're evaluating.
name : typing.Optional[str]
Name of the Evaluation to help identify it. Must be unique within the associated File.
- file : typing.Optional[FileRequestParams]
- The File to associate with the Evaluation.
-
request_options : typing.Optional[RequestOptions]
Request-specific configuration.
@@ -181,30 +169,20 @@ def create(
api_key="YOUR_API_KEY",
)
client.evaluations.create(
- dataset={"version_id": "dsv_6L78pqrdFi2xa"},
- evaluatees=[
- {"version_id": "prv_7ZlQREDScH0xkhUwtXruN", "orchestrated": False}
- ],
- evaluators=[{"version_id": "evv_012def", "orchestrated": False}],
+ evaluators=[{}],
)
"""
_response = self._client_wrapper.httpx_client.request(
"evaluations",
method="POST",
json={
- "dataset": convert_and_respect_annotation_metadata(
- object_=dataset, annotation=EvaluationsDatasetRequestParams, direction="write"
- ),
- "evaluatees": convert_and_respect_annotation_metadata(
- object_=evaluatees, annotation=typing.Sequence[EvaluateeRequestParams], direction="write"
+ "file": convert_and_respect_annotation_metadata(
+ object_=file, annotation=FileRequestParams, direction="write"
),
+ "name": name,
"evaluators": convert_and_respect_annotation_metadata(
object_=evaluators, annotation=typing.Sequence[EvaluationsRequestParams], direction="write"
),
- "name": name,
- "file": convert_and_respect_annotation_metadata(
- object_=file, annotation=FileRequestParams, direction="write"
- ),
},
request_options=request_options,
omit=OMIT,
@@ -233,15 +211,27 @@ def create(
raise ApiError(status_code=_response.status_code, body=_response.text)
raise ApiError(status_code=_response.status_code, body=_response_json)
- def get(self, id: str, *, request_options: typing.Optional[RequestOptions] = None) -> EvaluationResponse:
+ def add_evaluators(
+ self,
+ id: str,
+ *,
+ evaluators: typing.Sequence[EvaluationsRequestParams],
+ request_options: typing.Optional[RequestOptions] = None,
+ ) -> EvaluationResponse:
"""
- Get an Evaluation.
+ Add Evaluators to an Evaluation.
+
+ Add new Evaluators to an Evaluation. The Evaluators will be run on the Logs
+ generated for the Evaluation.
Parameters
----------
id : str
Unique identifier for Evaluation.
+ evaluators : typing.Sequence[EvaluationsRequestParams]
+ The Evaluators to add to this Evaluation.
+
request_options : typing.Optional[RequestOptions]
Request-specific configuration.
@@ -257,14 +247,21 @@ def get(self, id: str, *, request_options: typing.Optional[RequestOptions] = Non
client = Humanloop(
api_key="YOUR_API_KEY",
)
- client.evaluations.get(
- id="ev_567yza",
+ client.evaluations.add_evaluators(
+ id="id",
+ evaluators=[{}],
)
"""
_response = self._client_wrapper.httpx_client.request(
- f"evaluations/{jsonable_encoder(id)}",
- method="GET",
+ f"evaluations/{jsonable_encoder(id)}/evaluators",
+ method="POST",
+ json={
+ "evaluators": convert_and_respect_annotation_metadata(
+ object_=evaluators, annotation=typing.Sequence[EvaluationsRequestParams], direction="write"
+ ),
+ },
request_options=request_options,
+ omit=OMIT,
)
try:
if 200 <= _response.status_code < 300:
@@ -290,24 +287,30 @@ def get(self, id: str, *, request_options: typing.Optional[RequestOptions] = Non
raise ApiError(status_code=_response.status_code, body=_response.text)
raise ApiError(status_code=_response.status_code, body=_response_json)
- def delete(self, id: str, *, request_options: typing.Optional[RequestOptions] = None) -> None:
+ def remove_evaluator(
+ self, id: str, evaluator_version_id: str, *, request_options: typing.Optional[RequestOptions] = None
+ ) -> EvaluationResponse:
"""
- Delete an Evaluation.
+ Remove an Evaluator from an Evaluation.
- Remove an Evaluation from Humanloop. The Logs and Versions used in the Evaluation
- will not be deleted.
+ Remove an Evaluator from an Evaluation. The Evaluator will no longer be run on the Logs
+ generated for the Evaluation.
Parameters
----------
id : str
Unique identifier for Evaluation.
+ evaluator_version_id : str
+ Unique identifier for Evaluator Version.
+
request_options : typing.Optional[RequestOptions]
Request-specific configuration.
Returns
-------
- None
+ EvaluationResponse
+ Successful Response
Examples
--------
@@ -316,18 +319,25 @@ def delete(self, id: str, *, request_options: typing.Optional[RequestOptions] =
client = Humanloop(
api_key="YOUR_API_KEY",
)
- client.evaluations.delete(
- id="ev_567yza",
+ client.evaluations.remove_evaluator(
+ id="id",
+ evaluator_version_id="evaluator_version_id",
)
"""
_response = self._client_wrapper.httpx_client.request(
- f"evaluations/{jsonable_encoder(id)}",
+ f"evaluations/{jsonable_encoder(id)}/evaluators/{jsonable_encoder(evaluator_version_id)}",
method="DELETE",
request_options=request_options,
)
try:
if 200 <= _response.status_code < 300:
- return
+ return typing.cast(
+ EvaluationResponse,
+ construct_type(
+ type_=EvaluationResponse, # type: ignore
+ object_=_response.json(),
+ ),
+ )
if _response.status_code == 422:
raise UnprocessableEntityError(
typing.cast(
@@ -343,43 +353,15 @@ def delete(self, id: str, *, request_options: typing.Optional[RequestOptions] =
raise ApiError(status_code=_response.status_code, body=_response.text)
raise ApiError(status_code=_response.status_code, body=_response_json)
- def update_setup(
- self,
- id: str,
- *,
- dataset: typing.Optional[EvaluationsDatasetRequestParams] = OMIT,
- evaluatees: typing.Optional[typing.Sequence[EvaluateeRequestParams]] = OMIT,
- evaluators: typing.Optional[typing.Sequence[EvaluationsRequestParams]] = OMIT,
- name: typing.Optional[str] = OMIT,
- file: typing.Optional[FileRequestParams] = OMIT,
- request_options: typing.Optional[RequestOptions] = None,
- ) -> EvaluationResponse:
+ def get(self, id: str, *, request_options: typing.Optional[RequestOptions] = None) -> EvaluationResponse:
"""
- Update an Evaluation.
-
- Update the setup of an Evaluation by specifying the Dataset, versions to be
- evaluated (Evaluatees), and which Evaluators to provide judgments.
+ Get an Evaluation.
Parameters
----------
id : str
Unique identifier for Evaluation.
- dataset : typing.Optional[EvaluationsDatasetRequestParams]
- Dataset to use in this Evaluation.
-
- evaluatees : typing.Optional[typing.Sequence[EvaluateeRequestParams]]
- Unique identifiers for the Prompt/Tool Versions to include in the Evaluation. Can be left unpopulated if you wish to add evaluatees to this Evaluation by specifying `evaluation_id` in Log calls.
-
- evaluators : typing.Optional[typing.Sequence[EvaluationsRequestParams]]
- The Evaluators used to evaluate.
-
- name : typing.Optional[str]
- Name of the Evaluation to help identify it. Must be unique within the associated File.
-
- file : typing.Optional[FileRequestParams]
- The File to associate with the Evaluation.
-
request_options : typing.Optional[RequestOptions]
Request-specific configuration.
@@ -395,35 +377,14 @@ def update_setup(
client = Humanloop(
api_key="YOUR_API_KEY",
)
- client.evaluations.update_setup(
+ client.evaluations.get(
id="ev_567yza",
- dataset={"version_id": "dsv_6L78pqrdFi2xa"},
- evaluatees=[
- {"version_id": "prv_7ZlQREDScH0xkhUwtXruN", "orchestrated": False}
- ],
- evaluators=[{"version_id": "evv_012def", "orchestrated": False}],
)
"""
_response = self._client_wrapper.httpx_client.request(
f"evaluations/{jsonable_encoder(id)}",
- method="PATCH",
- json={
- "dataset": convert_and_respect_annotation_metadata(
- object_=dataset, annotation=EvaluationsDatasetRequestParams, direction="write"
- ),
- "evaluatees": convert_and_respect_annotation_metadata(
- object_=evaluatees, annotation=typing.Sequence[EvaluateeRequestParams], direction="write"
- ),
- "evaluators": convert_and_respect_annotation_metadata(
- object_=evaluators, annotation=typing.Sequence[EvaluationsRequestParams], direction="write"
- ),
- "name": name,
- "file": convert_and_respect_annotation_metadata(
- object_=file, annotation=FileRequestParams, direction="write"
- ),
- },
+ method="GET",
request_options=request_options,
- omit=OMIT,
)
try:
if 200 <= _response.status_code < 300:
@@ -449,29 +410,24 @@ def update_setup(
raise ApiError(status_code=_response.status_code, body=_response.text)
raise ApiError(status_code=_response.status_code, body=_response_json)
- def update_status(
- self, id: str, *, status: EvaluationStatus, request_options: typing.Optional[RequestOptions] = None
- ) -> EvaluationResponse:
+ def delete(self, id: str, *, request_options: typing.Optional[RequestOptions] = None) -> None:
"""
- Update the status of an Evaluation.
+ Delete an Evaluation.
- Can be used to cancel a running Evaluation, or mark an Evaluation that uses
- external or human evaluators as completed.
+ Remove an Evaluation from Humanloop. The Logs and Versions used in the Evaluation
+ will not be deleted.
Parameters
----------
id : str
Unique identifier for Evaluation.
- status : EvaluationStatus
-
request_options : typing.Optional[RequestOptions]
Request-specific configuration.
Returns
-------
- EvaluationResponse
- Successful Response
+ None
Examples
--------
@@ -480,29 +436,18 @@ def update_status(
client = Humanloop(
api_key="YOUR_API_KEY",
)
- client.evaluations.update_status(
- id="id",
- status="pending",
+ client.evaluations.delete(
+ id="ev_567yza",
)
"""
_response = self._client_wrapper.httpx_client.request(
- f"evaluations/{jsonable_encoder(id)}/status",
- method="PATCH",
- json={
- "status": status,
- },
+ f"evaluations/{jsonable_encoder(id)}",
+ method="DELETE",
request_options=request_options,
- omit=OMIT,
)
try:
if 200 <= _response.status_code < 300:
- return typing.cast(
- EvaluationResponse,
- construct_type(
- type_=EvaluationResponse, # type: ignore
- object_=_response.json(),
- ),
- )
+ return
if _response.status_code == 422:
raise UnprocessableEntityError(
typing.cast(
@@ -518,13 +463,11 @@ def update_status(
raise ApiError(status_code=_response.status_code, body=_response.text)
raise ApiError(status_code=_response.status_code, body=_response_json)
- def get_stats(self, id: str, *, request_options: typing.Optional[RequestOptions] = None) -> EvaluationStats:
+ def list_runs_for_evaluation(
+ self, id: str, *, request_options: typing.Optional[RequestOptions] = None
+ ) -> EvaluationRunsResponse:
"""
- Get Evaluation Stats.
-
- Retrieve aggregate stats for the specified Evaluation.
- This includes the number of generated Logs for each evaluated version and the
- corresponding Evaluator statistics (such as the mean and percentiles).
+ List all Runs for an Evaluation.
Parameters
----------
@@ -536,7 +479,7 @@ def get_stats(self, id: str, *, request_options: typing.Optional[RequestOptions]
Returns
-------
- EvaluationStats
+ EvaluationRunsResponse
Successful Response
Examples
@@ -546,21 +489,21 @@ def get_stats(self, id: str, *, request_options: typing.Optional[RequestOptions]
client = Humanloop(
api_key="YOUR_API_KEY",
)
- client.evaluations.get_stats(
+ client.evaluations.list_runs_for_evaluation(
id="id",
)
"""
_response = self._client_wrapper.httpx_client.request(
- f"evaluations/{jsonable_encoder(id)}/stats",
+ f"evaluations/{jsonable_encoder(id)}/runs",
method="GET",
request_options=request_options,
)
try:
if 200 <= _response.status_code < 300:
return typing.cast(
- EvaluationStats,
+ EvaluationRunsResponse,
construct_type(
- type_=EvaluationStats, # type: ignore
+ type_=EvaluationRunsResponse, # type: ignore
object_=_response.json(),
),
)
@@ -579,37 +522,60 @@ def get_stats(self, id: str, *, request_options: typing.Optional[RequestOptions]
raise ApiError(status_code=_response.status_code, body=_response.text)
raise ApiError(status_code=_response.status_code, body=_response_json)
- def get_logs(
+ def create_run(
self,
id: str,
*,
- page: typing.Optional[int] = None,
- size: typing.Optional[int] = None,
+ dataset: typing.Optional[EvaluationsDatasetRequestParams] = OMIT,
+ version: typing.Optional[VersionSpecificationParams] = OMIT,
+ orchestrated: typing.Optional[bool] = OMIT,
+ logs: typing.Optional[LogsAssociationType] = OMIT,
request_options: typing.Optional[RequestOptions] = None,
- ) -> PaginatedDataEvaluationReportLogResponse:
+ ) -> EvaluationRunResponse:
"""
- Get the Logs associated to a specific Evaluation.
+ Create an Evaluation Run.
- Each Datapoint in your Dataset will have a corresponding Log for each File version evaluated.
- e.g. If you have 50 Datapoints and are evaluating 2 Prompts, there will be 100 Logs associated with the Evaluation.
+ Create a new Evaluation Run. Optionally specify the Dataset and version to be
+ evaluated.
+
+ Humanloop will automatically start generating Logs and running Evaluators where
+ `orchestrated=true`. If you are generating Logs yourself, you can set `orchestrated=false`
+ and then generate and submit the required Logs via the API.
+
+ The `logs` parameter controls which Logs are associated with the Run. Defaults to `dynamic`
+ if `dataset` and `version` are provided. This means that Logs will automatically be retrieved
+ if they're associated with the specified Version and has `source_datapoint_id` referencing
+ a datapoint in the specified Dataset.
+ If `logs` is set to `fixed`, no existing Logs will be automatically associated with the Run.
+ You can then add Logs to the Run using the `POST /evaluations/{id}/runs/{run_id}/logs` endpoint,
+ or by adding `run_id` to your `POST /prompts/logs` requests.
+
+ To keep updated on the progress of the Run, you can poll the Run using
+ the `GET /evaluations/{id}/runs` endpoint and check its status.
Parameters
----------
id : str
- String ID of evaluation. Starts with `ev_` or `evr_`.
+ Unique identifier for Evaluation.
- page : typing.Optional[int]
- Page number for pagination.
+ dataset : typing.Optional[EvaluationsDatasetRequestParams]
+ Dataset to use in this Run.
- size : typing.Optional[int]
- Page size for pagination. Number of Logs to fetch.
+ version : typing.Optional[VersionSpecificationParams]
+ Version to use in this Run.
+
+ orchestrated : typing.Optional[bool]
+ Whether the Run is orchestrated by Humanloop. If `True`, Humanloop will generate Logs for the Run; `dataset` and `version` must be provided. If `False`, a log for the Prompt/Tool should be submitted by the user via the API.
+
+ logs : typing.Optional[LogsAssociationType]
+ How the Logs are associated with the Run. If `dynamic`, the latest relevant Logs will be inferred from the Dataset and Version. If `fixed`, the Logs will be explicitly associated. You can provide a list of Log IDs to associate with the Run, or add them to the Run later. Defaults to `dynamic` if `dataset` and `version` are provided; otherwise, defaults to `fixed`.
request_options : typing.Optional[RequestOptions]
Request-specific configuration.
Returns
-------
- PaginatedDataEvaluationReportLogResponse
+ EvaluationRunResponse
Successful Response
Examples
@@ -619,25 +585,32 @@ def get_logs(
client = Humanloop(
api_key="YOUR_API_KEY",
)
- client.evaluations.get_logs(
+ client.evaluations.create_run(
id="id",
)
"""
_response = self._client_wrapper.httpx_client.request(
- f"evaluations/{jsonable_encoder(id)}/logs",
- method="GET",
- params={
- "page": page,
- "size": size,
+ f"evaluations/{jsonable_encoder(id)}/runs",
+ method="POST",
+ json={
+ "dataset": convert_and_respect_annotation_metadata(
+ object_=dataset, annotation=EvaluationsDatasetRequestParams, direction="write"
+ ),
+ "version": convert_and_respect_annotation_metadata(
+ object_=version, annotation=VersionSpecificationParams, direction="write"
+ ),
+ "orchestrated": orchestrated,
+ "logs": logs,
},
request_options=request_options,
+ omit=OMIT,
)
try:
if 200 <= _response.status_code < 300:
return typing.cast(
- PaginatedDataEvaluationReportLogResponse,
+ EvaluationRunResponse,
construct_type(
- type_=PaginatedDataEvaluationReportLogResponse, # type: ignore
+ type_=EvaluationRunResponse, # type: ignore
object_=_response.json(),
),
)
@@ -656,53 +629,26 @@ def get_logs(
raise ApiError(status_code=_response.status_code, body=_response.text)
raise ApiError(status_code=_response.status_code, body=_response_json)
- def pin_evaluatee(
- self,
- id: str,
- *,
- version_id: typing.Optional[str] = OMIT,
- path: typing.Optional[str] = OMIT,
- file_id: typing.Optional[str] = OMIT,
- environment: typing.Optional[str] = OMIT,
- batch_id: typing.Optional[str] = OMIT,
- orchestrated: typing.Optional[bool] = OMIT,
- request_options: typing.Optional[RequestOptions] = None,
- ) -> EvaluationResponse:
+ def add_existing_run(
+ self, id: str, run_id: str, *, request_options: typing.Optional[RequestOptions] = None
+ ) -> typing.Optional[typing.Any]:
"""
- Pin the specified Evaluatee.
-
- Pinned Evaluatees are always displayed in the Evaluation Overview,
- and serve as the baseline for comparison with other Evaluatees.
+ Add an existing Run to an Evaluation.
Parameters
----------
id : str
Unique identifier for Evaluation.
- version_id : typing.Optional[str]
- Unique identifier for the File Version. If provided, none of the other fields should be specified.
-
- path : typing.Optional[str]
- Path identifying a File. Provide either this or `file_id` if you want to specify a File.
-
- file_id : typing.Optional[str]
- Unique identifier for the File. Provide either this or `path` if you want to specify a File.
-
- environment : typing.Optional[str]
- Name of the Environment a Version is deployed to. Only provide this when specifying a File. If not provided (and a File is specified), the default Environment is used.
-
- batch_id : typing.Optional[str]
- Unique identifier for the batch of Logs to include in the Evaluation Report.
-
- orchestrated : typing.Optional[bool]
- Whether the Prompt/Tool is orchestrated by Humanloop. Default is `True`. If `False`, a log for the Prompt/Tool should be submitted by the user via the API.
+ run_id : str
+ Unique identifier for Run.
request_options : typing.Optional[RequestOptions]
Request-specific configuration.
Returns
-------
- EvaluationResponse
+ typing.Optional[typing.Any]
Successful Response
Examples
@@ -712,30 +658,22 @@ def pin_evaluatee(
client = Humanloop(
api_key="YOUR_API_KEY",
)
- client.evaluations.pin_evaluatee(
+ client.evaluations.add_existing_run(
id="id",
+ run_id="run_id",
)
"""
_response = self._client_wrapper.httpx_client.request(
- f"evaluations/{jsonable_encoder(id)}/pin-evaluatee",
+ f"evaluations/{jsonable_encoder(id)}/runs/{jsonable_encoder(run_id)}",
method="POST",
- json={
- "version_id": version_id,
- "path": path,
- "file_id": file_id,
- "environment": environment,
- "batch_id": batch_id,
- "orchestrated": orchestrated,
- },
request_options=request_options,
- omit=OMIT,
)
try:
if 200 <= _response.status_code < 300:
return typing.cast(
- EvaluationResponse,
+ typing.Optional[typing.Any],
construct_type(
- type_=EvaluationResponse, # type: ignore
+ type_=typing.Optional[typing.Any], # type: ignore
object_=_response.json(),
),
)
@@ -754,97 +692,889 @@ def pin_evaluatee(
raise ApiError(status_code=_response.status_code, body=_response.text)
raise ApiError(status_code=_response.status_code, body=_response_json)
-
-class AsyncEvaluationsClient:
- def __init__(self, *, client_wrapper: AsyncClientWrapper):
- self._client_wrapper = client_wrapper
-
- async def list(
- self,
- *,
- file_id: str,
- page: typing.Optional[int] = None,
- size: typing.Optional[int] = None,
- request_options: typing.Optional[RequestOptions] = None,
- ) -> AsyncPager[EvaluationResponse]:
+ def remove_run_from_evaluation(
+ self, id: str, run_id: str, *, request_options: typing.Optional[RequestOptions] = None
+ ) -> None:
"""
- List all Evaluations for the specified `file_id`.
+ Remove a Run from an Evaluation.
- Retrieve a list of Evaluations that evaluate versions of the specified File.
+ Remove a Run from an Evaluation. The Logs and Versions used in the Run will not be deleted.
+ If this Run is used in any other Evaluations, it will still be available in those Evaluations.
Parameters
----------
- file_id : str
- Filter by File ID. Only Evaluations for the specified File will be returned.
-
- page : typing.Optional[int]
- Page number for pagination.
+ id : str
+ Unique identifier for Evaluation.
- size : typing.Optional[int]
- Page size for pagination. Number of Evaluations to fetch.
+ run_id : str
+ Unique identifier for Run.
request_options : typing.Optional[RequestOptions]
Request-specific configuration.
Returns
-------
- AsyncPager[EvaluationResponse]
- Successful Response
+ None
Examples
--------
- import asyncio
-
- from humanloop import AsyncHumanloop
+ from humanloop import Humanloop
- client = AsyncHumanloop(
+ client = Humanloop(
api_key="YOUR_API_KEY",
)
-
-
- async def main() -> None:
- response = await client.evaluations.list(
- file_id="pr_30gco7dx6JDq4200GVOHa",
- size=1,
- )
- async for item in response:
- yield item
- # alternatively, you can paginate page-by-page
- async for page in response.iter_pages():
- yield page
-
-
- asyncio.run(main())
+ client.evaluations.remove_run_from_evaluation(
+ id="id",
+ run_id="run_id",
+ )
"""
- page = page if page is not None else 1
- _response = await self._client_wrapper.httpx_client.request(
- "evaluations",
- method="GET",
- params={
- "file_id": file_id,
- "page": page,
- "size": size,
- },
+ _response = self._client_wrapper.httpx_client.request(
+ f"evaluations/{jsonable_encoder(id)}/runs/{jsonable_encoder(run_id)}",
+ method="DELETE",
request_options=request_options,
)
try:
if 200 <= _response.status_code < 300:
- _parsed_response = typing.cast(
- PaginatedEvaluationResponse,
+ return
+ if _response.status_code == 422:
+ raise UnprocessableEntityError(
+ typing.cast(
+ HttpValidationError,
+ construct_type(
+ type_=HttpValidationError, # type: ignore
+ object_=_response.json(),
+ ),
+ )
+ )
+ _response_json = _response.json()
+ except JSONDecodeError:
+ raise ApiError(status_code=_response.status_code, body=_response.text)
+ raise ApiError(status_code=_response.status_code, body=_response_json)
+
+ def update_evaluation_run(
+ self, id: str, run_id: str, *, control: bool, request_options: typing.Optional[RequestOptions] = None
+ ) -> EvaluationRunResponse:
+ """
+ Update an Evaluation Run.
+
+ Update the Dataset and version to be evaluated for an existing Run.
+
+ Parameters
+ ----------
+ id : str
+ Unique identifier for Evaluation.
+
+ run_id : str
+ Unique identifier for Run.
+
+ control : bool
+ If `True`, this Run will be used as the control in the Evaluation. Stats for other Runs will be compared to this Run. This will replace any existing control Run.
+
+ request_options : typing.Optional[RequestOptions]
+ Request-specific configuration.
+
+ Returns
+ -------
+ EvaluationRunResponse
+ Successful Response
+
+ Examples
+ --------
+ from humanloop import Humanloop
+
+ client = Humanloop(
+ api_key="YOUR_API_KEY",
+ )
+ client.evaluations.update_evaluation_run(
+ id="id",
+ run_id="run_id",
+ control=True,
+ )
+ """
+ _response = self._client_wrapper.httpx_client.request(
+ f"evaluations/{jsonable_encoder(id)}/runs/{jsonable_encoder(run_id)}",
+ method="PATCH",
+ json={
+ "control": control,
+ },
+ request_options=request_options,
+ omit=OMIT,
+ )
+ try:
+ if 200 <= _response.status_code < 300:
+ return typing.cast(
+ EvaluationRunResponse,
+ construct_type(
+ type_=EvaluationRunResponse, # type: ignore
+ object_=_response.json(),
+ ),
+ )
+ if _response.status_code == 422:
+ raise UnprocessableEntityError(
+ typing.cast(
+ HttpValidationError,
+ construct_type(
+ type_=HttpValidationError, # type: ignore
+ object_=_response.json(),
+ ),
+ )
+ )
+ _response_json = _response.json()
+ except JSONDecodeError:
+ raise ApiError(status_code=_response.status_code, body=_response.text)
+ raise ApiError(status_code=_response.status_code, body=_response_json)
+
+ def add_logs_to_run(
+ self,
+ id: str,
+ run_id: str,
+ *,
+ log_ids: typing.Sequence[str],
+ request_options: typing.Optional[RequestOptions] = None,
+ ) -> EvaluationRunResponse:
+ """
+ Add Logs to an Evaluation Run.
+
+ This is supported only for Runs that have a fixed set of Logs.
+ (Runs can either have a fixed set of Logs, or can be set to dynamically retrieve the latest Logs
+ if a Dataset and Version are provided.)
+
+ Parameters
+ ----------
+ id : str
+ Unique identifier for Evaluation.
+
+ run_id : str
+ Unique identifier for Run.
+
+ log_ids : typing.Sequence[str]
+ The IDs of the Logs to add to the Run.
+
+ request_options : typing.Optional[RequestOptions]
+ Request-specific configuration.
+
+ Returns
+ -------
+ EvaluationRunResponse
+ Successful Response
+
+ Examples
+ --------
+ from humanloop import Humanloop
+
+ client = Humanloop(
+ api_key="YOUR_API_KEY",
+ )
+ client.evaluations.add_logs_to_run(
+ id="id",
+ run_id="run_id",
+ log_ids=["log_ids"],
+ )
+ """
+ _response = self._client_wrapper.httpx_client.request(
+ f"evaluations/{jsonable_encoder(id)}/runs/{jsonable_encoder(run_id)}/logs",
+ method="POST",
+ json={
+ "log_ids": log_ids,
+ },
+ request_options=request_options,
+ omit=OMIT,
+ )
+ try:
+ if 200 <= _response.status_code < 300:
+ return typing.cast(
+ EvaluationRunResponse,
+ construct_type(
+ type_=EvaluationRunResponse, # type: ignore
+ object_=_response.json(),
+ ),
+ )
+ if _response.status_code == 422:
+ raise UnprocessableEntityError(
+ typing.cast(
+ HttpValidationError,
+ construct_type(
+ type_=HttpValidationError, # type: ignore
+ object_=_response.json(),
+ ),
+ )
+ )
+ _response_json = _response.json()
+ except JSONDecodeError:
+ raise ApiError(status_code=_response.status_code, body=_response.text)
+ raise ApiError(status_code=_response.status_code, body=_response_json)
+
+ def get_stats(self, id: str, *, request_options: typing.Optional[RequestOptions] = None) -> EvaluationStats:
+ """
+ Get Evaluation Stats.
+
+ Retrieve aggregate stats for the specified Evaluation.
+
+ This includes the number of generated Logs for each Run and the
+ corresponding Evaluator statistics (such as the mean and percentiles).
+
+ Parameters
+ ----------
+ id : str
+ Unique identifier for Evaluation.
+
+ request_options : typing.Optional[RequestOptions]
+ Request-specific configuration.
+
+ Returns
+ -------
+ EvaluationStats
+ Successful Response
+
+ Examples
+ --------
+ from humanloop import Humanloop
+
+ client = Humanloop(
+ api_key="YOUR_API_KEY",
+ )
+ client.evaluations.get_stats(
+ id="id",
+ )
+ """
+ _response = self._client_wrapper.httpx_client.request(
+ f"evaluations/{jsonable_encoder(id)}/stats",
+ method="GET",
+ request_options=request_options,
+ )
+ try:
+ if 200 <= _response.status_code < 300:
+ return typing.cast(
+ EvaluationStats,
+ construct_type(
+ type_=EvaluationStats, # type: ignore
+ object_=_response.json(),
+ ),
+ )
+ if _response.status_code == 422:
+ raise UnprocessableEntityError(
+ typing.cast(
+ HttpValidationError,
+ construct_type(
+ type_=HttpValidationError, # type: ignore
+ object_=_response.json(),
+ ),
+ )
+ )
+ _response_json = _response.json()
+ except JSONDecodeError:
+ raise ApiError(status_code=_response.status_code, body=_response.text)
+ raise ApiError(status_code=_response.status_code, body=_response_json)
+
+ def get_logs(
+ self,
+ id: str,
+ *,
+ page: typing.Optional[int] = None,
+ size: typing.Optional[int] = None,
+ request_options: typing.Optional[RequestOptions] = None,
+ ) -> PaginatedDataEvaluationLogResponse:
+ """
+ Get the Logs associated to a specific Evaluation.
+
+ Parameters
+ ----------
+ id : str
+ String ID of evaluation. Starts with `ev_` or `evr_`.
+
+ page : typing.Optional[int]
+ Page number for pagination.
+
+ size : typing.Optional[int]
+ Page size for pagination. Number of Logs to fetch.
+
+ request_options : typing.Optional[RequestOptions]
+ Request-specific configuration.
+
+ Returns
+ -------
+ PaginatedDataEvaluationLogResponse
+ Successful Response
+
+ Examples
+ --------
+ from humanloop import Humanloop
+
+ client = Humanloop(
+ api_key="YOUR_API_KEY",
+ )
+ client.evaluations.get_logs(
+ id="id",
+ )
+ """
+ _response = self._client_wrapper.httpx_client.request(
+ f"evaluations/{jsonable_encoder(id)}/logs",
+ method="GET",
+ params={
+ "page": page,
+ "size": size,
+ },
+ request_options=request_options,
+ )
+ try:
+ if 200 <= _response.status_code < 300:
+ return typing.cast(
+ PaginatedDataEvaluationLogResponse,
+ construct_type(
+ type_=PaginatedDataEvaluationLogResponse, # type: ignore
+ object_=_response.json(),
+ ),
+ )
+ if _response.status_code == 422:
+ raise UnprocessableEntityError(
+ typing.cast(
+ HttpValidationError,
+ construct_type(
+ type_=HttpValidationError, # type: ignore
+ object_=_response.json(),
+ ),
+ )
+ )
+ _response_json = _response.json()
+ except JSONDecodeError:
+ raise ApiError(status_code=_response.status_code, body=_response.text)
+ raise ApiError(status_code=_response.status_code, body=_response_json)
+
+
+class AsyncEvaluationsClient:
+ def __init__(self, *, client_wrapper: AsyncClientWrapper):
+ self._client_wrapper = client_wrapper
+
+ async def list(
+ self,
+ *,
+ file_id: str,
+ page: typing.Optional[int] = None,
+ size: typing.Optional[int] = None,
+ request_options: typing.Optional[RequestOptions] = None,
+ ) -> AsyncPager[EvaluationResponse]:
+ """
+ List all Evaluations for the specified `file_id`.
+
+ Retrieve a list of Evaluations that evaluate versions of the specified File.
+
+ Parameters
+ ----------
+ file_id : str
+ Filter by File ID. Only Evaluations for the specified File will be returned.
+
+ page : typing.Optional[int]
+ Page number for pagination.
+
+ size : typing.Optional[int]
+ Page size for pagination. Number of Evaluations to fetch.
+
+ request_options : typing.Optional[RequestOptions]
+ Request-specific configuration.
+
+ Returns
+ -------
+ AsyncPager[EvaluationResponse]
+ Successful Response
+
+ Examples
+ --------
+ import asyncio
+
+ from humanloop import AsyncHumanloop
+
+ client = AsyncHumanloop(
+ api_key="YOUR_API_KEY",
+ )
+
+
+ async def main() -> None:
+ response = await client.evaluations.list(
+ file_id="pr_30gco7dx6JDq4200GVOHa",
+ size=1,
+ )
+ async for item in response:
+ yield item
+ # alternatively, you can paginate page-by-page
+ async for page in response.iter_pages():
+ yield page
+
+
+ asyncio.run(main())
+ """
+ page = page if page is not None else 1
+ _response = await self._client_wrapper.httpx_client.request(
+ "evaluations",
+ method="GET",
+ params={
+ "file_id": file_id,
+ "page": page,
+ "size": size,
+ },
+ request_options=request_options,
+ )
+ try:
+ if 200 <= _response.status_code < 300:
+ _parsed_response = typing.cast(
+ PaginatedEvaluationResponse,
+ construct_type(
+ type_=PaginatedEvaluationResponse, # type: ignore
+ object_=_response.json(),
+ ),
+ )
+ _has_next = True
+ _get_next = lambda: self.list(
+ file_id=file_id,
+ page=page + 1,
+ size=size,
+ request_options=request_options,
+ )
+ _items = _parsed_response.records
+ return AsyncPager(has_next=_has_next, items=_items, get_next=_get_next)
+ if _response.status_code == 422:
+ raise UnprocessableEntityError(
+ typing.cast(
+ HttpValidationError,
+ construct_type(
+ type_=HttpValidationError, # type: ignore
+ object_=_response.json(),
+ ),
+ )
+ )
+ _response_json = _response.json()
+ except JSONDecodeError:
+ raise ApiError(status_code=_response.status_code, body=_response.text)
+ raise ApiError(status_code=_response.status_code, body=_response_json)
+
+ async def create(
+ self,
+ *,
+ evaluators: typing.Sequence[EvaluationsRequestParams],
+ file: typing.Optional[FileRequestParams] = OMIT,
+ name: typing.Optional[str] = OMIT,
+ request_options: typing.Optional[RequestOptions] = None,
+ ) -> EvaluationResponse:
+ """
+ Create an Evaluation.
+
+ Create an Evaluation by specifying the File to evaluate, and a name
+ for the Evaluation.
+
+ You can then add Runs to this Evaluation using the `POST /evaluations/{id}/runs` endpoint.
+
+ Parameters
+ ----------
+ evaluators : typing.Sequence[EvaluationsRequestParams]
+ The Evaluators used to evaluate.
+
+ file : typing.Optional[FileRequestParams]
+ The File to associate with the Evaluation. This File contains the Logs you're evaluating.
+
+ name : typing.Optional[str]
+ Name of the Evaluation to help identify it. Must be unique within the associated File.
+
+ request_options : typing.Optional[RequestOptions]
+ Request-specific configuration.
+
+ Returns
+ -------
+ EvaluationResponse
+ Successful Response
+
+ Examples
+ --------
+ import asyncio
+
+ from humanloop import AsyncHumanloop
+
+ client = AsyncHumanloop(
+ api_key="YOUR_API_KEY",
+ )
+
+
+ async def main() -> None:
+ await client.evaluations.create(
+ evaluators=[{}],
+ )
+
+
+ asyncio.run(main())
+ """
+ _response = await self._client_wrapper.httpx_client.request(
+ "evaluations",
+ method="POST",
+ json={
+ "file": convert_and_respect_annotation_metadata(
+ object_=file, annotation=FileRequestParams, direction="write"
+ ),
+ "name": name,
+ "evaluators": convert_and_respect_annotation_metadata(
+ object_=evaluators, annotation=typing.Sequence[EvaluationsRequestParams], direction="write"
+ ),
+ },
+ request_options=request_options,
+ omit=OMIT,
+ )
+ try:
+ if 200 <= _response.status_code < 300:
+ return typing.cast(
+ EvaluationResponse,
+ construct_type(
+ type_=EvaluationResponse, # type: ignore
+ object_=_response.json(),
+ ),
+ )
+ if _response.status_code == 422:
+ raise UnprocessableEntityError(
+ typing.cast(
+ HttpValidationError,
+ construct_type(
+ type_=HttpValidationError, # type: ignore
+ object_=_response.json(),
+ ),
+ )
+ )
+ _response_json = _response.json()
+ except JSONDecodeError:
+ raise ApiError(status_code=_response.status_code, body=_response.text)
+ raise ApiError(status_code=_response.status_code, body=_response_json)
+
+ async def add_evaluators(
+ self,
+ id: str,
+ *,
+ evaluators: typing.Sequence[EvaluationsRequestParams],
+ request_options: typing.Optional[RequestOptions] = None,
+ ) -> EvaluationResponse:
+ """
+ Add Evaluators to an Evaluation.
+
+ Add new Evaluators to an Evaluation. The Evaluators will be run on the Logs
+ generated for the Evaluation.
+
+ Parameters
+ ----------
+ id : str
+ Unique identifier for Evaluation.
+
+ evaluators : typing.Sequence[EvaluationsRequestParams]
+ The Evaluators to add to this Evaluation.
+
+ request_options : typing.Optional[RequestOptions]
+ Request-specific configuration.
+
+ Returns
+ -------
+ EvaluationResponse
+ Successful Response
+
+ Examples
+ --------
+ import asyncio
+
+ from humanloop import AsyncHumanloop
+
+ client = AsyncHumanloop(
+ api_key="YOUR_API_KEY",
+ )
+
+
+ async def main() -> None:
+ await client.evaluations.add_evaluators(
+ id="id",
+ evaluators=[{}],
+ )
+
+
+ asyncio.run(main())
+ """
+ _response = await self._client_wrapper.httpx_client.request(
+ f"evaluations/{jsonable_encoder(id)}/evaluators",
+ method="POST",
+ json={
+ "evaluators": convert_and_respect_annotation_metadata(
+ object_=evaluators, annotation=typing.Sequence[EvaluationsRequestParams], direction="write"
+ ),
+ },
+ request_options=request_options,
+ omit=OMIT,
+ )
+ try:
+ if 200 <= _response.status_code < 300:
+ return typing.cast(
+ EvaluationResponse,
+ construct_type(
+ type_=EvaluationResponse, # type: ignore
+ object_=_response.json(),
+ ),
+ )
+ if _response.status_code == 422:
+ raise UnprocessableEntityError(
+ typing.cast(
+ HttpValidationError,
+ construct_type(
+ type_=HttpValidationError, # type: ignore
+ object_=_response.json(),
+ ),
+ )
+ )
+ _response_json = _response.json()
+ except JSONDecodeError:
+ raise ApiError(status_code=_response.status_code, body=_response.text)
+ raise ApiError(status_code=_response.status_code, body=_response_json)
+
+ async def remove_evaluator(
+ self, id: str, evaluator_version_id: str, *, request_options: typing.Optional[RequestOptions] = None
+ ) -> EvaluationResponse:
+ """
+ Remove an Evaluator from an Evaluation.
+
+ Remove an Evaluator from an Evaluation. The Evaluator will no longer be run on the Logs
+ generated for the Evaluation.
+
+ Parameters
+ ----------
+ id : str
+ Unique identifier for Evaluation.
+
+ evaluator_version_id : str
+ Unique identifier for Evaluator Version.
+
+ request_options : typing.Optional[RequestOptions]
+ Request-specific configuration.
+
+ Returns
+ -------
+ EvaluationResponse
+ Successful Response
+
+ Examples
+ --------
+ import asyncio
+
+ from humanloop import AsyncHumanloop
+
+ client = AsyncHumanloop(
+ api_key="YOUR_API_KEY",
+ )
+
+
+ async def main() -> None:
+ await client.evaluations.remove_evaluator(
+ id="id",
+ evaluator_version_id="evaluator_version_id",
+ )
+
+
+ asyncio.run(main())
+ """
+ _response = await self._client_wrapper.httpx_client.request(
+ f"evaluations/{jsonable_encoder(id)}/evaluators/{jsonable_encoder(evaluator_version_id)}",
+ method="DELETE",
+ request_options=request_options,
+ )
+ try:
+ if 200 <= _response.status_code < 300:
+ return typing.cast(
+ EvaluationResponse,
+ construct_type(
+ type_=EvaluationResponse, # type: ignore
+ object_=_response.json(),
+ ),
+ )
+ if _response.status_code == 422:
+ raise UnprocessableEntityError(
+ typing.cast(
+ HttpValidationError,
+ construct_type(
+ type_=HttpValidationError, # type: ignore
+ object_=_response.json(),
+ ),
+ )
+ )
+ _response_json = _response.json()
+ except JSONDecodeError:
+ raise ApiError(status_code=_response.status_code, body=_response.text)
+ raise ApiError(status_code=_response.status_code, body=_response_json)
+
+ async def get(self, id: str, *, request_options: typing.Optional[RequestOptions] = None) -> EvaluationResponse:
+ """
+ Get an Evaluation.
+
+ Parameters
+ ----------
+ id : str
+ Unique identifier for Evaluation.
+
+ request_options : typing.Optional[RequestOptions]
+ Request-specific configuration.
+
+ Returns
+ -------
+ EvaluationResponse
+ Successful Response
+
+ Examples
+ --------
+ import asyncio
+
+ from humanloop import AsyncHumanloop
+
+ client = AsyncHumanloop(
+ api_key="YOUR_API_KEY",
+ )
+
+
+ async def main() -> None:
+ await client.evaluations.get(
+ id="ev_567yza",
+ )
+
+
+ asyncio.run(main())
+ """
+ _response = await self._client_wrapper.httpx_client.request(
+ f"evaluations/{jsonable_encoder(id)}",
+ method="GET",
+ request_options=request_options,
+ )
+ try:
+ if 200 <= _response.status_code < 300:
+ return typing.cast(
+ EvaluationResponse,
+ construct_type(
+ type_=EvaluationResponse, # type: ignore
+ object_=_response.json(),
+ ),
+ )
+ if _response.status_code == 422:
+ raise UnprocessableEntityError(
+ typing.cast(
+ HttpValidationError,
+ construct_type(
+ type_=HttpValidationError, # type: ignore
+ object_=_response.json(),
+ ),
+ )
+ )
+ _response_json = _response.json()
+ except JSONDecodeError:
+ raise ApiError(status_code=_response.status_code, body=_response.text)
+ raise ApiError(status_code=_response.status_code, body=_response_json)
+
+ async def delete(self, id: str, *, request_options: typing.Optional[RequestOptions] = None) -> None:
+ """
+ Delete an Evaluation.
+
+ Remove an Evaluation from Humanloop. The Logs and Versions used in the Evaluation
+ will not be deleted.
+
+ Parameters
+ ----------
+ id : str
+ Unique identifier for Evaluation.
+
+ request_options : typing.Optional[RequestOptions]
+ Request-specific configuration.
+
+ Returns
+ -------
+ None
+
+ Examples
+ --------
+ import asyncio
+
+ from humanloop import AsyncHumanloop
+
+ client = AsyncHumanloop(
+ api_key="YOUR_API_KEY",
+ )
+
+
+ async def main() -> None:
+ await client.evaluations.delete(
+ id="ev_567yza",
+ )
+
+
+ asyncio.run(main())
+ """
+ _response = await self._client_wrapper.httpx_client.request(
+ f"evaluations/{jsonable_encoder(id)}",
+ method="DELETE",
+ request_options=request_options,
+ )
+ try:
+ if 200 <= _response.status_code < 300:
+ return
+ if _response.status_code == 422:
+ raise UnprocessableEntityError(
+ typing.cast(
+ HttpValidationError,
+ construct_type(
+ type_=HttpValidationError, # type: ignore
+ object_=_response.json(),
+ ),
+ )
+ )
+ _response_json = _response.json()
+ except JSONDecodeError:
+ raise ApiError(status_code=_response.status_code, body=_response.text)
+ raise ApiError(status_code=_response.status_code, body=_response_json)
+
+ async def list_runs_for_evaluation(
+ self, id: str, *, request_options: typing.Optional[RequestOptions] = None
+ ) -> EvaluationRunsResponse:
+ """
+ List all Runs for an Evaluation.
+
+ Parameters
+ ----------
+ id : str
+ Unique identifier for Evaluation.
+
+ request_options : typing.Optional[RequestOptions]
+ Request-specific configuration.
+
+ Returns
+ -------
+ EvaluationRunsResponse
+ Successful Response
+
+ Examples
+ --------
+ import asyncio
+
+ from humanloop import AsyncHumanloop
+
+ client = AsyncHumanloop(
+ api_key="YOUR_API_KEY",
+ )
+
+
+ async def main() -> None:
+ await client.evaluations.list_runs_for_evaluation(
+ id="id",
+ )
+
+
+ asyncio.run(main())
+ """
+ _response = await self._client_wrapper.httpx_client.request(
+ f"evaluations/{jsonable_encoder(id)}/runs",
+ method="GET",
+ request_options=request_options,
+ )
+ try:
+ if 200 <= _response.status_code < 300:
+ return typing.cast(
+ EvaluationRunsResponse,
construct_type(
- type_=PaginatedEvaluationResponse, # type: ignore
+ type_=EvaluationRunsResponse, # type: ignore
object_=_response.json(),
),
)
- _has_next = True
- _get_next = lambda: self.list(
- file_id=file_id,
- page=page + 1,
- size=size,
- request_options=request_options,
- )
- _items = _parsed_response.records
- return AsyncPager(has_next=_has_next, items=_items, get_next=_get_next)
if _response.status_code == 422:
raise UnprocessableEntityError(
typing.cast(
@@ -860,53 +1590,60 @@ async def main() -> None:
raise ApiError(status_code=_response.status_code, body=_response.text)
raise ApiError(status_code=_response.status_code, body=_response_json)
- async def create(
+ async def create_run(
self,
+ id: str,
*,
- dataset: EvaluationsDatasetRequestParams,
- evaluators: typing.Sequence[EvaluationsRequestParams],
- evaluatees: typing.Optional[typing.Sequence[EvaluateeRequestParams]] = OMIT,
- name: typing.Optional[str] = OMIT,
- file: typing.Optional[FileRequestParams] = OMIT,
+ dataset: typing.Optional[EvaluationsDatasetRequestParams] = OMIT,
+ version: typing.Optional[VersionSpecificationParams] = OMIT,
+ orchestrated: typing.Optional[bool] = OMIT,
+ logs: typing.Optional[LogsAssociationType] = OMIT,
request_options: typing.Optional[RequestOptions] = None,
- ) -> EvaluationResponse:
+ ) -> EvaluationRunResponse:
"""
- Create an Evaluation.
+ Create an Evaluation Run.
- Create a new Evaluation by specifying the Dataset, versions to be
- evaluated (Evaluatees), and which Evaluators to provide judgments.
+ Create a new Evaluation Run. Optionally specify the Dataset and version to be
+ evaluated.
Humanloop will automatically start generating Logs and running Evaluators where
- `orchestrated=true`. If you own the runtime for the Evaluatee or Evaluator, you
- can set `orchestrated=false` and then generate and submit the required logs using
- your runtime.
+ `orchestrated=true`. If you are generating Logs yourself, you can set `orchestrated=false`
+ and then generate and submit the required Logs via the API.
+
+ The `logs` parameter controls which Logs are associated with the Run. Defaults to `dynamic`
+ if `dataset` and `version` are provided. This means that Logs will automatically be retrieved
+ if they're associated with the specified Version and has `source_datapoint_id` referencing
+ a datapoint in the specified Dataset.
+ If `logs` is set to `fixed`, no existing Logs will be automatically associated with the Run.
+ You can then add Logs to the Run using the `POST /evaluations/{id}/runs/{run_id}/logs` endpoint,
+ or by adding `run_id` to your `POST /prompts/logs` requests.
- To keep updated on the progress of the Evaluation, you can poll the Evaluation using
- the `GET /evaluations/:id` endpoint and check its status.
+ To keep updated on the progress of the Run, you can poll the Run using
+ the `GET /evaluations/{id}/runs` endpoint and check its status.
Parameters
----------
- dataset : EvaluationsDatasetRequestParams
- Dataset to use in this Evaluation.
+ id : str
+ Unique identifier for Evaluation.
- evaluators : typing.Sequence[EvaluationsRequestParams]
- The Evaluators used to evaluate.
+ dataset : typing.Optional[EvaluationsDatasetRequestParams]
+ Dataset to use in this Run.
- evaluatees : typing.Optional[typing.Sequence[EvaluateeRequestParams]]
- Unique identifiers for the Prompt/Tool Versions to include in the Evaluation. Can be left unpopulated if you wish to add Evaluatees to this Evaluation by specifying `evaluation_id` in Log calls.
+ version : typing.Optional[VersionSpecificationParams]
+ Version to use in this Run.
- name : typing.Optional[str]
- Name of the Evaluation to help identify it. Must be unique within the associated File.
+ orchestrated : typing.Optional[bool]
+ Whether the Run is orchestrated by Humanloop. If `True`, Humanloop will generate Logs for the Run; `dataset` and `version` must be provided. If `False`, a log for the Prompt/Tool should be submitted by the user via the API.
- file : typing.Optional[FileRequestParams]
- The File to associate with the Evaluation.
+ logs : typing.Optional[LogsAssociationType]
+ How the Logs are associated with the Run. If `dynamic`, the latest relevant Logs will be inferred from the Dataset and Version. If `fixed`, the Logs will be explicitly associated. You can provide a list of Log IDs to associate with the Run, or add them to the Run later. Defaults to `dynamic` if `dataset` and `version` are provided; otherwise, defaults to `fixed`.
request_options : typing.Optional[RequestOptions]
Request-specific configuration.
Returns
-------
- EvaluationResponse
+ EvaluationRunResponse
Successful Response
Examples
@@ -921,34 +1658,25 @@ async def create(
async def main() -> None:
- await client.evaluations.create(
- dataset={"version_id": "dsv_6L78pqrdFi2xa"},
- evaluatees=[
- {"version_id": "prv_7ZlQREDScH0xkhUwtXruN", "orchestrated": False}
- ],
- evaluators=[{"version_id": "evv_012def", "orchestrated": False}],
+ await client.evaluations.create_run(
+ id="id",
)
asyncio.run(main())
"""
_response = await self._client_wrapper.httpx_client.request(
- "evaluations",
+ f"evaluations/{jsonable_encoder(id)}/runs",
method="POST",
json={
"dataset": convert_and_respect_annotation_metadata(
object_=dataset, annotation=EvaluationsDatasetRequestParams, direction="write"
),
- "evaluatees": convert_and_respect_annotation_metadata(
- object_=evaluatees, annotation=typing.Sequence[EvaluateeRequestParams], direction="write"
- ),
- "evaluators": convert_and_respect_annotation_metadata(
- object_=evaluators, annotation=typing.Sequence[EvaluationsRequestParams], direction="write"
- ),
- "name": name,
- "file": convert_and_respect_annotation_metadata(
- object_=file, annotation=FileRequestParams, direction="write"
+ "version": convert_and_respect_annotation_metadata(
+ object_=version, annotation=VersionSpecificationParams, direction="write"
),
+ "orchestrated": orchestrated,
+ "logs": logs,
},
request_options=request_options,
omit=OMIT,
@@ -956,9 +1684,9 @@ async def main() -> None:
try:
if 200 <= _response.status_code < 300:
return typing.cast(
- EvaluationResponse,
+ EvaluationRunResponse,
construct_type(
- type_=EvaluationResponse, # type: ignore
+ type_=EvaluationRunResponse, # type: ignore
object_=_response.json(),
),
)
@@ -977,21 +1705,26 @@ async def main() -> None:
raise ApiError(status_code=_response.status_code, body=_response.text)
raise ApiError(status_code=_response.status_code, body=_response_json)
- async def get(self, id: str, *, request_options: typing.Optional[RequestOptions] = None) -> EvaluationResponse:
+ async def add_existing_run(
+ self, id: str, run_id: str, *, request_options: typing.Optional[RequestOptions] = None
+ ) -> typing.Optional[typing.Any]:
"""
- Get an Evaluation.
+ Add an existing Run to an Evaluation.
Parameters
----------
id : str
Unique identifier for Evaluation.
+ run_id : str
+ Unique identifier for Run.
+
request_options : typing.Optional[RequestOptions]
Request-specific configuration.
Returns
-------
- EvaluationResponse
+ typing.Optional[typing.Any]
Successful Response
Examples
@@ -1006,24 +1739,25 @@ async def get(self, id: str, *, request_options: typing.Optional[RequestOptions]
async def main() -> None:
- await client.evaluations.get(
- id="ev_567yza",
+ await client.evaluations.add_existing_run(
+ id="id",
+ run_id="run_id",
)
asyncio.run(main())
"""
_response = await self._client_wrapper.httpx_client.request(
- f"evaluations/{jsonable_encoder(id)}",
- method="GET",
+ f"evaluations/{jsonable_encoder(id)}/runs/{jsonable_encoder(run_id)}",
+ method="POST",
request_options=request_options,
)
try:
if 200 <= _response.status_code < 300:
return typing.cast(
- EvaluationResponse,
+ typing.Optional[typing.Any],
construct_type(
- type_=EvaluationResponse, # type: ignore
+ type_=typing.Optional[typing.Any], # type: ignore
object_=_response.json(),
),
)
@@ -1042,18 +1776,23 @@ async def main() -> None:
raise ApiError(status_code=_response.status_code, body=_response.text)
raise ApiError(status_code=_response.status_code, body=_response_json)
- async def delete(self, id: str, *, request_options: typing.Optional[RequestOptions] = None) -> None:
+ async def remove_run_from_evaluation(
+ self, id: str, run_id: str, *, request_options: typing.Optional[RequestOptions] = None
+ ) -> None:
"""
- Delete an Evaluation.
+ Remove a Run from an Evaluation.
- Remove an Evaluation from Humanloop. The Logs and Versions used in the Evaluation
- will not be deleted.
+ Remove a Run from an Evaluation. The Logs and Versions used in the Run will not be deleted.
+ If this Run is used in any other Evaluations, it will still be available in those Evaluations.
Parameters
----------
id : str
Unique identifier for Evaluation.
+ run_id : str
+ Unique identifier for Run.
+
request_options : typing.Optional[RequestOptions]
Request-specific configuration.
@@ -1073,15 +1812,16 @@ async def delete(self, id: str, *, request_options: typing.Optional[RequestOptio
async def main() -> None:
- await client.evaluations.delete(
- id="ev_567yza",
+ await client.evaluations.remove_run_from_evaluation(
+ id="id",
+ run_id="run_id",
)
asyncio.run(main())
"""
_response = await self._client_wrapper.httpx_client.request(
- f"evaluations/{jsonable_encoder(id)}",
+ f"evaluations/{jsonable_encoder(id)}/runs/{jsonable_encoder(run_id)}",
method="DELETE",
request_options=request_options,
)
@@ -1103,49 +1843,31 @@ async def main() -> None:
raise ApiError(status_code=_response.status_code, body=_response.text)
raise ApiError(status_code=_response.status_code, body=_response_json)
- async def update_setup(
- self,
- id: str,
- *,
- dataset: typing.Optional[EvaluationsDatasetRequestParams] = OMIT,
- evaluatees: typing.Optional[typing.Sequence[EvaluateeRequestParams]] = OMIT,
- evaluators: typing.Optional[typing.Sequence[EvaluationsRequestParams]] = OMIT,
- name: typing.Optional[str] = OMIT,
- file: typing.Optional[FileRequestParams] = OMIT,
- request_options: typing.Optional[RequestOptions] = None,
- ) -> EvaluationResponse:
+ async def update_evaluation_run(
+ self, id: str, run_id: str, *, control: bool, request_options: typing.Optional[RequestOptions] = None
+ ) -> EvaluationRunResponse:
"""
- Update an Evaluation.
+ Update an Evaluation Run.
- Update the setup of an Evaluation by specifying the Dataset, versions to be
- evaluated (Evaluatees), and which Evaluators to provide judgments.
+ Update the Dataset and version to be evaluated for an existing Run.
Parameters
----------
id : str
Unique identifier for Evaluation.
- dataset : typing.Optional[EvaluationsDatasetRequestParams]
- Dataset to use in this Evaluation.
-
- evaluatees : typing.Optional[typing.Sequence[EvaluateeRequestParams]]
- Unique identifiers for the Prompt/Tool Versions to include in the Evaluation. Can be left unpopulated if you wish to add evaluatees to this Evaluation by specifying `evaluation_id` in Log calls.
-
- evaluators : typing.Optional[typing.Sequence[EvaluationsRequestParams]]
- The Evaluators used to evaluate.
-
- name : typing.Optional[str]
- Name of the Evaluation to help identify it. Must be unique within the associated File.
+ run_id : str
+ Unique identifier for Run.
- file : typing.Optional[FileRequestParams]
- The File to associate with the Evaluation.
+ control : bool
+ If `True`, this Run will be used as the control in the Evaluation. Stats for other Runs will be compared to this Run. This will replace any existing control Run.
request_options : typing.Optional[RequestOptions]
Request-specific configuration.
Returns
-------
- EvaluationResponse
+ EvaluationRunResponse
Successful Response
Examples
@@ -1160,35 +1882,20 @@ async def update_setup(
async def main() -> None:
- await client.evaluations.update_setup(
- id="ev_567yza",
- dataset={"version_id": "dsv_6L78pqrdFi2xa"},
- evaluatees=[
- {"version_id": "prv_7ZlQREDScH0xkhUwtXruN", "orchestrated": False}
- ],
- evaluators=[{"version_id": "evv_012def", "orchestrated": False}],
+ await client.evaluations.update_evaluation_run(
+ id="id",
+ run_id="run_id",
+ control=True,
)
asyncio.run(main())
"""
_response = await self._client_wrapper.httpx_client.request(
- f"evaluations/{jsonable_encoder(id)}",
+ f"evaluations/{jsonable_encoder(id)}/runs/{jsonable_encoder(run_id)}",
method="PATCH",
json={
- "dataset": convert_and_respect_annotation_metadata(
- object_=dataset, annotation=EvaluationsDatasetRequestParams, direction="write"
- ),
- "evaluatees": convert_and_respect_annotation_metadata(
- object_=evaluatees, annotation=typing.Sequence[EvaluateeRequestParams], direction="write"
- ),
- "evaluators": convert_and_respect_annotation_metadata(
- object_=evaluators, annotation=typing.Sequence[EvaluationsRequestParams], direction="write"
- ),
- "name": name,
- "file": convert_and_respect_annotation_metadata(
- object_=file, annotation=FileRequestParams, direction="write"
- ),
+ "control": control,
},
request_options=request_options,
omit=OMIT,
@@ -1196,9 +1903,9 @@ async def main() -> None:
try:
if 200 <= _response.status_code < 300:
return typing.cast(
- EvaluationResponse,
+ EvaluationRunResponse,
construct_type(
- type_=EvaluationResponse, # type: ignore
+ type_=EvaluationRunResponse, # type: ignore
object_=_response.json(),
),
)
@@ -1217,28 +1924,38 @@ async def main() -> None:
raise ApiError(status_code=_response.status_code, body=_response.text)
raise ApiError(status_code=_response.status_code, body=_response_json)
- async def update_status(
- self, id: str, *, status: EvaluationStatus, request_options: typing.Optional[RequestOptions] = None
- ) -> EvaluationResponse:
+ async def add_logs_to_run(
+ self,
+ id: str,
+ run_id: str,
+ *,
+ log_ids: typing.Sequence[str],
+ request_options: typing.Optional[RequestOptions] = None,
+ ) -> EvaluationRunResponse:
"""
- Update the status of an Evaluation.
+ Add Logs to an Evaluation Run.
- Can be used to cancel a running Evaluation, or mark an Evaluation that uses
- external or human evaluators as completed.
+ This is supported only for Runs that have a fixed set of Logs.
+ (Runs can either have a fixed set of Logs, or can be set to dynamically retrieve the latest Logs
+ if a Dataset and Version are provided.)
Parameters
----------
id : str
Unique identifier for Evaluation.
- status : EvaluationStatus
+ run_id : str
+ Unique identifier for Run.
+
+ log_ids : typing.Sequence[str]
+ The IDs of the Logs to add to the Run.
request_options : typing.Optional[RequestOptions]
Request-specific configuration.
Returns
-------
- EvaluationResponse
+ EvaluationRunResponse
Successful Response
Examples
@@ -1253,19 +1970,20 @@ async def update_status(
async def main() -> None:
- await client.evaluations.update_status(
+ await client.evaluations.add_logs_to_run(
id="id",
- status="pending",
+ run_id="run_id",
+ log_ids=["log_ids"],
)
asyncio.run(main())
"""
_response = await self._client_wrapper.httpx_client.request(
- f"evaluations/{jsonable_encoder(id)}/status",
- method="PATCH",
+ f"evaluations/{jsonable_encoder(id)}/runs/{jsonable_encoder(run_id)}/logs",
+ method="POST",
json={
- "status": status,
+ "log_ids": log_ids,
},
request_options=request_options,
omit=OMIT,
@@ -1273,9 +1991,9 @@ async def main() -> None:
try:
if 200 <= _response.status_code < 300:
return typing.cast(
- EvaluationResponse,
+ EvaluationRunResponse,
construct_type(
- type_=EvaluationResponse, # type: ignore
+ type_=EvaluationRunResponse, # type: ignore
object_=_response.json(),
),
)
@@ -1299,7 +2017,8 @@ async def get_stats(self, id: str, *, request_options: typing.Optional[RequestOp
Get Evaluation Stats.
Retrieve aggregate stats for the specified Evaluation.
- This includes the number of generated Logs for each evaluated version and the
+
+ This includes the number of generated Logs for each Run and the
corresponding Evaluator statistics (such as the mean and percentiles).
Parameters
@@ -1370,13 +2089,10 @@ async def get_logs(
page: typing.Optional[int] = None,
size: typing.Optional[int] = None,
request_options: typing.Optional[RequestOptions] = None,
- ) -> PaginatedDataEvaluationReportLogResponse:
+ ) -> PaginatedDataEvaluationLogResponse:
"""
Get the Logs associated to a specific Evaluation.
- Each Datapoint in your Dataset will have a corresponding Log for each File version evaluated.
- e.g. If you have 50 Datapoints and are evaluating 2 Prompts, there will be 100 Logs associated with the Evaluation.
-
Parameters
----------
id : str
@@ -1393,7 +2109,7 @@ async def get_logs(
Returns
-------
- PaginatedDataEvaluationReportLogResponse
+ PaginatedDataEvaluationLogResponse
Successful Response
Examples
@@ -1427,115 +2143,9 @@ async def main() -> None:
try:
if 200 <= _response.status_code < 300:
return typing.cast(
- PaginatedDataEvaluationReportLogResponse,
- construct_type(
- type_=PaginatedDataEvaluationReportLogResponse, # type: ignore
- object_=_response.json(),
- ),
- )
- if _response.status_code == 422:
- raise UnprocessableEntityError(
- typing.cast(
- HttpValidationError,
- construct_type(
- type_=HttpValidationError, # type: ignore
- object_=_response.json(),
- ),
- )
- )
- _response_json = _response.json()
- except JSONDecodeError:
- raise ApiError(status_code=_response.status_code, body=_response.text)
- raise ApiError(status_code=_response.status_code, body=_response_json)
-
- async def pin_evaluatee(
- self,
- id: str,
- *,
- version_id: typing.Optional[str] = OMIT,
- path: typing.Optional[str] = OMIT,
- file_id: typing.Optional[str] = OMIT,
- environment: typing.Optional[str] = OMIT,
- batch_id: typing.Optional[str] = OMIT,
- orchestrated: typing.Optional[bool] = OMIT,
- request_options: typing.Optional[RequestOptions] = None,
- ) -> EvaluationResponse:
- """
- Pin the specified Evaluatee.
-
- Pinned Evaluatees are always displayed in the Evaluation Overview,
- and serve as the baseline for comparison with other Evaluatees.
-
- Parameters
- ----------
- id : str
- Unique identifier for Evaluation.
-
- version_id : typing.Optional[str]
- Unique identifier for the File Version. If provided, none of the other fields should be specified.
-
- path : typing.Optional[str]
- Path identifying a File. Provide either this or `file_id` if you want to specify a File.
-
- file_id : typing.Optional[str]
- Unique identifier for the File. Provide either this or `path` if you want to specify a File.
-
- environment : typing.Optional[str]
- Name of the Environment a Version is deployed to. Only provide this when specifying a File. If not provided (and a File is specified), the default Environment is used.
-
- batch_id : typing.Optional[str]
- Unique identifier for the batch of Logs to include in the Evaluation Report.
-
- orchestrated : typing.Optional[bool]
- Whether the Prompt/Tool is orchestrated by Humanloop. Default is `True`. If `False`, a log for the Prompt/Tool should be submitted by the user via the API.
-
- request_options : typing.Optional[RequestOptions]
- Request-specific configuration.
-
- Returns
- -------
- EvaluationResponse
- Successful Response
-
- Examples
- --------
- import asyncio
-
- from humanloop import AsyncHumanloop
-
- client = AsyncHumanloop(
- api_key="YOUR_API_KEY",
- )
-
-
- async def main() -> None:
- await client.evaluations.pin_evaluatee(
- id="id",
- )
-
-
- asyncio.run(main())
- """
- _response = await self._client_wrapper.httpx_client.request(
- f"evaluations/{jsonable_encoder(id)}/pin-evaluatee",
- method="POST",
- json={
- "version_id": version_id,
- "path": path,
- "file_id": file_id,
- "environment": environment,
- "batch_id": batch_id,
- "orchestrated": orchestrated,
- },
- request_options=request_options,
- omit=OMIT,
- )
- try:
- if 200 <= _response.status_code < 300:
- return typing.cast(
- EvaluationResponse,
+ PaginatedDataEvaluationLogResponse,
construct_type(
- type_=EvaluationResponse, # type: ignore
+ type_=PaginatedDataEvaluationLogResponse, # type: ignore
object_=_response.json(),
),
)
diff --git a/src/humanloop/evaluators/client.py b/src/humanloop/evaluators/client.py
index e05ae5cc..5e7ae73e 100644
--- a/src/humanloop/evaluators/client.py
+++ b/src/humanloop/evaluators/client.py
@@ -64,7 +64,6 @@ def log(
metadata: typing.Optional[typing.Dict[str, typing.Optional[typing.Any]]] = OMIT,
source_datapoint_id: typing.Optional[str] = OMIT,
trace_parent_id: typing.Optional[str] = OMIT,
- batch_id: typing.Optional[str] = OMIT,
user: typing.Optional[str] = OMIT,
create_evaluator_log_request_environment: typing.Optional[str] = OMIT,
save: typing.Optional[bool] = OMIT,
@@ -136,9 +135,6 @@ def log(
trace_parent_id : typing.Optional[str]
The ID of the parent Log to nest this Log under in a Trace.
- batch_id : typing.Optional[str]
- Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist.
-
user : typing.Optional[str]
End-user ID related to the Log.
@@ -197,7 +193,6 @@ def log(
"parent_id": parent_id,
"source_datapoint_id": source_datapoint_id,
"trace_parent_id": trace_parent_id,
- "batch_id": batch_id,
"user": user,
"environment": create_evaluator_log_request_environment,
"save": save,
@@ -1093,7 +1088,6 @@ async def log(
metadata: typing.Optional[typing.Dict[str, typing.Optional[typing.Any]]] = OMIT,
source_datapoint_id: typing.Optional[str] = OMIT,
trace_parent_id: typing.Optional[str] = OMIT,
- batch_id: typing.Optional[str] = OMIT,
user: typing.Optional[str] = OMIT,
create_evaluator_log_request_environment: typing.Optional[str] = OMIT,
save: typing.Optional[bool] = OMIT,
@@ -1165,9 +1159,6 @@ async def log(
trace_parent_id : typing.Optional[str]
The ID of the parent Log to nest this Log under in a Trace.
- batch_id : typing.Optional[str]
- Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist.
-
user : typing.Optional[str]
End-user ID related to the Log.
@@ -1234,7 +1225,6 @@ async def main() -> None:
"parent_id": parent_id,
"source_datapoint_id": source_datapoint_id,
"trace_parent_id": trace_parent_id,
- "batch_id": batch_id,
"user": user,
"environment": create_evaluator_log_request_environment,
"save": save,
diff --git a/src/humanloop/flows/client.py b/src/humanloop/flows/client.py
index 6b75d942..4b4671e7 100644
--- a/src/humanloop/flows/client.py
+++ b/src/humanloop/flows/client.py
@@ -45,7 +45,7 @@ def log(
*,
version_id: typing.Optional[str] = None,
environment: typing.Optional[str] = None,
- evaluation_id: typing.Optional[str] = OMIT,
+ run_id: typing.Optional[str] = OMIT,
path: typing.Optional[str] = OMIT,
id: typing.Optional[str] = OMIT,
start_time: typing.Optional[dt.datetime] = OMIT,
@@ -62,7 +62,6 @@ def log(
metadata: typing.Optional[typing.Dict[str, typing.Optional[typing.Any]]] = OMIT,
source_datapoint_id: typing.Optional[str] = OMIT,
trace_parent_id: typing.Optional[str] = OMIT,
- batch_id: typing.Optional[str] = OMIT,
user: typing.Optional[str] = OMIT,
flow_log_request_environment: typing.Optional[str] = OMIT,
save: typing.Optional[bool] = OMIT,
@@ -85,8 +84,8 @@ def log(
environment : typing.Optional[str]
Name of the Environment identifying a deployed version to log to.
- evaluation_id : typing.Optional[str]
- Unique identifier for the Evaluation Report to associate the Log to.
+ run_id : typing.Optional[str]
+ Unique identifier for the Run to associate the Log to.
path : typing.Optional[str]
Path of the Flow, including the name. This locates the Flow in the Humanloop filesystem and is used as as a unique identifier. For example: `folder/name` or just `name`.
@@ -136,9 +135,6 @@ def log(
trace_parent_id : typing.Optional[str]
The ID of the parent Log to nest this Log under in a Trace.
- batch_id : typing.Optional[str]
- Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist.
-
user : typing.Optional[str]
End-user ID related to the Log.
@@ -197,10 +193,10 @@ def log(
output="The patient is likely experiencing a myocardial infarction. Immediate medical attention is required.",
trace_status="incomplete",
start_time=datetime.datetime.fromisoformat(
- "2024-07-08 22:40:35+00:00",
+ "2024-07-08 21:40:35+00:00",
),
end_time=datetime.datetime.fromisoformat(
- "2024-07-08 22:40:39+00:00",
+ "2024-07-08 21:40:39+00:00",
),
)
"""
@@ -212,7 +208,7 @@ def log(
"environment": environment,
},
json={
- "evaluation_id": evaluation_id,
+ "run_id": run_id,
"path": path,
"id": id,
"start_time": start_time,
@@ -229,7 +225,6 @@ def log(
"metadata": metadata,
"source_datapoint_id": source_datapoint_id,
"trace_parent_id": trace_parent_id,
- "batch_id": batch_id,
"user": user,
"environment": flow_log_request_environment,
"save": save,
@@ -1210,7 +1205,7 @@ async def log(
*,
version_id: typing.Optional[str] = None,
environment: typing.Optional[str] = None,
- evaluation_id: typing.Optional[str] = OMIT,
+ run_id: typing.Optional[str] = OMIT,
path: typing.Optional[str] = OMIT,
id: typing.Optional[str] = OMIT,
start_time: typing.Optional[dt.datetime] = OMIT,
@@ -1227,7 +1222,6 @@ async def log(
metadata: typing.Optional[typing.Dict[str, typing.Optional[typing.Any]]] = OMIT,
source_datapoint_id: typing.Optional[str] = OMIT,
trace_parent_id: typing.Optional[str] = OMIT,
- batch_id: typing.Optional[str] = OMIT,
user: typing.Optional[str] = OMIT,
flow_log_request_environment: typing.Optional[str] = OMIT,
save: typing.Optional[bool] = OMIT,
@@ -1250,8 +1244,8 @@ async def log(
environment : typing.Optional[str]
Name of the Environment identifying a deployed version to log to.
- evaluation_id : typing.Optional[str]
- Unique identifier for the Evaluation Report to associate the Log to.
+ run_id : typing.Optional[str]
+ Unique identifier for the Run to associate the Log to.
path : typing.Optional[str]
Path of the Flow, including the name. This locates the Flow in the Humanloop filesystem and is used as as a unique identifier. For example: `folder/name` or just `name`.
@@ -1301,9 +1295,6 @@ async def log(
trace_parent_id : typing.Optional[str]
The ID of the parent Log to nest this Log under in a Trace.
- batch_id : typing.Optional[str]
- Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist.
-
user : typing.Optional[str]
End-user ID related to the Log.
@@ -1366,10 +1357,10 @@ async def main() -> None:
output="The patient is likely experiencing a myocardial infarction. Immediate medical attention is required.",
trace_status="incomplete",
start_time=datetime.datetime.fromisoformat(
- "2024-07-08 22:40:35+00:00",
+ "2024-07-08 21:40:35+00:00",
),
end_time=datetime.datetime.fromisoformat(
- "2024-07-08 22:40:39+00:00",
+ "2024-07-08 21:40:39+00:00",
),
)
@@ -1384,7 +1375,7 @@ async def main() -> None:
"environment": environment,
},
json={
- "evaluation_id": evaluation_id,
+ "run_id": run_id,
"path": path,
"id": id,
"start_time": start_time,
@@ -1401,7 +1392,6 @@ async def main() -> None:
"metadata": metadata,
"source_datapoint_id": source_datapoint_id,
"trace_parent_id": trace_parent_id,
- "batch_id": batch_id,
"user": user,
"environment": flow_log_request_environment,
"save": save,
diff --git a/src/humanloop/prompts/client.py b/src/humanloop/prompts/client.py
index a9332565..88cfa117 100644
--- a/src/humanloop/prompts/client.py
+++ b/src/humanloop/prompts/client.py
@@ -60,7 +60,7 @@ def log(
*,
version_id: typing.Optional[str] = None,
environment: typing.Optional[str] = None,
- evaluation_id: typing.Optional[str] = OMIT,
+ run_id: typing.Optional[str] = OMIT,
path: typing.Optional[str] = OMIT,
id: typing.Optional[str] = OMIT,
output_message: typing.Optional[ChatMessageParams] = OMIT,
@@ -86,7 +86,6 @@ def log(
metadata: typing.Optional[typing.Dict[str, typing.Optional[typing.Any]]] = OMIT,
source_datapoint_id: typing.Optional[str] = OMIT,
trace_parent_id: typing.Optional[str] = OMIT,
- batch_id: typing.Optional[str] = OMIT,
user: typing.Optional[str] = OMIT,
prompt_log_request_environment: typing.Optional[str] = OMIT,
save: typing.Optional[bool] = OMIT,
@@ -111,8 +110,8 @@ def log(
environment : typing.Optional[str]
Name of the Environment identifying a deployed version to log to.
- evaluation_id : typing.Optional[str]
- Unique identifier for the Evaluation Report to associate the Log to.
+ run_id : typing.Optional[str]
+ Unique identifier for the Run to associate the Log to.
path : typing.Optional[str]
Path of the Prompt, including the name. This locates the Prompt in the Humanloop filesystem and is used as as a unique identifier. For example: `folder/name` or just `name`.
@@ -193,9 +192,6 @@ def log(
trace_parent_id : typing.Optional[str]
The ID of the parent Log to nest this Log under in a Trace.
- batch_id : typing.Optional[str]
- Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist.
-
user : typing.Optional[str]
End-user ID related to the Log.
@@ -236,7 +232,7 @@ def log(
messages=[{"role": "user", "content": "What really happened at Roswell?"}],
inputs={"person": "Trump"},
created_at=datetime.datetime.fromisoformat(
- "2024-07-19 00:29:35.178000+00:00",
+ "2024-07-18 23:29:35.178000+00:00",
),
provider_latency=6.5931549072265625,
output_message={
@@ -258,7 +254,7 @@ def log(
"environment": environment,
},
json={
- "evaluation_id": evaluation_id,
+ "run_id": run_id,
"path": path,
"id": id,
"output_message": convert_and_respect_annotation_metadata(
@@ -292,7 +288,6 @@ def log(
"metadata": metadata,
"source_datapoint_id": source_datapoint_id,
"trace_parent_id": trace_parent_id,
- "batch_id": batch_id,
"user": user,
"environment": prompt_log_request_environment,
"save": save,
@@ -523,7 +518,6 @@ def call_stream(
end_time: typing.Optional[dt.datetime] = OMIT,
source_datapoint_id: typing.Optional[str] = OMIT,
trace_parent_id: typing.Optional[str] = OMIT,
- batch_id: typing.Optional[str] = OMIT,
user: typing.Optional[str] = OMIT,
prompts_call_stream_request_environment: typing.Optional[str] = OMIT,
save: typing.Optional[bool] = OMIT,
@@ -596,9 +590,6 @@ def call_stream(
trace_parent_id : typing.Optional[str]
The ID of the parent Log to nest this Log under in a Trace.
- batch_id : typing.Optional[str]
- Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist.
-
user : typing.Optional[str]
End-user ID related to the Log.
@@ -672,7 +663,6 @@ def call_stream(
),
source_datapoint_id="string",
trace_parent_id="string",
- batch_id="string",
user="string",
prompts_call_stream_request_environment="string",
save=True,
@@ -720,7 +710,6 @@ def call_stream(
"end_time": end_time,
"source_datapoint_id": source_datapoint_id,
"trace_parent_id": trace_parent_id,
- "batch_id": batch_id,
"user": user,
"environment": prompts_call_stream_request_environment,
"save": save,
@@ -784,7 +773,6 @@ def call(
end_time: typing.Optional[dt.datetime] = OMIT,
source_datapoint_id: typing.Optional[str] = OMIT,
trace_parent_id: typing.Optional[str] = OMIT,
- batch_id: typing.Optional[str] = OMIT,
user: typing.Optional[str] = OMIT,
prompts_call_request_environment: typing.Optional[str] = OMIT,
save: typing.Optional[bool] = OMIT,
@@ -857,9 +845,6 @@ def call(
trace_parent_id : typing.Optional[str]
The ID of the parent Log to nest this Log under in a Trace.
- batch_id : typing.Optional[str]
- Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist.
-
user : typing.Optional[str]
End-user ID related to the Log.
@@ -956,7 +941,6 @@ def call(
"end_time": end_time,
"source_datapoint_id": source_datapoint_id,
"trace_parent_id": trace_parent_id,
- "batch_id": batch_id,
"user": user,
"environment": prompts_call_request_environment,
"save": save,
@@ -1935,7 +1919,7 @@ async def log(
*,
version_id: typing.Optional[str] = None,
environment: typing.Optional[str] = None,
- evaluation_id: typing.Optional[str] = OMIT,
+ run_id: typing.Optional[str] = OMIT,
path: typing.Optional[str] = OMIT,
id: typing.Optional[str] = OMIT,
output_message: typing.Optional[ChatMessageParams] = OMIT,
@@ -1961,7 +1945,6 @@ async def log(
metadata: typing.Optional[typing.Dict[str, typing.Optional[typing.Any]]] = OMIT,
source_datapoint_id: typing.Optional[str] = OMIT,
trace_parent_id: typing.Optional[str] = OMIT,
- batch_id: typing.Optional[str] = OMIT,
user: typing.Optional[str] = OMIT,
prompt_log_request_environment: typing.Optional[str] = OMIT,
save: typing.Optional[bool] = OMIT,
@@ -1986,8 +1969,8 @@ async def log(
environment : typing.Optional[str]
Name of the Environment identifying a deployed version to log to.
- evaluation_id : typing.Optional[str]
- Unique identifier for the Evaluation Report to associate the Log to.
+ run_id : typing.Optional[str]
+ Unique identifier for the Run to associate the Log to.
path : typing.Optional[str]
Path of the Prompt, including the name. This locates the Prompt in the Humanloop filesystem and is used as as a unique identifier. For example: `folder/name` or just `name`.
@@ -2068,9 +2051,6 @@ async def log(
trace_parent_id : typing.Optional[str]
The ID of the parent Log to nest this Log under in a Trace.
- batch_id : typing.Optional[str]
- Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist.
-
user : typing.Optional[str]
End-user ID related to the Log.
@@ -2117,7 +2097,7 @@ async def main() -> None:
],
inputs={"person": "Trump"},
created_at=datetime.datetime.fromisoformat(
- "2024-07-19 00:29:35.178000+00:00",
+ "2024-07-18 23:29:35.178000+00:00",
),
provider_latency=6.5931549072265625,
output_message={
@@ -2142,7 +2122,7 @@ async def main() -> None:
"environment": environment,
},
json={
- "evaluation_id": evaluation_id,
+ "run_id": run_id,
"path": path,
"id": id,
"output_message": convert_and_respect_annotation_metadata(
@@ -2176,7 +2156,6 @@ async def main() -> None:
"metadata": metadata,
"source_datapoint_id": source_datapoint_id,
"trace_parent_id": trace_parent_id,
- "batch_id": batch_id,
"user": user,
"environment": prompt_log_request_environment,
"save": save,
@@ -2415,7 +2394,6 @@ async def call_stream(
end_time: typing.Optional[dt.datetime] = OMIT,
source_datapoint_id: typing.Optional[str] = OMIT,
trace_parent_id: typing.Optional[str] = OMIT,
- batch_id: typing.Optional[str] = OMIT,
user: typing.Optional[str] = OMIT,
prompts_call_stream_request_environment: typing.Optional[str] = OMIT,
save: typing.Optional[bool] = OMIT,
@@ -2488,9 +2466,6 @@ async def call_stream(
trace_parent_id : typing.Optional[str]
The ID of the parent Log to nest this Log under in a Trace.
- batch_id : typing.Optional[str]
- Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist.
-
user : typing.Optional[str]
End-user ID related to the Log.
@@ -2568,7 +2543,6 @@ async def main() -> None:
),
source_datapoint_id="string",
trace_parent_id="string",
- batch_id="string",
user="string",
prompts_call_stream_request_environment="string",
save=True,
@@ -2619,7 +2593,6 @@ async def main() -> None:
"end_time": end_time,
"source_datapoint_id": source_datapoint_id,
"trace_parent_id": trace_parent_id,
- "batch_id": batch_id,
"user": user,
"environment": prompts_call_stream_request_environment,
"save": save,
@@ -2683,7 +2656,6 @@ async def call(
end_time: typing.Optional[dt.datetime] = OMIT,
source_datapoint_id: typing.Optional[str] = OMIT,
trace_parent_id: typing.Optional[str] = OMIT,
- batch_id: typing.Optional[str] = OMIT,
user: typing.Optional[str] = OMIT,
prompts_call_request_environment: typing.Optional[str] = OMIT,
save: typing.Optional[bool] = OMIT,
@@ -2756,9 +2728,6 @@ async def call(
trace_parent_id : typing.Optional[str]
The ID of the parent Log to nest this Log under in a Trace.
- batch_id : typing.Optional[str]
- Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist.
-
user : typing.Optional[str]
End-user ID related to the Log.
@@ -2863,7 +2832,6 @@ async def main() -> None:
"end_time": end_time,
"source_datapoint_id": source_datapoint_id,
"trace_parent_id": trace_parent_id,
- "batch_id": batch_id,
"user": user,
"environment": prompts_call_request_environment,
"save": save,
diff --git a/src/humanloop/requests/__init__.py b/src/humanloop/requests/__init__.py
index a8f2e97e..e33e9078 100644
--- a/src/humanloop/requests/__init__.py
+++ b/src/humanloop/requests/__init__.py
@@ -23,12 +23,13 @@
DirectoryWithParentsAndChildrenResponseFilesItemParams,
)
from .environment_response import EnvironmentResponseParams
-from .evaluated_version_response import EvaluatedVersionResponseParams
from .evaluatee_request import EvaluateeRequestParams
from .evaluatee_response import EvaluateeResponseParams
from .evaluation_evaluator_response import EvaluationEvaluatorResponseParams
-from .evaluation_report_log_response import EvaluationReportLogResponseParams
+from .evaluation_log_response import EvaluationLogResponseParams
from .evaluation_response import EvaluationResponseParams
+from .evaluation_run_response import EvaluationRunResponseParams
+from .evaluation_runs_response import EvaluationRunsResponseParams
from .evaluation_stats import EvaluationStatsParams
from .evaluations_dataset_request import EvaluationsDatasetRequestParams
from .evaluations_request import EvaluationsRequestParams
@@ -74,7 +75,7 @@
from .monitoring_evaluator_version_request import MonitoringEvaluatorVersionRequestParams
from .numeric_evaluator_stats_response import NumericEvaluatorStatsResponseParams
from .overall_stats import OverallStatsParams
-from .paginated_data_evaluation_report_log_response import PaginatedDataEvaluationReportLogResponseParams
+from .paginated_data_evaluation_log_response import PaginatedDataEvaluationLogResponseParams
from .paginated_data_evaluator_response import PaginatedDataEvaluatorResponseParams
from .paginated_data_flow_response import PaginatedDataFlowResponseParams
from .paginated_data_log_response import PaginatedDataLogResponseParams
@@ -103,6 +104,9 @@
from .prompt_response_template import PromptResponseTemplateParams
from .provider_api_keys import ProviderApiKeysParams
from .response_format import ResponseFormatParams
+from .run_stats_response import RunStatsResponseParams
+from .run_stats_response_evaluator_stats_item import RunStatsResponseEvaluatorStatsItemParams
+from .run_version_response import RunVersionResponseParams
from .select_evaluator_stats_response import SelectEvaluatorStatsResponseParams
from .text_chat_content import TextChatContentParams
from .text_evaluator_stats_response import TextEvaluatorStatsResponseParams
@@ -119,6 +123,7 @@
from .version_id_response import VersionIdResponseParams
from .version_id_response_version import VersionIdResponseVersionParams
from .version_reference_response import VersionReferenceResponseParams
+from .version_specification import VersionSpecificationParams
from .version_stats_response import VersionStatsResponseParams
from .version_stats_response_evaluator_version_stats_item import VersionStatsResponseEvaluatorVersionStatsItemParams
@@ -144,12 +149,13 @@
"DirectoryWithParentsAndChildrenResponseFilesItemParams",
"DirectoryWithParentsAndChildrenResponseParams",
"EnvironmentResponseParams",
- "EvaluatedVersionResponseParams",
"EvaluateeRequestParams",
"EvaluateeResponseParams",
"EvaluationEvaluatorResponseParams",
- "EvaluationReportLogResponseParams",
+ "EvaluationLogResponseParams",
"EvaluationResponseParams",
+ "EvaluationRunResponseParams",
+ "EvaluationRunsResponseParams",
"EvaluationStatsParams",
"EvaluationsDatasetRequestParams",
"EvaluationsRequestParams",
@@ -191,7 +197,7 @@
"MonitoringEvaluatorVersionRequestParams",
"NumericEvaluatorStatsResponseParams",
"OverallStatsParams",
- "PaginatedDataEvaluationReportLogResponseParams",
+ "PaginatedDataEvaluationLogResponseParams",
"PaginatedDataEvaluatorResponseParams",
"PaginatedDataFlowResponseParams",
"PaginatedDataLogResponseParams",
@@ -216,6 +222,9 @@
"PromptResponseTemplateParams",
"ProviderApiKeysParams",
"ResponseFormatParams",
+ "RunStatsResponseEvaluatorStatsItemParams",
+ "RunStatsResponseParams",
+ "RunVersionResponseParams",
"SelectEvaluatorStatsResponseParams",
"TextChatContentParams",
"TextEvaluatorStatsResponseParams",
@@ -232,6 +241,7 @@
"VersionIdResponseParams",
"VersionIdResponseVersionParams",
"VersionReferenceResponseParams",
+ "VersionSpecificationParams",
"VersionStatsResponseEvaluatorVersionStatsItemParams",
"VersionStatsResponseParams",
]
diff --git a/src/humanloop/requests/boolean_evaluator_stats_response.py b/src/humanloop/requests/boolean_evaluator_stats_response.py
index 33d9b44f..18618f40 100644
--- a/src/humanloop/requests/boolean_evaluator_stats_response.py
+++ b/src/humanloop/requests/boolean_evaluator_stats_response.py
@@ -6,7 +6,7 @@
class BooleanEvaluatorStatsResponseParams(typing_extensions.TypedDict):
"""
Base attributes for stats for an Evaluator Version-Evaluated Version pair
- in the Evaluation Report.
+ in the Evaluation.
"""
evaluator_version_id: str
diff --git a/src/humanloop/requests/dataset_response.py b/src/humanloop/requests/dataset_response.py
index 941cf0d0..56fcc4ed 100644
--- a/src/humanloop/requests/dataset_response.py
+++ b/src/humanloop/requests/dataset_response.py
@@ -56,6 +56,16 @@ class DatasetResponseParams(typing_extensions.TypedDict):
The user who created the Dataset.
"""
+ committed_by: typing_extensions.NotRequired[UserResponse]
+ """
+ The user who committed the Dataset Version.
+ """
+
+ committed_at: typing_extensions.NotRequired[dt.datetime]
+ """
+ The date and time the Dataset Version was committed.
+ """
+
status: VersionStatus
"""
The status of the Dataset Version.
diff --git a/src/humanloop/requests/evaluatee_request.py b/src/humanloop/requests/evaluatee_request.py
index 2eba177f..26e048c5 100644
--- a/src/humanloop/requests/evaluatee_request.py
+++ b/src/humanloop/requests/evaluatee_request.py
@@ -38,7 +38,7 @@ class EvaluateeRequestParams(typing_extensions.TypedDict):
batch_id: typing_extensions.NotRequired[str]
"""
- Unique identifier for the batch of Logs to include in the Evaluation Report.
+ Unique identifier for the batch of Logs to include in the Evaluation.
"""
orchestrated: typing_extensions.NotRequired[bool]
diff --git a/src/humanloop/requests/evaluatee_response.py b/src/humanloop/requests/evaluatee_response.py
index 6ea5d9ba..411ba5ba 100644
--- a/src/humanloop/requests/evaluatee_response.py
+++ b/src/humanloop/requests/evaluatee_response.py
@@ -1,8 +1,8 @@
# This file was auto-generated by Fern from our API Definition.
import typing_extensions
-from .evaluated_version_response import EvaluatedVersionResponseParams
import typing_extensions
+from .run_version_response import RunVersionResponseParams
import datetime as dt
@@ -11,10 +11,10 @@ class EvaluateeResponseParams(typing_extensions.TypedDict):
Version of the Evaluatee being evaluated.
"""
- version: EvaluatedVersionResponseParams
+ version: typing_extensions.NotRequired[RunVersionResponseParams]
batch_id: typing_extensions.NotRequired[str]
"""
- Unique identifier for the batch of Logs to include in the Evaluation Report.
+ Unique identifier for the batch of Logs to include in the Evaluation.
"""
orchestrated: bool
diff --git a/src/humanloop/requests/evaluation_report_log_response.py b/src/humanloop/requests/evaluation_log_response.py
similarity index 59%
rename from src/humanloop/requests/evaluation_report_log_response.py
rename to src/humanloop/requests/evaluation_log_response.py
index 5aa9a042..8fe5d762 100644
--- a/src/humanloop/requests/evaluation_report_log_response.py
+++ b/src/humanloop/requests/evaluation_log_response.py
@@ -1,17 +1,15 @@
# This file was auto-generated by Fern from our API Definition.
import typing_extensions
-from .evaluated_version_response import EvaluatedVersionResponseParams
from .datapoint_response import DatapointResponseParams
-import typing_extensions
from .log_response import LogResponseParams
import typing
-class EvaluationReportLogResponseParams(typing_extensions.TypedDict):
- evaluated_version: EvaluatedVersionResponseParams
+class EvaluationLogResponseParams(typing_extensions.TypedDict):
+ run_id: str
"""
- The version of the Prompt, Tool or Evaluator that the Log belongs to.
+ Unique identifier for the Run.
"""
datapoint: DatapointResponseParams
@@ -19,7 +17,7 @@ class EvaluationReportLogResponseParams(typing_extensions.TypedDict):
The Datapoint used to generate the Log
"""
- log: typing_extensions.NotRequired[LogResponseParams]
+ log: LogResponseParams
"""
The Log that was evaluated by the Evaluator.
"""
diff --git a/src/humanloop/requests/evaluation_response.py b/src/humanloop/requests/evaluation_response.py
index fe09cad4..27d9da73 100644
--- a/src/humanloop/requests/evaluation_response.py
+++ b/src/humanloop/requests/evaluation_response.py
@@ -1,11 +1,8 @@
# This file was auto-generated by Fern from our API Definition.
import typing_extensions
-from .dataset_response import DatasetResponseParams
import typing
-from .evaluatee_response import EvaluateeResponseParams
from .evaluation_evaluator_response import EvaluationEvaluatorResponseParams
-from ..types.evaluation_status import EvaluationStatus
import typing_extensions
import datetime as dt
from ..types.user_response import UserResponse
@@ -17,14 +14,9 @@ class EvaluationResponseParams(typing_extensions.TypedDict):
Unique identifier for the Evaluation. Starts with `evr`.
"""
- dataset: DatasetResponseParams
+ runs_count: int
"""
- The Dataset used in the Evaluation.
- """
-
- evaluatees: typing.Sequence[EvaluateeResponseParams]
- """
- The Prompt/Tool Versions included in the Evaluation.
+ The total number of Runs in the Evaluation.
"""
evaluators: typing.Sequence[EvaluationEvaluatorResponseParams]
@@ -32,16 +24,6 @@ class EvaluationResponseParams(typing_extensions.TypedDict):
The Evaluator Versions used to evaluate.
"""
- status: EvaluationStatus
- """
- The current status of the Evaluation.
-
- - `"pending"`: The Evaluation has been created but is not actively being worked on by Humanloop.
- - `"running"`: Humanloop is checking for any missing Logs and Evaluator Logs, and will generate them where appropriate.
- - `"completed"`: All Logs an Evaluator Logs have been generated.
- - `"cancelled"`: The Evaluation has been cancelled by the user. Humanloop will stop generating Logs and Evaluator Logs.
- """
-
name: typing_extensions.NotRequired[str]
"""
Name of the Evaluation to help identify it. Must be unique among Evaluations associated with File.
diff --git a/src/humanloop/requests/evaluation_run_response.py b/src/humanloop/requests/evaluation_run_response.py
new file mode 100644
index 00000000..98ccfd75
--- /dev/null
+++ b/src/humanloop/requests/evaluation_run_response.py
@@ -0,0 +1,56 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing_extensions
+import typing_extensions
+from .dataset_response import DatasetResponseParams
+from .run_version_response import RunVersionResponseParams
+import datetime as dt
+from ..types.user_response import UserResponse
+from ..types.evaluation_status import EvaluationStatus
+
+
+class EvaluationRunResponseParams(typing_extensions.TypedDict):
+ id: str
+ """
+ Unique identifier for the Run.
+ """
+
+ dataset: typing_extensions.NotRequired[DatasetResponseParams]
+ """
+ The Dataset used in the Run.
+ """
+
+ version: typing_extensions.NotRequired[RunVersionResponseParams]
+ """
+ The version used in the Run.
+ """
+
+ orchestrated: bool
+ """
+ Whether the Run is orchestrated by Humanloop.
+ """
+
+ added_at: dt.datetime
+ """
+ When the Run was added to the Evaluation.
+ """
+
+ created_at: dt.datetime
+ """
+ When the Run was created.
+ """
+
+ created_by: typing_extensions.NotRequired[UserResponse]
+ """
+ The User who created the Run.
+ """
+
+ status: EvaluationStatus
+ """
+ The status of the Run.
+ """
+
+ control: bool
+ """
+ Stats for other Runs will be displayed in comparison to the control Run.
+ """
diff --git a/src/humanloop/requests/evaluation_runs_response.py b/src/humanloop/requests/evaluation_runs_response.py
new file mode 100644
index 00000000..a6e86d68
--- /dev/null
+++ b/src/humanloop/requests/evaluation_runs_response.py
@@ -0,0 +1,12 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing_extensions
+import typing
+from .evaluation_run_response import EvaluationRunResponseParams
+
+
+class EvaluationRunsResponseParams(typing_extensions.TypedDict):
+ runs: typing.Sequence[EvaluationRunResponseParams]
+ """
+ The Runs in the Evaluation.
+ """
diff --git a/src/humanloop/requests/evaluation_stats.py b/src/humanloop/requests/evaluation_stats.py
index b605ac2b..0a5a6a4a 100644
--- a/src/humanloop/requests/evaluation_stats.py
+++ b/src/humanloop/requests/evaluation_stats.py
@@ -1,22 +1,16 @@
# This file was auto-generated by Fern from our API Definition.
import typing_extensions
-from .overall_stats import OverallStatsParams
import typing
-from .version_stats_response import VersionStatsResponseParams
+from .run_stats_response import RunStatsResponseParams
import typing_extensions
from ..types.evaluation_status import EvaluationStatus
class EvaluationStatsParams(typing_extensions.TypedDict):
- overall_stats: OverallStatsParams
+ run_stats: typing.Sequence[RunStatsResponseParams]
"""
- Stats for the Evaluation Report as a whole.
- """
-
- version_stats: typing.Sequence[VersionStatsResponseParams]
- """
- Stats for each Evaluated Version in the Evaluation Report.
+ Stats for each Run in the Evaluation.
"""
progress: typing_extensions.NotRequired[str]
diff --git a/src/humanloop/requests/evaluator_response.py b/src/humanloop/requests/evaluator_response.py
index 888a55ff..609c11e4 100644
--- a/src/humanloop/requests/evaluator_response.py
+++ b/src/humanloop/requests/evaluator_response.py
@@ -66,6 +66,16 @@ class EvaluatorResponseParams(typing_extensions.TypedDict):
The user who created the Evaluator.
"""
+ committed_by: typing_extensions.NotRequired[UserResponse]
+ """
+ The user who committed the Evaluator Version.
+ """
+
+ committed_at: typing_extensions.NotRequired[dt.datetime]
+ """
+ The date and time the Evaluator Version was committed.
+ """
+
status: VersionStatus
last_used_at: dt.datetime
version_logs_count: int
diff --git a/src/humanloop/requests/flow_response.py b/src/humanloop/requests/flow_response.py
index 27a004ec..60b7753a 100644
--- a/src/humanloop/requests/flow_response.py
+++ b/src/humanloop/requests/flow_response.py
@@ -68,6 +68,16 @@ class FlowResponseParams(typing_extensions.TypedDict):
The user who created the Flow.
"""
+ committed_by: typing_extensions.NotRequired[UserResponse]
+ """
+ The user who committed the Flow Version.
+ """
+
+ committed_at: typing_extensions.NotRequired[dt.datetime]
+ """
+ The date and time the Flow Version was committed.
+ """
+
status: VersionStatus
"""
The status of the Flow Version.
diff --git a/src/humanloop/requests/numeric_evaluator_stats_response.py b/src/humanloop/requests/numeric_evaluator_stats_response.py
index 91eb5b4c..4edbda84 100644
--- a/src/humanloop/requests/numeric_evaluator_stats_response.py
+++ b/src/humanloop/requests/numeric_evaluator_stats_response.py
@@ -8,7 +8,7 @@
class NumericEvaluatorStatsResponseParams(typing_extensions.TypedDict):
"""
Base attributes for stats for an Evaluator Version-Evaluated Version pair
- in the Evaluation Report.
+ in the Evaluation.
"""
evaluator_version_id: str
diff --git a/src/humanloop/requests/overall_stats.py b/src/humanloop/requests/overall_stats.py
index 5946d210..da04f19f 100644
--- a/src/humanloop/requests/overall_stats.py
+++ b/src/humanloop/requests/overall_stats.py
@@ -6,15 +6,15 @@
class OverallStatsParams(typing_extensions.TypedDict):
num_datapoints: int
"""
- The total number of Datapoints in the Evaluation Report's Dataset Version.
+ The total number of Datapoints in the Evaluation's Dataset Version.
"""
total_logs: int
"""
- The total number of Logs in the Evaluation Report.
+ The total number of Logs in the Evaluation.
"""
total_evaluator_logs: int
"""
- The total number of Evaluator Logs in the Evaluation Report.
+ The total number of Evaluator Logs in the Evaluation.
"""
diff --git a/src/humanloop/requests/paginated_data_evaluation_log_response.py b/src/humanloop/requests/paginated_data_evaluation_log_response.py
new file mode 100644
index 00000000..e9723472
--- /dev/null
+++ b/src/humanloop/requests/paginated_data_evaluation_log_response.py
@@ -0,0 +1,12 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing_extensions
+import typing
+from .evaluation_log_response import EvaluationLogResponseParams
+
+
+class PaginatedDataEvaluationLogResponseParams(typing_extensions.TypedDict):
+ records: typing.Sequence[EvaluationLogResponseParams]
+ page: int
+ size: int
+ total: int
diff --git a/src/humanloop/requests/paginated_data_evaluation_report_log_response.py b/src/humanloop/requests/paginated_data_evaluation_report_log_response.py
deleted file mode 100644
index bdc88d6a..00000000
--- a/src/humanloop/requests/paginated_data_evaluation_report_log_response.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# This file was auto-generated by Fern from our API Definition.
-
-import typing_extensions
-import typing
-from .evaluation_report_log_response import EvaluationReportLogResponseParams
-
-
-class PaginatedDataEvaluationReportLogResponseParams(typing_extensions.TypedDict):
- records: typing.Sequence[EvaluationReportLogResponseParams]
- page: int
- size: int
- total: int
diff --git a/src/humanloop/requests/prompt_call_response.py b/src/humanloop/requests/prompt_call_response.py
index 685d6b44..7a66ecbd 100644
--- a/src/humanloop/requests/prompt_call_response.py
+++ b/src/humanloop/requests/prompt_call_response.py
@@ -70,11 +70,6 @@ class PromptCallResponseParams(typing_extensions.TypedDict):
The ID of the parent Log to nest this Log under in a Trace.
"""
- batch_id: typing_extensions.NotRequired[str]
- """
- Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist.
- """
-
user: typing_extensions.NotRequired[str]
"""
End-user ID related to the Log.
diff --git a/src/humanloop/requests/prompt_response.py b/src/humanloop/requests/prompt_response.py
index 918813ec..50039007 100644
--- a/src/humanloop/requests/prompt_response.py
+++ b/src/humanloop/requests/prompt_response.py
@@ -159,6 +159,16 @@ class PromptResponseParams(typing_extensions.TypedDict):
The user who created the Prompt.
"""
+ committed_by: typing_extensions.NotRequired[UserResponse]
+ """
+ The user who committed the Prompt Version.
+ """
+
+ committed_at: typing_extensions.NotRequired[dt.datetime]
+ """
+ The date and time the Prompt Version was committed.
+ """
+
status: VersionStatus
"""
The status of the Prompt Version.
diff --git a/src/humanloop/requests/run_stats_response.py b/src/humanloop/requests/run_stats_response.py
new file mode 100644
index 00000000..0cb19389
--- /dev/null
+++ b/src/humanloop/requests/run_stats_response.py
@@ -0,0 +1,37 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing_extensions
+import typing_extensions
+import typing
+from .run_stats_response_evaluator_stats_item import RunStatsResponseEvaluatorStatsItemParams
+
+
+class RunStatsResponseParams(typing_extensions.TypedDict):
+ """
+ Stats for a Run in the Evaluation.
+ """
+
+ run_id: str
+ """
+ Unique identifier for the Run.
+ """
+
+ version_id: typing_extensions.NotRequired[str]
+ """
+ Unique identifier for the evaluated Version.
+ """
+
+ batch_id: typing_extensions.NotRequired[str]
+ """
+ Unique identifier for the batch of Logs to include in the Evaluation.
+ """
+
+ num_logs: int
+ """
+ The total number of existing Logs in this Run.
+ """
+
+ evaluator_stats: typing.Sequence[RunStatsResponseEvaluatorStatsItemParams]
+ """
+ Stats for each Evaluator Version applied to this Run.
+ """
diff --git a/src/humanloop/requests/run_stats_response_evaluator_stats_item.py b/src/humanloop/requests/run_stats_response_evaluator_stats_item.py
new file mode 100644
index 00000000..a42aea0b
--- /dev/null
+++ b/src/humanloop/requests/run_stats_response_evaluator_stats_item.py
@@ -0,0 +1,14 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing
+from .numeric_evaluator_stats_response import NumericEvaluatorStatsResponseParams
+from .boolean_evaluator_stats_response import BooleanEvaluatorStatsResponseParams
+from .select_evaluator_stats_response import SelectEvaluatorStatsResponseParams
+from .text_evaluator_stats_response import TextEvaluatorStatsResponseParams
+
+RunStatsResponseEvaluatorStatsItemParams = typing.Union[
+ NumericEvaluatorStatsResponseParams,
+ BooleanEvaluatorStatsResponseParams,
+ SelectEvaluatorStatsResponseParams,
+ TextEvaluatorStatsResponseParams,
+]
diff --git a/src/humanloop/requests/evaluated_version_response.py b/src/humanloop/requests/run_version_response.py
similarity index 88%
rename from src/humanloop/requests/evaluated_version_response.py
rename to src/humanloop/requests/run_version_response.py
index d35a602d..879ea25c 100644
--- a/src/humanloop/requests/evaluated_version_response.py
+++ b/src/humanloop/requests/run_version_response.py
@@ -6,6 +6,6 @@
from .evaluator_response import EvaluatorResponseParams
from .flow_response import FlowResponseParams
-EvaluatedVersionResponseParams = typing.Union[
+RunVersionResponseParams = typing.Union[
PromptResponseParams, ToolResponseParams, EvaluatorResponseParams, FlowResponseParams
]
diff --git a/src/humanloop/requests/text_evaluator_stats_response.py b/src/humanloop/requests/text_evaluator_stats_response.py
index d1d97f81..8f0f358d 100644
--- a/src/humanloop/requests/text_evaluator_stats_response.py
+++ b/src/humanloop/requests/text_evaluator_stats_response.py
@@ -6,7 +6,7 @@
class TextEvaluatorStatsResponseParams(typing_extensions.TypedDict):
"""
Base attributes for stats for an Evaluator Version-Evaluated Version pair
- in the Evaluation Report.
+ in the Evaluation.
"""
evaluator_version_id: str
diff --git a/src/humanloop/requests/tool_response.py b/src/humanloop/requests/tool_response.py
index 44313db7..57b9b608 100644
--- a/src/humanloop/requests/tool_response.py
+++ b/src/humanloop/requests/tool_response.py
@@ -94,6 +94,16 @@ class ToolResponseParams(typing_extensions.TypedDict):
The user who created the Tool.
"""
+ committed_by: typing_extensions.NotRequired[UserResponse]
+ """
+ The user who committed the Tool Version.
+ """
+
+ committed_at: typing_extensions.NotRequired[dt.datetime]
+ """
+ The date and time the Tool Version was committed.
+ """
+
status: VersionStatus
"""
The status of the Tool Version.
diff --git a/src/humanloop/requests/version_specification.py b/src/humanloop/requests/version_specification.py
new file mode 100644
index 00000000..34606269
--- /dev/null
+++ b/src/humanloop/requests/version_specification.py
@@ -0,0 +1,37 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing_extensions
+import typing_extensions
+
+
+class VersionSpecificationParams(typing_extensions.TypedDict):
+ """
+ Specification of a File version on Humanloop.
+
+ This can be done in a couple of ways:
+
+ - Specifying `version_id` directly.
+ - Specifying a File (and optionally an Environment).
+ - A File can be specified by either `path` or `file_id`.
+ - An Environment can be specified by `environment_id`. If no Environment is specified, the default Environment is used.
+ """
+
+ version_id: typing_extensions.NotRequired[str]
+ """
+ Unique identifier for the File Version. If provided, none of the other fields should be specified.
+ """
+
+ path: typing_extensions.NotRequired[str]
+ """
+ Path identifying a File. Provide either this or `file_id` if you want to specify a File.
+ """
+
+ file_id: typing_extensions.NotRequired[str]
+ """
+ Unique identifier for the File. Provide either this or `path` if you want to specify a File.
+ """
+
+ environment: typing_extensions.NotRequired[str]
+ """
+ Name of the Environment a Version is deployed to. Only provide this when specifying a File. If not provided (and a File is specified), the default Environment is used.
+ """
diff --git a/src/humanloop/requests/version_stats_response.py b/src/humanloop/requests/version_stats_response.py
index 34f753f4..053c0ac9 100644
--- a/src/humanloop/requests/version_stats_response.py
+++ b/src/humanloop/requests/version_stats_response.py
@@ -7,26 +7,22 @@
class VersionStatsResponseParams(typing_extensions.TypedDict):
- """
- Stats for an Evaluated Version in the Evaluation Report.
- """
-
version_id: str
"""
- Unique identifier for the Evaluated Version.
+ Unique identifier for the evaluated Version.
"""
batch_id: typing_extensions.NotRequired[str]
"""
- Unique identifier for the batch of Logs to include in the Evaluation Report.
+ Unique identifier for the batch of Logs to include in the Evaluation.
"""
num_logs: int
"""
- The total number of existing Logs for this Evaluated Version within the Evaluation Report. These are Logs that have been generated by this Evaluated Version on a Datapoint belonging to the Evaluation Report's Dataset Version.
+ The total number of existing Logs in this Run.
"""
evaluator_version_stats: typing.Sequence[VersionStatsResponseEvaluatorVersionStatsItemParams]
"""
- Stats for each Evaluator Version used to evaluate this Evaluated Version.
+ Stats for each Evaluator Version applied to this Run.
"""
diff --git a/src/humanloop/tools/client.py b/src/humanloop/tools/client.py
index 4d23bb8e..7226e60b 100644
--- a/src/humanloop/tools/client.py
+++ b/src/humanloop/tools/client.py
@@ -62,7 +62,6 @@ def log(
metadata: typing.Optional[typing.Dict[str, typing.Optional[typing.Any]]] = OMIT,
source_datapoint_id: typing.Optional[str] = OMIT,
trace_parent_id: typing.Optional[str] = OMIT,
- batch_id: typing.Optional[str] = OMIT,
user: typing.Optional[str] = OMIT,
tool_log_request_environment: typing.Optional[str] = OMIT,
save: typing.Optional[bool] = OMIT,
@@ -136,9 +135,6 @@ def log(
trace_parent_id : typing.Optional[str]
The ID of the parent Log to nest this Log under in a Trace.
- batch_id : typing.Optional[str]
- Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist.
-
user : typing.Optional[str]
End-user ID related to the Log.
@@ -210,7 +206,6 @@ def log(
"metadata": metadata,
"source_datapoint_id": source_datapoint_id,
"trace_parent_id": trace_parent_id,
- "batch_id": batch_id,
"user": user,
"environment": tool_log_request_environment,
"save": save,
@@ -1258,7 +1253,6 @@ async def log(
metadata: typing.Optional[typing.Dict[str, typing.Optional[typing.Any]]] = OMIT,
source_datapoint_id: typing.Optional[str] = OMIT,
trace_parent_id: typing.Optional[str] = OMIT,
- batch_id: typing.Optional[str] = OMIT,
user: typing.Optional[str] = OMIT,
tool_log_request_environment: typing.Optional[str] = OMIT,
save: typing.Optional[bool] = OMIT,
@@ -1332,9 +1326,6 @@ async def log(
trace_parent_id : typing.Optional[str]
The ID of the parent Log to nest this Log under in a Trace.
- batch_id : typing.Optional[str]
- Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist.
-
user : typing.Optional[str]
End-user ID related to the Log.
@@ -1414,7 +1405,6 @@ async def main() -> None:
"metadata": metadata,
"source_datapoint_id": source_datapoint_id,
"trace_parent_id": trace_parent_id,
- "batch_id": batch_id,
"user": user,
"environment": tool_log_request_environment,
"save": save,
diff --git a/src/humanloop/types/__init__.py b/src/humanloop/types/__init__.py
index c9b3180f..8c973b52 100644
--- a/src/humanloop/types/__init__.py
+++ b/src/humanloop/types/__init__.py
@@ -26,12 +26,13 @@
from .directory_with_parents_and_children_response_files_item import DirectoryWithParentsAndChildrenResponseFilesItem
from .environment_response import EnvironmentResponse
from .environment_tag import EnvironmentTag
-from .evaluated_version_response import EvaluatedVersionResponse
from .evaluatee_request import EvaluateeRequest
from .evaluatee_response import EvaluateeResponse
from .evaluation_evaluator_response import EvaluationEvaluatorResponse
-from .evaluation_report_log_response import EvaluationReportLogResponse
+from .evaluation_log_response import EvaluationLogResponse
from .evaluation_response import EvaluationResponse
+from .evaluation_run_response import EvaluationRunResponse
+from .evaluation_runs_response import EvaluationRunsResponse
from .evaluation_stats import EvaluationStats
from .evaluation_status import EvaluationStatus
from .evaluations_dataset_request import EvaluationsDatasetRequest
@@ -78,6 +79,7 @@
from .list_tools import ListTools
from .llm_evaluator_request import LlmEvaluatorRequest
from .log_response import LogResponse
+from .logs_association_type import LogsAssociationType
from .model_endpoints import ModelEndpoints
from .model_providers import ModelProviders
from .monitoring_evaluator_environment_request import MonitoringEvaluatorEnvironmentRequest
@@ -87,7 +89,7 @@
from .numeric_evaluator_stats_response import NumericEvaluatorStatsResponse
from .observability_status import ObservabilityStatus
from .overall_stats import OverallStats
-from .paginated_data_evaluation_report_log_response import PaginatedDataEvaluationReportLogResponse
+from .paginated_data_evaluation_log_response import PaginatedDataEvaluationLogResponse
from .paginated_data_evaluator_response import PaginatedDataEvaluatorResponse
from .paginated_data_flow_response import PaginatedDataFlowResponse
from .paginated_data_log_response import PaginatedDataLogResponse
@@ -121,6 +123,9 @@
from .provider_api_keys import ProviderApiKeys
from .response_format import ResponseFormat
from .response_format_type import ResponseFormatType
+from .run_stats_response import RunStatsResponse
+from .run_stats_response_evaluator_stats_item import RunStatsResponseEvaluatorStatsItem
+from .run_version_response import RunVersionResponse
from .select_evaluator_stats_response import SelectEvaluatorStatsResponse
from .sort_order import SortOrder
from .text_chat_content import TextChatContent
@@ -144,6 +149,7 @@
from .version_id_response import VersionIdResponse
from .version_id_response_version import VersionIdResponseVersion
from .version_reference_response import VersionReferenceResponse
+from .version_specification import VersionSpecification
from .version_stats_response import VersionStatsResponse
from .version_stats_response_evaluator_version_stats_item import VersionStatsResponseEvaluatorVersionStatsItem
from .version_status import VersionStatus
@@ -175,12 +181,13 @@
"DirectoryWithParentsAndChildrenResponseFilesItem",
"EnvironmentResponse",
"EnvironmentTag",
- "EvaluatedVersionResponse",
"EvaluateeRequest",
"EvaluateeResponse",
"EvaluationEvaluatorResponse",
- "EvaluationReportLogResponse",
+ "EvaluationLogResponse",
"EvaluationResponse",
+ "EvaluationRunResponse",
+ "EvaluationRunsResponse",
"EvaluationStats",
"EvaluationStatus",
"EvaluationsDatasetRequest",
@@ -225,6 +232,7 @@
"ListTools",
"LlmEvaluatorRequest",
"LogResponse",
+ "LogsAssociationType",
"ModelEndpoints",
"ModelProviders",
"MonitoringEvaluatorEnvironmentRequest",
@@ -234,7 +242,7 @@
"NumericEvaluatorStatsResponse",
"ObservabilityStatus",
"OverallStats",
- "PaginatedDataEvaluationReportLogResponse",
+ "PaginatedDataEvaluationLogResponse",
"PaginatedDataEvaluatorResponse",
"PaginatedDataFlowResponse",
"PaginatedDataLogResponse",
@@ -264,6 +272,9 @@
"ProviderApiKeys",
"ResponseFormat",
"ResponseFormatType",
+ "RunStatsResponse",
+ "RunStatsResponseEvaluatorStatsItem",
+ "RunVersionResponse",
"SelectEvaluatorStatsResponse",
"SortOrder",
"TextChatContent",
@@ -287,6 +298,7 @@
"VersionIdResponse",
"VersionIdResponseVersion",
"VersionReferenceResponse",
+ "VersionSpecification",
"VersionStatsResponse",
"VersionStatsResponseEvaluatorVersionStatsItem",
"VersionStatus",
diff --git a/src/humanloop/types/boolean_evaluator_stats_response.py b/src/humanloop/types/boolean_evaluator_stats_response.py
index 9ce51712..3deca81b 100644
--- a/src/humanloop/types/boolean_evaluator_stats_response.py
+++ b/src/humanloop/types/boolean_evaluator_stats_response.py
@@ -9,7 +9,7 @@
class BooleanEvaluatorStatsResponse(UncheckedBaseModel):
"""
Base attributes for stats for an Evaluator Version-Evaluated Version pair
- in the Evaluation Report.
+ in the Evaluation.
"""
evaluator_version_id: str = pydantic.Field()
diff --git a/src/humanloop/types/dataset_response.py b/src/humanloop/types/dataset_response.py
index 132a7abf..942a9ee1 100644
--- a/src/humanloop/types/dataset_response.py
+++ b/src/humanloop/types/dataset_response.py
@@ -57,6 +57,16 @@ class DatasetResponse(UncheckedBaseModel):
The user who created the Dataset.
"""
+ committed_by: typing.Optional[UserResponse] = pydantic.Field(default=None)
+ """
+ The user who committed the Dataset Version.
+ """
+
+ committed_at: typing.Optional[dt.datetime] = pydantic.Field(default=None)
+ """
+ The date and time the Dataset Version was committed.
+ """
+
status: VersionStatus = pydantic.Field()
"""
The status of the Dataset Version.
diff --git a/src/humanloop/types/evaluatee_request.py b/src/humanloop/types/evaluatee_request.py
index 32f5f867..d976f840 100644
--- a/src/humanloop/types/evaluatee_request.py
+++ b/src/humanloop/types/evaluatee_request.py
@@ -40,7 +40,7 @@ class EvaluateeRequest(UncheckedBaseModel):
batch_id: typing.Optional[str] = pydantic.Field(default=None)
"""
- Unique identifier for the batch of Logs to include in the Evaluation Report.
+ Unique identifier for the batch of Logs to include in the Evaluation.
"""
orchestrated: typing.Optional[bool] = pydantic.Field(default=None)
diff --git a/src/humanloop/types/evaluatee_response.py b/src/humanloop/types/evaluatee_response.py
index 4dd78cb7..baa33f79 100644
--- a/src/humanloop/types/evaluatee_response.py
+++ b/src/humanloop/types/evaluatee_response.py
@@ -9,8 +9,8 @@
from .tool_response import ToolResponse
from .version_deployment_response import VersionDeploymentResponse
from .version_id_response import VersionIdResponse
-from .evaluated_version_response import EvaluatedVersionResponse
import typing
+from .run_version_response import RunVersionResponse
import pydantic
import datetime as dt
from ..core.pydantic_utilities import IS_PYDANTIC_V2
@@ -22,10 +22,10 @@ class EvaluateeResponse(UncheckedBaseModel):
Version of the Evaluatee being evaluated.
"""
- version: EvaluatedVersionResponse
+ version: typing.Optional[RunVersionResponse] = None
batch_id: typing.Optional[str] = pydantic.Field(default=None)
"""
- Unique identifier for the batch of Logs to include in the Evaluation Report.
+ Unique identifier for the batch of Logs to include in the Evaluation.
"""
orchestrated: bool = pydantic.Field()
diff --git a/src/humanloop/types/evaluation_report_log_response.py b/src/humanloop/types/evaluation_log_response.py
similarity index 53%
rename from src/humanloop/types/evaluation_report_log_response.py
rename to src/humanloop/types/evaluation_log_response.py
index a92d3414..d0ad938d 100644
--- a/src/humanloop/types/evaluation_report_log_response.py
+++ b/src/humanloop/types/evaluation_log_response.py
@@ -2,30 +2,29 @@
from __future__ import annotations
from ..core.unchecked_base_model import UncheckedBaseModel
+from .evaluator_log_response import EvaluatorLogResponse
from .evaluator_response import EvaluatorResponse
+from .flow_log_response import FlowLogResponse
from .flow_response import FlowResponse
from .monitoring_evaluator_response import MonitoringEvaluatorResponse
+from .prompt_log_response import PromptLogResponse
from .prompt_response import PromptResponse
+from .tool_log_response import ToolLogResponse
from .tool_response import ToolResponse
from .version_deployment_response import VersionDeploymentResponse
from .version_id_response import VersionIdResponse
-from .evaluator_log_response import EvaluatorLogResponse
-from .flow_log_response import FlowLogResponse
-from .prompt_log_response import PromptLogResponse
-from .tool_log_response import ToolLogResponse
-from .evaluated_version_response import EvaluatedVersionResponse
import pydantic
from .datapoint_response import DatapointResponse
-import typing
from .log_response import LogResponse
+import typing
from ..core.pydantic_utilities import IS_PYDANTIC_V2
from ..core.pydantic_utilities import update_forward_refs
-class EvaluationReportLogResponse(UncheckedBaseModel):
- evaluated_version: EvaluatedVersionResponse = pydantic.Field()
+class EvaluationLogResponse(UncheckedBaseModel):
+ run_id: str = pydantic.Field()
"""
- The version of the Prompt, Tool or Evaluator that the Log belongs to.
+ Unique identifier for the Run.
"""
datapoint: DatapointResponse = pydantic.Field()
@@ -33,7 +32,7 @@ class EvaluationReportLogResponse(UncheckedBaseModel):
The Datapoint used to generate the Log
"""
- log: typing.Optional[LogResponse] = pydantic.Field(default=None)
+ log: LogResponse = pydantic.Field()
"""
The Log that was evaluated by the Evaluator.
"""
@@ -53,14 +52,14 @@ class Config:
extra = pydantic.Extra.allow
-update_forward_refs(EvaluatorResponse, EvaluationReportLogResponse=EvaluationReportLogResponse)
-update_forward_refs(FlowResponse, EvaluationReportLogResponse=EvaluationReportLogResponse)
-update_forward_refs(MonitoringEvaluatorResponse, EvaluationReportLogResponse=EvaluationReportLogResponse)
-update_forward_refs(PromptResponse, EvaluationReportLogResponse=EvaluationReportLogResponse)
-update_forward_refs(ToolResponse, EvaluationReportLogResponse=EvaluationReportLogResponse)
-update_forward_refs(VersionDeploymentResponse, EvaluationReportLogResponse=EvaluationReportLogResponse)
-update_forward_refs(VersionIdResponse, EvaluationReportLogResponse=EvaluationReportLogResponse)
-update_forward_refs(EvaluatorLogResponse, EvaluationReportLogResponse=EvaluationReportLogResponse)
-update_forward_refs(FlowLogResponse, EvaluationReportLogResponse=EvaluationReportLogResponse)
-update_forward_refs(PromptLogResponse, EvaluationReportLogResponse=EvaluationReportLogResponse)
-update_forward_refs(ToolLogResponse, EvaluationReportLogResponse=EvaluationReportLogResponse)
+update_forward_refs(EvaluatorLogResponse, EvaluationLogResponse=EvaluationLogResponse)
+update_forward_refs(EvaluatorResponse, EvaluationLogResponse=EvaluationLogResponse)
+update_forward_refs(FlowLogResponse, EvaluationLogResponse=EvaluationLogResponse)
+update_forward_refs(FlowResponse, EvaluationLogResponse=EvaluationLogResponse)
+update_forward_refs(MonitoringEvaluatorResponse, EvaluationLogResponse=EvaluationLogResponse)
+update_forward_refs(PromptLogResponse, EvaluationLogResponse=EvaluationLogResponse)
+update_forward_refs(PromptResponse, EvaluationLogResponse=EvaluationLogResponse)
+update_forward_refs(ToolLogResponse, EvaluationLogResponse=EvaluationLogResponse)
+update_forward_refs(ToolResponse, EvaluationLogResponse=EvaluationLogResponse)
+update_forward_refs(VersionDeploymentResponse, EvaluationLogResponse=EvaluationLogResponse)
+update_forward_refs(VersionIdResponse, EvaluationLogResponse=EvaluationLogResponse)
diff --git a/src/humanloop/types/evaluation_response.py b/src/humanloop/types/evaluation_response.py
index b8864204..a4c2336a 100644
--- a/src/humanloop/types/evaluation_response.py
+++ b/src/humanloop/types/evaluation_response.py
@@ -10,11 +10,8 @@
from .version_deployment_response import VersionDeploymentResponse
from .version_id_response import VersionIdResponse
import pydantic
-from .dataset_response import DatasetResponse
import typing
-from .evaluatee_response import EvaluateeResponse
from .evaluation_evaluator_response import EvaluationEvaluatorResponse
-from .evaluation_status import EvaluationStatus
import datetime as dt
from .user_response import UserResponse
from ..core.pydantic_utilities import IS_PYDANTIC_V2
@@ -27,14 +24,9 @@ class EvaluationResponse(UncheckedBaseModel):
Unique identifier for the Evaluation. Starts with `evr`.
"""
- dataset: DatasetResponse = pydantic.Field()
+ runs_count: int = pydantic.Field()
"""
- The Dataset used in the Evaluation.
- """
-
- evaluatees: typing.List[EvaluateeResponse] = pydantic.Field()
- """
- The Prompt/Tool Versions included in the Evaluation.
+ The total number of Runs in the Evaluation.
"""
evaluators: typing.List[EvaluationEvaluatorResponse] = pydantic.Field()
@@ -42,16 +34,6 @@ class EvaluationResponse(UncheckedBaseModel):
The Evaluator Versions used to evaluate.
"""
- status: EvaluationStatus = pydantic.Field()
- """
- The current status of the Evaluation.
-
- - `"pending"`: The Evaluation has been created but is not actively being worked on by Humanloop.
- - `"running"`: Humanloop is checking for any missing Logs and Evaluator Logs, and will generate them where appropriate.
- - `"completed"`: All Logs an Evaluator Logs have been generated.
- - `"cancelled"`: The Evaluation has been cancelled by the user. Humanloop will stop generating Logs and Evaluator Logs.
- """
-
name: typing.Optional[str] = pydantic.Field(default=None)
"""
Name of the Evaluation to help identify it. Must be unique among Evaluations associated with File.
diff --git a/src/humanloop/types/evaluation_run_response.py b/src/humanloop/types/evaluation_run_response.py
new file mode 100644
index 00000000..46f9308d
--- /dev/null
+++ b/src/humanloop/types/evaluation_run_response.py
@@ -0,0 +1,85 @@
+# This file was auto-generated by Fern from our API Definition.
+
+from __future__ import annotations
+from ..core.unchecked_base_model import UncheckedBaseModel
+from .evaluator_response import EvaluatorResponse
+from .flow_response import FlowResponse
+from .monitoring_evaluator_response import MonitoringEvaluatorResponse
+from .prompt_response import PromptResponse
+from .tool_response import ToolResponse
+from .version_deployment_response import VersionDeploymentResponse
+from .version_id_response import VersionIdResponse
+import pydantic
+import typing
+from .dataset_response import DatasetResponse
+from .run_version_response import RunVersionResponse
+import datetime as dt
+from .user_response import UserResponse
+from .evaluation_status import EvaluationStatus
+from ..core.pydantic_utilities import IS_PYDANTIC_V2
+from ..core.pydantic_utilities import update_forward_refs
+
+
+class EvaluationRunResponse(UncheckedBaseModel):
+ id: str = pydantic.Field()
+ """
+ Unique identifier for the Run.
+ """
+
+ dataset: typing.Optional[DatasetResponse] = pydantic.Field(default=None)
+ """
+ The Dataset used in the Run.
+ """
+
+ version: typing.Optional[RunVersionResponse] = pydantic.Field(default=None)
+ """
+ The version used in the Run.
+ """
+
+ orchestrated: bool = pydantic.Field()
+ """
+ Whether the Run is orchestrated by Humanloop.
+ """
+
+ added_at: dt.datetime = pydantic.Field()
+ """
+ When the Run was added to the Evaluation.
+ """
+
+ created_at: dt.datetime = pydantic.Field()
+ """
+ When the Run was created.
+ """
+
+ created_by: typing.Optional[UserResponse] = pydantic.Field(default=None)
+ """
+ The User who created the Run.
+ """
+
+ status: EvaluationStatus = pydantic.Field()
+ """
+ The status of the Run.
+ """
+
+ control: bool = pydantic.Field()
+ """
+ Stats for other Runs will be displayed in comparison to the control Run.
+ """
+
+ if IS_PYDANTIC_V2:
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
+ else:
+
+ class Config:
+ frozen = True
+ smart_union = True
+ extra = pydantic.Extra.allow
+
+
+update_forward_refs(EvaluatorResponse, EvaluationRunResponse=EvaluationRunResponse)
+update_forward_refs(FlowResponse, EvaluationRunResponse=EvaluationRunResponse)
+update_forward_refs(MonitoringEvaluatorResponse, EvaluationRunResponse=EvaluationRunResponse)
+update_forward_refs(PromptResponse, EvaluationRunResponse=EvaluationRunResponse)
+update_forward_refs(ToolResponse, EvaluationRunResponse=EvaluationRunResponse)
+update_forward_refs(VersionDeploymentResponse, EvaluationRunResponse=EvaluationRunResponse)
+update_forward_refs(VersionIdResponse, EvaluationRunResponse=EvaluationRunResponse)
diff --git a/src/humanloop/types/evaluation_runs_response.py b/src/humanloop/types/evaluation_runs_response.py
new file mode 100644
index 00000000..208a7529
--- /dev/null
+++ b/src/humanloop/types/evaluation_runs_response.py
@@ -0,0 +1,41 @@
+# This file was auto-generated by Fern from our API Definition.
+
+from __future__ import annotations
+from ..core.unchecked_base_model import UncheckedBaseModel
+from .evaluator_response import EvaluatorResponse
+from .flow_response import FlowResponse
+from .monitoring_evaluator_response import MonitoringEvaluatorResponse
+from .prompt_response import PromptResponse
+from .tool_response import ToolResponse
+from .version_deployment_response import VersionDeploymentResponse
+from .version_id_response import VersionIdResponse
+import typing
+from .evaluation_run_response import EvaluationRunResponse
+import pydantic
+from ..core.pydantic_utilities import IS_PYDANTIC_V2
+from ..core.pydantic_utilities import update_forward_refs
+
+
+class EvaluationRunsResponse(UncheckedBaseModel):
+ runs: typing.List[EvaluationRunResponse] = pydantic.Field()
+ """
+ The Runs in the Evaluation.
+ """
+
+ if IS_PYDANTIC_V2:
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
+ else:
+
+ class Config:
+ frozen = True
+ smart_union = True
+ extra = pydantic.Extra.allow
+
+
+update_forward_refs(EvaluatorResponse, EvaluationRunsResponse=EvaluationRunsResponse)
+update_forward_refs(FlowResponse, EvaluationRunsResponse=EvaluationRunsResponse)
+update_forward_refs(MonitoringEvaluatorResponse, EvaluationRunsResponse=EvaluationRunsResponse)
+update_forward_refs(PromptResponse, EvaluationRunsResponse=EvaluationRunsResponse)
+update_forward_refs(ToolResponse, EvaluationRunsResponse=EvaluationRunsResponse)
+update_forward_refs(VersionDeploymentResponse, EvaluationRunsResponse=EvaluationRunsResponse)
+update_forward_refs(VersionIdResponse, EvaluationRunsResponse=EvaluationRunsResponse)
diff --git a/src/humanloop/types/evaluation_stats.py b/src/humanloop/types/evaluation_stats.py
index 350cf1db..9a6a07a7 100644
--- a/src/humanloop/types/evaluation_stats.py
+++ b/src/humanloop/types/evaluation_stats.py
@@ -1,23 +1,17 @@
# This file was auto-generated by Fern from our API Definition.
from ..core.unchecked_base_model import UncheckedBaseModel
-from .overall_stats import OverallStats
-import pydantic
import typing
-from .version_stats_response import VersionStatsResponse
+from .run_stats_response import RunStatsResponse
+import pydantic
from .evaluation_status import EvaluationStatus
from ..core.pydantic_utilities import IS_PYDANTIC_V2
class EvaluationStats(UncheckedBaseModel):
- overall_stats: OverallStats = pydantic.Field()
- """
- Stats for the Evaluation Report as a whole.
- """
-
- version_stats: typing.List[VersionStatsResponse] = pydantic.Field()
+ run_stats: typing.List[RunStatsResponse] = pydantic.Field()
"""
- Stats for each Evaluated Version in the Evaluation Report.
+ Stats for each Run in the Evaluation.
"""
progress: typing.Optional[str] = pydantic.Field(default=None)
diff --git a/src/humanloop/types/evaluator_response.py b/src/humanloop/types/evaluator_response.py
index 69111519..fcaf0326 100644
--- a/src/humanloop/types/evaluator_response.py
+++ b/src/humanloop/types/evaluator_response.py
@@ -64,6 +64,16 @@ class EvaluatorResponse(UncheckedBaseModel):
The user who created the Evaluator.
"""
+ committed_by: typing.Optional[UserResponse] = pydantic.Field(default=None)
+ """
+ The user who committed the Evaluator Version.
+ """
+
+ committed_at: typing.Optional[dt.datetime] = pydantic.Field(default=None)
+ """
+ The date and time the Evaluator Version was committed.
+ """
+
status: VersionStatus
last_used_at: dt.datetime
version_logs_count: int = pydantic.Field()
diff --git a/src/humanloop/types/flow_response.py b/src/humanloop/types/flow_response.py
index 2c478605..874782a1 100644
--- a/src/humanloop/types/flow_response.py
+++ b/src/humanloop/types/flow_response.py
@@ -66,6 +66,16 @@ class FlowResponse(UncheckedBaseModel):
The user who created the Flow.
"""
+ committed_by: typing.Optional[UserResponse] = pydantic.Field(default=None)
+ """
+ The user who committed the Flow Version.
+ """
+
+ committed_at: typing.Optional[dt.datetime] = pydantic.Field(default=None)
+ """
+ The date and time the Flow Version was committed.
+ """
+
status: VersionStatus = pydantic.Field()
"""
The status of the Flow Version.
diff --git a/src/humanloop/types/logs_association_type.py b/src/humanloop/types/logs_association_type.py
new file mode 100644
index 00000000..c904b93c
--- /dev/null
+++ b/src/humanloop/types/logs_association_type.py
@@ -0,0 +1,5 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing
+
+LogsAssociationType = typing.Union[typing.Literal["dynamic", "fixed"], typing.Any]
diff --git a/src/humanloop/types/numeric_evaluator_stats_response.py b/src/humanloop/types/numeric_evaluator_stats_response.py
index 6ca2e662..eec24ff5 100644
--- a/src/humanloop/types/numeric_evaluator_stats_response.py
+++ b/src/humanloop/types/numeric_evaluator_stats_response.py
@@ -9,7 +9,7 @@
class NumericEvaluatorStatsResponse(UncheckedBaseModel):
"""
Base attributes for stats for an Evaluator Version-Evaluated Version pair
- in the Evaluation Report.
+ in the Evaluation.
"""
evaluator_version_id: str = pydantic.Field()
diff --git a/src/humanloop/types/overall_stats.py b/src/humanloop/types/overall_stats.py
index 8258f898..b1d6e6dc 100644
--- a/src/humanloop/types/overall_stats.py
+++ b/src/humanloop/types/overall_stats.py
@@ -9,17 +9,17 @@
class OverallStats(UncheckedBaseModel):
num_datapoints: int = pydantic.Field()
"""
- The total number of Datapoints in the Evaluation Report's Dataset Version.
+ The total number of Datapoints in the Evaluation's Dataset Version.
"""
total_logs: int = pydantic.Field()
"""
- The total number of Logs in the Evaluation Report.
+ The total number of Logs in the Evaluation.
"""
total_evaluator_logs: int = pydantic.Field()
"""
- The total number of Evaluator Logs in the Evaluation Report.
+ The total number of Evaluator Logs in the Evaluation.
"""
if IS_PYDANTIC_V2:
diff --git a/src/humanloop/types/paginated_data_evaluation_log_response.py b/src/humanloop/types/paginated_data_evaluation_log_response.py
new file mode 100644
index 00000000..c6e19791
--- /dev/null
+++ b/src/humanloop/types/paginated_data_evaluation_log_response.py
@@ -0,0 +1,49 @@
+# This file was auto-generated by Fern from our API Definition.
+
+from __future__ import annotations
+from ..core.unchecked_base_model import UncheckedBaseModel
+from .evaluator_log_response import EvaluatorLogResponse
+from .evaluator_response import EvaluatorResponse
+from .flow_log_response import FlowLogResponse
+from .flow_response import FlowResponse
+from .monitoring_evaluator_response import MonitoringEvaluatorResponse
+from .prompt_log_response import PromptLogResponse
+from .prompt_response import PromptResponse
+from .tool_log_response import ToolLogResponse
+from .tool_response import ToolResponse
+from .version_deployment_response import VersionDeploymentResponse
+from .version_id_response import VersionIdResponse
+import typing
+from .evaluation_log_response import EvaluationLogResponse
+from ..core.pydantic_utilities import IS_PYDANTIC_V2
+import pydantic
+from ..core.pydantic_utilities import update_forward_refs
+
+
+class PaginatedDataEvaluationLogResponse(UncheckedBaseModel):
+ records: typing.List[EvaluationLogResponse]
+ page: int
+ size: int
+ total: int
+
+ if IS_PYDANTIC_V2:
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
+ else:
+
+ class Config:
+ frozen = True
+ smart_union = True
+ extra = pydantic.Extra.allow
+
+
+update_forward_refs(EvaluatorLogResponse, PaginatedDataEvaluationLogResponse=PaginatedDataEvaluationLogResponse)
+update_forward_refs(EvaluatorResponse, PaginatedDataEvaluationLogResponse=PaginatedDataEvaluationLogResponse)
+update_forward_refs(FlowLogResponse, PaginatedDataEvaluationLogResponse=PaginatedDataEvaluationLogResponse)
+update_forward_refs(FlowResponse, PaginatedDataEvaluationLogResponse=PaginatedDataEvaluationLogResponse)
+update_forward_refs(MonitoringEvaluatorResponse, PaginatedDataEvaluationLogResponse=PaginatedDataEvaluationLogResponse)
+update_forward_refs(PromptLogResponse, PaginatedDataEvaluationLogResponse=PaginatedDataEvaluationLogResponse)
+update_forward_refs(PromptResponse, PaginatedDataEvaluationLogResponse=PaginatedDataEvaluationLogResponse)
+update_forward_refs(ToolLogResponse, PaginatedDataEvaluationLogResponse=PaginatedDataEvaluationLogResponse)
+update_forward_refs(ToolResponse, PaginatedDataEvaluationLogResponse=PaginatedDataEvaluationLogResponse)
+update_forward_refs(VersionDeploymentResponse, PaginatedDataEvaluationLogResponse=PaginatedDataEvaluationLogResponse)
+update_forward_refs(VersionIdResponse, PaginatedDataEvaluationLogResponse=PaginatedDataEvaluationLogResponse)
diff --git a/src/humanloop/types/paginated_data_evaluation_report_log_response.py b/src/humanloop/types/paginated_data_evaluation_report_log_response.py
deleted file mode 100644
index 95c1725d..00000000
--- a/src/humanloop/types/paginated_data_evaluation_report_log_response.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# This file was auto-generated by Fern from our API Definition.
-
-from __future__ import annotations
-from ..core.unchecked_base_model import UncheckedBaseModel
-from .evaluator_log_response import EvaluatorLogResponse
-from .evaluator_response import EvaluatorResponse
-from .flow_log_response import FlowLogResponse
-from .flow_response import FlowResponse
-from .monitoring_evaluator_response import MonitoringEvaluatorResponse
-from .prompt_log_response import PromptLogResponse
-from .prompt_response import PromptResponse
-from .tool_log_response import ToolLogResponse
-from .tool_response import ToolResponse
-from .version_deployment_response import VersionDeploymentResponse
-from .version_id_response import VersionIdResponse
-import typing
-from .evaluation_report_log_response import EvaluationReportLogResponse
-from ..core.pydantic_utilities import IS_PYDANTIC_V2
-import pydantic
-from ..core.pydantic_utilities import update_forward_refs
-
-
-class PaginatedDataEvaluationReportLogResponse(UncheckedBaseModel):
- records: typing.List[EvaluationReportLogResponse]
- page: int
- size: int
- total: int
-
- if IS_PYDANTIC_V2:
- model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
- else:
-
- class Config:
- frozen = True
- smart_union = True
- extra = pydantic.Extra.allow
-
-
-update_forward_refs(
- EvaluatorLogResponse, PaginatedDataEvaluationReportLogResponse=PaginatedDataEvaluationReportLogResponse
-)
-update_forward_refs(
- EvaluatorResponse, PaginatedDataEvaluationReportLogResponse=PaginatedDataEvaluationReportLogResponse
-)
-update_forward_refs(FlowLogResponse, PaginatedDataEvaluationReportLogResponse=PaginatedDataEvaluationReportLogResponse)
-update_forward_refs(FlowResponse, PaginatedDataEvaluationReportLogResponse=PaginatedDataEvaluationReportLogResponse)
-update_forward_refs(
- MonitoringEvaluatorResponse, PaginatedDataEvaluationReportLogResponse=PaginatedDataEvaluationReportLogResponse
-)
-update_forward_refs(
- PromptLogResponse, PaginatedDataEvaluationReportLogResponse=PaginatedDataEvaluationReportLogResponse
-)
-update_forward_refs(PromptResponse, PaginatedDataEvaluationReportLogResponse=PaginatedDataEvaluationReportLogResponse)
-update_forward_refs(ToolLogResponse, PaginatedDataEvaluationReportLogResponse=PaginatedDataEvaluationReportLogResponse)
-update_forward_refs(ToolResponse, PaginatedDataEvaluationReportLogResponse=PaginatedDataEvaluationReportLogResponse)
-update_forward_refs(
- VersionDeploymentResponse, PaginatedDataEvaluationReportLogResponse=PaginatedDataEvaluationReportLogResponse
-)
-update_forward_refs(
- VersionIdResponse, PaginatedDataEvaluationReportLogResponse=PaginatedDataEvaluationReportLogResponse
-)
diff --git a/src/humanloop/types/prompt_call_response.py b/src/humanloop/types/prompt_call_response.py
index 64db5f49..492d10aa 100644
--- a/src/humanloop/types/prompt_call_response.py
+++ b/src/humanloop/types/prompt_call_response.py
@@ -79,11 +79,6 @@ class PromptCallResponse(UncheckedBaseModel):
The ID of the parent Log to nest this Log under in a Trace.
"""
- batch_id: typing.Optional[str] = pydantic.Field(default=None)
- """
- Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist.
- """
-
user: typing.Optional[str] = pydantic.Field(default=None)
"""
End-user ID related to the Log.
diff --git a/src/humanloop/types/prompt_response.py b/src/humanloop/types/prompt_response.py
index 6f1029f5..64db52d5 100644
--- a/src/humanloop/types/prompt_response.py
+++ b/src/humanloop/types/prompt_response.py
@@ -157,6 +157,16 @@ class PromptResponse(UncheckedBaseModel):
The user who created the Prompt.
"""
+ committed_by: typing.Optional[UserResponse] = pydantic.Field(default=None)
+ """
+ The user who committed the Prompt Version.
+ """
+
+ committed_at: typing.Optional[dt.datetime] = pydantic.Field(default=None)
+ """
+ The date and time the Prompt Version was committed.
+ """
+
status: VersionStatus = pydantic.Field()
"""
The status of the Prompt Version.
diff --git a/src/humanloop/types/run_stats_response.py b/src/humanloop/types/run_stats_response.py
new file mode 100644
index 00000000..201c6e76
--- /dev/null
+++ b/src/humanloop/types/run_stats_response.py
@@ -0,0 +1,47 @@
+# This file was auto-generated by Fern from our API Definition.
+
+from ..core.unchecked_base_model import UncheckedBaseModel
+import pydantic
+import typing
+from .run_stats_response_evaluator_stats_item import RunStatsResponseEvaluatorStatsItem
+from ..core.pydantic_utilities import IS_PYDANTIC_V2
+
+
+class RunStatsResponse(UncheckedBaseModel):
+ """
+ Stats for a Run in the Evaluation.
+ """
+
+ run_id: str = pydantic.Field()
+ """
+ Unique identifier for the Run.
+ """
+
+ version_id: typing.Optional[str] = pydantic.Field(default=None)
+ """
+ Unique identifier for the evaluated Version.
+ """
+
+ batch_id: typing.Optional[str] = pydantic.Field(default=None)
+ """
+ Unique identifier for the batch of Logs to include in the Evaluation.
+ """
+
+ num_logs: int = pydantic.Field()
+ """
+ The total number of existing Logs in this Run.
+ """
+
+ evaluator_stats: typing.List[RunStatsResponseEvaluatorStatsItem] = pydantic.Field()
+ """
+ Stats for each Evaluator Version applied to this Run.
+ """
+
+ if IS_PYDANTIC_V2:
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
+ else:
+
+ class Config:
+ frozen = True
+ smart_union = True
+ extra = pydantic.Extra.allow
diff --git a/src/humanloop/types/run_stats_response_evaluator_stats_item.py b/src/humanloop/types/run_stats_response_evaluator_stats_item.py
new file mode 100644
index 00000000..c7fe6056
--- /dev/null
+++ b/src/humanloop/types/run_stats_response_evaluator_stats_item.py
@@ -0,0 +1,14 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import typing
+from .numeric_evaluator_stats_response import NumericEvaluatorStatsResponse
+from .boolean_evaluator_stats_response import BooleanEvaluatorStatsResponse
+from .select_evaluator_stats_response import SelectEvaluatorStatsResponse
+from .text_evaluator_stats_response import TextEvaluatorStatsResponse
+
+RunStatsResponseEvaluatorStatsItem = typing.Union[
+ NumericEvaluatorStatsResponse,
+ BooleanEvaluatorStatsResponse,
+ SelectEvaluatorStatsResponse,
+ TextEvaluatorStatsResponse,
+]
diff --git a/src/humanloop/types/evaluated_version_response.py b/src/humanloop/types/run_version_response.py
similarity index 71%
rename from src/humanloop/types/evaluated_version_response.py
rename to src/humanloop/types/run_version_response.py
index 3064bfb1..d94b1178 100644
--- a/src/humanloop/types/evaluated_version_response.py
+++ b/src/humanloop/types/run_version_response.py
@@ -6,4 +6,4 @@
from .evaluator_response import EvaluatorResponse
from .flow_response import FlowResponse
-EvaluatedVersionResponse = typing.Union[PromptResponse, ToolResponse, EvaluatorResponse, FlowResponse]
+RunVersionResponse = typing.Union[PromptResponse, ToolResponse, EvaluatorResponse, FlowResponse]
diff --git a/src/humanloop/types/text_evaluator_stats_response.py b/src/humanloop/types/text_evaluator_stats_response.py
index 735b4eb7..652c7aa6 100644
--- a/src/humanloop/types/text_evaluator_stats_response.py
+++ b/src/humanloop/types/text_evaluator_stats_response.py
@@ -9,7 +9,7 @@
class TextEvaluatorStatsResponse(UncheckedBaseModel):
"""
Base attributes for stats for an Evaluator Version-Evaluated Version pair
- in the Evaluation Report.
+ in the Evaluation.
"""
evaluator_version_id: str = pydantic.Field()
diff --git a/src/humanloop/types/tool_response.py b/src/humanloop/types/tool_response.py
index 3099da27..c1db98bb 100644
--- a/src/humanloop/types/tool_response.py
+++ b/src/humanloop/types/tool_response.py
@@ -92,6 +92,16 @@ class ToolResponse(UncheckedBaseModel):
The user who created the Tool.
"""
+ committed_by: typing.Optional[UserResponse] = pydantic.Field(default=None)
+ """
+ The user who committed the Tool Version.
+ """
+
+ committed_at: typing.Optional[dt.datetime] = pydantic.Field(default=None)
+ """
+ The date and time the Tool Version was committed.
+ """
+
status: VersionStatus = pydantic.Field()
"""
The status of the Tool Version.
diff --git a/src/humanloop/types/version_specification.py b/src/humanloop/types/version_specification.py
new file mode 100644
index 00000000..bb3464ce
--- /dev/null
+++ b/src/humanloop/types/version_specification.py
@@ -0,0 +1,48 @@
+# This file was auto-generated by Fern from our API Definition.
+
+from ..core.unchecked_base_model import UncheckedBaseModel
+import typing
+import pydantic
+from ..core.pydantic_utilities import IS_PYDANTIC_V2
+
+
+class VersionSpecification(UncheckedBaseModel):
+ """
+ Specification of a File version on Humanloop.
+
+ This can be done in a couple of ways:
+
+ - Specifying `version_id` directly.
+ - Specifying a File (and optionally an Environment).
+ - A File can be specified by either `path` or `file_id`.
+ - An Environment can be specified by `environment_id`. If no Environment is specified, the default Environment is used.
+ """
+
+ version_id: typing.Optional[str] = pydantic.Field(default=None)
+ """
+ Unique identifier for the File Version. If provided, none of the other fields should be specified.
+ """
+
+ path: typing.Optional[str] = pydantic.Field(default=None)
+ """
+ Path identifying a File. Provide either this or `file_id` if you want to specify a File.
+ """
+
+ file_id: typing.Optional[str] = pydantic.Field(default=None)
+ """
+ Unique identifier for the File. Provide either this or `path` if you want to specify a File.
+ """
+
+ environment: typing.Optional[str] = pydantic.Field(default=None)
+ """
+ Name of the Environment a Version is deployed to. Only provide this when specifying a File. If not provided (and a File is specified), the default Environment is used.
+ """
+
+ if IS_PYDANTIC_V2:
+ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2
+ else:
+
+ class Config:
+ frozen = True
+ smart_union = True
+ extra = pydantic.Extra.allow
diff --git a/src/humanloop/types/version_stats_response.py b/src/humanloop/types/version_stats_response.py
index 25c0a682..6439fca4 100644
--- a/src/humanloop/types/version_stats_response.py
+++ b/src/humanloop/types/version_stats_response.py
@@ -8,28 +8,24 @@
class VersionStatsResponse(UncheckedBaseModel):
- """
- Stats for an Evaluated Version in the Evaluation Report.
- """
-
version_id: str = pydantic.Field()
"""
- Unique identifier for the Evaluated Version.
+ Unique identifier for the evaluated Version.
"""
batch_id: typing.Optional[str] = pydantic.Field(default=None)
"""
- Unique identifier for the batch of Logs to include in the Evaluation Report.
+ Unique identifier for the batch of Logs to include in the Evaluation.
"""
num_logs: int = pydantic.Field()
"""
- The total number of existing Logs for this Evaluated Version within the Evaluation Report. These are Logs that have been generated by this Evaluated Version on a Datapoint belonging to the Evaluation Report's Dataset Version.
+ The total number of existing Logs in this Run.
"""
evaluator_version_stats: typing.List[VersionStatsResponseEvaluatorVersionStatsItem] = pydantic.Field()
"""
- Stats for each Evaluator Version used to evaluate this Evaluated Version.
+ Stats for each Evaluator Version applied to this Run.
"""
if IS_PYDANTIC_V2: