diff --git a/README.md b/README.md index fda3a6eb..2445b8f0 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ client.prompts.log( messages=[{"role": "user", "content": "What really happened at Roswell?"}], inputs={"person": "Trump"}, created_at=datetime.datetime.fromisoformat( - "2024-07-19 00:29:35.178000+00:00", + "2024-07-18 23:29:35.178000+00:00", ), provider_latency=6.5931549072265625, output_message={ @@ -88,7 +88,7 @@ async def main() -> None: ], inputs={"person": "Trump"}, created_at=datetime.datetime.fromisoformat( - "2024-07-19 00:29:35.178000+00:00", + "2024-07-18 23:29:35.178000+00:00", ), provider_latency=6.5931549072265625, output_message={ @@ -165,7 +165,6 @@ response = client.prompts.call_stream( ), source_datapoint_id="string", trace_parent_id="string", - batch_id="string", user="string", prompts_call_stream_request_environment="string", save=True, diff --git a/pyproject.toml b/pyproject.toml index 279764ab..a62db965 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "humanloop" -version = "0.8.8" +version = "0.8.9" description = "" readme = "README.md" authors = [] diff --git a/reference.md b/reference.md index aae70be1..27cd7ce4 100644 --- a/reference.md +++ b/reference.md @@ -56,7 +56,7 @@ client.prompts.log( messages=[{"role": "user", "content": "What really happened at Roswell?"}], inputs={"person": "Trump"}, created_at=datetime.datetime.fromisoformat( - "2024-07-19 00:29:35.178000+00:00", + "2024-07-18 23:29:35.178000+00:00", ), provider_latency=6.5931549072265625, output_message={ @@ -100,7 +100,7 @@ client.prompts.log(
-**evaluation_id:** `typing.Optional[str]` — Unique identifier for the Evaluation Report to associate the Log to. +**run_id:** `typing.Optional[str]` — Unique identifier for the Run to associate the Log to.
@@ -314,14 +314,6 @@ Controls how the model uses tools. The following options are supported:
-**batch_id:** `typing.Optional[str]` — Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist. - -
-
- -
-
- **user:** `typing.Optional[str]` — End-user ID related to the Log.
@@ -682,7 +674,6 @@ response = client.prompts.call_stream( ), source_datapoint_id="string", trace_parent_id="string", - batch_id="string", user="string", prompts_call_stream_request_environment="string", save=True, @@ -836,14 +827,6 @@ Controls how the model uses tools. The following options are supported:
-**batch_id:** `typing.Optional[str]` — Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist. - -
-
- -
-
- **user:** `typing.Optional[str]` — End-user ID related to the Log.
@@ -1102,14 +1085,6 @@ Controls how the model uses tools. The following options are supported:
-**batch_id:** `typing.Optional[str]` — Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist. - -
-
- -
-
- **user:** `typing.Optional[str]` — End-user ID related to the Log.
@@ -2525,14 +2500,6 @@ client.tools.log(
-**batch_id:** `typing.Optional[str]` — Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist. - -
-
- -
-
- **user:** `typing.Optional[str]` — End-user ID related to the Log.
@@ -4497,6 +4464,14 @@ client.datasets.list_versions(
+**include_datapoints:** `typing.Optional[typing.Literal["latest_committed"]]` — If set to 'latest_committed', include the Datapoints for the latest committed version. Defaults to `None`. + +
+
+ +
+
+ **request_options:** `typing.Optional[RequestOptions]` — Request-specific configuration.
@@ -5157,14 +5132,6 @@ client.evaluators.log(
-**batch_id:** `typing.Optional[str]` — Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist. - -
-
- -
-
- **user:** `typing.Optional[str]` — End-user ID related to the Log.
@@ -6258,10 +6225,10 @@ client.flows.log( output="The patient is likely experiencing a myocardial infarction. Immediate medical attention is required.", trace_status="incomplete", start_time=datetime.datetime.fromisoformat( - "2024-07-08 22:40:35+00:00", + "2024-07-08 21:40:35+00:00", ), end_time=datetime.datetime.fromisoformat( - "2024-07-08 22:40:39+00:00", + "2024-07-08 21:40:39+00:00", ), ) @@ -6295,7 +6262,7 @@ client.flows.log(
-**evaluation_id:** `typing.Optional[str]` — Unique identifier for the Evaluation Report to associate the Log to. +**run_id:** `typing.Optional[str]` — Unique identifier for the Run to associate the Log to.
@@ -6431,14 +6398,6 @@ client.flows.log(
-**batch_id:** `typing.Optional[str]` — Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist. - -
-
- -
-
- **user:** `typing.Optional[str]` — End-user ID related to the Log.
@@ -8212,16 +8171,10 @@ for page in response.iter_pages(): Create an Evaluation. -Create a new Evaluation by specifying the Dataset, versions to be -evaluated (Evaluatees), and which Evaluators to provide judgments. +Create an Evaluation by specifying the File to evaluate, and a name +for the Evaluation. -Humanloop will automatically start generating Logs and running Evaluators where -`orchestrated=true`. If you own the runtime for the Evaluatee or Evaluator, you -can set `orchestrated=false` and then generate and submit the required logs using -your runtime. - -To keep updated on the progress of the Evaluation, you can poll the Evaluation using -the `GET /evaluations/:id` endpoint and check its status. +You can then add Runs to this Evaluation using the `POST /evaluations/{id}/runs` endpoint.
@@ -8242,11 +8195,7 @@ client = Humanloop( api_key="YOUR_API_KEY", ) client.evaluations.create( - dataset={"version_id": "dsv_6L78pqrdFi2xa"}, - evaluatees=[ - {"version_id": "prv_7ZlQREDScH0xkhUwtXruN", "orchestrated": False} - ], - evaluators=[{"version_id": "evv_012def", "orchestrated": False}], + evaluators=[{}], ) ``` @@ -8263,7 +8212,7 @@ client.evaluations.create(
-**dataset:** `EvaluationsDatasetRequestParams` — Dataset to use in this Evaluation. +**evaluators:** `typing.Sequence[EvaluationsRequestParams]` — The Evaluators used to evaluate.
@@ -8271,7 +8220,7 @@ client.evaluations.create(
-**evaluators:** `typing.Sequence[EvaluationsRequestParams]` — The Evaluators used to evaluate. +**file:** `typing.Optional[FileRequestParams]` — The File to associate with the Evaluation. This File contains the Logs you're evaluating.
@@ -8279,7 +8228,7 @@ client.evaluations.create(
-**evaluatees:** `typing.Optional[typing.Sequence[EvaluateeRequestParams]]` — Unique identifiers for the Prompt/Tool Versions to include in the Evaluation. Can be left unpopulated if you wish to add Evaluatees to this Evaluation by specifying `evaluation_id` in Log calls. +**name:** `typing.Optional[str]` — Name of the Evaluation to help identify it. Must be unique within the associated File.
@@ -8287,15 +8236,163 @@ client.evaluations.create(
-**name:** `typing.Optional[str]` — Name of the Evaluation to help identify it. Must be unique within the associated File. +**request_options:** `typing.Optional[RequestOptions]` — Request-specific configuration. +
+
+ +
+ + + +
+ + +
client.evaluations.add_evaluators(...) +
+
+ +#### 📝 Description + +
+
+ +
+
+ +Add Evaluators to an Evaluation. + +Add new Evaluators to an Evaluation. The Evaluators will be run on the Logs +generated for the Evaluation. +
+
+#### 🔌 Usage +
-**file:** `typing.Optional[FileRequestParams]` — The File to associate with the Evaluation. +
+
+ +```python +from humanloop import Humanloop + +client = Humanloop( + api_key="YOUR_API_KEY", +) +client.evaluations.add_evaluators( + id="id", + evaluators=[{}], +) + +``` +
+
+
+
+ +#### ⚙️ Parameters + +
+
+ +
+
+ +**id:** `str` — Unique identifier for Evaluation. + +
+
+ +
+
+ +**evaluators:** `typing.Sequence[EvaluationsRequestParams]` — The Evaluators to add to this Evaluation. + +
+
+ +
+
+ +**request_options:** `typing.Optional[RequestOptions]` — Request-specific configuration. + +
+
+
+
+ + +
+
+
+ +
client.evaluations.remove_evaluator(...) +
+
+ +#### 📝 Description + +
+
+ +
+
+ +Remove an Evaluator from an Evaluation. + +Remove an Evaluator from an Evaluation. The Evaluator will no longer be run on the Logs +generated for the Evaluation. +
+
+
+
+ +#### 🔌 Usage + +
+
+ +
+
+ +```python +from humanloop import Humanloop + +client = Humanloop( + api_key="YOUR_API_KEY", +) +client.evaluations.remove_evaluator( + id="id", + evaluator_version_id="evaluator_version_id", +) + +``` +
+
+
+
+ +#### ⚙️ Parameters + +
+
+ +
+
+ +**id:** `str` — Unique identifier for Evaluation. + +
+
+ +
+
+ +**evaluator_version_id:** `str` — Unique identifier for Evaluator Version.
@@ -8458,7 +8555,7 @@ client.evaluations.delete(
-
client.evaluations.update_setup(...) +
client.evaluations.list_runs_for_evaluation(...)
@@ -8470,10 +8567,7 @@ client.evaluations.delete(
-Update an Evaluation. - -Update the setup of an Evaluation by specifying the Dataset, versions to be -evaluated (Evaluatees), and which Evaluators to provide judgments. +List all Runs for an Evaluation.
@@ -8493,13 +8587,8 @@ from humanloop import Humanloop client = Humanloop( api_key="YOUR_API_KEY", ) -client.evaluations.update_setup( - id="ev_567yza", - dataset={"version_id": "dsv_6L78pqrdFi2xa"}, - evaluatees=[ - {"version_id": "prv_7ZlQREDScH0xkhUwtXruN", "orchestrated": False} - ], - evaluators=[{"version_id": "evv_012def", "orchestrated": False}], +client.evaluations.list_runs_for_evaluation( + id="id", ) ``` @@ -8524,7 +8613,87 @@ client.evaluations.update_setup(
-**dataset:** `typing.Optional[EvaluationsDatasetRequestParams]` — Dataset to use in this Evaluation. +**request_options:** `typing.Optional[RequestOptions]` — Request-specific configuration. + +
+
+ +
+ + + +
+ + +
client.evaluations.create_run(...) +
+
+ +#### 📝 Description + +
+
+ +
+
+ +Create an Evaluation Run. + +Create a new Evaluation Run. Optionally specify the Dataset and version to be +evaluated. + +Humanloop will automatically start generating Logs and running Evaluators where +`orchestrated=true`. If you are generating Logs yourself, you can set `orchestrated=false` +and then generate and submit the required Logs via the API. + +The `logs` parameter controls which Logs are associated with the Run. Defaults to `dynamic` +if `dataset` and `version` are provided. This means that Logs will automatically be retrieved +if they're associated with the specified Version and has `source_datapoint_id` referencing +a datapoint in the specified Dataset. +If `logs` is set to `fixed`, no existing Logs will be automatically associated with the Run. +You can then add Logs to the Run using the `POST /evaluations/{id}/runs/{run_id}/logs` endpoint, +or by adding `run_id` to your `POST /prompts/logs` requests. + +To keep updated on the progress of the Run, you can poll the Run using +the `GET /evaluations/{id}/runs` endpoint and check its status. +
+
+
+
+ +#### 🔌 Usage + +
+
+ +
+
+ +```python +from humanloop import Humanloop + +client = Humanloop( + api_key="YOUR_API_KEY", +) +client.evaluations.create_run( + id="id", +) + +``` +
+
+
+
+ +#### ⚙️ Parameters + +
+
+ +
+
+ +**id:** `str` — Unique identifier for Evaluation.
@@ -8532,7 +8701,7 @@ client.evaluations.update_setup(
-**evaluatees:** `typing.Optional[typing.Sequence[EvaluateeRequestParams]]` — Unique identifiers for the Prompt/Tool Versions to include in the Evaluation. Can be left unpopulated if you wish to add evaluatees to this Evaluation by specifying `evaluation_id` in Log calls. +**dataset:** `typing.Optional[EvaluationsDatasetRequestParams]` — Dataset to use in this Run.
@@ -8540,7 +8709,7 @@ client.evaluations.update_setup(
-**evaluators:** `typing.Optional[typing.Sequence[EvaluationsRequestParams]]` — The Evaluators used to evaluate. +**version:** `typing.Optional[VersionSpecificationParams]` — Version to use in this Run.
@@ -8548,7 +8717,7 @@ client.evaluations.update_setup(
-**name:** `typing.Optional[str]` — Name of the Evaluation to help identify it. Must be unique within the associated File. +**orchestrated:** `typing.Optional[bool]` — Whether the Run is orchestrated by Humanloop. If `True`, Humanloop will generate Logs for the Run; `dataset` and `version` must be provided. If `False`, a log for the Prompt/Tool should be submitted by the user via the API.
@@ -8556,7 +8725,7 @@ client.evaluations.update_setup(
-**file:** `typing.Optional[FileRequestParams]` — The File to associate with the Evaluation. +**logs:** `typing.Optional[LogsAssociationType]` — How the Logs are associated with the Run. If `dynamic`, the latest relevant Logs will be inferred from the Dataset and Version. If `fixed`, the Logs will be explicitly associated. You can provide a list of Log IDs to associate with the Run, or add them to the Run later. Defaults to `dynamic` if `dataset` and `version` are provided; otherwise, defaults to `fixed`.
@@ -8576,7 +8745,7 @@ client.evaluations.update_setup(
-
client.evaluations.update_status(...) +
client.evaluations.add_existing_run(...)
@@ -8588,10 +8757,7 @@ client.evaluations.update_setup(
-Update the status of an Evaluation. - -Can be used to cancel a running Evaluation, or mark an Evaluation that uses -external or human evaluators as completed. +Add an existing Run to an Evaluation.
@@ -8611,9 +8777,9 @@ from humanloop import Humanloop client = Humanloop( api_key="YOUR_API_KEY", ) -client.evaluations.update_status( +client.evaluations.add_existing_run( id="id", - status="pending", + run_id="run_id", ) ``` @@ -8638,7 +8804,7 @@ client.evaluations.update_status(
-**status:** `EvaluationStatus` +**run_id:** `str` — Unique identifier for Run.
@@ -8658,7 +8824,7 @@ client.evaluations.update_status(
-
client.evaluations.get_stats(...) +
client.evaluations.remove_run_from_evaluation(...)
@@ -8670,11 +8836,10 @@ client.evaluations.update_status(
-Get Evaluation Stats. +Remove a Run from an Evaluation. -Retrieve aggregate stats for the specified Evaluation. -This includes the number of generated Logs for each evaluated version and the -corresponding Evaluator statistics (such as the mean and percentiles). +Remove a Run from an Evaluation. The Logs and Versions used in the Run will not be deleted. +If this Run is used in any other Evaluations, it will still be available in those Evaluations.
@@ -8694,8 +8859,9 @@ from humanloop import Humanloop client = Humanloop( api_key="YOUR_API_KEY", ) -client.evaluations.get_stats( +client.evaluations.remove_run_from_evaluation( id="id", + run_id="run_id", ) ``` @@ -8720,6 +8886,14 @@ client.evaluations.get_stats(
+**run_id:** `str` — Unique identifier for Run. + +
+
+ +
+
+ **request_options:** `typing.Optional[RequestOptions]` — Request-specific configuration.
@@ -8732,7 +8906,7 @@ client.evaluations.get_stats(
-
client.evaluations.get_logs(...) +
client.evaluations.update_evaluation_run(...)
@@ -8744,10 +8918,9 @@ client.evaluations.get_stats(
-Get the Logs associated to a specific Evaluation. +Update an Evaluation Run. -Each Datapoint in your Dataset will have a corresponding Log for each File version evaluated. -e.g. If you have 50 Datapoints and are evaluating 2 Prompts, there will be 100 Logs associated with the Evaluation. +Update the Dataset and version to be evaluated for an existing Run.
@@ -8767,8 +8940,10 @@ from humanloop import Humanloop client = Humanloop( api_key="YOUR_API_KEY", ) -client.evaluations.get_logs( +client.evaluations.update_evaluation_run( id="id", + run_id="run_id", + control=True, ) ``` @@ -8785,7 +8960,7 @@ client.evaluations.get_logs(
-**id:** `str` — String ID of evaluation. Starts with `ev_` or `evr_`. +**id:** `str` — Unique identifier for Evaluation.
@@ -8793,7 +8968,7 @@ client.evaluations.get_logs(
-**page:** `typing.Optional[int]` — Page number for pagination. +**run_id:** `str` — Unique identifier for Run.
@@ -8801,7 +8976,7 @@ client.evaluations.get_logs(
-**size:** `typing.Optional[int]` — Page size for pagination. Number of Logs to fetch. +**control:** `bool` — If `True`, this Run will be used as the control in the Evaluation. Stats for other Runs will be compared to this Run. This will replace any existing control Run.
@@ -8821,7 +8996,7 @@ client.evaluations.get_logs(
-
client.evaluations.pin_evaluatee(...) +
client.evaluations.add_logs_to_run(...)
@@ -8833,10 +9008,11 @@ client.evaluations.get_logs(
-Pin the specified Evaluatee. +Add Logs to an Evaluation Run. -Pinned Evaluatees are always displayed in the Evaluation Overview, -and serve as the baseline for comparison with other Evaluatees. +This is supported only for Runs that have a fixed set of Logs. +(Runs can either have a fixed set of Logs, or can be set to dynamically retrieve the latest Logs +if a Dataset and Version are provided.)
@@ -8856,8 +9032,10 @@ from humanloop import Humanloop client = Humanloop( api_key="YOUR_API_KEY", ) -client.evaluations.pin_evaluatee( +client.evaluations.add_logs_to_run( id="id", + run_id="run_id", + log_ids=["log_ids"], ) ``` @@ -8882,7 +9060,7 @@ client.evaluations.pin_evaluatee(
-**version_id:** `typing.Optional[str]` — Unique identifier for the File Version. If provided, none of the other fields should be specified. +**run_id:** `str` — Unique identifier for Run.
@@ -8890,7 +9068,7 @@ client.evaluations.pin_evaluatee(
-**path:** `typing.Optional[str]` — Path identifying a File. Provide either this or `file_id` if you want to specify a File. +**log_ids:** `typing.Sequence[str]` — The IDs of the Logs to add to the Run.
@@ -8898,15 +9076,74 @@ client.evaluations.pin_evaluatee(
-**file_id:** `typing.Optional[str]` — Unique identifier for the File. Provide either this or `path` if you want to specify a File. +**request_options:** `typing.Optional[RequestOptions]` — Request-specific configuration.
+ +
+ + + +
+ + +
client.evaluations.get_stats(...) +
+
+ +#### 📝 Description + +
+
-**environment:** `typing.Optional[str]` — Name of the Environment a Version is deployed to. Only provide this when specifying a File. If not provided (and a File is specified), the default Environment is used. +Get Evaluation Stats. + +Retrieve aggregate stats for the specified Evaluation. + +This includes the number of generated Logs for each Run and the +corresponding Evaluator statistics (such as the mean and percentiles). +
+
+
+
+ +#### 🔌 Usage + +
+
+ +
+
+ +```python +from humanloop import Humanloop + +client = Humanloop( + api_key="YOUR_API_KEY", +) +client.evaluations.get_stats( + id="id", +) + +``` +
+
+
+
+ +#### ⚙️ Parameters + +
+
+ +
+
+ +**id:** `str` — Unique identifier for Evaluation.
@@ -8914,15 +9151,85 @@ client.evaluations.pin_evaluatee(
-**batch_id:** `typing.Optional[str]` — Unique identifier for the batch of Logs to include in the Evaluation Report. +**request_options:** `typing.Optional[RequestOptions]` — Request-specific configuration.
+
+
+ + +
+
+
+
client.evaluations.get_logs(...)
-**orchestrated:** `typing.Optional[bool]` — Whether the Prompt/Tool is orchestrated by Humanloop. Default is `True`. If `False`, a log for the Prompt/Tool should be submitted by the user via the API. +#### 📝 Description + +
+
+ +
+
+ +Get the Logs associated to a specific Evaluation. +
+
+
+
+ +#### 🔌 Usage + +
+
+ +
+
+ +```python +from humanloop import Humanloop + +client = Humanloop( + api_key="YOUR_API_KEY", +) +client.evaluations.get_logs( + id="id", +) + +``` +
+
+
+
+ +#### ⚙️ Parameters + +
+
+ +
+
+ +**id:** `str` — String ID of evaluation. Starts with `ev_` or `evr_`. + +
+
+ +
+
+ +**page:** `typing.Optional[int]` — Page number for pagination. + +
+
+ +
+
+ +**size:** `typing.Optional[int]` — Page size for pagination. Number of Logs to fetch.
diff --git a/src/humanloop/__init__.py b/src/humanloop/__init__.py index f782b29d..ac2e2567 100644 --- a/src/humanloop/__init__.py +++ b/src/humanloop/__init__.py @@ -27,12 +27,13 @@ DirectoryWithParentsAndChildrenResponseFilesItem, EnvironmentResponse, EnvironmentTag, - EvaluatedVersionResponse, EvaluateeRequest, EvaluateeResponse, EvaluationEvaluatorResponse, - EvaluationReportLogResponse, + EvaluationLogResponse, EvaluationResponse, + EvaluationRunResponse, + EvaluationRunsResponse, EvaluationStats, EvaluationStatus, EvaluationsDatasetRequest, @@ -77,6 +78,7 @@ ListTools, LlmEvaluatorRequest, LogResponse, + LogsAssociationType, ModelEndpoints, ModelProviders, MonitoringEvaluatorEnvironmentRequest, @@ -86,7 +88,7 @@ NumericEvaluatorStatsResponse, ObservabilityStatus, OverallStats, - PaginatedDataEvaluationReportLogResponse, + PaginatedDataEvaluationLogResponse, PaginatedDataEvaluatorResponse, PaginatedDataFlowResponse, PaginatedDataLogResponse, @@ -116,6 +118,9 @@ ProviderApiKeys, ResponseFormat, ResponseFormatType, + RunStatsResponse, + RunStatsResponseEvaluatorStatsItem, + RunVersionResponse, SelectEvaluatorStatsResponse, SortOrder, TextChatContent, @@ -139,6 +144,7 @@ VersionIdResponse, VersionIdResponseVersion, VersionReferenceResponse, + VersionSpecification, VersionStatsResponse, VersionStatsResponseEvaluatorVersionStatsItem, VersionStatus, @@ -191,12 +197,13 @@ DirectoryWithParentsAndChildrenResponseFilesItemParams, DirectoryWithParentsAndChildrenResponseParams, EnvironmentResponseParams, - EvaluatedVersionResponseParams, EvaluateeRequestParams, EvaluateeResponseParams, EvaluationEvaluatorResponseParams, - EvaluationReportLogResponseParams, + EvaluationLogResponseParams, EvaluationResponseParams, + EvaluationRunResponseParams, + EvaluationRunsResponseParams, EvaluationStatsParams, EvaluationsDatasetRequestParams, EvaluationsRequestParams, @@ -238,7 +245,7 @@ MonitoringEvaluatorVersionRequestParams, NumericEvaluatorStatsResponseParams, OverallStatsParams, - PaginatedDataEvaluationReportLogResponseParams, + PaginatedDataEvaluationLogResponseParams, PaginatedDataEvaluatorResponseParams, PaginatedDataFlowResponseParams, PaginatedDataLogResponseParams, @@ -263,6 +270,9 @@ PromptResponseTemplateParams, ProviderApiKeysParams, ResponseFormatParams, + RunStatsResponseEvaluatorStatsItemParams, + RunStatsResponseParams, + RunVersionResponseParams, SelectEvaluatorStatsResponseParams, TextChatContentParams, TextEvaluatorStatsResponseParams, @@ -279,6 +289,7 @@ VersionIdResponseParams, VersionIdResponseVersionParams, VersionReferenceResponseParams, + VersionSpecificationParams, VersionStatsResponseEvaluatorVersionStatsItemParams, VersionStatsResponseParams, ) @@ -337,18 +348,20 @@ "EnvironmentResponse", "EnvironmentResponseParams", "EnvironmentTag", - "EvaluatedVersionResponse", - "EvaluatedVersionResponseParams", "EvaluateeRequest", "EvaluateeRequestParams", "EvaluateeResponse", "EvaluateeResponseParams", "EvaluationEvaluatorResponse", "EvaluationEvaluatorResponseParams", - "EvaluationReportLogResponse", - "EvaluationReportLogResponseParams", + "EvaluationLogResponse", + "EvaluationLogResponseParams", "EvaluationResponse", "EvaluationResponseParams", + "EvaluationRunResponse", + "EvaluationRunResponseParams", + "EvaluationRunsResponse", + "EvaluationRunsResponseParams", "EvaluationStats", "EvaluationStatsParams", "EvaluationStatus", @@ -431,6 +444,7 @@ "LlmEvaluatorRequestParams", "LogResponse", "LogResponseParams", + "LogsAssociationType", "ModelEndpoints", "ModelProviders", "MonitoringEvaluatorEnvironmentRequest", @@ -445,8 +459,8 @@ "ObservabilityStatus", "OverallStats", "OverallStatsParams", - "PaginatedDataEvaluationReportLogResponse", - "PaginatedDataEvaluationReportLogResponseParams", + "PaginatedDataEvaluationLogResponse", + "PaginatedDataEvaluationLogResponseParams", "PaginatedDataEvaluatorResponse", "PaginatedDataEvaluatorResponseParams", "PaginatedDataFlowResponse", @@ -512,6 +526,12 @@ "ResponseFormat", "ResponseFormatParams", "ResponseFormatType", + "RunStatsResponse", + "RunStatsResponseEvaluatorStatsItem", + "RunStatsResponseEvaluatorStatsItemParams", + "RunStatsResponseParams", + "RunVersionResponse", + "RunVersionResponseParams", "SelectEvaluatorStatsResponse", "SelectEvaluatorStatsResponseParams", "SortOrder", @@ -554,6 +574,8 @@ "VersionIdResponseVersionParams", "VersionReferenceResponse", "VersionReferenceResponseParams", + "VersionSpecification", + "VersionSpecificationParams", "VersionStatsResponse", "VersionStatsResponseEvaluatorVersionStatsItem", "VersionStatsResponseEvaluatorVersionStatsItemParams", diff --git a/src/humanloop/core/client_wrapper.py b/src/humanloop/core/client_wrapper.py index 04653533..4282222b 100644 --- a/src/humanloop/core/client_wrapper.py +++ b/src/humanloop/core/client_wrapper.py @@ -16,7 +16,7 @@ def get_headers(self) -> typing.Dict[str, str]: headers: typing.Dict[str, str] = { "X-Fern-Language": "Python", "X-Fern-SDK-Name": "humanloop", - "X-Fern-SDK-Version": "0.8.8", + "X-Fern-SDK-Version": "0.8.9", } headers["X-API-KEY"] = self.api_key return headers diff --git a/src/humanloop/datasets/client.py b/src/humanloop/datasets/client.py index a4e5dd99..ddfede4d 100644 --- a/src/humanloop/datasets/client.py +++ b/src/humanloop/datasets/client.py @@ -602,6 +602,7 @@ def list_versions( id: str, *, status: typing.Optional[VersionStatus] = None, + include_datapoints: typing.Optional[typing.Literal["latest_committed"]] = None, request_options: typing.Optional[RequestOptions] = None, ) -> ListDatasets: """ @@ -615,6 +616,9 @@ def list_versions( status : typing.Optional[VersionStatus] Filter versions by status: 'uncommitted', 'committed'. If no status is provided, all versions are returned. + include_datapoints : typing.Optional[typing.Literal["latest_committed"]] + If set to 'latest_committed', include the Datapoints for the latest committed version. Defaults to `None`. + request_options : typing.Optional[RequestOptions] Request-specific configuration. @@ -640,6 +644,7 @@ def list_versions( method="GET", params={ "status": status, + "include_datapoints": include_datapoints, }, request_options=request_options, ) @@ -1647,6 +1652,7 @@ async def list_versions( id: str, *, status: typing.Optional[VersionStatus] = None, + include_datapoints: typing.Optional[typing.Literal["latest_committed"]] = None, request_options: typing.Optional[RequestOptions] = None, ) -> ListDatasets: """ @@ -1660,6 +1666,9 @@ async def list_versions( status : typing.Optional[VersionStatus] Filter versions by status: 'uncommitted', 'committed'. If no status is provided, all versions are returned. + include_datapoints : typing.Optional[typing.Literal["latest_committed"]] + If set to 'latest_committed', include the Datapoints for the latest committed version. Defaults to `None`. + request_options : typing.Optional[RequestOptions] Request-specific configuration. @@ -1693,6 +1702,7 @@ async def main() -> None: method="GET", params={ "status": status, + "include_datapoints": include_datapoints, }, request_options=request_options, ) diff --git a/src/humanloop/eval_utils.py b/src/humanloop/eval_utils.py index e5112d19..c6bc3b98 100644 --- a/src/humanloop/eval_utils.py +++ b/src/humanloop/eval_utils.py @@ -18,7 +18,6 @@ from typing_extensions import NotRequired, TypedDict import time import sys -import uuid from concurrent.futures import ThreadPoolExecutor, as_completed from .client import BaseHumanloop @@ -41,11 +40,13 @@ from .types import ToolKernelRequest as Tool from .types import BooleanEvaluatorStatsResponse as BooleanStats from .types import NumericEvaluatorStatsResponse as NumericStats -from .types import UpdateDatesetAction as UpdateDatasetAction # TODO: fix original type typo +from .types import ( + UpdateDatesetAction as UpdateDatasetAction, +) # TODO: fix original type typo from .types import DatapointResponse as Datapoint from .types import ( EvaluationStats, - VersionStatsResponse, + RunStatsResponse, EvaluatorArgumentsType, EvaluatorReturnTypeEnum, EvaluationResponse, @@ -61,7 +62,9 @@ if not logger.hasHandlers(): logger.addHandler(console_handler) -EvaluatorDict = Union[CodeEvaluatorDict, LLMEvaluatorDict, HumanEvaluatorDict, ExternalEvaluator] +EvaluatorDict = Union[ + CodeEvaluatorDict, LLMEvaluatorDict, HumanEvaluatorDict, ExternalEvaluator +] Version = Union[FlowDict, PromptDict, ToolDict, EvaluatorDict] FileType = Literal["flow", "prompt", "tool", "evaluator"] @@ -202,9 +205,13 @@ def _run_eval( function_ = file.pop("callable") except KeyError as _: if type_ == "flow": - raise ValueError("You must provide a `callable` for your Flow `file` to run a local eval.") + raise ValueError( + "You must provide a `callable` for your Flow `file` to run a local eval." + ) else: - logger.info(f"No `callable` provided for your {type_} file - will attempt to generate logs on Humanloop.") + logger.info( + f"No `callable` provided for your {type_} file - will attempt to generate logs on Humanloop." + ) custom_logger = file.pop("custom_logger", None) file_dict = {**file, **version} @@ -222,7 +229,9 @@ def _run_eval( try: _ = Prompt.parse_obj(version) except ValidationError as error_: - logger.error(msg=f"Invalid Prompt `version` in your `file` request. \n\nValidation error: \n)") + logger.error( + msg=f"Invalid Prompt `version` in your `file` request. \n\nValidation error: \n)" + ) raise error_ hl_file = client.prompts.upsert(**file_dict) @@ -230,7 +239,9 @@ def _run_eval( try: _ = Tool.parse_obj(version) except ValidationError as error_: - logger.error(msg=f"Invalid Tool `version` in your `file` request. \n\nValidation error: \n)") + logger.error( + msg=f"Invalid Tool `version` in your `file` request. \n\nValidation error: \n)" + ) raise error_ hl_file = client.tools.upsert(**file_dict) @@ -263,7 +274,9 @@ def _run_eval( attributes={"code": inspect.getsource(eval_function)}, evaluator_type="external", ) - _ = client.evaluators.upsert(id=evaluator.get("id"), path=evaluator.get("path"), spec=spec) + _ = client.evaluators.upsert( + id=evaluator.get("id"), path=evaluator.get("path"), spec=spec + ) # Validate upfront that the local Evaluators and Dataset fit requires_target = False @@ -286,7 +299,6 @@ def _run_eval( try: evaluation = client.evaluations.create( name=name, - dataset={"file_id": hl_dataset.id}, evaluators=[{"path": e["path"]} for e in evaluators], file={"id": hl_file.id}, ) @@ -301,15 +313,22 @@ def _run_eval( if not evaluation: raise ValueError(f"Evaluation with name {name} not found.") - # Every run will generate a new batch of logs - batch_id = uuid.uuid4().hex[:10] # ignore risk of collision + # Create a new Run + run = client.evaluations.create_run( + id=evaluation.id, + dataset={"file_id": hl_dataset.id}, + logs="fixed", + orchestrated=False, + ) + + # Every Run will generate a new batch of Logs + run_id = run.id log_func = _get_log_func( client=client, type_=type_, file_id=hl_file.id, version_id=hl_file.version_id, - evaluation_id=evaluation.id, - batch_id=batch_id, + run_id=run_id, ) # Define the function to execute your function in parallel and Log to Humanloop @@ -318,7 +337,9 @@ def process_datapoint(datapoint: Datapoint): datapoint_dict = datapoint.dict() try: if "messages" in datapoint_dict: - output = function_(**datapoint_dict["inputs"], messages=datapoint_dict["messages"]) + output = function_( + **datapoint_dict["inputs"], messages=datapoint_dict["messages"] + ) else: output = function_(**datapoint_dict["inputs"]) if custom_logger: @@ -343,7 +364,9 @@ def process_datapoint(datapoint: Datapoint): start_time=start_time, end_time=datetime.now(), ) - logger.warning(msg=f"\nYour {type_}'s `callable` failed for Datapoint: {datapoint.id}. \n Error: {str(e)}") + logger.warning( + msg=f"\nYour {type_}'s `callable` failed for Datapoint: {datapoint.id}. \n Error: {str(e)}" + ) # Apply local Evaluators for local_evaluator in local_evaluators: @@ -376,28 +399,35 @@ def process_datapoint(datapoint: Datapoint): start_time=start_time, end_time=datetime.now(), ) - logger.warning(f"\nEvaluator {local_evaluator['path']} failed with error {str(e)}") + logger.warning( + f"\nEvaluator {local_evaluator['path']} failed with error {str(e)}" + ) # Execute the function and send the logs to Humanloop in parallel total_datapoints = len(hl_dataset.datapoints) logger.info(f"\n{CYAN}Navigate to your Evaluation:{RESET}\n{evaluation.url}\n") logger.info(f"{CYAN}{type_.capitalize()} Version ID: {hl_file.version_id}{RESET}") - logger.info(f"{CYAN}Run ID: {batch_id}{RESET}") + logger.info(f"{CYAN}Run ID: {run_id}{RESET}") # Generate locally if a file `callable` is provided if function_: logger.info( - f"{CYAN}\nRunning {hl_file.name} over the Dataset {hl_dataset.name} using {workers} workers{RESET} " + f"{CYAN}\nRunning '{hl_file.name}' over the Dataset '{hl_dataset.name}' using {workers} workers{RESET} " ) completed_tasks = 0 with ThreadPoolExecutor(max_workers=workers) as executor: - futures = [executor.submit(process_datapoint, datapoint) for datapoint in hl_dataset.datapoints] + futures = [ + executor.submit(process_datapoint, datapoint) + for datapoint in hl_dataset.datapoints + ] for _ in as_completed(futures): completed_tasks += 1 _progress_bar(total_datapoints, completed_tasks) else: # TODO: trigger run when updated API is available - logger.info(f"{CYAN}\nRunning {hl_file.name} over the Dataset {hl_dataset.name}{RESET}") + logger.info( + f"{CYAN}\nRunning '{hl_file.name}' over the Dataset '{hl_dataset.name}'{RESET}" + ) # Wait for the Evaluation to complete then print the results complete = False @@ -413,39 +443,43 @@ def process_datapoint(datapoint: Datapoint): logger.info(stats.report) checks: List[EvaluatorCheck] = [] - if all(evaluator.get("threshold") is None for evaluator in evaluators) and len(stats.version_stats) == 1: - # Skip `check_evaluation_improvement` if no thresholds were provided and there is only one run. - # (Or the logs would not be helpful) - return checks - for evaluator in evaluators: - _, score, delta = check_evaluation_improvement( - evaluation=evaluation, - stats=stats, - evaluator_path=evaluator["path"], - batch_id=batch_id, - ) - threshold_check = None - threshold = evaluator.get("threshold") - if threshold is not None: - threshold_check = check_evaluation_threshold( + + # Skip `check_evaluation_improvement` if no thresholds were provided and there is only one run. + # (Or the logs would not be helpful) + if ( + any(evaluator.get("threshold") is not None for evaluator in evaluators) + or len(stats.run_stats) > 1 + ): + for evaluator in evaluators: + _, score, delta = check_evaluation_improvement( evaluation=evaluation, stats=stats, evaluator_path=evaluator["path"], - threshold=threshold, - batch_id=batch_id, + run_id=run_id, ) - checks.append( - EvaluatorCheck( - path=evaluator["path"], - # TODO: Add back in with number valence on Evaluators - # improvement_check=improvement_check, - score=score, - delta=delta, - threshold=threshold, - threshold_check=threshold_check, - evaluation_id=evaluation.id, + threshold_check = None + threshold = evaluator.get("threshold") + if threshold is not None: + threshold_check = check_evaluation_threshold( + evaluation=evaluation, + stats=stats, + evaluator_path=evaluator["path"], + threshold=threshold, + run_id=run_id, + ) + checks.append( + EvaluatorCheck( + path=evaluator["path"], + # TODO: Add back in with number valence on Evaluators + # improvement_check=improvement_check, + score=score, + delta=delta, + threshold=threshold, + threshold_check=threshold_check, + evaluation_id=evaluation.id, + ) ) - ) + logger.info(f"\n{CYAN}View your Evaluation:{RESET}\n{evaluation.url}\n") return checks @@ -455,8 +489,7 @@ def _get_log_func( type_: FileType, file_id: str, version_id: str, - evaluation_id: str, - batch_id: str, + run_id: str, ) -> Callable: """Returns the appropriate log function pre-filled with common parameters.""" log_request = { @@ -464,8 +497,7 @@ def _get_log_func( # Why are both `id` and `version_id` needed in the API? "id": file_id, "version_id": version_id, - "evaluation_id": evaluation_id, - "batch_id": batch_id, + "run_id": run_id, } if type_ == "flow": return partial(client.flows.log, **log_request, trace_status="complete") @@ -479,7 +511,9 @@ def _get_log_func( raise NotImplementedError(f"Unsupported File version: {type_}") -def get_score_from_evaluator_stat(stat: Union[NumericStats, BooleanStats]) -> Union[float, None]: +def get_score_from_evaluator_stat( + stat: Union[NumericStats, BooleanStats], +) -> Union[float, None]: """Get the score from an Evaluator Stat.""" score = None if isinstance(stat, BooleanStats): @@ -526,14 +560,18 @@ def _progress_bar(total: int, progress: int): def get_evaluator_stats_by_path( - stat: VersionStatsResponse, evaluation: EvaluationResponse + stat: RunStatsResponse, evaluation: EvaluationResponse ) -> Dict[str, Union[NumericStats, BooleanStats]]: """Get the Evaluator stats by path.""" # TODO: Update the API so this is not necessary - evaluators_by_id = {evaluator.version.version_id: evaluator for evaluator in evaluation.evaluators} + evaluators_by_id = { + evaluator.version.version_id: evaluator for evaluator in evaluation.evaluators + } evaluator_stats_by_path = { - evaluators_by_id[evaluator_stat.evaluator_version_id].version.path: evaluator_stat - for evaluator_stat in stat.evaluator_version_stats + evaluators_by_id[ + evaluator_stat.evaluator_version_id + ].version.path: evaluator_stat + for evaluator_stat in stat.evaluator_stats } return evaluator_stats_by_path @@ -543,12 +581,13 @@ def check_evaluation_threshold( stats: EvaluationStats, evaluator_path: str, threshold: float, - batch_id: str, + run_id: str, ) -> bool: """Checks if the latest version has an average Evaluator result above a threshold.""" # TODO: Update the API so this is not necessary evaluator_stats_by_path = get_evaluator_stats_by_path( - stat=next((stat for stat in stats.version_stats if stat.batch_id == batch_id), None), evaluation=evaluation + stat=next((stat for stat in stats.run_stats if stat.run_id == run_id), None), + evaluation=evaluation, ) if evaluator_path in evaluator_stats_by_path: evaluator_stat = evaluator_stats_by_path[evaluator_path] @@ -571,7 +610,7 @@ def check_evaluation_improvement( evaluation: EvaluationResponse, evaluator_path: str, stats: EvaluationStats, - batch_id: str, + run_id: str, ) -> Tuple[bool, float, float]: """ Check the latest version has improved across for a specific Evaluator. @@ -581,24 +620,34 @@ def check_evaluation_improvement( # TODO: Update the API so this is not necessary latest_evaluator_stats_by_path = get_evaluator_stats_by_path( - stat=next((stat for stat in stats.version_stats if stat.batch_id == batch_id), None), evaluation=evaluation + stat=next((stat for stat in stats.run_stats if stat.run_id == run_id), None), + evaluation=evaluation, ) - if len(stats.version_stats) == 1: + if len(stats.run_stats) == 1: logger.info(f"{YELLOW}⚠️ No previous versions to compare with.{RESET}") return True, 0, 0 - previous_evaluator_stats_by_path = get_evaluator_stats_by_path(stat=stats.version_stats[-2], evaluation=evaluation) - if evaluator_path in latest_evaluator_stats_by_path and evaluator_path in previous_evaluator_stats_by_path: + previous_evaluator_stats_by_path = get_evaluator_stats_by_path( + stat=stats.run_stats[-2], evaluation=evaluation + ) + if ( + evaluator_path in latest_evaluator_stats_by_path + and evaluator_path in previous_evaluator_stats_by_path + ): latest_evaluator_stat = latest_evaluator_stats_by_path[evaluator_path] previous_evaluator_stat = previous_evaluator_stats_by_path[evaluator_path] latest_score = get_score_from_evaluator_stat(stat=latest_evaluator_stat) previous_score = get_score_from_evaluator_stat(stat=previous_evaluator_stat) diff = round(latest_score - previous_score, 2) if diff >= 0: - logger.info(f"{CYAN}Change of [{diff}] for Evaluator {evaluator_path}{RESET}") + logger.info( + f"{CYAN}Change of [{diff}] for Evaluator {evaluator_path}{RESET}" + ) return True, latest_score, diff else: - logger.info(f"{CYAN}Change of [{diff}] for Evaluator {evaluator_path}{RESET}") + logger.info( + f"{CYAN}Change of [{diff}] for Evaluator {evaluator_path}{RESET}" + ) return False, latest_score, diff else: raise ValueError(f"Evaluator {evaluator_path} not found in the stats.") diff --git a/src/humanloop/evaluations/client.py b/src/humanloop/evaluations/client.py index 92cf4033..c2190762 100644 --- a/src/humanloop/evaluations/client.py +++ b/src/humanloop/evaluations/client.py @@ -11,15 +11,17 @@ from ..types.http_validation_error import HttpValidationError from json.decoder import JSONDecodeError from ..core.api_error import ApiError -from ..requests.evaluations_dataset_request import EvaluationsDatasetRequestParams from ..requests.evaluations_request import EvaluationsRequestParams -from ..requests.evaluatee_request import EvaluateeRequestParams from ..requests.file_request import FileRequestParams from ..core.serialization import convert_and_respect_annotation_metadata from ..core.jsonable_encoder import jsonable_encoder -from ..types.evaluation_status import EvaluationStatus +from ..types.evaluation_runs_response import EvaluationRunsResponse +from ..requests.evaluations_dataset_request import EvaluationsDatasetRequestParams +from ..requests.version_specification import VersionSpecificationParams +from ..types.logs_association_type import LogsAssociationType +from ..types.evaluation_run_response import EvaluationRunResponse from ..types.evaluation_stats import EvaluationStats -from ..types.paginated_data_evaluation_report_log_response import PaginatedDataEvaluationReportLogResponse +from ..types.paginated_data_evaluation_log_response import PaginatedDataEvaluationLogResponse from ..core.client_wrapper import AsyncClientWrapper from ..core.pagination import AsyncPager @@ -127,44 +129,30 @@ def list( def create( self, *, - dataset: EvaluationsDatasetRequestParams, evaluators: typing.Sequence[EvaluationsRequestParams], - evaluatees: typing.Optional[typing.Sequence[EvaluateeRequestParams]] = OMIT, - name: typing.Optional[str] = OMIT, file: typing.Optional[FileRequestParams] = OMIT, + name: typing.Optional[str] = OMIT, request_options: typing.Optional[RequestOptions] = None, ) -> EvaluationResponse: """ Create an Evaluation. - Create a new Evaluation by specifying the Dataset, versions to be - evaluated (Evaluatees), and which Evaluators to provide judgments. + Create an Evaluation by specifying the File to evaluate, and a name + for the Evaluation. - Humanloop will automatically start generating Logs and running Evaluators where - `orchestrated=true`. If you own the runtime for the Evaluatee or Evaluator, you - can set `orchestrated=false` and then generate and submit the required logs using - your runtime. - - To keep updated on the progress of the Evaluation, you can poll the Evaluation using - the `GET /evaluations/:id` endpoint and check its status. + You can then add Runs to this Evaluation using the `POST /evaluations/{id}/runs` endpoint. Parameters ---------- - dataset : EvaluationsDatasetRequestParams - Dataset to use in this Evaluation. - evaluators : typing.Sequence[EvaluationsRequestParams] The Evaluators used to evaluate. - evaluatees : typing.Optional[typing.Sequence[EvaluateeRequestParams]] - Unique identifiers for the Prompt/Tool Versions to include in the Evaluation. Can be left unpopulated if you wish to add Evaluatees to this Evaluation by specifying `evaluation_id` in Log calls. + file : typing.Optional[FileRequestParams] + The File to associate with the Evaluation. This File contains the Logs you're evaluating. name : typing.Optional[str] Name of the Evaluation to help identify it. Must be unique within the associated File. - file : typing.Optional[FileRequestParams] - The File to associate with the Evaluation. - request_options : typing.Optional[RequestOptions] Request-specific configuration. @@ -181,30 +169,20 @@ def create( api_key="YOUR_API_KEY", ) client.evaluations.create( - dataset={"version_id": "dsv_6L78pqrdFi2xa"}, - evaluatees=[ - {"version_id": "prv_7ZlQREDScH0xkhUwtXruN", "orchestrated": False} - ], - evaluators=[{"version_id": "evv_012def", "orchestrated": False}], + evaluators=[{}], ) """ _response = self._client_wrapper.httpx_client.request( "evaluations", method="POST", json={ - "dataset": convert_and_respect_annotation_metadata( - object_=dataset, annotation=EvaluationsDatasetRequestParams, direction="write" - ), - "evaluatees": convert_and_respect_annotation_metadata( - object_=evaluatees, annotation=typing.Sequence[EvaluateeRequestParams], direction="write" + "file": convert_and_respect_annotation_metadata( + object_=file, annotation=FileRequestParams, direction="write" ), + "name": name, "evaluators": convert_and_respect_annotation_metadata( object_=evaluators, annotation=typing.Sequence[EvaluationsRequestParams], direction="write" ), - "name": name, - "file": convert_and_respect_annotation_metadata( - object_=file, annotation=FileRequestParams, direction="write" - ), }, request_options=request_options, omit=OMIT, @@ -233,15 +211,27 @@ def create( raise ApiError(status_code=_response.status_code, body=_response.text) raise ApiError(status_code=_response.status_code, body=_response_json) - def get(self, id: str, *, request_options: typing.Optional[RequestOptions] = None) -> EvaluationResponse: + def add_evaluators( + self, + id: str, + *, + evaluators: typing.Sequence[EvaluationsRequestParams], + request_options: typing.Optional[RequestOptions] = None, + ) -> EvaluationResponse: """ - Get an Evaluation. + Add Evaluators to an Evaluation. + + Add new Evaluators to an Evaluation. The Evaluators will be run on the Logs + generated for the Evaluation. Parameters ---------- id : str Unique identifier for Evaluation. + evaluators : typing.Sequence[EvaluationsRequestParams] + The Evaluators to add to this Evaluation. + request_options : typing.Optional[RequestOptions] Request-specific configuration. @@ -257,14 +247,21 @@ def get(self, id: str, *, request_options: typing.Optional[RequestOptions] = Non client = Humanloop( api_key="YOUR_API_KEY", ) - client.evaluations.get( - id="ev_567yza", + client.evaluations.add_evaluators( + id="id", + evaluators=[{}], ) """ _response = self._client_wrapper.httpx_client.request( - f"evaluations/{jsonable_encoder(id)}", - method="GET", + f"evaluations/{jsonable_encoder(id)}/evaluators", + method="POST", + json={ + "evaluators": convert_and_respect_annotation_metadata( + object_=evaluators, annotation=typing.Sequence[EvaluationsRequestParams], direction="write" + ), + }, request_options=request_options, + omit=OMIT, ) try: if 200 <= _response.status_code < 300: @@ -290,24 +287,30 @@ def get(self, id: str, *, request_options: typing.Optional[RequestOptions] = Non raise ApiError(status_code=_response.status_code, body=_response.text) raise ApiError(status_code=_response.status_code, body=_response_json) - def delete(self, id: str, *, request_options: typing.Optional[RequestOptions] = None) -> None: + def remove_evaluator( + self, id: str, evaluator_version_id: str, *, request_options: typing.Optional[RequestOptions] = None + ) -> EvaluationResponse: """ - Delete an Evaluation. + Remove an Evaluator from an Evaluation. - Remove an Evaluation from Humanloop. The Logs and Versions used in the Evaluation - will not be deleted. + Remove an Evaluator from an Evaluation. The Evaluator will no longer be run on the Logs + generated for the Evaluation. Parameters ---------- id : str Unique identifier for Evaluation. + evaluator_version_id : str + Unique identifier for Evaluator Version. + request_options : typing.Optional[RequestOptions] Request-specific configuration. Returns ------- - None + EvaluationResponse + Successful Response Examples -------- @@ -316,18 +319,25 @@ def delete(self, id: str, *, request_options: typing.Optional[RequestOptions] = client = Humanloop( api_key="YOUR_API_KEY", ) - client.evaluations.delete( - id="ev_567yza", + client.evaluations.remove_evaluator( + id="id", + evaluator_version_id="evaluator_version_id", ) """ _response = self._client_wrapper.httpx_client.request( - f"evaluations/{jsonable_encoder(id)}", + f"evaluations/{jsonable_encoder(id)}/evaluators/{jsonable_encoder(evaluator_version_id)}", method="DELETE", request_options=request_options, ) try: if 200 <= _response.status_code < 300: - return + return typing.cast( + EvaluationResponse, + construct_type( + type_=EvaluationResponse, # type: ignore + object_=_response.json(), + ), + ) if _response.status_code == 422: raise UnprocessableEntityError( typing.cast( @@ -343,43 +353,15 @@ def delete(self, id: str, *, request_options: typing.Optional[RequestOptions] = raise ApiError(status_code=_response.status_code, body=_response.text) raise ApiError(status_code=_response.status_code, body=_response_json) - def update_setup( - self, - id: str, - *, - dataset: typing.Optional[EvaluationsDatasetRequestParams] = OMIT, - evaluatees: typing.Optional[typing.Sequence[EvaluateeRequestParams]] = OMIT, - evaluators: typing.Optional[typing.Sequence[EvaluationsRequestParams]] = OMIT, - name: typing.Optional[str] = OMIT, - file: typing.Optional[FileRequestParams] = OMIT, - request_options: typing.Optional[RequestOptions] = None, - ) -> EvaluationResponse: + def get(self, id: str, *, request_options: typing.Optional[RequestOptions] = None) -> EvaluationResponse: """ - Update an Evaluation. - - Update the setup of an Evaluation by specifying the Dataset, versions to be - evaluated (Evaluatees), and which Evaluators to provide judgments. + Get an Evaluation. Parameters ---------- id : str Unique identifier for Evaluation. - dataset : typing.Optional[EvaluationsDatasetRequestParams] - Dataset to use in this Evaluation. - - evaluatees : typing.Optional[typing.Sequence[EvaluateeRequestParams]] - Unique identifiers for the Prompt/Tool Versions to include in the Evaluation. Can be left unpopulated if you wish to add evaluatees to this Evaluation by specifying `evaluation_id` in Log calls. - - evaluators : typing.Optional[typing.Sequence[EvaluationsRequestParams]] - The Evaluators used to evaluate. - - name : typing.Optional[str] - Name of the Evaluation to help identify it. Must be unique within the associated File. - - file : typing.Optional[FileRequestParams] - The File to associate with the Evaluation. - request_options : typing.Optional[RequestOptions] Request-specific configuration. @@ -395,35 +377,14 @@ def update_setup( client = Humanloop( api_key="YOUR_API_KEY", ) - client.evaluations.update_setup( + client.evaluations.get( id="ev_567yza", - dataset={"version_id": "dsv_6L78pqrdFi2xa"}, - evaluatees=[ - {"version_id": "prv_7ZlQREDScH0xkhUwtXruN", "orchestrated": False} - ], - evaluators=[{"version_id": "evv_012def", "orchestrated": False}], ) """ _response = self._client_wrapper.httpx_client.request( f"evaluations/{jsonable_encoder(id)}", - method="PATCH", - json={ - "dataset": convert_and_respect_annotation_metadata( - object_=dataset, annotation=EvaluationsDatasetRequestParams, direction="write" - ), - "evaluatees": convert_and_respect_annotation_metadata( - object_=evaluatees, annotation=typing.Sequence[EvaluateeRequestParams], direction="write" - ), - "evaluators": convert_and_respect_annotation_metadata( - object_=evaluators, annotation=typing.Sequence[EvaluationsRequestParams], direction="write" - ), - "name": name, - "file": convert_and_respect_annotation_metadata( - object_=file, annotation=FileRequestParams, direction="write" - ), - }, + method="GET", request_options=request_options, - omit=OMIT, ) try: if 200 <= _response.status_code < 300: @@ -449,29 +410,24 @@ def update_setup( raise ApiError(status_code=_response.status_code, body=_response.text) raise ApiError(status_code=_response.status_code, body=_response_json) - def update_status( - self, id: str, *, status: EvaluationStatus, request_options: typing.Optional[RequestOptions] = None - ) -> EvaluationResponse: + def delete(self, id: str, *, request_options: typing.Optional[RequestOptions] = None) -> None: """ - Update the status of an Evaluation. + Delete an Evaluation. - Can be used to cancel a running Evaluation, or mark an Evaluation that uses - external or human evaluators as completed. + Remove an Evaluation from Humanloop. The Logs and Versions used in the Evaluation + will not be deleted. Parameters ---------- id : str Unique identifier for Evaluation. - status : EvaluationStatus - request_options : typing.Optional[RequestOptions] Request-specific configuration. Returns ------- - EvaluationResponse - Successful Response + None Examples -------- @@ -480,29 +436,18 @@ def update_status( client = Humanloop( api_key="YOUR_API_KEY", ) - client.evaluations.update_status( - id="id", - status="pending", + client.evaluations.delete( + id="ev_567yza", ) """ _response = self._client_wrapper.httpx_client.request( - f"evaluations/{jsonable_encoder(id)}/status", - method="PATCH", - json={ - "status": status, - }, + f"evaluations/{jsonable_encoder(id)}", + method="DELETE", request_options=request_options, - omit=OMIT, ) try: if 200 <= _response.status_code < 300: - return typing.cast( - EvaluationResponse, - construct_type( - type_=EvaluationResponse, # type: ignore - object_=_response.json(), - ), - ) + return if _response.status_code == 422: raise UnprocessableEntityError( typing.cast( @@ -518,13 +463,11 @@ def update_status( raise ApiError(status_code=_response.status_code, body=_response.text) raise ApiError(status_code=_response.status_code, body=_response_json) - def get_stats(self, id: str, *, request_options: typing.Optional[RequestOptions] = None) -> EvaluationStats: + def list_runs_for_evaluation( + self, id: str, *, request_options: typing.Optional[RequestOptions] = None + ) -> EvaluationRunsResponse: """ - Get Evaluation Stats. - - Retrieve aggregate stats for the specified Evaluation. - This includes the number of generated Logs for each evaluated version and the - corresponding Evaluator statistics (such as the mean and percentiles). + List all Runs for an Evaluation. Parameters ---------- @@ -536,7 +479,7 @@ def get_stats(self, id: str, *, request_options: typing.Optional[RequestOptions] Returns ------- - EvaluationStats + EvaluationRunsResponse Successful Response Examples @@ -546,21 +489,21 @@ def get_stats(self, id: str, *, request_options: typing.Optional[RequestOptions] client = Humanloop( api_key="YOUR_API_KEY", ) - client.evaluations.get_stats( + client.evaluations.list_runs_for_evaluation( id="id", ) """ _response = self._client_wrapper.httpx_client.request( - f"evaluations/{jsonable_encoder(id)}/stats", + f"evaluations/{jsonable_encoder(id)}/runs", method="GET", request_options=request_options, ) try: if 200 <= _response.status_code < 300: return typing.cast( - EvaluationStats, + EvaluationRunsResponse, construct_type( - type_=EvaluationStats, # type: ignore + type_=EvaluationRunsResponse, # type: ignore object_=_response.json(), ), ) @@ -579,37 +522,60 @@ def get_stats(self, id: str, *, request_options: typing.Optional[RequestOptions] raise ApiError(status_code=_response.status_code, body=_response.text) raise ApiError(status_code=_response.status_code, body=_response_json) - def get_logs( + def create_run( self, id: str, *, - page: typing.Optional[int] = None, - size: typing.Optional[int] = None, + dataset: typing.Optional[EvaluationsDatasetRequestParams] = OMIT, + version: typing.Optional[VersionSpecificationParams] = OMIT, + orchestrated: typing.Optional[bool] = OMIT, + logs: typing.Optional[LogsAssociationType] = OMIT, request_options: typing.Optional[RequestOptions] = None, - ) -> PaginatedDataEvaluationReportLogResponse: + ) -> EvaluationRunResponse: """ - Get the Logs associated to a specific Evaluation. + Create an Evaluation Run. - Each Datapoint in your Dataset will have a corresponding Log for each File version evaluated. - e.g. If you have 50 Datapoints and are evaluating 2 Prompts, there will be 100 Logs associated with the Evaluation. + Create a new Evaluation Run. Optionally specify the Dataset and version to be + evaluated. + + Humanloop will automatically start generating Logs and running Evaluators where + `orchestrated=true`. If you are generating Logs yourself, you can set `orchestrated=false` + and then generate and submit the required Logs via the API. + + The `logs` parameter controls which Logs are associated with the Run. Defaults to `dynamic` + if `dataset` and `version` are provided. This means that Logs will automatically be retrieved + if they're associated with the specified Version and has `source_datapoint_id` referencing + a datapoint in the specified Dataset. + If `logs` is set to `fixed`, no existing Logs will be automatically associated with the Run. + You can then add Logs to the Run using the `POST /evaluations/{id}/runs/{run_id}/logs` endpoint, + or by adding `run_id` to your `POST /prompts/logs` requests. + + To keep updated on the progress of the Run, you can poll the Run using + the `GET /evaluations/{id}/runs` endpoint and check its status. Parameters ---------- id : str - String ID of evaluation. Starts with `ev_` or `evr_`. + Unique identifier for Evaluation. - page : typing.Optional[int] - Page number for pagination. + dataset : typing.Optional[EvaluationsDatasetRequestParams] + Dataset to use in this Run. - size : typing.Optional[int] - Page size for pagination. Number of Logs to fetch. + version : typing.Optional[VersionSpecificationParams] + Version to use in this Run. + + orchestrated : typing.Optional[bool] + Whether the Run is orchestrated by Humanloop. If `True`, Humanloop will generate Logs for the Run; `dataset` and `version` must be provided. If `False`, a log for the Prompt/Tool should be submitted by the user via the API. + + logs : typing.Optional[LogsAssociationType] + How the Logs are associated with the Run. If `dynamic`, the latest relevant Logs will be inferred from the Dataset and Version. If `fixed`, the Logs will be explicitly associated. You can provide a list of Log IDs to associate with the Run, or add them to the Run later. Defaults to `dynamic` if `dataset` and `version` are provided; otherwise, defaults to `fixed`. request_options : typing.Optional[RequestOptions] Request-specific configuration. Returns ------- - PaginatedDataEvaluationReportLogResponse + EvaluationRunResponse Successful Response Examples @@ -619,25 +585,32 @@ def get_logs( client = Humanloop( api_key="YOUR_API_KEY", ) - client.evaluations.get_logs( + client.evaluations.create_run( id="id", ) """ _response = self._client_wrapper.httpx_client.request( - f"evaluations/{jsonable_encoder(id)}/logs", - method="GET", - params={ - "page": page, - "size": size, + f"evaluations/{jsonable_encoder(id)}/runs", + method="POST", + json={ + "dataset": convert_and_respect_annotation_metadata( + object_=dataset, annotation=EvaluationsDatasetRequestParams, direction="write" + ), + "version": convert_and_respect_annotation_metadata( + object_=version, annotation=VersionSpecificationParams, direction="write" + ), + "orchestrated": orchestrated, + "logs": logs, }, request_options=request_options, + omit=OMIT, ) try: if 200 <= _response.status_code < 300: return typing.cast( - PaginatedDataEvaluationReportLogResponse, + EvaluationRunResponse, construct_type( - type_=PaginatedDataEvaluationReportLogResponse, # type: ignore + type_=EvaluationRunResponse, # type: ignore object_=_response.json(), ), ) @@ -656,53 +629,26 @@ def get_logs( raise ApiError(status_code=_response.status_code, body=_response.text) raise ApiError(status_code=_response.status_code, body=_response_json) - def pin_evaluatee( - self, - id: str, - *, - version_id: typing.Optional[str] = OMIT, - path: typing.Optional[str] = OMIT, - file_id: typing.Optional[str] = OMIT, - environment: typing.Optional[str] = OMIT, - batch_id: typing.Optional[str] = OMIT, - orchestrated: typing.Optional[bool] = OMIT, - request_options: typing.Optional[RequestOptions] = None, - ) -> EvaluationResponse: + def add_existing_run( + self, id: str, run_id: str, *, request_options: typing.Optional[RequestOptions] = None + ) -> typing.Optional[typing.Any]: """ - Pin the specified Evaluatee. - - Pinned Evaluatees are always displayed in the Evaluation Overview, - and serve as the baseline for comparison with other Evaluatees. + Add an existing Run to an Evaluation. Parameters ---------- id : str Unique identifier for Evaluation. - version_id : typing.Optional[str] - Unique identifier for the File Version. If provided, none of the other fields should be specified. - - path : typing.Optional[str] - Path identifying a File. Provide either this or `file_id` if you want to specify a File. - - file_id : typing.Optional[str] - Unique identifier for the File. Provide either this or `path` if you want to specify a File. - - environment : typing.Optional[str] - Name of the Environment a Version is deployed to. Only provide this when specifying a File. If not provided (and a File is specified), the default Environment is used. - - batch_id : typing.Optional[str] - Unique identifier for the batch of Logs to include in the Evaluation Report. - - orchestrated : typing.Optional[bool] - Whether the Prompt/Tool is orchestrated by Humanloop. Default is `True`. If `False`, a log for the Prompt/Tool should be submitted by the user via the API. + run_id : str + Unique identifier for Run. request_options : typing.Optional[RequestOptions] Request-specific configuration. Returns ------- - EvaluationResponse + typing.Optional[typing.Any] Successful Response Examples @@ -712,30 +658,22 @@ def pin_evaluatee( client = Humanloop( api_key="YOUR_API_KEY", ) - client.evaluations.pin_evaluatee( + client.evaluations.add_existing_run( id="id", + run_id="run_id", ) """ _response = self._client_wrapper.httpx_client.request( - f"evaluations/{jsonable_encoder(id)}/pin-evaluatee", + f"evaluations/{jsonable_encoder(id)}/runs/{jsonable_encoder(run_id)}", method="POST", - json={ - "version_id": version_id, - "path": path, - "file_id": file_id, - "environment": environment, - "batch_id": batch_id, - "orchestrated": orchestrated, - }, request_options=request_options, - omit=OMIT, ) try: if 200 <= _response.status_code < 300: return typing.cast( - EvaluationResponse, + typing.Optional[typing.Any], construct_type( - type_=EvaluationResponse, # type: ignore + type_=typing.Optional[typing.Any], # type: ignore object_=_response.json(), ), ) @@ -754,97 +692,889 @@ def pin_evaluatee( raise ApiError(status_code=_response.status_code, body=_response.text) raise ApiError(status_code=_response.status_code, body=_response_json) - -class AsyncEvaluationsClient: - def __init__(self, *, client_wrapper: AsyncClientWrapper): - self._client_wrapper = client_wrapper - - async def list( - self, - *, - file_id: str, - page: typing.Optional[int] = None, - size: typing.Optional[int] = None, - request_options: typing.Optional[RequestOptions] = None, - ) -> AsyncPager[EvaluationResponse]: + def remove_run_from_evaluation( + self, id: str, run_id: str, *, request_options: typing.Optional[RequestOptions] = None + ) -> None: """ - List all Evaluations for the specified `file_id`. + Remove a Run from an Evaluation. - Retrieve a list of Evaluations that evaluate versions of the specified File. + Remove a Run from an Evaluation. The Logs and Versions used in the Run will not be deleted. + If this Run is used in any other Evaluations, it will still be available in those Evaluations. Parameters ---------- - file_id : str - Filter by File ID. Only Evaluations for the specified File will be returned. - - page : typing.Optional[int] - Page number for pagination. + id : str + Unique identifier for Evaluation. - size : typing.Optional[int] - Page size for pagination. Number of Evaluations to fetch. + run_id : str + Unique identifier for Run. request_options : typing.Optional[RequestOptions] Request-specific configuration. Returns ------- - AsyncPager[EvaluationResponse] - Successful Response + None Examples -------- - import asyncio - - from humanloop import AsyncHumanloop + from humanloop import Humanloop - client = AsyncHumanloop( + client = Humanloop( api_key="YOUR_API_KEY", ) - - - async def main() -> None: - response = await client.evaluations.list( - file_id="pr_30gco7dx6JDq4200GVOHa", - size=1, - ) - async for item in response: - yield item - # alternatively, you can paginate page-by-page - async for page in response.iter_pages(): - yield page - - - asyncio.run(main()) + client.evaluations.remove_run_from_evaluation( + id="id", + run_id="run_id", + ) """ - page = page if page is not None else 1 - _response = await self._client_wrapper.httpx_client.request( - "evaluations", - method="GET", - params={ - "file_id": file_id, - "page": page, - "size": size, - }, + _response = self._client_wrapper.httpx_client.request( + f"evaluations/{jsonable_encoder(id)}/runs/{jsonable_encoder(run_id)}", + method="DELETE", request_options=request_options, ) try: if 200 <= _response.status_code < 300: - _parsed_response = typing.cast( - PaginatedEvaluationResponse, + return + if _response.status_code == 422: + raise UnprocessableEntityError( + typing.cast( + HttpValidationError, + construct_type( + type_=HttpValidationError, # type: ignore + object_=_response.json(), + ), + ) + ) + _response_json = _response.json() + except JSONDecodeError: + raise ApiError(status_code=_response.status_code, body=_response.text) + raise ApiError(status_code=_response.status_code, body=_response_json) + + def update_evaluation_run( + self, id: str, run_id: str, *, control: bool, request_options: typing.Optional[RequestOptions] = None + ) -> EvaluationRunResponse: + """ + Update an Evaluation Run. + + Update the Dataset and version to be evaluated for an existing Run. + + Parameters + ---------- + id : str + Unique identifier for Evaluation. + + run_id : str + Unique identifier for Run. + + control : bool + If `True`, this Run will be used as the control in the Evaluation. Stats for other Runs will be compared to this Run. This will replace any existing control Run. + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + EvaluationRunResponse + Successful Response + + Examples + -------- + from humanloop import Humanloop + + client = Humanloop( + api_key="YOUR_API_KEY", + ) + client.evaluations.update_evaluation_run( + id="id", + run_id="run_id", + control=True, + ) + """ + _response = self._client_wrapper.httpx_client.request( + f"evaluations/{jsonable_encoder(id)}/runs/{jsonable_encoder(run_id)}", + method="PATCH", + json={ + "control": control, + }, + request_options=request_options, + omit=OMIT, + ) + try: + if 200 <= _response.status_code < 300: + return typing.cast( + EvaluationRunResponse, + construct_type( + type_=EvaluationRunResponse, # type: ignore + object_=_response.json(), + ), + ) + if _response.status_code == 422: + raise UnprocessableEntityError( + typing.cast( + HttpValidationError, + construct_type( + type_=HttpValidationError, # type: ignore + object_=_response.json(), + ), + ) + ) + _response_json = _response.json() + except JSONDecodeError: + raise ApiError(status_code=_response.status_code, body=_response.text) + raise ApiError(status_code=_response.status_code, body=_response_json) + + def add_logs_to_run( + self, + id: str, + run_id: str, + *, + log_ids: typing.Sequence[str], + request_options: typing.Optional[RequestOptions] = None, + ) -> EvaluationRunResponse: + """ + Add Logs to an Evaluation Run. + + This is supported only for Runs that have a fixed set of Logs. + (Runs can either have a fixed set of Logs, or can be set to dynamically retrieve the latest Logs + if a Dataset and Version are provided.) + + Parameters + ---------- + id : str + Unique identifier for Evaluation. + + run_id : str + Unique identifier for Run. + + log_ids : typing.Sequence[str] + The IDs of the Logs to add to the Run. + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + EvaluationRunResponse + Successful Response + + Examples + -------- + from humanloop import Humanloop + + client = Humanloop( + api_key="YOUR_API_KEY", + ) + client.evaluations.add_logs_to_run( + id="id", + run_id="run_id", + log_ids=["log_ids"], + ) + """ + _response = self._client_wrapper.httpx_client.request( + f"evaluations/{jsonable_encoder(id)}/runs/{jsonable_encoder(run_id)}/logs", + method="POST", + json={ + "log_ids": log_ids, + }, + request_options=request_options, + omit=OMIT, + ) + try: + if 200 <= _response.status_code < 300: + return typing.cast( + EvaluationRunResponse, + construct_type( + type_=EvaluationRunResponse, # type: ignore + object_=_response.json(), + ), + ) + if _response.status_code == 422: + raise UnprocessableEntityError( + typing.cast( + HttpValidationError, + construct_type( + type_=HttpValidationError, # type: ignore + object_=_response.json(), + ), + ) + ) + _response_json = _response.json() + except JSONDecodeError: + raise ApiError(status_code=_response.status_code, body=_response.text) + raise ApiError(status_code=_response.status_code, body=_response_json) + + def get_stats(self, id: str, *, request_options: typing.Optional[RequestOptions] = None) -> EvaluationStats: + """ + Get Evaluation Stats. + + Retrieve aggregate stats for the specified Evaluation. + + This includes the number of generated Logs for each Run and the + corresponding Evaluator statistics (such as the mean and percentiles). + + Parameters + ---------- + id : str + Unique identifier for Evaluation. + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + EvaluationStats + Successful Response + + Examples + -------- + from humanloop import Humanloop + + client = Humanloop( + api_key="YOUR_API_KEY", + ) + client.evaluations.get_stats( + id="id", + ) + """ + _response = self._client_wrapper.httpx_client.request( + f"evaluations/{jsonable_encoder(id)}/stats", + method="GET", + request_options=request_options, + ) + try: + if 200 <= _response.status_code < 300: + return typing.cast( + EvaluationStats, + construct_type( + type_=EvaluationStats, # type: ignore + object_=_response.json(), + ), + ) + if _response.status_code == 422: + raise UnprocessableEntityError( + typing.cast( + HttpValidationError, + construct_type( + type_=HttpValidationError, # type: ignore + object_=_response.json(), + ), + ) + ) + _response_json = _response.json() + except JSONDecodeError: + raise ApiError(status_code=_response.status_code, body=_response.text) + raise ApiError(status_code=_response.status_code, body=_response_json) + + def get_logs( + self, + id: str, + *, + page: typing.Optional[int] = None, + size: typing.Optional[int] = None, + request_options: typing.Optional[RequestOptions] = None, + ) -> PaginatedDataEvaluationLogResponse: + """ + Get the Logs associated to a specific Evaluation. + + Parameters + ---------- + id : str + String ID of evaluation. Starts with `ev_` or `evr_`. + + page : typing.Optional[int] + Page number for pagination. + + size : typing.Optional[int] + Page size for pagination. Number of Logs to fetch. + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + PaginatedDataEvaluationLogResponse + Successful Response + + Examples + -------- + from humanloop import Humanloop + + client = Humanloop( + api_key="YOUR_API_KEY", + ) + client.evaluations.get_logs( + id="id", + ) + """ + _response = self._client_wrapper.httpx_client.request( + f"evaluations/{jsonable_encoder(id)}/logs", + method="GET", + params={ + "page": page, + "size": size, + }, + request_options=request_options, + ) + try: + if 200 <= _response.status_code < 300: + return typing.cast( + PaginatedDataEvaluationLogResponse, + construct_type( + type_=PaginatedDataEvaluationLogResponse, # type: ignore + object_=_response.json(), + ), + ) + if _response.status_code == 422: + raise UnprocessableEntityError( + typing.cast( + HttpValidationError, + construct_type( + type_=HttpValidationError, # type: ignore + object_=_response.json(), + ), + ) + ) + _response_json = _response.json() + except JSONDecodeError: + raise ApiError(status_code=_response.status_code, body=_response.text) + raise ApiError(status_code=_response.status_code, body=_response_json) + + +class AsyncEvaluationsClient: + def __init__(self, *, client_wrapper: AsyncClientWrapper): + self._client_wrapper = client_wrapper + + async def list( + self, + *, + file_id: str, + page: typing.Optional[int] = None, + size: typing.Optional[int] = None, + request_options: typing.Optional[RequestOptions] = None, + ) -> AsyncPager[EvaluationResponse]: + """ + List all Evaluations for the specified `file_id`. + + Retrieve a list of Evaluations that evaluate versions of the specified File. + + Parameters + ---------- + file_id : str + Filter by File ID. Only Evaluations for the specified File will be returned. + + page : typing.Optional[int] + Page number for pagination. + + size : typing.Optional[int] + Page size for pagination. Number of Evaluations to fetch. + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + AsyncPager[EvaluationResponse] + Successful Response + + Examples + -------- + import asyncio + + from humanloop import AsyncHumanloop + + client = AsyncHumanloop( + api_key="YOUR_API_KEY", + ) + + + async def main() -> None: + response = await client.evaluations.list( + file_id="pr_30gco7dx6JDq4200GVOHa", + size=1, + ) + async for item in response: + yield item + # alternatively, you can paginate page-by-page + async for page in response.iter_pages(): + yield page + + + asyncio.run(main()) + """ + page = page if page is not None else 1 + _response = await self._client_wrapper.httpx_client.request( + "evaluations", + method="GET", + params={ + "file_id": file_id, + "page": page, + "size": size, + }, + request_options=request_options, + ) + try: + if 200 <= _response.status_code < 300: + _parsed_response = typing.cast( + PaginatedEvaluationResponse, + construct_type( + type_=PaginatedEvaluationResponse, # type: ignore + object_=_response.json(), + ), + ) + _has_next = True + _get_next = lambda: self.list( + file_id=file_id, + page=page + 1, + size=size, + request_options=request_options, + ) + _items = _parsed_response.records + return AsyncPager(has_next=_has_next, items=_items, get_next=_get_next) + if _response.status_code == 422: + raise UnprocessableEntityError( + typing.cast( + HttpValidationError, + construct_type( + type_=HttpValidationError, # type: ignore + object_=_response.json(), + ), + ) + ) + _response_json = _response.json() + except JSONDecodeError: + raise ApiError(status_code=_response.status_code, body=_response.text) + raise ApiError(status_code=_response.status_code, body=_response_json) + + async def create( + self, + *, + evaluators: typing.Sequence[EvaluationsRequestParams], + file: typing.Optional[FileRequestParams] = OMIT, + name: typing.Optional[str] = OMIT, + request_options: typing.Optional[RequestOptions] = None, + ) -> EvaluationResponse: + """ + Create an Evaluation. + + Create an Evaluation by specifying the File to evaluate, and a name + for the Evaluation. + + You can then add Runs to this Evaluation using the `POST /evaluations/{id}/runs` endpoint. + + Parameters + ---------- + evaluators : typing.Sequence[EvaluationsRequestParams] + The Evaluators used to evaluate. + + file : typing.Optional[FileRequestParams] + The File to associate with the Evaluation. This File contains the Logs you're evaluating. + + name : typing.Optional[str] + Name of the Evaluation to help identify it. Must be unique within the associated File. + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + EvaluationResponse + Successful Response + + Examples + -------- + import asyncio + + from humanloop import AsyncHumanloop + + client = AsyncHumanloop( + api_key="YOUR_API_KEY", + ) + + + async def main() -> None: + await client.evaluations.create( + evaluators=[{}], + ) + + + asyncio.run(main()) + """ + _response = await self._client_wrapper.httpx_client.request( + "evaluations", + method="POST", + json={ + "file": convert_and_respect_annotation_metadata( + object_=file, annotation=FileRequestParams, direction="write" + ), + "name": name, + "evaluators": convert_and_respect_annotation_metadata( + object_=evaluators, annotation=typing.Sequence[EvaluationsRequestParams], direction="write" + ), + }, + request_options=request_options, + omit=OMIT, + ) + try: + if 200 <= _response.status_code < 300: + return typing.cast( + EvaluationResponse, + construct_type( + type_=EvaluationResponse, # type: ignore + object_=_response.json(), + ), + ) + if _response.status_code == 422: + raise UnprocessableEntityError( + typing.cast( + HttpValidationError, + construct_type( + type_=HttpValidationError, # type: ignore + object_=_response.json(), + ), + ) + ) + _response_json = _response.json() + except JSONDecodeError: + raise ApiError(status_code=_response.status_code, body=_response.text) + raise ApiError(status_code=_response.status_code, body=_response_json) + + async def add_evaluators( + self, + id: str, + *, + evaluators: typing.Sequence[EvaluationsRequestParams], + request_options: typing.Optional[RequestOptions] = None, + ) -> EvaluationResponse: + """ + Add Evaluators to an Evaluation. + + Add new Evaluators to an Evaluation. The Evaluators will be run on the Logs + generated for the Evaluation. + + Parameters + ---------- + id : str + Unique identifier for Evaluation. + + evaluators : typing.Sequence[EvaluationsRequestParams] + The Evaluators to add to this Evaluation. + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + EvaluationResponse + Successful Response + + Examples + -------- + import asyncio + + from humanloop import AsyncHumanloop + + client = AsyncHumanloop( + api_key="YOUR_API_KEY", + ) + + + async def main() -> None: + await client.evaluations.add_evaluators( + id="id", + evaluators=[{}], + ) + + + asyncio.run(main()) + """ + _response = await self._client_wrapper.httpx_client.request( + f"evaluations/{jsonable_encoder(id)}/evaluators", + method="POST", + json={ + "evaluators": convert_and_respect_annotation_metadata( + object_=evaluators, annotation=typing.Sequence[EvaluationsRequestParams], direction="write" + ), + }, + request_options=request_options, + omit=OMIT, + ) + try: + if 200 <= _response.status_code < 300: + return typing.cast( + EvaluationResponse, + construct_type( + type_=EvaluationResponse, # type: ignore + object_=_response.json(), + ), + ) + if _response.status_code == 422: + raise UnprocessableEntityError( + typing.cast( + HttpValidationError, + construct_type( + type_=HttpValidationError, # type: ignore + object_=_response.json(), + ), + ) + ) + _response_json = _response.json() + except JSONDecodeError: + raise ApiError(status_code=_response.status_code, body=_response.text) + raise ApiError(status_code=_response.status_code, body=_response_json) + + async def remove_evaluator( + self, id: str, evaluator_version_id: str, *, request_options: typing.Optional[RequestOptions] = None + ) -> EvaluationResponse: + """ + Remove an Evaluator from an Evaluation. + + Remove an Evaluator from an Evaluation. The Evaluator will no longer be run on the Logs + generated for the Evaluation. + + Parameters + ---------- + id : str + Unique identifier for Evaluation. + + evaluator_version_id : str + Unique identifier for Evaluator Version. + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + EvaluationResponse + Successful Response + + Examples + -------- + import asyncio + + from humanloop import AsyncHumanloop + + client = AsyncHumanloop( + api_key="YOUR_API_KEY", + ) + + + async def main() -> None: + await client.evaluations.remove_evaluator( + id="id", + evaluator_version_id="evaluator_version_id", + ) + + + asyncio.run(main()) + """ + _response = await self._client_wrapper.httpx_client.request( + f"evaluations/{jsonable_encoder(id)}/evaluators/{jsonable_encoder(evaluator_version_id)}", + method="DELETE", + request_options=request_options, + ) + try: + if 200 <= _response.status_code < 300: + return typing.cast( + EvaluationResponse, + construct_type( + type_=EvaluationResponse, # type: ignore + object_=_response.json(), + ), + ) + if _response.status_code == 422: + raise UnprocessableEntityError( + typing.cast( + HttpValidationError, + construct_type( + type_=HttpValidationError, # type: ignore + object_=_response.json(), + ), + ) + ) + _response_json = _response.json() + except JSONDecodeError: + raise ApiError(status_code=_response.status_code, body=_response.text) + raise ApiError(status_code=_response.status_code, body=_response_json) + + async def get(self, id: str, *, request_options: typing.Optional[RequestOptions] = None) -> EvaluationResponse: + """ + Get an Evaluation. + + Parameters + ---------- + id : str + Unique identifier for Evaluation. + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + EvaluationResponse + Successful Response + + Examples + -------- + import asyncio + + from humanloop import AsyncHumanloop + + client = AsyncHumanloop( + api_key="YOUR_API_KEY", + ) + + + async def main() -> None: + await client.evaluations.get( + id="ev_567yza", + ) + + + asyncio.run(main()) + """ + _response = await self._client_wrapper.httpx_client.request( + f"evaluations/{jsonable_encoder(id)}", + method="GET", + request_options=request_options, + ) + try: + if 200 <= _response.status_code < 300: + return typing.cast( + EvaluationResponse, + construct_type( + type_=EvaluationResponse, # type: ignore + object_=_response.json(), + ), + ) + if _response.status_code == 422: + raise UnprocessableEntityError( + typing.cast( + HttpValidationError, + construct_type( + type_=HttpValidationError, # type: ignore + object_=_response.json(), + ), + ) + ) + _response_json = _response.json() + except JSONDecodeError: + raise ApiError(status_code=_response.status_code, body=_response.text) + raise ApiError(status_code=_response.status_code, body=_response_json) + + async def delete(self, id: str, *, request_options: typing.Optional[RequestOptions] = None) -> None: + """ + Delete an Evaluation. + + Remove an Evaluation from Humanloop. The Logs and Versions used in the Evaluation + will not be deleted. + + Parameters + ---------- + id : str + Unique identifier for Evaluation. + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + None + + Examples + -------- + import asyncio + + from humanloop import AsyncHumanloop + + client = AsyncHumanloop( + api_key="YOUR_API_KEY", + ) + + + async def main() -> None: + await client.evaluations.delete( + id="ev_567yza", + ) + + + asyncio.run(main()) + """ + _response = await self._client_wrapper.httpx_client.request( + f"evaluations/{jsonable_encoder(id)}", + method="DELETE", + request_options=request_options, + ) + try: + if 200 <= _response.status_code < 300: + return + if _response.status_code == 422: + raise UnprocessableEntityError( + typing.cast( + HttpValidationError, + construct_type( + type_=HttpValidationError, # type: ignore + object_=_response.json(), + ), + ) + ) + _response_json = _response.json() + except JSONDecodeError: + raise ApiError(status_code=_response.status_code, body=_response.text) + raise ApiError(status_code=_response.status_code, body=_response_json) + + async def list_runs_for_evaluation( + self, id: str, *, request_options: typing.Optional[RequestOptions] = None + ) -> EvaluationRunsResponse: + """ + List all Runs for an Evaluation. + + Parameters + ---------- + id : str + Unique identifier for Evaluation. + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + EvaluationRunsResponse + Successful Response + + Examples + -------- + import asyncio + + from humanloop import AsyncHumanloop + + client = AsyncHumanloop( + api_key="YOUR_API_KEY", + ) + + + async def main() -> None: + await client.evaluations.list_runs_for_evaluation( + id="id", + ) + + + asyncio.run(main()) + """ + _response = await self._client_wrapper.httpx_client.request( + f"evaluations/{jsonable_encoder(id)}/runs", + method="GET", + request_options=request_options, + ) + try: + if 200 <= _response.status_code < 300: + return typing.cast( + EvaluationRunsResponse, construct_type( - type_=PaginatedEvaluationResponse, # type: ignore + type_=EvaluationRunsResponse, # type: ignore object_=_response.json(), ), ) - _has_next = True - _get_next = lambda: self.list( - file_id=file_id, - page=page + 1, - size=size, - request_options=request_options, - ) - _items = _parsed_response.records - return AsyncPager(has_next=_has_next, items=_items, get_next=_get_next) if _response.status_code == 422: raise UnprocessableEntityError( typing.cast( @@ -860,53 +1590,60 @@ async def main() -> None: raise ApiError(status_code=_response.status_code, body=_response.text) raise ApiError(status_code=_response.status_code, body=_response_json) - async def create( + async def create_run( self, + id: str, *, - dataset: EvaluationsDatasetRequestParams, - evaluators: typing.Sequence[EvaluationsRequestParams], - evaluatees: typing.Optional[typing.Sequence[EvaluateeRequestParams]] = OMIT, - name: typing.Optional[str] = OMIT, - file: typing.Optional[FileRequestParams] = OMIT, + dataset: typing.Optional[EvaluationsDatasetRequestParams] = OMIT, + version: typing.Optional[VersionSpecificationParams] = OMIT, + orchestrated: typing.Optional[bool] = OMIT, + logs: typing.Optional[LogsAssociationType] = OMIT, request_options: typing.Optional[RequestOptions] = None, - ) -> EvaluationResponse: + ) -> EvaluationRunResponse: """ - Create an Evaluation. + Create an Evaluation Run. - Create a new Evaluation by specifying the Dataset, versions to be - evaluated (Evaluatees), and which Evaluators to provide judgments. + Create a new Evaluation Run. Optionally specify the Dataset and version to be + evaluated. Humanloop will automatically start generating Logs and running Evaluators where - `orchestrated=true`. If you own the runtime for the Evaluatee or Evaluator, you - can set `orchestrated=false` and then generate and submit the required logs using - your runtime. + `orchestrated=true`. If you are generating Logs yourself, you can set `orchestrated=false` + and then generate and submit the required Logs via the API. + + The `logs` parameter controls which Logs are associated with the Run. Defaults to `dynamic` + if `dataset` and `version` are provided. This means that Logs will automatically be retrieved + if they're associated with the specified Version and has `source_datapoint_id` referencing + a datapoint in the specified Dataset. + If `logs` is set to `fixed`, no existing Logs will be automatically associated with the Run. + You can then add Logs to the Run using the `POST /evaluations/{id}/runs/{run_id}/logs` endpoint, + or by adding `run_id` to your `POST /prompts/logs` requests. - To keep updated on the progress of the Evaluation, you can poll the Evaluation using - the `GET /evaluations/:id` endpoint and check its status. + To keep updated on the progress of the Run, you can poll the Run using + the `GET /evaluations/{id}/runs` endpoint and check its status. Parameters ---------- - dataset : EvaluationsDatasetRequestParams - Dataset to use in this Evaluation. + id : str + Unique identifier for Evaluation. - evaluators : typing.Sequence[EvaluationsRequestParams] - The Evaluators used to evaluate. + dataset : typing.Optional[EvaluationsDatasetRequestParams] + Dataset to use in this Run. - evaluatees : typing.Optional[typing.Sequence[EvaluateeRequestParams]] - Unique identifiers for the Prompt/Tool Versions to include in the Evaluation. Can be left unpopulated if you wish to add Evaluatees to this Evaluation by specifying `evaluation_id` in Log calls. + version : typing.Optional[VersionSpecificationParams] + Version to use in this Run. - name : typing.Optional[str] - Name of the Evaluation to help identify it. Must be unique within the associated File. + orchestrated : typing.Optional[bool] + Whether the Run is orchestrated by Humanloop. If `True`, Humanloop will generate Logs for the Run; `dataset` and `version` must be provided. If `False`, a log for the Prompt/Tool should be submitted by the user via the API. - file : typing.Optional[FileRequestParams] - The File to associate with the Evaluation. + logs : typing.Optional[LogsAssociationType] + How the Logs are associated with the Run. If `dynamic`, the latest relevant Logs will be inferred from the Dataset and Version. If `fixed`, the Logs will be explicitly associated. You can provide a list of Log IDs to associate with the Run, or add them to the Run later. Defaults to `dynamic` if `dataset` and `version` are provided; otherwise, defaults to `fixed`. request_options : typing.Optional[RequestOptions] Request-specific configuration. Returns ------- - EvaluationResponse + EvaluationRunResponse Successful Response Examples @@ -921,34 +1658,25 @@ async def create( async def main() -> None: - await client.evaluations.create( - dataset={"version_id": "dsv_6L78pqrdFi2xa"}, - evaluatees=[ - {"version_id": "prv_7ZlQREDScH0xkhUwtXruN", "orchestrated": False} - ], - evaluators=[{"version_id": "evv_012def", "orchestrated": False}], + await client.evaluations.create_run( + id="id", ) asyncio.run(main()) """ _response = await self._client_wrapper.httpx_client.request( - "evaluations", + f"evaluations/{jsonable_encoder(id)}/runs", method="POST", json={ "dataset": convert_and_respect_annotation_metadata( object_=dataset, annotation=EvaluationsDatasetRequestParams, direction="write" ), - "evaluatees": convert_and_respect_annotation_metadata( - object_=evaluatees, annotation=typing.Sequence[EvaluateeRequestParams], direction="write" - ), - "evaluators": convert_and_respect_annotation_metadata( - object_=evaluators, annotation=typing.Sequence[EvaluationsRequestParams], direction="write" - ), - "name": name, - "file": convert_and_respect_annotation_metadata( - object_=file, annotation=FileRequestParams, direction="write" + "version": convert_and_respect_annotation_metadata( + object_=version, annotation=VersionSpecificationParams, direction="write" ), + "orchestrated": orchestrated, + "logs": logs, }, request_options=request_options, omit=OMIT, @@ -956,9 +1684,9 @@ async def main() -> None: try: if 200 <= _response.status_code < 300: return typing.cast( - EvaluationResponse, + EvaluationRunResponse, construct_type( - type_=EvaluationResponse, # type: ignore + type_=EvaluationRunResponse, # type: ignore object_=_response.json(), ), ) @@ -977,21 +1705,26 @@ async def main() -> None: raise ApiError(status_code=_response.status_code, body=_response.text) raise ApiError(status_code=_response.status_code, body=_response_json) - async def get(self, id: str, *, request_options: typing.Optional[RequestOptions] = None) -> EvaluationResponse: + async def add_existing_run( + self, id: str, run_id: str, *, request_options: typing.Optional[RequestOptions] = None + ) -> typing.Optional[typing.Any]: """ - Get an Evaluation. + Add an existing Run to an Evaluation. Parameters ---------- id : str Unique identifier for Evaluation. + run_id : str + Unique identifier for Run. + request_options : typing.Optional[RequestOptions] Request-specific configuration. Returns ------- - EvaluationResponse + typing.Optional[typing.Any] Successful Response Examples @@ -1006,24 +1739,25 @@ async def get(self, id: str, *, request_options: typing.Optional[RequestOptions] async def main() -> None: - await client.evaluations.get( - id="ev_567yza", + await client.evaluations.add_existing_run( + id="id", + run_id="run_id", ) asyncio.run(main()) """ _response = await self._client_wrapper.httpx_client.request( - f"evaluations/{jsonable_encoder(id)}", - method="GET", + f"evaluations/{jsonable_encoder(id)}/runs/{jsonable_encoder(run_id)}", + method="POST", request_options=request_options, ) try: if 200 <= _response.status_code < 300: return typing.cast( - EvaluationResponse, + typing.Optional[typing.Any], construct_type( - type_=EvaluationResponse, # type: ignore + type_=typing.Optional[typing.Any], # type: ignore object_=_response.json(), ), ) @@ -1042,18 +1776,23 @@ async def main() -> None: raise ApiError(status_code=_response.status_code, body=_response.text) raise ApiError(status_code=_response.status_code, body=_response_json) - async def delete(self, id: str, *, request_options: typing.Optional[RequestOptions] = None) -> None: + async def remove_run_from_evaluation( + self, id: str, run_id: str, *, request_options: typing.Optional[RequestOptions] = None + ) -> None: """ - Delete an Evaluation. + Remove a Run from an Evaluation. - Remove an Evaluation from Humanloop. The Logs and Versions used in the Evaluation - will not be deleted. + Remove a Run from an Evaluation. The Logs and Versions used in the Run will not be deleted. + If this Run is used in any other Evaluations, it will still be available in those Evaluations. Parameters ---------- id : str Unique identifier for Evaluation. + run_id : str + Unique identifier for Run. + request_options : typing.Optional[RequestOptions] Request-specific configuration. @@ -1073,15 +1812,16 @@ async def delete(self, id: str, *, request_options: typing.Optional[RequestOptio async def main() -> None: - await client.evaluations.delete( - id="ev_567yza", + await client.evaluations.remove_run_from_evaluation( + id="id", + run_id="run_id", ) asyncio.run(main()) """ _response = await self._client_wrapper.httpx_client.request( - f"evaluations/{jsonable_encoder(id)}", + f"evaluations/{jsonable_encoder(id)}/runs/{jsonable_encoder(run_id)}", method="DELETE", request_options=request_options, ) @@ -1103,49 +1843,31 @@ async def main() -> None: raise ApiError(status_code=_response.status_code, body=_response.text) raise ApiError(status_code=_response.status_code, body=_response_json) - async def update_setup( - self, - id: str, - *, - dataset: typing.Optional[EvaluationsDatasetRequestParams] = OMIT, - evaluatees: typing.Optional[typing.Sequence[EvaluateeRequestParams]] = OMIT, - evaluators: typing.Optional[typing.Sequence[EvaluationsRequestParams]] = OMIT, - name: typing.Optional[str] = OMIT, - file: typing.Optional[FileRequestParams] = OMIT, - request_options: typing.Optional[RequestOptions] = None, - ) -> EvaluationResponse: + async def update_evaluation_run( + self, id: str, run_id: str, *, control: bool, request_options: typing.Optional[RequestOptions] = None + ) -> EvaluationRunResponse: """ - Update an Evaluation. + Update an Evaluation Run. - Update the setup of an Evaluation by specifying the Dataset, versions to be - evaluated (Evaluatees), and which Evaluators to provide judgments. + Update the Dataset and version to be evaluated for an existing Run. Parameters ---------- id : str Unique identifier for Evaluation. - dataset : typing.Optional[EvaluationsDatasetRequestParams] - Dataset to use in this Evaluation. - - evaluatees : typing.Optional[typing.Sequence[EvaluateeRequestParams]] - Unique identifiers for the Prompt/Tool Versions to include in the Evaluation. Can be left unpopulated if you wish to add evaluatees to this Evaluation by specifying `evaluation_id` in Log calls. - - evaluators : typing.Optional[typing.Sequence[EvaluationsRequestParams]] - The Evaluators used to evaluate. - - name : typing.Optional[str] - Name of the Evaluation to help identify it. Must be unique within the associated File. + run_id : str + Unique identifier for Run. - file : typing.Optional[FileRequestParams] - The File to associate with the Evaluation. + control : bool + If `True`, this Run will be used as the control in the Evaluation. Stats for other Runs will be compared to this Run. This will replace any existing control Run. request_options : typing.Optional[RequestOptions] Request-specific configuration. Returns ------- - EvaluationResponse + EvaluationRunResponse Successful Response Examples @@ -1160,35 +1882,20 @@ async def update_setup( async def main() -> None: - await client.evaluations.update_setup( - id="ev_567yza", - dataset={"version_id": "dsv_6L78pqrdFi2xa"}, - evaluatees=[ - {"version_id": "prv_7ZlQREDScH0xkhUwtXruN", "orchestrated": False} - ], - evaluators=[{"version_id": "evv_012def", "orchestrated": False}], + await client.evaluations.update_evaluation_run( + id="id", + run_id="run_id", + control=True, ) asyncio.run(main()) """ _response = await self._client_wrapper.httpx_client.request( - f"evaluations/{jsonable_encoder(id)}", + f"evaluations/{jsonable_encoder(id)}/runs/{jsonable_encoder(run_id)}", method="PATCH", json={ - "dataset": convert_and_respect_annotation_metadata( - object_=dataset, annotation=EvaluationsDatasetRequestParams, direction="write" - ), - "evaluatees": convert_and_respect_annotation_metadata( - object_=evaluatees, annotation=typing.Sequence[EvaluateeRequestParams], direction="write" - ), - "evaluators": convert_and_respect_annotation_metadata( - object_=evaluators, annotation=typing.Sequence[EvaluationsRequestParams], direction="write" - ), - "name": name, - "file": convert_and_respect_annotation_metadata( - object_=file, annotation=FileRequestParams, direction="write" - ), + "control": control, }, request_options=request_options, omit=OMIT, @@ -1196,9 +1903,9 @@ async def main() -> None: try: if 200 <= _response.status_code < 300: return typing.cast( - EvaluationResponse, + EvaluationRunResponse, construct_type( - type_=EvaluationResponse, # type: ignore + type_=EvaluationRunResponse, # type: ignore object_=_response.json(), ), ) @@ -1217,28 +1924,38 @@ async def main() -> None: raise ApiError(status_code=_response.status_code, body=_response.text) raise ApiError(status_code=_response.status_code, body=_response_json) - async def update_status( - self, id: str, *, status: EvaluationStatus, request_options: typing.Optional[RequestOptions] = None - ) -> EvaluationResponse: + async def add_logs_to_run( + self, + id: str, + run_id: str, + *, + log_ids: typing.Sequence[str], + request_options: typing.Optional[RequestOptions] = None, + ) -> EvaluationRunResponse: """ - Update the status of an Evaluation. + Add Logs to an Evaluation Run. - Can be used to cancel a running Evaluation, or mark an Evaluation that uses - external or human evaluators as completed. + This is supported only for Runs that have a fixed set of Logs. + (Runs can either have a fixed set of Logs, or can be set to dynamically retrieve the latest Logs + if a Dataset and Version are provided.) Parameters ---------- id : str Unique identifier for Evaluation. - status : EvaluationStatus + run_id : str + Unique identifier for Run. + + log_ids : typing.Sequence[str] + The IDs of the Logs to add to the Run. request_options : typing.Optional[RequestOptions] Request-specific configuration. Returns ------- - EvaluationResponse + EvaluationRunResponse Successful Response Examples @@ -1253,19 +1970,20 @@ async def update_status( async def main() -> None: - await client.evaluations.update_status( + await client.evaluations.add_logs_to_run( id="id", - status="pending", + run_id="run_id", + log_ids=["log_ids"], ) asyncio.run(main()) """ _response = await self._client_wrapper.httpx_client.request( - f"evaluations/{jsonable_encoder(id)}/status", - method="PATCH", + f"evaluations/{jsonable_encoder(id)}/runs/{jsonable_encoder(run_id)}/logs", + method="POST", json={ - "status": status, + "log_ids": log_ids, }, request_options=request_options, omit=OMIT, @@ -1273,9 +1991,9 @@ async def main() -> None: try: if 200 <= _response.status_code < 300: return typing.cast( - EvaluationResponse, + EvaluationRunResponse, construct_type( - type_=EvaluationResponse, # type: ignore + type_=EvaluationRunResponse, # type: ignore object_=_response.json(), ), ) @@ -1299,7 +2017,8 @@ async def get_stats(self, id: str, *, request_options: typing.Optional[RequestOp Get Evaluation Stats. Retrieve aggregate stats for the specified Evaluation. - This includes the number of generated Logs for each evaluated version and the + + This includes the number of generated Logs for each Run and the corresponding Evaluator statistics (such as the mean and percentiles). Parameters @@ -1370,13 +2089,10 @@ async def get_logs( page: typing.Optional[int] = None, size: typing.Optional[int] = None, request_options: typing.Optional[RequestOptions] = None, - ) -> PaginatedDataEvaluationReportLogResponse: + ) -> PaginatedDataEvaluationLogResponse: """ Get the Logs associated to a specific Evaluation. - Each Datapoint in your Dataset will have a corresponding Log for each File version evaluated. - e.g. If you have 50 Datapoints and are evaluating 2 Prompts, there will be 100 Logs associated with the Evaluation. - Parameters ---------- id : str @@ -1393,7 +2109,7 @@ async def get_logs( Returns ------- - PaginatedDataEvaluationReportLogResponse + PaginatedDataEvaluationLogResponse Successful Response Examples @@ -1427,115 +2143,9 @@ async def main() -> None: try: if 200 <= _response.status_code < 300: return typing.cast( - PaginatedDataEvaluationReportLogResponse, - construct_type( - type_=PaginatedDataEvaluationReportLogResponse, # type: ignore - object_=_response.json(), - ), - ) - if _response.status_code == 422: - raise UnprocessableEntityError( - typing.cast( - HttpValidationError, - construct_type( - type_=HttpValidationError, # type: ignore - object_=_response.json(), - ), - ) - ) - _response_json = _response.json() - except JSONDecodeError: - raise ApiError(status_code=_response.status_code, body=_response.text) - raise ApiError(status_code=_response.status_code, body=_response_json) - - async def pin_evaluatee( - self, - id: str, - *, - version_id: typing.Optional[str] = OMIT, - path: typing.Optional[str] = OMIT, - file_id: typing.Optional[str] = OMIT, - environment: typing.Optional[str] = OMIT, - batch_id: typing.Optional[str] = OMIT, - orchestrated: typing.Optional[bool] = OMIT, - request_options: typing.Optional[RequestOptions] = None, - ) -> EvaluationResponse: - """ - Pin the specified Evaluatee. - - Pinned Evaluatees are always displayed in the Evaluation Overview, - and serve as the baseline for comparison with other Evaluatees. - - Parameters - ---------- - id : str - Unique identifier for Evaluation. - - version_id : typing.Optional[str] - Unique identifier for the File Version. If provided, none of the other fields should be specified. - - path : typing.Optional[str] - Path identifying a File. Provide either this or `file_id` if you want to specify a File. - - file_id : typing.Optional[str] - Unique identifier for the File. Provide either this or `path` if you want to specify a File. - - environment : typing.Optional[str] - Name of the Environment a Version is deployed to. Only provide this when specifying a File. If not provided (and a File is specified), the default Environment is used. - - batch_id : typing.Optional[str] - Unique identifier for the batch of Logs to include in the Evaluation Report. - - orchestrated : typing.Optional[bool] - Whether the Prompt/Tool is orchestrated by Humanloop. Default is `True`. If `False`, a log for the Prompt/Tool should be submitted by the user via the API. - - request_options : typing.Optional[RequestOptions] - Request-specific configuration. - - Returns - ------- - EvaluationResponse - Successful Response - - Examples - -------- - import asyncio - - from humanloop import AsyncHumanloop - - client = AsyncHumanloop( - api_key="YOUR_API_KEY", - ) - - - async def main() -> None: - await client.evaluations.pin_evaluatee( - id="id", - ) - - - asyncio.run(main()) - """ - _response = await self._client_wrapper.httpx_client.request( - f"evaluations/{jsonable_encoder(id)}/pin-evaluatee", - method="POST", - json={ - "version_id": version_id, - "path": path, - "file_id": file_id, - "environment": environment, - "batch_id": batch_id, - "orchestrated": orchestrated, - }, - request_options=request_options, - omit=OMIT, - ) - try: - if 200 <= _response.status_code < 300: - return typing.cast( - EvaluationResponse, + PaginatedDataEvaluationLogResponse, construct_type( - type_=EvaluationResponse, # type: ignore + type_=PaginatedDataEvaluationLogResponse, # type: ignore object_=_response.json(), ), ) diff --git a/src/humanloop/evaluators/client.py b/src/humanloop/evaluators/client.py index e05ae5cc..5e7ae73e 100644 --- a/src/humanloop/evaluators/client.py +++ b/src/humanloop/evaluators/client.py @@ -64,7 +64,6 @@ def log( metadata: typing.Optional[typing.Dict[str, typing.Optional[typing.Any]]] = OMIT, source_datapoint_id: typing.Optional[str] = OMIT, trace_parent_id: typing.Optional[str] = OMIT, - batch_id: typing.Optional[str] = OMIT, user: typing.Optional[str] = OMIT, create_evaluator_log_request_environment: typing.Optional[str] = OMIT, save: typing.Optional[bool] = OMIT, @@ -136,9 +135,6 @@ def log( trace_parent_id : typing.Optional[str] The ID of the parent Log to nest this Log under in a Trace. - batch_id : typing.Optional[str] - Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist. - user : typing.Optional[str] End-user ID related to the Log. @@ -197,7 +193,6 @@ def log( "parent_id": parent_id, "source_datapoint_id": source_datapoint_id, "trace_parent_id": trace_parent_id, - "batch_id": batch_id, "user": user, "environment": create_evaluator_log_request_environment, "save": save, @@ -1093,7 +1088,6 @@ async def log( metadata: typing.Optional[typing.Dict[str, typing.Optional[typing.Any]]] = OMIT, source_datapoint_id: typing.Optional[str] = OMIT, trace_parent_id: typing.Optional[str] = OMIT, - batch_id: typing.Optional[str] = OMIT, user: typing.Optional[str] = OMIT, create_evaluator_log_request_environment: typing.Optional[str] = OMIT, save: typing.Optional[bool] = OMIT, @@ -1165,9 +1159,6 @@ async def log( trace_parent_id : typing.Optional[str] The ID of the parent Log to nest this Log under in a Trace. - batch_id : typing.Optional[str] - Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist. - user : typing.Optional[str] End-user ID related to the Log. @@ -1234,7 +1225,6 @@ async def main() -> None: "parent_id": parent_id, "source_datapoint_id": source_datapoint_id, "trace_parent_id": trace_parent_id, - "batch_id": batch_id, "user": user, "environment": create_evaluator_log_request_environment, "save": save, diff --git a/src/humanloop/flows/client.py b/src/humanloop/flows/client.py index 6b75d942..4b4671e7 100644 --- a/src/humanloop/flows/client.py +++ b/src/humanloop/flows/client.py @@ -45,7 +45,7 @@ def log( *, version_id: typing.Optional[str] = None, environment: typing.Optional[str] = None, - evaluation_id: typing.Optional[str] = OMIT, + run_id: typing.Optional[str] = OMIT, path: typing.Optional[str] = OMIT, id: typing.Optional[str] = OMIT, start_time: typing.Optional[dt.datetime] = OMIT, @@ -62,7 +62,6 @@ def log( metadata: typing.Optional[typing.Dict[str, typing.Optional[typing.Any]]] = OMIT, source_datapoint_id: typing.Optional[str] = OMIT, trace_parent_id: typing.Optional[str] = OMIT, - batch_id: typing.Optional[str] = OMIT, user: typing.Optional[str] = OMIT, flow_log_request_environment: typing.Optional[str] = OMIT, save: typing.Optional[bool] = OMIT, @@ -85,8 +84,8 @@ def log( environment : typing.Optional[str] Name of the Environment identifying a deployed version to log to. - evaluation_id : typing.Optional[str] - Unique identifier for the Evaluation Report to associate the Log to. + run_id : typing.Optional[str] + Unique identifier for the Run to associate the Log to. path : typing.Optional[str] Path of the Flow, including the name. This locates the Flow in the Humanloop filesystem and is used as as a unique identifier. For example: `folder/name` or just `name`. @@ -136,9 +135,6 @@ def log( trace_parent_id : typing.Optional[str] The ID of the parent Log to nest this Log under in a Trace. - batch_id : typing.Optional[str] - Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist. - user : typing.Optional[str] End-user ID related to the Log. @@ -197,10 +193,10 @@ def log( output="The patient is likely experiencing a myocardial infarction. Immediate medical attention is required.", trace_status="incomplete", start_time=datetime.datetime.fromisoformat( - "2024-07-08 22:40:35+00:00", + "2024-07-08 21:40:35+00:00", ), end_time=datetime.datetime.fromisoformat( - "2024-07-08 22:40:39+00:00", + "2024-07-08 21:40:39+00:00", ), ) """ @@ -212,7 +208,7 @@ def log( "environment": environment, }, json={ - "evaluation_id": evaluation_id, + "run_id": run_id, "path": path, "id": id, "start_time": start_time, @@ -229,7 +225,6 @@ def log( "metadata": metadata, "source_datapoint_id": source_datapoint_id, "trace_parent_id": trace_parent_id, - "batch_id": batch_id, "user": user, "environment": flow_log_request_environment, "save": save, @@ -1210,7 +1205,7 @@ async def log( *, version_id: typing.Optional[str] = None, environment: typing.Optional[str] = None, - evaluation_id: typing.Optional[str] = OMIT, + run_id: typing.Optional[str] = OMIT, path: typing.Optional[str] = OMIT, id: typing.Optional[str] = OMIT, start_time: typing.Optional[dt.datetime] = OMIT, @@ -1227,7 +1222,6 @@ async def log( metadata: typing.Optional[typing.Dict[str, typing.Optional[typing.Any]]] = OMIT, source_datapoint_id: typing.Optional[str] = OMIT, trace_parent_id: typing.Optional[str] = OMIT, - batch_id: typing.Optional[str] = OMIT, user: typing.Optional[str] = OMIT, flow_log_request_environment: typing.Optional[str] = OMIT, save: typing.Optional[bool] = OMIT, @@ -1250,8 +1244,8 @@ async def log( environment : typing.Optional[str] Name of the Environment identifying a deployed version to log to. - evaluation_id : typing.Optional[str] - Unique identifier for the Evaluation Report to associate the Log to. + run_id : typing.Optional[str] + Unique identifier for the Run to associate the Log to. path : typing.Optional[str] Path of the Flow, including the name. This locates the Flow in the Humanloop filesystem and is used as as a unique identifier. For example: `folder/name` or just `name`. @@ -1301,9 +1295,6 @@ async def log( trace_parent_id : typing.Optional[str] The ID of the parent Log to nest this Log under in a Trace. - batch_id : typing.Optional[str] - Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist. - user : typing.Optional[str] End-user ID related to the Log. @@ -1366,10 +1357,10 @@ async def main() -> None: output="The patient is likely experiencing a myocardial infarction. Immediate medical attention is required.", trace_status="incomplete", start_time=datetime.datetime.fromisoformat( - "2024-07-08 22:40:35+00:00", + "2024-07-08 21:40:35+00:00", ), end_time=datetime.datetime.fromisoformat( - "2024-07-08 22:40:39+00:00", + "2024-07-08 21:40:39+00:00", ), ) @@ -1384,7 +1375,7 @@ async def main() -> None: "environment": environment, }, json={ - "evaluation_id": evaluation_id, + "run_id": run_id, "path": path, "id": id, "start_time": start_time, @@ -1401,7 +1392,6 @@ async def main() -> None: "metadata": metadata, "source_datapoint_id": source_datapoint_id, "trace_parent_id": trace_parent_id, - "batch_id": batch_id, "user": user, "environment": flow_log_request_environment, "save": save, diff --git a/src/humanloop/prompts/client.py b/src/humanloop/prompts/client.py index a9332565..88cfa117 100644 --- a/src/humanloop/prompts/client.py +++ b/src/humanloop/prompts/client.py @@ -60,7 +60,7 @@ def log( *, version_id: typing.Optional[str] = None, environment: typing.Optional[str] = None, - evaluation_id: typing.Optional[str] = OMIT, + run_id: typing.Optional[str] = OMIT, path: typing.Optional[str] = OMIT, id: typing.Optional[str] = OMIT, output_message: typing.Optional[ChatMessageParams] = OMIT, @@ -86,7 +86,6 @@ def log( metadata: typing.Optional[typing.Dict[str, typing.Optional[typing.Any]]] = OMIT, source_datapoint_id: typing.Optional[str] = OMIT, trace_parent_id: typing.Optional[str] = OMIT, - batch_id: typing.Optional[str] = OMIT, user: typing.Optional[str] = OMIT, prompt_log_request_environment: typing.Optional[str] = OMIT, save: typing.Optional[bool] = OMIT, @@ -111,8 +110,8 @@ def log( environment : typing.Optional[str] Name of the Environment identifying a deployed version to log to. - evaluation_id : typing.Optional[str] - Unique identifier for the Evaluation Report to associate the Log to. + run_id : typing.Optional[str] + Unique identifier for the Run to associate the Log to. path : typing.Optional[str] Path of the Prompt, including the name. This locates the Prompt in the Humanloop filesystem and is used as as a unique identifier. For example: `folder/name` or just `name`. @@ -193,9 +192,6 @@ def log( trace_parent_id : typing.Optional[str] The ID of the parent Log to nest this Log under in a Trace. - batch_id : typing.Optional[str] - Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist. - user : typing.Optional[str] End-user ID related to the Log. @@ -236,7 +232,7 @@ def log( messages=[{"role": "user", "content": "What really happened at Roswell?"}], inputs={"person": "Trump"}, created_at=datetime.datetime.fromisoformat( - "2024-07-19 00:29:35.178000+00:00", + "2024-07-18 23:29:35.178000+00:00", ), provider_latency=6.5931549072265625, output_message={ @@ -258,7 +254,7 @@ def log( "environment": environment, }, json={ - "evaluation_id": evaluation_id, + "run_id": run_id, "path": path, "id": id, "output_message": convert_and_respect_annotation_metadata( @@ -292,7 +288,6 @@ def log( "metadata": metadata, "source_datapoint_id": source_datapoint_id, "trace_parent_id": trace_parent_id, - "batch_id": batch_id, "user": user, "environment": prompt_log_request_environment, "save": save, @@ -523,7 +518,6 @@ def call_stream( end_time: typing.Optional[dt.datetime] = OMIT, source_datapoint_id: typing.Optional[str] = OMIT, trace_parent_id: typing.Optional[str] = OMIT, - batch_id: typing.Optional[str] = OMIT, user: typing.Optional[str] = OMIT, prompts_call_stream_request_environment: typing.Optional[str] = OMIT, save: typing.Optional[bool] = OMIT, @@ -596,9 +590,6 @@ def call_stream( trace_parent_id : typing.Optional[str] The ID of the parent Log to nest this Log under in a Trace. - batch_id : typing.Optional[str] - Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist. - user : typing.Optional[str] End-user ID related to the Log. @@ -672,7 +663,6 @@ def call_stream( ), source_datapoint_id="string", trace_parent_id="string", - batch_id="string", user="string", prompts_call_stream_request_environment="string", save=True, @@ -720,7 +710,6 @@ def call_stream( "end_time": end_time, "source_datapoint_id": source_datapoint_id, "trace_parent_id": trace_parent_id, - "batch_id": batch_id, "user": user, "environment": prompts_call_stream_request_environment, "save": save, @@ -784,7 +773,6 @@ def call( end_time: typing.Optional[dt.datetime] = OMIT, source_datapoint_id: typing.Optional[str] = OMIT, trace_parent_id: typing.Optional[str] = OMIT, - batch_id: typing.Optional[str] = OMIT, user: typing.Optional[str] = OMIT, prompts_call_request_environment: typing.Optional[str] = OMIT, save: typing.Optional[bool] = OMIT, @@ -857,9 +845,6 @@ def call( trace_parent_id : typing.Optional[str] The ID of the parent Log to nest this Log under in a Trace. - batch_id : typing.Optional[str] - Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist. - user : typing.Optional[str] End-user ID related to the Log. @@ -956,7 +941,6 @@ def call( "end_time": end_time, "source_datapoint_id": source_datapoint_id, "trace_parent_id": trace_parent_id, - "batch_id": batch_id, "user": user, "environment": prompts_call_request_environment, "save": save, @@ -1935,7 +1919,7 @@ async def log( *, version_id: typing.Optional[str] = None, environment: typing.Optional[str] = None, - evaluation_id: typing.Optional[str] = OMIT, + run_id: typing.Optional[str] = OMIT, path: typing.Optional[str] = OMIT, id: typing.Optional[str] = OMIT, output_message: typing.Optional[ChatMessageParams] = OMIT, @@ -1961,7 +1945,6 @@ async def log( metadata: typing.Optional[typing.Dict[str, typing.Optional[typing.Any]]] = OMIT, source_datapoint_id: typing.Optional[str] = OMIT, trace_parent_id: typing.Optional[str] = OMIT, - batch_id: typing.Optional[str] = OMIT, user: typing.Optional[str] = OMIT, prompt_log_request_environment: typing.Optional[str] = OMIT, save: typing.Optional[bool] = OMIT, @@ -1986,8 +1969,8 @@ async def log( environment : typing.Optional[str] Name of the Environment identifying a deployed version to log to. - evaluation_id : typing.Optional[str] - Unique identifier for the Evaluation Report to associate the Log to. + run_id : typing.Optional[str] + Unique identifier for the Run to associate the Log to. path : typing.Optional[str] Path of the Prompt, including the name. This locates the Prompt in the Humanloop filesystem and is used as as a unique identifier. For example: `folder/name` or just `name`. @@ -2068,9 +2051,6 @@ async def log( trace_parent_id : typing.Optional[str] The ID of the parent Log to nest this Log under in a Trace. - batch_id : typing.Optional[str] - Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist. - user : typing.Optional[str] End-user ID related to the Log. @@ -2117,7 +2097,7 @@ async def main() -> None: ], inputs={"person": "Trump"}, created_at=datetime.datetime.fromisoformat( - "2024-07-19 00:29:35.178000+00:00", + "2024-07-18 23:29:35.178000+00:00", ), provider_latency=6.5931549072265625, output_message={ @@ -2142,7 +2122,7 @@ async def main() -> None: "environment": environment, }, json={ - "evaluation_id": evaluation_id, + "run_id": run_id, "path": path, "id": id, "output_message": convert_and_respect_annotation_metadata( @@ -2176,7 +2156,6 @@ async def main() -> None: "metadata": metadata, "source_datapoint_id": source_datapoint_id, "trace_parent_id": trace_parent_id, - "batch_id": batch_id, "user": user, "environment": prompt_log_request_environment, "save": save, @@ -2415,7 +2394,6 @@ async def call_stream( end_time: typing.Optional[dt.datetime] = OMIT, source_datapoint_id: typing.Optional[str] = OMIT, trace_parent_id: typing.Optional[str] = OMIT, - batch_id: typing.Optional[str] = OMIT, user: typing.Optional[str] = OMIT, prompts_call_stream_request_environment: typing.Optional[str] = OMIT, save: typing.Optional[bool] = OMIT, @@ -2488,9 +2466,6 @@ async def call_stream( trace_parent_id : typing.Optional[str] The ID of the parent Log to nest this Log under in a Trace. - batch_id : typing.Optional[str] - Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist. - user : typing.Optional[str] End-user ID related to the Log. @@ -2568,7 +2543,6 @@ async def main() -> None: ), source_datapoint_id="string", trace_parent_id="string", - batch_id="string", user="string", prompts_call_stream_request_environment="string", save=True, @@ -2619,7 +2593,6 @@ async def main() -> None: "end_time": end_time, "source_datapoint_id": source_datapoint_id, "trace_parent_id": trace_parent_id, - "batch_id": batch_id, "user": user, "environment": prompts_call_stream_request_environment, "save": save, @@ -2683,7 +2656,6 @@ async def call( end_time: typing.Optional[dt.datetime] = OMIT, source_datapoint_id: typing.Optional[str] = OMIT, trace_parent_id: typing.Optional[str] = OMIT, - batch_id: typing.Optional[str] = OMIT, user: typing.Optional[str] = OMIT, prompts_call_request_environment: typing.Optional[str] = OMIT, save: typing.Optional[bool] = OMIT, @@ -2756,9 +2728,6 @@ async def call( trace_parent_id : typing.Optional[str] The ID of the parent Log to nest this Log under in a Trace. - batch_id : typing.Optional[str] - Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist. - user : typing.Optional[str] End-user ID related to the Log. @@ -2863,7 +2832,6 @@ async def main() -> None: "end_time": end_time, "source_datapoint_id": source_datapoint_id, "trace_parent_id": trace_parent_id, - "batch_id": batch_id, "user": user, "environment": prompts_call_request_environment, "save": save, diff --git a/src/humanloop/requests/__init__.py b/src/humanloop/requests/__init__.py index a8f2e97e..e33e9078 100644 --- a/src/humanloop/requests/__init__.py +++ b/src/humanloop/requests/__init__.py @@ -23,12 +23,13 @@ DirectoryWithParentsAndChildrenResponseFilesItemParams, ) from .environment_response import EnvironmentResponseParams -from .evaluated_version_response import EvaluatedVersionResponseParams from .evaluatee_request import EvaluateeRequestParams from .evaluatee_response import EvaluateeResponseParams from .evaluation_evaluator_response import EvaluationEvaluatorResponseParams -from .evaluation_report_log_response import EvaluationReportLogResponseParams +from .evaluation_log_response import EvaluationLogResponseParams from .evaluation_response import EvaluationResponseParams +from .evaluation_run_response import EvaluationRunResponseParams +from .evaluation_runs_response import EvaluationRunsResponseParams from .evaluation_stats import EvaluationStatsParams from .evaluations_dataset_request import EvaluationsDatasetRequestParams from .evaluations_request import EvaluationsRequestParams @@ -74,7 +75,7 @@ from .monitoring_evaluator_version_request import MonitoringEvaluatorVersionRequestParams from .numeric_evaluator_stats_response import NumericEvaluatorStatsResponseParams from .overall_stats import OverallStatsParams -from .paginated_data_evaluation_report_log_response import PaginatedDataEvaluationReportLogResponseParams +from .paginated_data_evaluation_log_response import PaginatedDataEvaluationLogResponseParams from .paginated_data_evaluator_response import PaginatedDataEvaluatorResponseParams from .paginated_data_flow_response import PaginatedDataFlowResponseParams from .paginated_data_log_response import PaginatedDataLogResponseParams @@ -103,6 +104,9 @@ from .prompt_response_template import PromptResponseTemplateParams from .provider_api_keys import ProviderApiKeysParams from .response_format import ResponseFormatParams +from .run_stats_response import RunStatsResponseParams +from .run_stats_response_evaluator_stats_item import RunStatsResponseEvaluatorStatsItemParams +from .run_version_response import RunVersionResponseParams from .select_evaluator_stats_response import SelectEvaluatorStatsResponseParams from .text_chat_content import TextChatContentParams from .text_evaluator_stats_response import TextEvaluatorStatsResponseParams @@ -119,6 +123,7 @@ from .version_id_response import VersionIdResponseParams from .version_id_response_version import VersionIdResponseVersionParams from .version_reference_response import VersionReferenceResponseParams +from .version_specification import VersionSpecificationParams from .version_stats_response import VersionStatsResponseParams from .version_stats_response_evaluator_version_stats_item import VersionStatsResponseEvaluatorVersionStatsItemParams @@ -144,12 +149,13 @@ "DirectoryWithParentsAndChildrenResponseFilesItemParams", "DirectoryWithParentsAndChildrenResponseParams", "EnvironmentResponseParams", - "EvaluatedVersionResponseParams", "EvaluateeRequestParams", "EvaluateeResponseParams", "EvaluationEvaluatorResponseParams", - "EvaluationReportLogResponseParams", + "EvaluationLogResponseParams", "EvaluationResponseParams", + "EvaluationRunResponseParams", + "EvaluationRunsResponseParams", "EvaluationStatsParams", "EvaluationsDatasetRequestParams", "EvaluationsRequestParams", @@ -191,7 +197,7 @@ "MonitoringEvaluatorVersionRequestParams", "NumericEvaluatorStatsResponseParams", "OverallStatsParams", - "PaginatedDataEvaluationReportLogResponseParams", + "PaginatedDataEvaluationLogResponseParams", "PaginatedDataEvaluatorResponseParams", "PaginatedDataFlowResponseParams", "PaginatedDataLogResponseParams", @@ -216,6 +222,9 @@ "PromptResponseTemplateParams", "ProviderApiKeysParams", "ResponseFormatParams", + "RunStatsResponseEvaluatorStatsItemParams", + "RunStatsResponseParams", + "RunVersionResponseParams", "SelectEvaluatorStatsResponseParams", "TextChatContentParams", "TextEvaluatorStatsResponseParams", @@ -232,6 +241,7 @@ "VersionIdResponseParams", "VersionIdResponseVersionParams", "VersionReferenceResponseParams", + "VersionSpecificationParams", "VersionStatsResponseEvaluatorVersionStatsItemParams", "VersionStatsResponseParams", ] diff --git a/src/humanloop/requests/boolean_evaluator_stats_response.py b/src/humanloop/requests/boolean_evaluator_stats_response.py index 33d9b44f..18618f40 100644 --- a/src/humanloop/requests/boolean_evaluator_stats_response.py +++ b/src/humanloop/requests/boolean_evaluator_stats_response.py @@ -6,7 +6,7 @@ class BooleanEvaluatorStatsResponseParams(typing_extensions.TypedDict): """ Base attributes for stats for an Evaluator Version-Evaluated Version pair - in the Evaluation Report. + in the Evaluation. """ evaluator_version_id: str diff --git a/src/humanloop/requests/dataset_response.py b/src/humanloop/requests/dataset_response.py index 941cf0d0..56fcc4ed 100644 --- a/src/humanloop/requests/dataset_response.py +++ b/src/humanloop/requests/dataset_response.py @@ -56,6 +56,16 @@ class DatasetResponseParams(typing_extensions.TypedDict): The user who created the Dataset. """ + committed_by: typing_extensions.NotRequired[UserResponse] + """ + The user who committed the Dataset Version. + """ + + committed_at: typing_extensions.NotRequired[dt.datetime] + """ + The date and time the Dataset Version was committed. + """ + status: VersionStatus """ The status of the Dataset Version. diff --git a/src/humanloop/requests/evaluatee_request.py b/src/humanloop/requests/evaluatee_request.py index 2eba177f..26e048c5 100644 --- a/src/humanloop/requests/evaluatee_request.py +++ b/src/humanloop/requests/evaluatee_request.py @@ -38,7 +38,7 @@ class EvaluateeRequestParams(typing_extensions.TypedDict): batch_id: typing_extensions.NotRequired[str] """ - Unique identifier for the batch of Logs to include in the Evaluation Report. + Unique identifier for the batch of Logs to include in the Evaluation. """ orchestrated: typing_extensions.NotRequired[bool] diff --git a/src/humanloop/requests/evaluatee_response.py b/src/humanloop/requests/evaluatee_response.py index 6ea5d9ba..411ba5ba 100644 --- a/src/humanloop/requests/evaluatee_response.py +++ b/src/humanloop/requests/evaluatee_response.py @@ -1,8 +1,8 @@ # This file was auto-generated by Fern from our API Definition. import typing_extensions -from .evaluated_version_response import EvaluatedVersionResponseParams import typing_extensions +from .run_version_response import RunVersionResponseParams import datetime as dt @@ -11,10 +11,10 @@ class EvaluateeResponseParams(typing_extensions.TypedDict): Version of the Evaluatee being evaluated. """ - version: EvaluatedVersionResponseParams + version: typing_extensions.NotRequired[RunVersionResponseParams] batch_id: typing_extensions.NotRequired[str] """ - Unique identifier for the batch of Logs to include in the Evaluation Report. + Unique identifier for the batch of Logs to include in the Evaluation. """ orchestrated: bool diff --git a/src/humanloop/requests/evaluation_report_log_response.py b/src/humanloop/requests/evaluation_log_response.py similarity index 59% rename from src/humanloop/requests/evaluation_report_log_response.py rename to src/humanloop/requests/evaluation_log_response.py index 5aa9a042..8fe5d762 100644 --- a/src/humanloop/requests/evaluation_report_log_response.py +++ b/src/humanloop/requests/evaluation_log_response.py @@ -1,17 +1,15 @@ # This file was auto-generated by Fern from our API Definition. import typing_extensions -from .evaluated_version_response import EvaluatedVersionResponseParams from .datapoint_response import DatapointResponseParams -import typing_extensions from .log_response import LogResponseParams import typing -class EvaluationReportLogResponseParams(typing_extensions.TypedDict): - evaluated_version: EvaluatedVersionResponseParams +class EvaluationLogResponseParams(typing_extensions.TypedDict): + run_id: str """ - The version of the Prompt, Tool or Evaluator that the Log belongs to. + Unique identifier for the Run. """ datapoint: DatapointResponseParams @@ -19,7 +17,7 @@ class EvaluationReportLogResponseParams(typing_extensions.TypedDict): The Datapoint used to generate the Log """ - log: typing_extensions.NotRequired[LogResponseParams] + log: LogResponseParams """ The Log that was evaluated by the Evaluator. """ diff --git a/src/humanloop/requests/evaluation_response.py b/src/humanloop/requests/evaluation_response.py index fe09cad4..27d9da73 100644 --- a/src/humanloop/requests/evaluation_response.py +++ b/src/humanloop/requests/evaluation_response.py @@ -1,11 +1,8 @@ # This file was auto-generated by Fern from our API Definition. import typing_extensions -from .dataset_response import DatasetResponseParams import typing -from .evaluatee_response import EvaluateeResponseParams from .evaluation_evaluator_response import EvaluationEvaluatorResponseParams -from ..types.evaluation_status import EvaluationStatus import typing_extensions import datetime as dt from ..types.user_response import UserResponse @@ -17,14 +14,9 @@ class EvaluationResponseParams(typing_extensions.TypedDict): Unique identifier for the Evaluation. Starts with `evr`. """ - dataset: DatasetResponseParams + runs_count: int """ - The Dataset used in the Evaluation. - """ - - evaluatees: typing.Sequence[EvaluateeResponseParams] - """ - The Prompt/Tool Versions included in the Evaluation. + The total number of Runs in the Evaluation. """ evaluators: typing.Sequence[EvaluationEvaluatorResponseParams] @@ -32,16 +24,6 @@ class EvaluationResponseParams(typing_extensions.TypedDict): The Evaluator Versions used to evaluate. """ - status: EvaluationStatus - """ - The current status of the Evaluation. - - - `"pending"`: The Evaluation has been created but is not actively being worked on by Humanloop. - - `"running"`: Humanloop is checking for any missing Logs and Evaluator Logs, and will generate them where appropriate. - - `"completed"`: All Logs an Evaluator Logs have been generated. - - `"cancelled"`: The Evaluation has been cancelled by the user. Humanloop will stop generating Logs and Evaluator Logs. - """ - name: typing_extensions.NotRequired[str] """ Name of the Evaluation to help identify it. Must be unique among Evaluations associated with File. diff --git a/src/humanloop/requests/evaluation_run_response.py b/src/humanloop/requests/evaluation_run_response.py new file mode 100644 index 00000000..98ccfd75 --- /dev/null +++ b/src/humanloop/requests/evaluation_run_response.py @@ -0,0 +1,56 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing_extensions +import typing_extensions +from .dataset_response import DatasetResponseParams +from .run_version_response import RunVersionResponseParams +import datetime as dt +from ..types.user_response import UserResponse +from ..types.evaluation_status import EvaluationStatus + + +class EvaluationRunResponseParams(typing_extensions.TypedDict): + id: str + """ + Unique identifier for the Run. + """ + + dataset: typing_extensions.NotRequired[DatasetResponseParams] + """ + The Dataset used in the Run. + """ + + version: typing_extensions.NotRequired[RunVersionResponseParams] + """ + The version used in the Run. + """ + + orchestrated: bool + """ + Whether the Run is orchestrated by Humanloop. + """ + + added_at: dt.datetime + """ + When the Run was added to the Evaluation. + """ + + created_at: dt.datetime + """ + When the Run was created. + """ + + created_by: typing_extensions.NotRequired[UserResponse] + """ + The User who created the Run. + """ + + status: EvaluationStatus + """ + The status of the Run. + """ + + control: bool + """ + Stats for other Runs will be displayed in comparison to the control Run. + """ diff --git a/src/humanloop/requests/evaluation_runs_response.py b/src/humanloop/requests/evaluation_runs_response.py new file mode 100644 index 00000000..a6e86d68 --- /dev/null +++ b/src/humanloop/requests/evaluation_runs_response.py @@ -0,0 +1,12 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing_extensions +import typing +from .evaluation_run_response import EvaluationRunResponseParams + + +class EvaluationRunsResponseParams(typing_extensions.TypedDict): + runs: typing.Sequence[EvaluationRunResponseParams] + """ + The Runs in the Evaluation. + """ diff --git a/src/humanloop/requests/evaluation_stats.py b/src/humanloop/requests/evaluation_stats.py index b605ac2b..0a5a6a4a 100644 --- a/src/humanloop/requests/evaluation_stats.py +++ b/src/humanloop/requests/evaluation_stats.py @@ -1,22 +1,16 @@ # This file was auto-generated by Fern from our API Definition. import typing_extensions -from .overall_stats import OverallStatsParams import typing -from .version_stats_response import VersionStatsResponseParams +from .run_stats_response import RunStatsResponseParams import typing_extensions from ..types.evaluation_status import EvaluationStatus class EvaluationStatsParams(typing_extensions.TypedDict): - overall_stats: OverallStatsParams + run_stats: typing.Sequence[RunStatsResponseParams] """ - Stats for the Evaluation Report as a whole. - """ - - version_stats: typing.Sequence[VersionStatsResponseParams] - """ - Stats for each Evaluated Version in the Evaluation Report. + Stats for each Run in the Evaluation. """ progress: typing_extensions.NotRequired[str] diff --git a/src/humanloop/requests/evaluator_response.py b/src/humanloop/requests/evaluator_response.py index 888a55ff..609c11e4 100644 --- a/src/humanloop/requests/evaluator_response.py +++ b/src/humanloop/requests/evaluator_response.py @@ -66,6 +66,16 @@ class EvaluatorResponseParams(typing_extensions.TypedDict): The user who created the Evaluator. """ + committed_by: typing_extensions.NotRequired[UserResponse] + """ + The user who committed the Evaluator Version. + """ + + committed_at: typing_extensions.NotRequired[dt.datetime] + """ + The date and time the Evaluator Version was committed. + """ + status: VersionStatus last_used_at: dt.datetime version_logs_count: int diff --git a/src/humanloop/requests/flow_response.py b/src/humanloop/requests/flow_response.py index 27a004ec..60b7753a 100644 --- a/src/humanloop/requests/flow_response.py +++ b/src/humanloop/requests/flow_response.py @@ -68,6 +68,16 @@ class FlowResponseParams(typing_extensions.TypedDict): The user who created the Flow. """ + committed_by: typing_extensions.NotRequired[UserResponse] + """ + The user who committed the Flow Version. + """ + + committed_at: typing_extensions.NotRequired[dt.datetime] + """ + The date and time the Flow Version was committed. + """ + status: VersionStatus """ The status of the Flow Version. diff --git a/src/humanloop/requests/numeric_evaluator_stats_response.py b/src/humanloop/requests/numeric_evaluator_stats_response.py index 91eb5b4c..4edbda84 100644 --- a/src/humanloop/requests/numeric_evaluator_stats_response.py +++ b/src/humanloop/requests/numeric_evaluator_stats_response.py @@ -8,7 +8,7 @@ class NumericEvaluatorStatsResponseParams(typing_extensions.TypedDict): """ Base attributes for stats for an Evaluator Version-Evaluated Version pair - in the Evaluation Report. + in the Evaluation. """ evaluator_version_id: str diff --git a/src/humanloop/requests/overall_stats.py b/src/humanloop/requests/overall_stats.py index 5946d210..da04f19f 100644 --- a/src/humanloop/requests/overall_stats.py +++ b/src/humanloop/requests/overall_stats.py @@ -6,15 +6,15 @@ class OverallStatsParams(typing_extensions.TypedDict): num_datapoints: int """ - The total number of Datapoints in the Evaluation Report's Dataset Version. + The total number of Datapoints in the Evaluation's Dataset Version. """ total_logs: int """ - The total number of Logs in the Evaluation Report. + The total number of Logs in the Evaluation. """ total_evaluator_logs: int """ - The total number of Evaluator Logs in the Evaluation Report. + The total number of Evaluator Logs in the Evaluation. """ diff --git a/src/humanloop/requests/paginated_data_evaluation_log_response.py b/src/humanloop/requests/paginated_data_evaluation_log_response.py new file mode 100644 index 00000000..e9723472 --- /dev/null +++ b/src/humanloop/requests/paginated_data_evaluation_log_response.py @@ -0,0 +1,12 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing_extensions +import typing +from .evaluation_log_response import EvaluationLogResponseParams + + +class PaginatedDataEvaluationLogResponseParams(typing_extensions.TypedDict): + records: typing.Sequence[EvaluationLogResponseParams] + page: int + size: int + total: int diff --git a/src/humanloop/requests/paginated_data_evaluation_report_log_response.py b/src/humanloop/requests/paginated_data_evaluation_report_log_response.py deleted file mode 100644 index bdc88d6a..00000000 --- a/src/humanloop/requests/paginated_data_evaluation_report_log_response.py +++ /dev/null @@ -1,12 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing_extensions -import typing -from .evaluation_report_log_response import EvaluationReportLogResponseParams - - -class PaginatedDataEvaluationReportLogResponseParams(typing_extensions.TypedDict): - records: typing.Sequence[EvaluationReportLogResponseParams] - page: int - size: int - total: int diff --git a/src/humanloop/requests/prompt_call_response.py b/src/humanloop/requests/prompt_call_response.py index 685d6b44..7a66ecbd 100644 --- a/src/humanloop/requests/prompt_call_response.py +++ b/src/humanloop/requests/prompt_call_response.py @@ -70,11 +70,6 @@ class PromptCallResponseParams(typing_extensions.TypedDict): The ID of the parent Log to nest this Log under in a Trace. """ - batch_id: typing_extensions.NotRequired[str] - """ - Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist. - """ - user: typing_extensions.NotRequired[str] """ End-user ID related to the Log. diff --git a/src/humanloop/requests/prompt_response.py b/src/humanloop/requests/prompt_response.py index 918813ec..50039007 100644 --- a/src/humanloop/requests/prompt_response.py +++ b/src/humanloop/requests/prompt_response.py @@ -159,6 +159,16 @@ class PromptResponseParams(typing_extensions.TypedDict): The user who created the Prompt. """ + committed_by: typing_extensions.NotRequired[UserResponse] + """ + The user who committed the Prompt Version. + """ + + committed_at: typing_extensions.NotRequired[dt.datetime] + """ + The date and time the Prompt Version was committed. + """ + status: VersionStatus """ The status of the Prompt Version. diff --git a/src/humanloop/requests/run_stats_response.py b/src/humanloop/requests/run_stats_response.py new file mode 100644 index 00000000..0cb19389 --- /dev/null +++ b/src/humanloop/requests/run_stats_response.py @@ -0,0 +1,37 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing_extensions +import typing_extensions +import typing +from .run_stats_response_evaluator_stats_item import RunStatsResponseEvaluatorStatsItemParams + + +class RunStatsResponseParams(typing_extensions.TypedDict): + """ + Stats for a Run in the Evaluation. + """ + + run_id: str + """ + Unique identifier for the Run. + """ + + version_id: typing_extensions.NotRequired[str] + """ + Unique identifier for the evaluated Version. + """ + + batch_id: typing_extensions.NotRequired[str] + """ + Unique identifier for the batch of Logs to include in the Evaluation. + """ + + num_logs: int + """ + The total number of existing Logs in this Run. + """ + + evaluator_stats: typing.Sequence[RunStatsResponseEvaluatorStatsItemParams] + """ + Stats for each Evaluator Version applied to this Run. + """ diff --git a/src/humanloop/requests/run_stats_response_evaluator_stats_item.py b/src/humanloop/requests/run_stats_response_evaluator_stats_item.py new file mode 100644 index 00000000..a42aea0b --- /dev/null +++ b/src/humanloop/requests/run_stats_response_evaluator_stats_item.py @@ -0,0 +1,14 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing +from .numeric_evaluator_stats_response import NumericEvaluatorStatsResponseParams +from .boolean_evaluator_stats_response import BooleanEvaluatorStatsResponseParams +from .select_evaluator_stats_response import SelectEvaluatorStatsResponseParams +from .text_evaluator_stats_response import TextEvaluatorStatsResponseParams + +RunStatsResponseEvaluatorStatsItemParams = typing.Union[ + NumericEvaluatorStatsResponseParams, + BooleanEvaluatorStatsResponseParams, + SelectEvaluatorStatsResponseParams, + TextEvaluatorStatsResponseParams, +] diff --git a/src/humanloop/requests/evaluated_version_response.py b/src/humanloop/requests/run_version_response.py similarity index 88% rename from src/humanloop/requests/evaluated_version_response.py rename to src/humanloop/requests/run_version_response.py index d35a602d..879ea25c 100644 --- a/src/humanloop/requests/evaluated_version_response.py +++ b/src/humanloop/requests/run_version_response.py @@ -6,6 +6,6 @@ from .evaluator_response import EvaluatorResponseParams from .flow_response import FlowResponseParams -EvaluatedVersionResponseParams = typing.Union[ +RunVersionResponseParams = typing.Union[ PromptResponseParams, ToolResponseParams, EvaluatorResponseParams, FlowResponseParams ] diff --git a/src/humanloop/requests/text_evaluator_stats_response.py b/src/humanloop/requests/text_evaluator_stats_response.py index d1d97f81..8f0f358d 100644 --- a/src/humanloop/requests/text_evaluator_stats_response.py +++ b/src/humanloop/requests/text_evaluator_stats_response.py @@ -6,7 +6,7 @@ class TextEvaluatorStatsResponseParams(typing_extensions.TypedDict): """ Base attributes for stats for an Evaluator Version-Evaluated Version pair - in the Evaluation Report. + in the Evaluation. """ evaluator_version_id: str diff --git a/src/humanloop/requests/tool_response.py b/src/humanloop/requests/tool_response.py index 44313db7..57b9b608 100644 --- a/src/humanloop/requests/tool_response.py +++ b/src/humanloop/requests/tool_response.py @@ -94,6 +94,16 @@ class ToolResponseParams(typing_extensions.TypedDict): The user who created the Tool. """ + committed_by: typing_extensions.NotRequired[UserResponse] + """ + The user who committed the Tool Version. + """ + + committed_at: typing_extensions.NotRequired[dt.datetime] + """ + The date and time the Tool Version was committed. + """ + status: VersionStatus """ The status of the Tool Version. diff --git a/src/humanloop/requests/version_specification.py b/src/humanloop/requests/version_specification.py new file mode 100644 index 00000000..34606269 --- /dev/null +++ b/src/humanloop/requests/version_specification.py @@ -0,0 +1,37 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing_extensions +import typing_extensions + + +class VersionSpecificationParams(typing_extensions.TypedDict): + """ + Specification of a File version on Humanloop. + + This can be done in a couple of ways: + + - Specifying `version_id` directly. + - Specifying a File (and optionally an Environment). + - A File can be specified by either `path` or `file_id`. + - An Environment can be specified by `environment_id`. If no Environment is specified, the default Environment is used. + """ + + version_id: typing_extensions.NotRequired[str] + """ + Unique identifier for the File Version. If provided, none of the other fields should be specified. + """ + + path: typing_extensions.NotRequired[str] + """ + Path identifying a File. Provide either this or `file_id` if you want to specify a File. + """ + + file_id: typing_extensions.NotRequired[str] + """ + Unique identifier for the File. Provide either this or `path` if you want to specify a File. + """ + + environment: typing_extensions.NotRequired[str] + """ + Name of the Environment a Version is deployed to. Only provide this when specifying a File. If not provided (and a File is specified), the default Environment is used. + """ diff --git a/src/humanloop/requests/version_stats_response.py b/src/humanloop/requests/version_stats_response.py index 34f753f4..053c0ac9 100644 --- a/src/humanloop/requests/version_stats_response.py +++ b/src/humanloop/requests/version_stats_response.py @@ -7,26 +7,22 @@ class VersionStatsResponseParams(typing_extensions.TypedDict): - """ - Stats for an Evaluated Version in the Evaluation Report. - """ - version_id: str """ - Unique identifier for the Evaluated Version. + Unique identifier for the evaluated Version. """ batch_id: typing_extensions.NotRequired[str] """ - Unique identifier for the batch of Logs to include in the Evaluation Report. + Unique identifier for the batch of Logs to include in the Evaluation. """ num_logs: int """ - The total number of existing Logs for this Evaluated Version within the Evaluation Report. These are Logs that have been generated by this Evaluated Version on a Datapoint belonging to the Evaluation Report's Dataset Version. + The total number of existing Logs in this Run. """ evaluator_version_stats: typing.Sequence[VersionStatsResponseEvaluatorVersionStatsItemParams] """ - Stats for each Evaluator Version used to evaluate this Evaluated Version. + Stats for each Evaluator Version applied to this Run. """ diff --git a/src/humanloop/tools/client.py b/src/humanloop/tools/client.py index 4d23bb8e..7226e60b 100644 --- a/src/humanloop/tools/client.py +++ b/src/humanloop/tools/client.py @@ -62,7 +62,6 @@ def log( metadata: typing.Optional[typing.Dict[str, typing.Optional[typing.Any]]] = OMIT, source_datapoint_id: typing.Optional[str] = OMIT, trace_parent_id: typing.Optional[str] = OMIT, - batch_id: typing.Optional[str] = OMIT, user: typing.Optional[str] = OMIT, tool_log_request_environment: typing.Optional[str] = OMIT, save: typing.Optional[bool] = OMIT, @@ -136,9 +135,6 @@ def log( trace_parent_id : typing.Optional[str] The ID of the parent Log to nest this Log under in a Trace. - batch_id : typing.Optional[str] - Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist. - user : typing.Optional[str] End-user ID related to the Log. @@ -210,7 +206,6 @@ def log( "metadata": metadata, "source_datapoint_id": source_datapoint_id, "trace_parent_id": trace_parent_id, - "batch_id": batch_id, "user": user, "environment": tool_log_request_environment, "save": save, @@ -1258,7 +1253,6 @@ async def log( metadata: typing.Optional[typing.Dict[str, typing.Optional[typing.Any]]] = OMIT, source_datapoint_id: typing.Optional[str] = OMIT, trace_parent_id: typing.Optional[str] = OMIT, - batch_id: typing.Optional[str] = OMIT, user: typing.Optional[str] = OMIT, tool_log_request_environment: typing.Optional[str] = OMIT, save: typing.Optional[bool] = OMIT, @@ -1332,9 +1326,6 @@ async def log( trace_parent_id : typing.Optional[str] The ID of the parent Log to nest this Log under in a Trace. - batch_id : typing.Optional[str] - Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist. - user : typing.Optional[str] End-user ID related to the Log. @@ -1414,7 +1405,6 @@ async def main() -> None: "metadata": metadata, "source_datapoint_id": source_datapoint_id, "trace_parent_id": trace_parent_id, - "batch_id": batch_id, "user": user, "environment": tool_log_request_environment, "save": save, diff --git a/src/humanloop/types/__init__.py b/src/humanloop/types/__init__.py index c9b3180f..8c973b52 100644 --- a/src/humanloop/types/__init__.py +++ b/src/humanloop/types/__init__.py @@ -26,12 +26,13 @@ from .directory_with_parents_and_children_response_files_item import DirectoryWithParentsAndChildrenResponseFilesItem from .environment_response import EnvironmentResponse from .environment_tag import EnvironmentTag -from .evaluated_version_response import EvaluatedVersionResponse from .evaluatee_request import EvaluateeRequest from .evaluatee_response import EvaluateeResponse from .evaluation_evaluator_response import EvaluationEvaluatorResponse -from .evaluation_report_log_response import EvaluationReportLogResponse +from .evaluation_log_response import EvaluationLogResponse from .evaluation_response import EvaluationResponse +from .evaluation_run_response import EvaluationRunResponse +from .evaluation_runs_response import EvaluationRunsResponse from .evaluation_stats import EvaluationStats from .evaluation_status import EvaluationStatus from .evaluations_dataset_request import EvaluationsDatasetRequest @@ -78,6 +79,7 @@ from .list_tools import ListTools from .llm_evaluator_request import LlmEvaluatorRequest from .log_response import LogResponse +from .logs_association_type import LogsAssociationType from .model_endpoints import ModelEndpoints from .model_providers import ModelProviders from .monitoring_evaluator_environment_request import MonitoringEvaluatorEnvironmentRequest @@ -87,7 +89,7 @@ from .numeric_evaluator_stats_response import NumericEvaluatorStatsResponse from .observability_status import ObservabilityStatus from .overall_stats import OverallStats -from .paginated_data_evaluation_report_log_response import PaginatedDataEvaluationReportLogResponse +from .paginated_data_evaluation_log_response import PaginatedDataEvaluationLogResponse from .paginated_data_evaluator_response import PaginatedDataEvaluatorResponse from .paginated_data_flow_response import PaginatedDataFlowResponse from .paginated_data_log_response import PaginatedDataLogResponse @@ -121,6 +123,9 @@ from .provider_api_keys import ProviderApiKeys from .response_format import ResponseFormat from .response_format_type import ResponseFormatType +from .run_stats_response import RunStatsResponse +from .run_stats_response_evaluator_stats_item import RunStatsResponseEvaluatorStatsItem +from .run_version_response import RunVersionResponse from .select_evaluator_stats_response import SelectEvaluatorStatsResponse from .sort_order import SortOrder from .text_chat_content import TextChatContent @@ -144,6 +149,7 @@ from .version_id_response import VersionIdResponse from .version_id_response_version import VersionIdResponseVersion from .version_reference_response import VersionReferenceResponse +from .version_specification import VersionSpecification from .version_stats_response import VersionStatsResponse from .version_stats_response_evaluator_version_stats_item import VersionStatsResponseEvaluatorVersionStatsItem from .version_status import VersionStatus @@ -175,12 +181,13 @@ "DirectoryWithParentsAndChildrenResponseFilesItem", "EnvironmentResponse", "EnvironmentTag", - "EvaluatedVersionResponse", "EvaluateeRequest", "EvaluateeResponse", "EvaluationEvaluatorResponse", - "EvaluationReportLogResponse", + "EvaluationLogResponse", "EvaluationResponse", + "EvaluationRunResponse", + "EvaluationRunsResponse", "EvaluationStats", "EvaluationStatus", "EvaluationsDatasetRequest", @@ -225,6 +232,7 @@ "ListTools", "LlmEvaluatorRequest", "LogResponse", + "LogsAssociationType", "ModelEndpoints", "ModelProviders", "MonitoringEvaluatorEnvironmentRequest", @@ -234,7 +242,7 @@ "NumericEvaluatorStatsResponse", "ObservabilityStatus", "OverallStats", - "PaginatedDataEvaluationReportLogResponse", + "PaginatedDataEvaluationLogResponse", "PaginatedDataEvaluatorResponse", "PaginatedDataFlowResponse", "PaginatedDataLogResponse", @@ -264,6 +272,9 @@ "ProviderApiKeys", "ResponseFormat", "ResponseFormatType", + "RunStatsResponse", + "RunStatsResponseEvaluatorStatsItem", + "RunVersionResponse", "SelectEvaluatorStatsResponse", "SortOrder", "TextChatContent", @@ -287,6 +298,7 @@ "VersionIdResponse", "VersionIdResponseVersion", "VersionReferenceResponse", + "VersionSpecification", "VersionStatsResponse", "VersionStatsResponseEvaluatorVersionStatsItem", "VersionStatus", diff --git a/src/humanloop/types/boolean_evaluator_stats_response.py b/src/humanloop/types/boolean_evaluator_stats_response.py index 9ce51712..3deca81b 100644 --- a/src/humanloop/types/boolean_evaluator_stats_response.py +++ b/src/humanloop/types/boolean_evaluator_stats_response.py @@ -9,7 +9,7 @@ class BooleanEvaluatorStatsResponse(UncheckedBaseModel): """ Base attributes for stats for an Evaluator Version-Evaluated Version pair - in the Evaluation Report. + in the Evaluation. """ evaluator_version_id: str = pydantic.Field() diff --git a/src/humanloop/types/dataset_response.py b/src/humanloop/types/dataset_response.py index 132a7abf..942a9ee1 100644 --- a/src/humanloop/types/dataset_response.py +++ b/src/humanloop/types/dataset_response.py @@ -57,6 +57,16 @@ class DatasetResponse(UncheckedBaseModel): The user who created the Dataset. """ + committed_by: typing.Optional[UserResponse] = pydantic.Field(default=None) + """ + The user who committed the Dataset Version. + """ + + committed_at: typing.Optional[dt.datetime] = pydantic.Field(default=None) + """ + The date and time the Dataset Version was committed. + """ + status: VersionStatus = pydantic.Field() """ The status of the Dataset Version. diff --git a/src/humanloop/types/evaluatee_request.py b/src/humanloop/types/evaluatee_request.py index 32f5f867..d976f840 100644 --- a/src/humanloop/types/evaluatee_request.py +++ b/src/humanloop/types/evaluatee_request.py @@ -40,7 +40,7 @@ class EvaluateeRequest(UncheckedBaseModel): batch_id: typing.Optional[str] = pydantic.Field(default=None) """ - Unique identifier for the batch of Logs to include in the Evaluation Report. + Unique identifier for the batch of Logs to include in the Evaluation. """ orchestrated: typing.Optional[bool] = pydantic.Field(default=None) diff --git a/src/humanloop/types/evaluatee_response.py b/src/humanloop/types/evaluatee_response.py index 4dd78cb7..baa33f79 100644 --- a/src/humanloop/types/evaluatee_response.py +++ b/src/humanloop/types/evaluatee_response.py @@ -9,8 +9,8 @@ from .tool_response import ToolResponse from .version_deployment_response import VersionDeploymentResponse from .version_id_response import VersionIdResponse -from .evaluated_version_response import EvaluatedVersionResponse import typing +from .run_version_response import RunVersionResponse import pydantic import datetime as dt from ..core.pydantic_utilities import IS_PYDANTIC_V2 @@ -22,10 +22,10 @@ class EvaluateeResponse(UncheckedBaseModel): Version of the Evaluatee being evaluated. """ - version: EvaluatedVersionResponse + version: typing.Optional[RunVersionResponse] = None batch_id: typing.Optional[str] = pydantic.Field(default=None) """ - Unique identifier for the batch of Logs to include in the Evaluation Report. + Unique identifier for the batch of Logs to include in the Evaluation. """ orchestrated: bool = pydantic.Field() diff --git a/src/humanloop/types/evaluation_report_log_response.py b/src/humanloop/types/evaluation_log_response.py similarity index 53% rename from src/humanloop/types/evaluation_report_log_response.py rename to src/humanloop/types/evaluation_log_response.py index a92d3414..d0ad938d 100644 --- a/src/humanloop/types/evaluation_report_log_response.py +++ b/src/humanloop/types/evaluation_log_response.py @@ -2,30 +2,29 @@ from __future__ import annotations from ..core.unchecked_base_model import UncheckedBaseModel +from .evaluator_log_response import EvaluatorLogResponse from .evaluator_response import EvaluatorResponse +from .flow_log_response import FlowLogResponse from .flow_response import FlowResponse from .monitoring_evaluator_response import MonitoringEvaluatorResponse +from .prompt_log_response import PromptLogResponse from .prompt_response import PromptResponse +from .tool_log_response import ToolLogResponse from .tool_response import ToolResponse from .version_deployment_response import VersionDeploymentResponse from .version_id_response import VersionIdResponse -from .evaluator_log_response import EvaluatorLogResponse -from .flow_log_response import FlowLogResponse -from .prompt_log_response import PromptLogResponse -from .tool_log_response import ToolLogResponse -from .evaluated_version_response import EvaluatedVersionResponse import pydantic from .datapoint_response import DatapointResponse -import typing from .log_response import LogResponse +import typing from ..core.pydantic_utilities import IS_PYDANTIC_V2 from ..core.pydantic_utilities import update_forward_refs -class EvaluationReportLogResponse(UncheckedBaseModel): - evaluated_version: EvaluatedVersionResponse = pydantic.Field() +class EvaluationLogResponse(UncheckedBaseModel): + run_id: str = pydantic.Field() """ - The version of the Prompt, Tool or Evaluator that the Log belongs to. + Unique identifier for the Run. """ datapoint: DatapointResponse = pydantic.Field() @@ -33,7 +32,7 @@ class EvaluationReportLogResponse(UncheckedBaseModel): The Datapoint used to generate the Log """ - log: typing.Optional[LogResponse] = pydantic.Field(default=None) + log: LogResponse = pydantic.Field() """ The Log that was evaluated by the Evaluator. """ @@ -53,14 +52,14 @@ class Config: extra = pydantic.Extra.allow -update_forward_refs(EvaluatorResponse, EvaluationReportLogResponse=EvaluationReportLogResponse) -update_forward_refs(FlowResponse, EvaluationReportLogResponse=EvaluationReportLogResponse) -update_forward_refs(MonitoringEvaluatorResponse, EvaluationReportLogResponse=EvaluationReportLogResponse) -update_forward_refs(PromptResponse, EvaluationReportLogResponse=EvaluationReportLogResponse) -update_forward_refs(ToolResponse, EvaluationReportLogResponse=EvaluationReportLogResponse) -update_forward_refs(VersionDeploymentResponse, EvaluationReportLogResponse=EvaluationReportLogResponse) -update_forward_refs(VersionIdResponse, EvaluationReportLogResponse=EvaluationReportLogResponse) -update_forward_refs(EvaluatorLogResponse, EvaluationReportLogResponse=EvaluationReportLogResponse) -update_forward_refs(FlowLogResponse, EvaluationReportLogResponse=EvaluationReportLogResponse) -update_forward_refs(PromptLogResponse, EvaluationReportLogResponse=EvaluationReportLogResponse) -update_forward_refs(ToolLogResponse, EvaluationReportLogResponse=EvaluationReportLogResponse) +update_forward_refs(EvaluatorLogResponse, EvaluationLogResponse=EvaluationLogResponse) +update_forward_refs(EvaluatorResponse, EvaluationLogResponse=EvaluationLogResponse) +update_forward_refs(FlowLogResponse, EvaluationLogResponse=EvaluationLogResponse) +update_forward_refs(FlowResponse, EvaluationLogResponse=EvaluationLogResponse) +update_forward_refs(MonitoringEvaluatorResponse, EvaluationLogResponse=EvaluationLogResponse) +update_forward_refs(PromptLogResponse, EvaluationLogResponse=EvaluationLogResponse) +update_forward_refs(PromptResponse, EvaluationLogResponse=EvaluationLogResponse) +update_forward_refs(ToolLogResponse, EvaluationLogResponse=EvaluationLogResponse) +update_forward_refs(ToolResponse, EvaluationLogResponse=EvaluationLogResponse) +update_forward_refs(VersionDeploymentResponse, EvaluationLogResponse=EvaluationLogResponse) +update_forward_refs(VersionIdResponse, EvaluationLogResponse=EvaluationLogResponse) diff --git a/src/humanloop/types/evaluation_response.py b/src/humanloop/types/evaluation_response.py index b8864204..a4c2336a 100644 --- a/src/humanloop/types/evaluation_response.py +++ b/src/humanloop/types/evaluation_response.py @@ -10,11 +10,8 @@ from .version_deployment_response import VersionDeploymentResponse from .version_id_response import VersionIdResponse import pydantic -from .dataset_response import DatasetResponse import typing -from .evaluatee_response import EvaluateeResponse from .evaluation_evaluator_response import EvaluationEvaluatorResponse -from .evaluation_status import EvaluationStatus import datetime as dt from .user_response import UserResponse from ..core.pydantic_utilities import IS_PYDANTIC_V2 @@ -27,14 +24,9 @@ class EvaluationResponse(UncheckedBaseModel): Unique identifier for the Evaluation. Starts with `evr`. """ - dataset: DatasetResponse = pydantic.Field() + runs_count: int = pydantic.Field() """ - The Dataset used in the Evaluation. - """ - - evaluatees: typing.List[EvaluateeResponse] = pydantic.Field() - """ - The Prompt/Tool Versions included in the Evaluation. + The total number of Runs in the Evaluation. """ evaluators: typing.List[EvaluationEvaluatorResponse] = pydantic.Field() @@ -42,16 +34,6 @@ class EvaluationResponse(UncheckedBaseModel): The Evaluator Versions used to evaluate. """ - status: EvaluationStatus = pydantic.Field() - """ - The current status of the Evaluation. - - - `"pending"`: The Evaluation has been created but is not actively being worked on by Humanloop. - - `"running"`: Humanloop is checking for any missing Logs and Evaluator Logs, and will generate them where appropriate. - - `"completed"`: All Logs an Evaluator Logs have been generated. - - `"cancelled"`: The Evaluation has been cancelled by the user. Humanloop will stop generating Logs and Evaluator Logs. - """ - name: typing.Optional[str] = pydantic.Field(default=None) """ Name of the Evaluation to help identify it. Must be unique among Evaluations associated with File. diff --git a/src/humanloop/types/evaluation_run_response.py b/src/humanloop/types/evaluation_run_response.py new file mode 100644 index 00000000..46f9308d --- /dev/null +++ b/src/humanloop/types/evaluation_run_response.py @@ -0,0 +1,85 @@ +# This file was auto-generated by Fern from our API Definition. + +from __future__ import annotations +from ..core.unchecked_base_model import UncheckedBaseModel +from .evaluator_response import EvaluatorResponse +from .flow_response import FlowResponse +from .monitoring_evaluator_response import MonitoringEvaluatorResponse +from .prompt_response import PromptResponse +from .tool_response import ToolResponse +from .version_deployment_response import VersionDeploymentResponse +from .version_id_response import VersionIdResponse +import pydantic +import typing +from .dataset_response import DatasetResponse +from .run_version_response import RunVersionResponse +import datetime as dt +from .user_response import UserResponse +from .evaluation_status import EvaluationStatus +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.pydantic_utilities import update_forward_refs + + +class EvaluationRunResponse(UncheckedBaseModel): + id: str = pydantic.Field() + """ + Unique identifier for the Run. + """ + + dataset: typing.Optional[DatasetResponse] = pydantic.Field(default=None) + """ + The Dataset used in the Run. + """ + + version: typing.Optional[RunVersionResponse] = pydantic.Field(default=None) + """ + The version used in the Run. + """ + + orchestrated: bool = pydantic.Field() + """ + Whether the Run is orchestrated by Humanloop. + """ + + added_at: dt.datetime = pydantic.Field() + """ + When the Run was added to the Evaluation. + """ + + created_at: dt.datetime = pydantic.Field() + """ + When the Run was created. + """ + + created_by: typing.Optional[UserResponse] = pydantic.Field(default=None) + """ + The User who created the Run. + """ + + status: EvaluationStatus = pydantic.Field() + """ + The status of the Run. + """ + + control: bool = pydantic.Field() + """ + Stats for other Runs will be displayed in comparison to the control Run. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +update_forward_refs(EvaluatorResponse, EvaluationRunResponse=EvaluationRunResponse) +update_forward_refs(FlowResponse, EvaluationRunResponse=EvaluationRunResponse) +update_forward_refs(MonitoringEvaluatorResponse, EvaluationRunResponse=EvaluationRunResponse) +update_forward_refs(PromptResponse, EvaluationRunResponse=EvaluationRunResponse) +update_forward_refs(ToolResponse, EvaluationRunResponse=EvaluationRunResponse) +update_forward_refs(VersionDeploymentResponse, EvaluationRunResponse=EvaluationRunResponse) +update_forward_refs(VersionIdResponse, EvaluationRunResponse=EvaluationRunResponse) diff --git a/src/humanloop/types/evaluation_runs_response.py b/src/humanloop/types/evaluation_runs_response.py new file mode 100644 index 00000000..208a7529 --- /dev/null +++ b/src/humanloop/types/evaluation_runs_response.py @@ -0,0 +1,41 @@ +# This file was auto-generated by Fern from our API Definition. + +from __future__ import annotations +from ..core.unchecked_base_model import UncheckedBaseModel +from .evaluator_response import EvaluatorResponse +from .flow_response import FlowResponse +from .monitoring_evaluator_response import MonitoringEvaluatorResponse +from .prompt_response import PromptResponse +from .tool_response import ToolResponse +from .version_deployment_response import VersionDeploymentResponse +from .version_id_response import VersionIdResponse +import typing +from .evaluation_run_response import EvaluationRunResponse +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.pydantic_utilities import update_forward_refs + + +class EvaluationRunsResponse(UncheckedBaseModel): + runs: typing.List[EvaluationRunResponse] = pydantic.Field() + """ + The Runs in the Evaluation. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +update_forward_refs(EvaluatorResponse, EvaluationRunsResponse=EvaluationRunsResponse) +update_forward_refs(FlowResponse, EvaluationRunsResponse=EvaluationRunsResponse) +update_forward_refs(MonitoringEvaluatorResponse, EvaluationRunsResponse=EvaluationRunsResponse) +update_forward_refs(PromptResponse, EvaluationRunsResponse=EvaluationRunsResponse) +update_forward_refs(ToolResponse, EvaluationRunsResponse=EvaluationRunsResponse) +update_forward_refs(VersionDeploymentResponse, EvaluationRunsResponse=EvaluationRunsResponse) +update_forward_refs(VersionIdResponse, EvaluationRunsResponse=EvaluationRunsResponse) diff --git a/src/humanloop/types/evaluation_stats.py b/src/humanloop/types/evaluation_stats.py index 350cf1db..9a6a07a7 100644 --- a/src/humanloop/types/evaluation_stats.py +++ b/src/humanloop/types/evaluation_stats.py @@ -1,23 +1,17 @@ # This file was auto-generated by Fern from our API Definition. from ..core.unchecked_base_model import UncheckedBaseModel -from .overall_stats import OverallStats -import pydantic import typing -from .version_stats_response import VersionStatsResponse +from .run_stats_response import RunStatsResponse +import pydantic from .evaluation_status import EvaluationStatus from ..core.pydantic_utilities import IS_PYDANTIC_V2 class EvaluationStats(UncheckedBaseModel): - overall_stats: OverallStats = pydantic.Field() - """ - Stats for the Evaluation Report as a whole. - """ - - version_stats: typing.List[VersionStatsResponse] = pydantic.Field() + run_stats: typing.List[RunStatsResponse] = pydantic.Field() """ - Stats for each Evaluated Version in the Evaluation Report. + Stats for each Run in the Evaluation. """ progress: typing.Optional[str] = pydantic.Field(default=None) diff --git a/src/humanloop/types/evaluator_response.py b/src/humanloop/types/evaluator_response.py index 69111519..fcaf0326 100644 --- a/src/humanloop/types/evaluator_response.py +++ b/src/humanloop/types/evaluator_response.py @@ -64,6 +64,16 @@ class EvaluatorResponse(UncheckedBaseModel): The user who created the Evaluator. """ + committed_by: typing.Optional[UserResponse] = pydantic.Field(default=None) + """ + The user who committed the Evaluator Version. + """ + + committed_at: typing.Optional[dt.datetime] = pydantic.Field(default=None) + """ + The date and time the Evaluator Version was committed. + """ + status: VersionStatus last_used_at: dt.datetime version_logs_count: int = pydantic.Field() diff --git a/src/humanloop/types/flow_response.py b/src/humanloop/types/flow_response.py index 2c478605..874782a1 100644 --- a/src/humanloop/types/flow_response.py +++ b/src/humanloop/types/flow_response.py @@ -66,6 +66,16 @@ class FlowResponse(UncheckedBaseModel): The user who created the Flow. """ + committed_by: typing.Optional[UserResponse] = pydantic.Field(default=None) + """ + The user who committed the Flow Version. + """ + + committed_at: typing.Optional[dt.datetime] = pydantic.Field(default=None) + """ + The date and time the Flow Version was committed. + """ + status: VersionStatus = pydantic.Field() """ The status of the Flow Version. diff --git a/src/humanloop/types/logs_association_type.py b/src/humanloop/types/logs_association_type.py new file mode 100644 index 00000000..c904b93c --- /dev/null +++ b/src/humanloop/types/logs_association_type.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +LogsAssociationType = typing.Union[typing.Literal["dynamic", "fixed"], typing.Any] diff --git a/src/humanloop/types/numeric_evaluator_stats_response.py b/src/humanloop/types/numeric_evaluator_stats_response.py index 6ca2e662..eec24ff5 100644 --- a/src/humanloop/types/numeric_evaluator_stats_response.py +++ b/src/humanloop/types/numeric_evaluator_stats_response.py @@ -9,7 +9,7 @@ class NumericEvaluatorStatsResponse(UncheckedBaseModel): """ Base attributes for stats for an Evaluator Version-Evaluated Version pair - in the Evaluation Report. + in the Evaluation. """ evaluator_version_id: str = pydantic.Field() diff --git a/src/humanloop/types/overall_stats.py b/src/humanloop/types/overall_stats.py index 8258f898..b1d6e6dc 100644 --- a/src/humanloop/types/overall_stats.py +++ b/src/humanloop/types/overall_stats.py @@ -9,17 +9,17 @@ class OverallStats(UncheckedBaseModel): num_datapoints: int = pydantic.Field() """ - The total number of Datapoints in the Evaluation Report's Dataset Version. + The total number of Datapoints in the Evaluation's Dataset Version. """ total_logs: int = pydantic.Field() """ - The total number of Logs in the Evaluation Report. + The total number of Logs in the Evaluation. """ total_evaluator_logs: int = pydantic.Field() """ - The total number of Evaluator Logs in the Evaluation Report. + The total number of Evaluator Logs in the Evaluation. """ if IS_PYDANTIC_V2: diff --git a/src/humanloop/types/paginated_data_evaluation_log_response.py b/src/humanloop/types/paginated_data_evaluation_log_response.py new file mode 100644 index 00000000..c6e19791 --- /dev/null +++ b/src/humanloop/types/paginated_data_evaluation_log_response.py @@ -0,0 +1,49 @@ +# This file was auto-generated by Fern from our API Definition. + +from __future__ import annotations +from ..core.unchecked_base_model import UncheckedBaseModel +from .evaluator_log_response import EvaluatorLogResponse +from .evaluator_response import EvaluatorResponse +from .flow_log_response import FlowLogResponse +from .flow_response import FlowResponse +from .monitoring_evaluator_response import MonitoringEvaluatorResponse +from .prompt_log_response import PromptLogResponse +from .prompt_response import PromptResponse +from .tool_log_response import ToolLogResponse +from .tool_response import ToolResponse +from .version_deployment_response import VersionDeploymentResponse +from .version_id_response import VersionIdResponse +import typing +from .evaluation_log_response import EvaluationLogResponse +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +import pydantic +from ..core.pydantic_utilities import update_forward_refs + + +class PaginatedDataEvaluationLogResponse(UncheckedBaseModel): + records: typing.List[EvaluationLogResponse] + page: int + size: int + total: int + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +update_forward_refs(EvaluatorLogResponse, PaginatedDataEvaluationLogResponse=PaginatedDataEvaluationLogResponse) +update_forward_refs(EvaluatorResponse, PaginatedDataEvaluationLogResponse=PaginatedDataEvaluationLogResponse) +update_forward_refs(FlowLogResponse, PaginatedDataEvaluationLogResponse=PaginatedDataEvaluationLogResponse) +update_forward_refs(FlowResponse, PaginatedDataEvaluationLogResponse=PaginatedDataEvaluationLogResponse) +update_forward_refs(MonitoringEvaluatorResponse, PaginatedDataEvaluationLogResponse=PaginatedDataEvaluationLogResponse) +update_forward_refs(PromptLogResponse, PaginatedDataEvaluationLogResponse=PaginatedDataEvaluationLogResponse) +update_forward_refs(PromptResponse, PaginatedDataEvaluationLogResponse=PaginatedDataEvaluationLogResponse) +update_forward_refs(ToolLogResponse, PaginatedDataEvaluationLogResponse=PaginatedDataEvaluationLogResponse) +update_forward_refs(ToolResponse, PaginatedDataEvaluationLogResponse=PaginatedDataEvaluationLogResponse) +update_forward_refs(VersionDeploymentResponse, PaginatedDataEvaluationLogResponse=PaginatedDataEvaluationLogResponse) +update_forward_refs(VersionIdResponse, PaginatedDataEvaluationLogResponse=PaginatedDataEvaluationLogResponse) diff --git a/src/humanloop/types/paginated_data_evaluation_report_log_response.py b/src/humanloop/types/paginated_data_evaluation_report_log_response.py deleted file mode 100644 index 95c1725d..00000000 --- a/src/humanloop/types/paginated_data_evaluation_report_log_response.py +++ /dev/null @@ -1,61 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -from __future__ import annotations -from ..core.unchecked_base_model import UncheckedBaseModel -from .evaluator_log_response import EvaluatorLogResponse -from .evaluator_response import EvaluatorResponse -from .flow_log_response import FlowLogResponse -from .flow_response import FlowResponse -from .monitoring_evaluator_response import MonitoringEvaluatorResponse -from .prompt_log_response import PromptLogResponse -from .prompt_response import PromptResponse -from .tool_log_response import ToolLogResponse -from .tool_response import ToolResponse -from .version_deployment_response import VersionDeploymentResponse -from .version_id_response import VersionIdResponse -import typing -from .evaluation_report_log_response import EvaluationReportLogResponse -from ..core.pydantic_utilities import IS_PYDANTIC_V2 -import pydantic -from ..core.pydantic_utilities import update_forward_refs - - -class PaginatedDataEvaluationReportLogResponse(UncheckedBaseModel): - records: typing.List[EvaluationReportLogResponse] - page: int - size: int - total: int - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow - - -update_forward_refs( - EvaluatorLogResponse, PaginatedDataEvaluationReportLogResponse=PaginatedDataEvaluationReportLogResponse -) -update_forward_refs( - EvaluatorResponse, PaginatedDataEvaluationReportLogResponse=PaginatedDataEvaluationReportLogResponse -) -update_forward_refs(FlowLogResponse, PaginatedDataEvaluationReportLogResponse=PaginatedDataEvaluationReportLogResponse) -update_forward_refs(FlowResponse, PaginatedDataEvaluationReportLogResponse=PaginatedDataEvaluationReportLogResponse) -update_forward_refs( - MonitoringEvaluatorResponse, PaginatedDataEvaluationReportLogResponse=PaginatedDataEvaluationReportLogResponse -) -update_forward_refs( - PromptLogResponse, PaginatedDataEvaluationReportLogResponse=PaginatedDataEvaluationReportLogResponse -) -update_forward_refs(PromptResponse, PaginatedDataEvaluationReportLogResponse=PaginatedDataEvaluationReportLogResponse) -update_forward_refs(ToolLogResponse, PaginatedDataEvaluationReportLogResponse=PaginatedDataEvaluationReportLogResponse) -update_forward_refs(ToolResponse, PaginatedDataEvaluationReportLogResponse=PaginatedDataEvaluationReportLogResponse) -update_forward_refs( - VersionDeploymentResponse, PaginatedDataEvaluationReportLogResponse=PaginatedDataEvaluationReportLogResponse -) -update_forward_refs( - VersionIdResponse, PaginatedDataEvaluationReportLogResponse=PaginatedDataEvaluationReportLogResponse -) diff --git a/src/humanloop/types/prompt_call_response.py b/src/humanloop/types/prompt_call_response.py index 64db5f49..492d10aa 100644 --- a/src/humanloop/types/prompt_call_response.py +++ b/src/humanloop/types/prompt_call_response.py @@ -79,11 +79,6 @@ class PromptCallResponse(UncheckedBaseModel): The ID of the parent Log to nest this Log under in a Trace. """ - batch_id: typing.Optional[str] = pydantic.Field(default=None) - """ - Unique identifier for the Batch to add this Batch to. Batches are used to group Logs together for Evaluations. A Batch will be created if one with the given ID does not exist. - """ - user: typing.Optional[str] = pydantic.Field(default=None) """ End-user ID related to the Log. diff --git a/src/humanloop/types/prompt_response.py b/src/humanloop/types/prompt_response.py index 6f1029f5..64db52d5 100644 --- a/src/humanloop/types/prompt_response.py +++ b/src/humanloop/types/prompt_response.py @@ -157,6 +157,16 @@ class PromptResponse(UncheckedBaseModel): The user who created the Prompt. """ + committed_by: typing.Optional[UserResponse] = pydantic.Field(default=None) + """ + The user who committed the Prompt Version. + """ + + committed_at: typing.Optional[dt.datetime] = pydantic.Field(default=None) + """ + The date and time the Prompt Version was committed. + """ + status: VersionStatus = pydantic.Field() """ The status of the Prompt Version. diff --git a/src/humanloop/types/run_stats_response.py b/src/humanloop/types/run_stats_response.py new file mode 100644 index 00000000..201c6e76 --- /dev/null +++ b/src/humanloop/types/run_stats_response.py @@ -0,0 +1,47 @@ +# This file was auto-generated by Fern from our API Definition. + +from ..core.unchecked_base_model import UncheckedBaseModel +import pydantic +import typing +from .run_stats_response_evaluator_stats_item import RunStatsResponseEvaluatorStatsItem +from ..core.pydantic_utilities import IS_PYDANTIC_V2 + + +class RunStatsResponse(UncheckedBaseModel): + """ + Stats for a Run in the Evaluation. + """ + + run_id: str = pydantic.Field() + """ + Unique identifier for the Run. + """ + + version_id: typing.Optional[str] = pydantic.Field(default=None) + """ + Unique identifier for the evaluated Version. + """ + + batch_id: typing.Optional[str] = pydantic.Field(default=None) + """ + Unique identifier for the batch of Logs to include in the Evaluation. + """ + + num_logs: int = pydantic.Field() + """ + The total number of existing Logs in this Run. + """ + + evaluator_stats: typing.List[RunStatsResponseEvaluatorStatsItem] = pydantic.Field() + """ + Stats for each Evaluator Version applied to this Run. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/humanloop/types/run_stats_response_evaluator_stats_item.py b/src/humanloop/types/run_stats_response_evaluator_stats_item.py new file mode 100644 index 00000000..c7fe6056 --- /dev/null +++ b/src/humanloop/types/run_stats_response_evaluator_stats_item.py @@ -0,0 +1,14 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing +from .numeric_evaluator_stats_response import NumericEvaluatorStatsResponse +from .boolean_evaluator_stats_response import BooleanEvaluatorStatsResponse +from .select_evaluator_stats_response import SelectEvaluatorStatsResponse +from .text_evaluator_stats_response import TextEvaluatorStatsResponse + +RunStatsResponseEvaluatorStatsItem = typing.Union[ + NumericEvaluatorStatsResponse, + BooleanEvaluatorStatsResponse, + SelectEvaluatorStatsResponse, + TextEvaluatorStatsResponse, +] diff --git a/src/humanloop/types/evaluated_version_response.py b/src/humanloop/types/run_version_response.py similarity index 71% rename from src/humanloop/types/evaluated_version_response.py rename to src/humanloop/types/run_version_response.py index 3064bfb1..d94b1178 100644 --- a/src/humanloop/types/evaluated_version_response.py +++ b/src/humanloop/types/run_version_response.py @@ -6,4 +6,4 @@ from .evaluator_response import EvaluatorResponse from .flow_response import FlowResponse -EvaluatedVersionResponse = typing.Union[PromptResponse, ToolResponse, EvaluatorResponse, FlowResponse] +RunVersionResponse = typing.Union[PromptResponse, ToolResponse, EvaluatorResponse, FlowResponse] diff --git a/src/humanloop/types/text_evaluator_stats_response.py b/src/humanloop/types/text_evaluator_stats_response.py index 735b4eb7..652c7aa6 100644 --- a/src/humanloop/types/text_evaluator_stats_response.py +++ b/src/humanloop/types/text_evaluator_stats_response.py @@ -9,7 +9,7 @@ class TextEvaluatorStatsResponse(UncheckedBaseModel): """ Base attributes for stats for an Evaluator Version-Evaluated Version pair - in the Evaluation Report. + in the Evaluation. """ evaluator_version_id: str = pydantic.Field() diff --git a/src/humanloop/types/tool_response.py b/src/humanloop/types/tool_response.py index 3099da27..c1db98bb 100644 --- a/src/humanloop/types/tool_response.py +++ b/src/humanloop/types/tool_response.py @@ -92,6 +92,16 @@ class ToolResponse(UncheckedBaseModel): The user who created the Tool. """ + committed_by: typing.Optional[UserResponse] = pydantic.Field(default=None) + """ + The user who committed the Tool Version. + """ + + committed_at: typing.Optional[dt.datetime] = pydantic.Field(default=None) + """ + The date and time the Tool Version was committed. + """ + status: VersionStatus = pydantic.Field() """ The status of the Tool Version. diff --git a/src/humanloop/types/version_specification.py b/src/humanloop/types/version_specification.py new file mode 100644 index 00000000..bb3464ce --- /dev/null +++ b/src/humanloop/types/version_specification.py @@ -0,0 +1,48 @@ +# This file was auto-generated by Fern from our API Definition. + +from ..core.unchecked_base_model import UncheckedBaseModel +import typing +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 + + +class VersionSpecification(UncheckedBaseModel): + """ + Specification of a File version on Humanloop. + + This can be done in a couple of ways: + + - Specifying `version_id` directly. + - Specifying a File (and optionally an Environment). + - A File can be specified by either `path` or `file_id`. + - An Environment can be specified by `environment_id`. If no Environment is specified, the default Environment is used. + """ + + version_id: typing.Optional[str] = pydantic.Field(default=None) + """ + Unique identifier for the File Version. If provided, none of the other fields should be specified. + """ + + path: typing.Optional[str] = pydantic.Field(default=None) + """ + Path identifying a File. Provide either this or `file_id` if you want to specify a File. + """ + + file_id: typing.Optional[str] = pydantic.Field(default=None) + """ + Unique identifier for the File. Provide either this or `path` if you want to specify a File. + """ + + environment: typing.Optional[str] = pydantic.Field(default=None) + """ + Name of the Environment a Version is deployed to. Only provide this when specifying a File. If not provided (and a File is specified), the default Environment is used. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/humanloop/types/version_stats_response.py b/src/humanloop/types/version_stats_response.py index 25c0a682..6439fca4 100644 --- a/src/humanloop/types/version_stats_response.py +++ b/src/humanloop/types/version_stats_response.py @@ -8,28 +8,24 @@ class VersionStatsResponse(UncheckedBaseModel): - """ - Stats for an Evaluated Version in the Evaluation Report. - """ - version_id: str = pydantic.Field() """ - Unique identifier for the Evaluated Version. + Unique identifier for the evaluated Version. """ batch_id: typing.Optional[str] = pydantic.Field(default=None) """ - Unique identifier for the batch of Logs to include in the Evaluation Report. + Unique identifier for the batch of Logs to include in the Evaluation. """ num_logs: int = pydantic.Field() """ - The total number of existing Logs for this Evaluated Version within the Evaluation Report. These are Logs that have been generated by this Evaluated Version on a Datapoint belonging to the Evaluation Report's Dataset Version. + The total number of existing Logs in this Run. """ evaluator_version_stats: typing.List[VersionStatsResponseEvaluatorVersionStatsItem] = pydantic.Field() """ - Stats for each Evaluator Version used to evaluate this Evaluated Version. + Stats for each Evaluator Version applied to this Run. """ if IS_PYDANTIC_V2: