Updated step to use no CRI

frgud · frgud · commit 7c0152710577 · 2025-10-15T10:43:11.000-04:00
diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/qualitative_eval_step.py b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/qualitative_eval_step.py
@@ -145,8 +145,8 @@ def create_bedrock_judge_metrics():
             ),
             examples=medical_accuracy_examples,
             version="v1",
-            model="bedrock:/us.anthropic.claude-3-haiku-20240307-v1:0",
-            # model="bedrock:/anthropic.claude-3-haiku-20240307-v1:0",
+            # model="bedrock:/us.anthropic.claude-3-haiku-20240307-v1:0",
+            model="bedrock:/anthropic.claude-3-haiku-20240307-v1:0",
             parameters={
                 "anthropic_version": "bedrock-2023-05-31",
                 "temperature": 0.0,
@@ -194,8 +194,8 @@ def create_bedrock_judge_metrics():
             ),
             examples=clinical_reasoning_examples,
             version="v1",
-            model="bedrock:/us.anthropic.claude-3-haiku-20240307-v1:0",
-            # model="bedrock:/anthropic.claude-3-haiku-20240307-v1:0",
+            # model="bedrock:/us.anthropic.claude-3-haiku-20240307-v1:0",
+            model="bedrock:/anthropic.claude-3-haiku-20240307-v1:0",
             parameters={
                 "anthropic_version": "bedrock-2023-05-31",
                 "temperature": 0.0,
@@ -241,8 +241,8 @@ def create_bedrock_judge_metrics():
             ),
             examples=patient_safety_examples,
             version="v1",
-            model="bedrock:/us.anthropic.claude-3-haiku-20240307-v1:0",
-            # model="bedrock:/anthropic.claude-3-haiku-20240307-v1:0",
+            # model="bedrock:/us.anthropic.claude-3-haiku-20240307-v1:0",
+            model="bedrock:/anthropic.claude-3-haiku-20240307-v1:0",
             parameters={
                 "anthropic_version": "bedrock-2023-05-31",
                 "temperature": 0.0,
@@ -481,7 +481,8 @@ def evaluate_model_qualitatively(model_config, dataset):
             mlflow.log_param("qualitative_evaluation_endpoint", endpoint_name)
             mlflow.log_param("qualitative_evaluation_num_samples", num_samples)
             mlflow.log_param("qualitative_evaluation_timestamp", datetime.now().isoformat())
-            mlflow.log_param("llm_judge_model", "bedrock:/us.anthropic.claude-3-haiku-20240307-v1:0")
+            mlflow.log_param("llm_judge_model", "bedrock:/anthropic.claude-3-haiku-20240307-v1:0")
+            # mlflow.log_param("llm_judge_model", "bedrock:/us.anthropic.claude-3-haiku-20240307-v1:0")
             
             # Load the test dataset
             try: