Skip to content

Commit d8ecd33

Browse files
authored
Merge pull request #6 from BethanyJep/main
finetuning & distillation labs added
2 parents ea40465 + 3870b33 commit d8ecd33

File tree

173 files changed

+13765
-2
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

173 files changed

+13765
-2
lines changed

.gitignore

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -418,4 +418,8 @@ FodyWeavers.xsd
418418
*.msp
419419

420420
#mkdocs static site
421-
site/*
421+
site/*
422+
423+
# .github
424+
.venv
425+
.github/chatmodes/

lab/3-Custom-Politeness-Evaluator/creating-grader.ipynb

Lines changed: 1231 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 257 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,257 @@
1+
# Original version from:
2+
# https://raw.githubusercontent.com/azure-ai-foundry/build-2025-demos/refs/heads/main/Azure%20AI%20Model%20Customization/DistillationDemo/scripts/eval_utils.py
3+
import openai
4+
import pandas as pd
5+
import matplotlib.pyplot as plt
6+
import numpy as np # Import numpy for percentile calculations
7+
8+
from concurrent.futures import ThreadPoolExecutor, as_completed
9+
10+
def get_eval_runs_list(client: openai.Client, eval_id: str) -> list:
11+
"""
12+
Fetch the list of evaluation runs for a given evaluation ID.
13+
14+
Args:
15+
eval_id (str): The evaluation ID.
16+
17+
Returns:
18+
list: A list of evaluation runs with their details.
19+
"""
20+
runs = client.evals.runs.list(eval_id)
21+
22+
print(f"Get Evaluation Runs: {eval_id}")
23+
list_runs = []
24+
25+
if runs:
26+
for run in runs:
27+
r = {
28+
'id': run.id,
29+
'name': run.name,
30+
'status': run.status,
31+
'model': run.model,
32+
}
33+
result = run.result_counts.to_dict()
34+
if result:
35+
passed = result.get('passed', 0)
36+
errored = result.get('errored', 0)
37+
failed = result.get('failed', 0)
38+
total = result.get('total', 0)
39+
pass_percentage = (passed * 100) / (passed + failed) if (passed + failed) > 0 else 0
40+
error_percentage = (errored * 100) / total if total > 0 else 0
41+
r['pass_percentage'] = pass_percentage
42+
r['error_percentage'] = error_percentage
43+
44+
list_runs.append(r)
45+
46+
return list_runs
47+
48+
49+
def get_eval_details(client: openai.Client, eval_id: str) -> dict:
50+
"""
51+
Fetch the details of a specific evaluation.
52+
53+
Args:
54+
eval_id (str): The evaluation ID.
55+
56+
Returns:
57+
dict: A dictionary containing evaluation details, including the name.
58+
"""
59+
try:
60+
eval = client.evals.retrieve(eval_id)
61+
return eval.to_dict()
62+
except Exception as e:
63+
print(f"Failed to fetch evaluation details for ID: {eval_id}. Error: {e}")
64+
return {"name": f"Unknown Evaluation ({eval_id})"}
65+
66+
67+
def display_evaluation_summary(client: openai.Client, eval_ids: list, x_range = (0, 7)):
68+
"""
69+
Fetch and display a summary of evaluation runs for a list of evaluation IDs, including a horizontal bar chart,
70+
average score, and score distribution for all runs in a single chart with a maximum of 4 graphs per row.
71+
72+
Args:
73+
eval_ids (list): A list of evaluation IDs.
74+
"""
75+
all_eval_runs = []
76+
eval_id_to_name = {}
77+
eval_id_to_color = {}
78+
79+
# Assign unique colors for each evaluation ID
80+
colors = plt.cm.tab10.colors # Use a colormap for distinct colors
81+
for i, eval_id in enumerate(eval_ids):
82+
eval_id_to_color[eval_id] = colors[i % len(colors)]
83+
84+
# Fetch evaluation runs and details for each evaluation ID
85+
for eval_id in eval_ids:
86+
eval_runs = get_eval_runs_list(client, eval_id)
87+
88+
# Fetch evaluation details using the helper method
89+
eval_details = get_eval_details(client, eval_id)
90+
eval_name = eval_details.get('name', f'Unknown Evaluation ({eval_id})')
91+
eval_id_to_name[eval_id] = eval_name
92+
93+
# Add evaluation ID to each run for color coding
94+
for run in eval_runs:
95+
run['eval_id'] = eval_id
96+
all_eval_runs.append(run)
97+
98+
# Combine all evaluation runs into a single DataFrame
99+
if all_eval_runs:
100+
df = pd.DataFrame(all_eval_runs)
101+
df = df[['id', 'name', 'model', 'status', 'pass_percentage', 'error_percentage', 'eval_id']] # Select relevant columns
102+
df['eval_name'] = df['eval_id'].map(eval_id_to_name) # Map eval_id to eval_name
103+
df['model'] = df['model'].str[:15] # Truncate model names to 15 characters
104+
df = df.sort_values(by=['pass_percentage'], ascending=[False]) # Sort by pass_percentage descending
105+
106+
print("\n" + "=" * 50)
107+
print("Combined Evaluation Summary")
108+
print("=" * 50)
109+
print(df.to_string(index=False, header=["Run ID", "Run Name", "Model", "Status", "Pass Percentage (%)", "Error Percentage (%)", "Evaluation ID", "Evaluation Name"]))
110+
print("=" * 50)
111+
112+
# Dynamically adjust the figure height based on the number of rows
113+
num_rows = len(df)
114+
fig_height = max(3, num_rows * 0.5) # Set a minimum height of 6 and scale with 0.5 per row
115+
116+
117+
# Create a horizontal bar chart with rows sorted by pass percentage across all eval_ids
118+
plt.figure(figsize=(12, fig_height))
119+
120+
df['display_label'] = df['model'].where(
121+
(df['model'].str.strip() != '') & (df['model'] != 'None') & (df['model'].notna()),
122+
df['name']
123+
)
124+
125+
plt.barh(
126+
df['display_label'],
127+
df['pass_percentage'],
128+
color=[eval_id_to_color[eval_id] for eval_id in df['eval_id']],
129+
edgecolor='black'
130+
)
131+
plt.xlabel('Pass Percentage (%)')
132+
plt.ylabel('Model')
133+
plt.title("Pass Percentage by Model Across Evaluations")
134+
plt.xlim(0, 100) # Set x-axis scale explicitly to 0-100
135+
plt.gca().invert_yaxis() # Invert y-axis to show the highest percentage at the top
136+
plt.grid(axis='x', linestyle='--', alpha=0.7)
137+
plt.tight_layout()
138+
plt.show()
139+
140+
# Process each run to calculate and collect scores for distribution
141+
# (This part can be slow as we have to page over results for each run, so we parallelize this.)
142+
all_scores = []
143+
run_labels = []
144+
score_summary = [] # To store data for the summary table
145+
146+
print("=" * 50)
147+
print("Fetching scores...")
148+
print("=" * 50)
149+
150+
futures = {} # dict of "future: (model, eval_id)" so we can easily access which model powered the run.
151+
with ThreadPoolExecutor(thread_name_prefix="eval-run-fetcher") as pool:
152+
for _, row in df.iterrows():
153+
run_id = row['id']
154+
eval_id = row['eval_id']
155+
future = pool.submit(get_eval_run_output_items, client, eval_id, run_id)
156+
futures.update({ future: (row['model'] , eval_id)})
157+
158+
for f in as_completed(futures.keys()):
159+
try:
160+
model, eval_id = futures[f]
161+
scores = f.result()
162+
except Exception as e:
163+
print(f"exception fetching future result: {e}")
164+
scores = None
165+
if scores:
166+
avg_score = sum(scores) / len(scores)
167+
min_score = min(scores)
168+
max_score = max(scores)
169+
p10 = np.percentile(scores, 10) # 10th percentile
170+
p25 = np.percentile(scores, 25) # 25th percentile
171+
p50 = np.percentile(scores, 50) # 50th percentile (median)
172+
p75 = np.percentile(scores, 75) # 75th percentile
173+
p90 = np.percentile(scores, 90) # 90th percentile
174+
175+
# Collect scores and labels for the combined chart
176+
all_scores.append((scores, eval_id_to_color[eval_id])) # Include color for the subplot
177+
run_labels.append(f"{model} ({eval_id_to_name[eval_id]})") # Include eval name in the label
178+
179+
# Add data to the summary table
180+
score_summary.append({
181+
"Model": model,
182+
"Evaluation Name": eval_id_to_name[eval_id],
183+
"Average Score": f"{avg_score:.2f}",
184+
"Min Score": f"{min_score:.2f}",
185+
"Max Score": f"{max_score:.2f}",
186+
"10th Percentile": f"{p10:.2f}",
187+
"25th Percentile": f"{p25:.2f}",
188+
"50th Percentile": f"{p50:.2f}",
189+
"75th Percentile": f"{p75:.2f}",
190+
"90th Percentile": f"{p90:.2f}"
191+
})
192+
193+
# Display the score summary as a table
194+
if score_summary:
195+
score_df = pd.DataFrame(score_summary)
196+
score_df = score_df.sort_values(by=['Evaluation Name', 'Average Score'], ascending=[True, False]) # Sort by eval_name and avg_score
197+
print("\n" + "=" * 50)
198+
print("Score Summary Table:")
199+
print(score_df.to_string(index=False))
200+
print("=" * 50)
201+
202+
# Plot all score distributions in a single chart with a maximum of 4 graphs per row
203+
if all_scores:
204+
num_runs = len(all_scores)
205+
max_cols = 4 # Maximum number of graphs per row
206+
num_rows = (num_runs + max_cols - 1) // max_cols # Calculate the number of rows
207+
208+
_, axes = plt.subplots(num_rows, max_cols, figsize=(5 * max_cols, 4 * num_rows), sharey=True)
209+
axes = axes.flatten() # Flatten the axes array for easier indexing
210+
211+
for i, ((scores, color), label) in enumerate(zip(all_scores, run_labels)):
212+
ax = axes[i]
213+
ax.hist(scores, bins=10, color=color, edgecolor='black') # Use color for the histogram
214+
ax.set_title(label, fontsize=10) # Include model and evaluation name
215+
ax.set_xlabel("Score")
216+
ax.set_ylabel("Frequency")
217+
ax.set_xlim(x_range[0], x_range[1]) # Fix the x-axis range between 0 and 7
218+
ax.grid(axis='y', linestyle='--', alpha=0.7)
219+
220+
# Hide any unused subplots
221+
for j in range(len(all_scores), len(axes)):
222+
axes[j].axis('off')
223+
224+
plt.tight_layout()
225+
plt.suptitle("Score Distributions for each Model", fontsize=16, y=1.02)
226+
plt.show()
227+
else:
228+
print("\n" + "=" * 50)
229+
print("No evaluation runs found for the provided Evaluation IDs.")
230+
print("=" * 50)
231+
232+
233+
def get_eval_run_output_items(client: openai.Client, eval_id: str, run_id: str) -> list:
234+
"""
235+
Fetch the output items for a specific evaluation run and extract the result scores.
236+
237+
Args:
238+
eval_id (str): The evaluation ID.
239+
run_id (str): The run ID.
240+
241+
Returns:
242+
list: A list of scores for the output items.
243+
"""
244+
scores = []
245+
246+
try:
247+
response = client.evals.runs.output_items.list(run_id=run_id, eval_id=eval_id)
248+
for page in response.iter_pages():
249+
for item in page.data:
250+
for result in item.results:
251+
score = result.get("score")
252+
if score is not None:
253+
scores.append(score)
254+
except Exception as e:
255+
print(f"Failed to fetch output items for run {run_id}. Error: {e}")
256+
257+
return scores

0 commit comments

Comments
 (0)