1+ # Original version from:
2+ # https://raw.githubusercontent.com/azure-ai-foundry/build-2025-demos/refs/heads/main/Azure%20AI%20Model%20Customization/DistillationDemo/scripts/eval_utils.py
3+ import openai
4+ import pandas as pd
5+ import matplotlib .pyplot as plt
6+ import numpy as np # Import numpy for percentile calculations
7+
8+ from concurrent .futures import ThreadPoolExecutor , as_completed
9+
10+ def get_eval_runs_list (client : openai .Client , eval_id : str ) -> list :
11+ """
12+ Fetch the list of evaluation runs for a given evaluation ID.
13+
14+ Args:
15+ eval_id (str): The evaluation ID.
16+
17+ Returns:
18+ list: A list of evaluation runs with their details.
19+ """
20+ runs = client .evals .runs .list (eval_id )
21+
22+ print (f"Get Evaluation Runs: { eval_id } " )
23+ list_runs = []
24+
25+ if runs :
26+ for run in runs :
27+ r = {
28+ 'id' : run .id ,
29+ 'name' : run .name ,
30+ 'status' : run .status ,
31+ 'model' : run .model ,
32+ }
33+ result = run .result_counts .to_dict ()
34+ if result :
35+ passed = result .get ('passed' , 0 )
36+ errored = result .get ('errored' , 0 )
37+ failed = result .get ('failed' , 0 )
38+ total = result .get ('total' , 0 )
39+ pass_percentage = (passed * 100 ) / (passed + failed ) if (passed + failed ) > 0 else 0
40+ error_percentage = (errored * 100 ) / total if total > 0 else 0
41+ r ['pass_percentage' ] = pass_percentage
42+ r ['error_percentage' ] = error_percentage
43+
44+ list_runs .append (r )
45+
46+ return list_runs
47+
48+
49+ def get_eval_details (client : openai .Client , eval_id : str ) -> dict :
50+ """
51+ Fetch the details of a specific evaluation.
52+
53+ Args:
54+ eval_id (str): The evaluation ID.
55+
56+ Returns:
57+ dict: A dictionary containing evaluation details, including the name.
58+ """
59+ try :
60+ eval = client .evals .retrieve (eval_id )
61+ return eval .to_dict ()
62+ except Exception as e :
63+ print (f"Failed to fetch evaluation details for ID: { eval_id } . Error: { e } " )
64+ return {"name" : f"Unknown Evaluation ({ eval_id } )" }
65+
66+
67+ def display_evaluation_summary (client : openai .Client , eval_ids : list , x_range = (0 , 7 )):
68+ """
69+ Fetch and display a summary of evaluation runs for a list of evaluation IDs, including a horizontal bar chart,
70+ average score, and score distribution for all runs in a single chart with a maximum of 4 graphs per row.
71+
72+ Args:
73+ eval_ids (list): A list of evaluation IDs.
74+ """
75+ all_eval_runs = []
76+ eval_id_to_name = {}
77+ eval_id_to_color = {}
78+
79+ # Assign unique colors for each evaluation ID
80+ colors = plt .cm .tab10 .colors # Use a colormap for distinct colors
81+ for i , eval_id in enumerate (eval_ids ):
82+ eval_id_to_color [eval_id ] = colors [i % len (colors )]
83+
84+ # Fetch evaluation runs and details for each evaluation ID
85+ for eval_id in eval_ids :
86+ eval_runs = get_eval_runs_list (client , eval_id )
87+
88+ # Fetch evaluation details using the helper method
89+ eval_details = get_eval_details (client , eval_id )
90+ eval_name = eval_details .get ('name' , f'Unknown Evaluation ({ eval_id } )' )
91+ eval_id_to_name [eval_id ] = eval_name
92+
93+ # Add evaluation ID to each run for color coding
94+ for run in eval_runs :
95+ run ['eval_id' ] = eval_id
96+ all_eval_runs .append (run )
97+
98+ # Combine all evaluation runs into a single DataFrame
99+ if all_eval_runs :
100+ df = pd .DataFrame (all_eval_runs )
101+ df = df [['id' , 'name' , 'model' , 'status' , 'pass_percentage' , 'error_percentage' , 'eval_id' ]] # Select relevant columns
102+ df ['eval_name' ] = df ['eval_id' ].map (eval_id_to_name ) # Map eval_id to eval_name
103+ df ['model' ] = df ['model' ].str [:15 ] # Truncate model names to 15 characters
104+ df = df .sort_values (by = ['pass_percentage' ], ascending = [False ]) # Sort by pass_percentage descending
105+
106+ print ("\n " + "=" * 50 )
107+ print ("Combined Evaluation Summary" )
108+ print ("=" * 50 )
109+ print (df .to_string (index = False , header = ["Run ID" , "Run Name" , "Model" , "Status" , "Pass Percentage (%)" , "Error Percentage (%)" , "Evaluation ID" , "Evaluation Name" ]))
110+ print ("=" * 50 )
111+
112+ # Dynamically adjust the figure height based on the number of rows
113+ num_rows = len (df )
114+ fig_height = max (3 , num_rows * 0.5 ) # Set a minimum height of 6 and scale with 0.5 per row
115+
116+
117+ # Create a horizontal bar chart with rows sorted by pass percentage across all eval_ids
118+ plt .figure (figsize = (12 , fig_height ))
119+
120+ df ['display_label' ] = df ['model' ].where (
121+ (df ['model' ].str .strip () != '' ) & (df ['model' ] != 'None' ) & (df ['model' ].notna ()),
122+ df ['name' ]
123+ )
124+
125+ plt .barh (
126+ df ['display_label' ],
127+ df ['pass_percentage' ],
128+ color = [eval_id_to_color [eval_id ] for eval_id in df ['eval_id' ]],
129+ edgecolor = 'black'
130+ )
131+ plt .xlabel ('Pass Percentage (%)' )
132+ plt .ylabel ('Model' )
133+ plt .title ("Pass Percentage by Model Across Evaluations" )
134+ plt .xlim (0 , 100 ) # Set x-axis scale explicitly to 0-100
135+ plt .gca ().invert_yaxis () # Invert y-axis to show the highest percentage at the top
136+ plt .grid (axis = 'x' , linestyle = '--' , alpha = 0.7 )
137+ plt .tight_layout ()
138+ plt .show ()
139+
140+ # Process each run to calculate and collect scores for distribution
141+ # (This part can be slow as we have to page over results for each run, so we parallelize this.)
142+ all_scores = []
143+ run_labels = []
144+ score_summary = [] # To store data for the summary table
145+
146+ print ("=" * 50 )
147+ print ("Fetching scores..." )
148+ print ("=" * 50 )
149+
150+ futures = {} # dict of "future: (model, eval_id)" so we can easily access which model powered the run.
151+ with ThreadPoolExecutor (thread_name_prefix = "eval-run-fetcher" ) as pool :
152+ for _ , row in df .iterrows ():
153+ run_id = row ['id' ]
154+ eval_id = row ['eval_id' ]
155+ future = pool .submit (get_eval_run_output_items , client , eval_id , run_id )
156+ futures .update ({ future : (row ['model' ] , eval_id )})
157+
158+ for f in as_completed (futures .keys ()):
159+ try :
160+ model , eval_id = futures [f ]
161+ scores = f .result ()
162+ except Exception as e :
163+ print (f"exception fetching future result: { e } " )
164+ scores = None
165+ if scores :
166+ avg_score = sum (scores ) / len (scores )
167+ min_score = min (scores )
168+ max_score = max (scores )
169+ p10 = np .percentile (scores , 10 ) # 10th percentile
170+ p25 = np .percentile (scores , 25 ) # 25th percentile
171+ p50 = np .percentile (scores , 50 ) # 50th percentile (median)
172+ p75 = np .percentile (scores , 75 ) # 75th percentile
173+ p90 = np .percentile (scores , 90 ) # 90th percentile
174+
175+ # Collect scores and labels for the combined chart
176+ all_scores .append ((scores , eval_id_to_color [eval_id ])) # Include color for the subplot
177+ run_labels .append (f"{ model } ({ eval_id_to_name [eval_id ]} )" ) # Include eval name in the label
178+
179+ # Add data to the summary table
180+ score_summary .append ({
181+ "Model" : model ,
182+ "Evaluation Name" : eval_id_to_name [eval_id ],
183+ "Average Score" : f"{ avg_score :.2f} " ,
184+ "Min Score" : f"{ min_score :.2f} " ,
185+ "Max Score" : f"{ max_score :.2f} " ,
186+ "10th Percentile" : f"{ p10 :.2f} " ,
187+ "25th Percentile" : f"{ p25 :.2f} " ,
188+ "50th Percentile" : f"{ p50 :.2f} " ,
189+ "75th Percentile" : f"{ p75 :.2f} " ,
190+ "90th Percentile" : f"{ p90 :.2f} "
191+ })
192+
193+ # Display the score summary as a table
194+ if score_summary :
195+ score_df = pd .DataFrame (score_summary )
196+ score_df = score_df .sort_values (by = ['Evaluation Name' , 'Average Score' ], ascending = [True , False ]) # Sort by eval_name and avg_score
197+ print ("\n " + "=" * 50 )
198+ print ("Score Summary Table:" )
199+ print (score_df .to_string (index = False ))
200+ print ("=" * 50 )
201+
202+ # Plot all score distributions in a single chart with a maximum of 4 graphs per row
203+ if all_scores :
204+ num_runs = len (all_scores )
205+ max_cols = 4 # Maximum number of graphs per row
206+ num_rows = (num_runs + max_cols - 1 ) // max_cols # Calculate the number of rows
207+
208+ _ , axes = plt .subplots (num_rows , max_cols , figsize = (5 * max_cols , 4 * num_rows ), sharey = True )
209+ axes = axes .flatten () # Flatten the axes array for easier indexing
210+
211+ for i , ((scores , color ), label ) in enumerate (zip (all_scores , run_labels )):
212+ ax = axes [i ]
213+ ax .hist (scores , bins = 10 , color = color , edgecolor = 'black' ) # Use color for the histogram
214+ ax .set_title (label , fontsize = 10 ) # Include model and evaluation name
215+ ax .set_xlabel ("Score" )
216+ ax .set_ylabel ("Frequency" )
217+ ax .set_xlim (x_range [0 ], x_range [1 ]) # Fix the x-axis range between 0 and 7
218+ ax .grid (axis = 'y' , linestyle = '--' , alpha = 0.7 )
219+
220+ # Hide any unused subplots
221+ for j in range (len (all_scores ), len (axes )):
222+ axes [j ].axis ('off' )
223+
224+ plt .tight_layout ()
225+ plt .suptitle ("Score Distributions for each Model" , fontsize = 16 , y = 1.02 )
226+ plt .show ()
227+ else :
228+ print ("\n " + "=" * 50 )
229+ print ("No evaluation runs found for the provided Evaluation IDs." )
230+ print ("=" * 50 )
231+
232+
233+ def get_eval_run_output_items (client : openai .Client , eval_id : str , run_id : str ) -> list :
234+ """
235+ Fetch the output items for a specific evaluation run and extract the result scores.
236+
237+ Args:
238+ eval_id (str): The evaluation ID.
239+ run_id (str): The run ID.
240+
241+ Returns:
242+ list: A list of scores for the output items.
243+ """
244+ scores = []
245+
246+ try :
247+ response = client .evals .runs .output_items .list (run_id = run_id , eval_id = eval_id )
248+ for page in response .iter_pages ():
249+ for item in page .data :
250+ for result in item .results :
251+ score = result .get ("score" )
252+ if score is not None :
253+ scores .append (score )
254+ except Exception as e :
255+ print (f"Failed to fetch output items for run { run_id } . Error: { e } " )
256+
257+ return scores
0 commit comments