admin-healthelic commited on
Commit
e544f11
·
verified ·
1 Parent(s): 2f93c3b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -60
app.py CHANGED
@@ -2,6 +2,8 @@ import os
2
  import json
3
  import re
4
  import uuid
 
 
5
  from datetime import datetime
6
  import openai
7
  import gradio as gr
@@ -31,12 +33,9 @@ GRADER_MODEL = "gpt-4o-mini"
31
  openai.api_key = os.getenv("OPENAI_API_KEY")
32
  genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
33
 
34
- # Models that only support default temperature (don’t allow setting temperature manually)
35
  MODEL_DEFAULT_TEMP = ["o4-mini"]
36
 
37
- # Local JSON file for storing runs
38
- RUNS_FILE = "/data/runs2.json"
39
-
40
  # -------------------------
41
  # Helper to read JSONL
42
  # -------------------------
@@ -67,9 +66,9 @@ class HealthBenchEval:
67
 
68
  self.scores = []
69
  self.htmls = ""
70
- self.sample_records = [] # <-- store per-sample rows
71
  self.seed = seed
72
- self.eval_id = str(uuid.uuid4()) # unique id per evaluation run
73
 
74
  def score_with_grader(self, prompt_text, completion_text, example_index):
75
  prompt = f"""
@@ -96,10 +95,7 @@ Return only a number between 0 and 1.
96
  return 0.0
97
 
98
  def generate_with_candidate(self, candidate_model, system_prompt, prompt_text, example_index, max_tokens=1024):
99
- """
100
- Generate completion with retry logic and better error logging.
101
- """
102
- for attempt in range(3): # retry up to 3 times
103
  try:
104
  if candidate_model.startswith("gemini"):
105
  model = genai.GenerativeModel(candidate_model)
@@ -119,7 +115,6 @@ Return only a number between 0 and 1.
119
  messages.append({"role": "system", "content": system_prompt})
120
  messages.append({"role": "user", "content": prompt_text})
121
 
122
- # Skip temperature for models that don't support it
123
  if candidate_model in MODEL_DEFAULT_TEMP:
124
  resp = openai.chat.completions.create(
125
  model=candidate_model,
@@ -134,7 +129,6 @@ Return only a number between 0 and 1.
134
  max_completion_tokens=max_tokens
135
  )
136
  completion = resp.choices[0].message.content
137
- print(resp)
138
 
139
  return completion.strip() if hasattr(completion, "strip") else completion
140
 
@@ -142,7 +136,7 @@ Return only a number between 0 and 1.
142
  print(f"[ERROR] Candidate model {candidate_model} failed at dataset index {example_index} (attempt {attempt+1}/3)")
143
  print(f"Prompt text: {prompt_text[:200]}...")
144
  print(f"Error: {e}")
145
- if attempt == 2: # after last attempt
146
  return f"[ERROR after 3 retries: {str(e)}]"
147
 
148
  def __call__(self, candidate_model, system_prompt, eval_subset=""):
@@ -150,21 +144,19 @@ Return only a number between 0 and 1.
150
 
151
  cumulative_total = 0.0
152
  for i, example in enumerate(self.dataset):
153
- dataset_index = self.indices[i] # actual dataset row index
154
  prompt_obj = example.get("prompt", [])
155
  prompt_text = " ".join([m.get("content", "") for m in prompt_obj])
156
 
157
  completion_text = self.generate_with_candidate(candidate_model, system_prompt, prompt_text, dataset_index)
158
  score = self.score_with_grader(prompt_text, completion_text, dataset_index)
159
 
160
- # update running totals (per eval_id)
161
  cumulative_total += score
162
  cumulative_avg = cumulative_total / (i + 1)
163
 
164
  self.scores.append(score)
165
  html_lines.append(f"<li>Dataset Row {dataset_index}: Score = {score:.3f}</li>")
166
 
167
- # create individual sample record
168
  self.sample_records.append({
169
  "eval_id": self.eval_id,
170
  "timestamp": datetime.utcnow().isoformat(),
@@ -184,19 +176,9 @@ Return only a number between 0 and 1.
184
  return self
185
 
186
  # -------------------------
187
- # Helper to generate HTML table from runs
188
  # -------------------------
189
- def generate_runs_html():
190
- runs = []
191
- if os.path.exists(RUNS_FILE):
192
- try:
193
- with open(RUNS_FILE, "r", encoding="utf-8") as f:
194
- runs = json.load(f)
195
- if not isinstance(runs, list):
196
- runs = []
197
- except (json.JSONDecodeError, ValueError):
198
- runs = []
199
-
200
  if runs:
201
  table_rows = ""
202
  for r in reversed(runs):
@@ -244,24 +226,40 @@ def generate_runs_html():
244
  """
245
  else:
246
  runs_html = "<p>No evaluations yet.</p>"
247
-
248
  return runs_html
249
 
250
- # -------------------------
251
- # Clear runs file
252
- # -------------------------
253
- def clear_runs():
254
- with open(RUNS_FILE, "w", encoding="utf-8") as f:
255
- json.dump([], f, indent=2)
256
- return "<p>No evaluations yet.</p>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
  # -------------------------
259
  # Gradio UI function
260
  # -------------------------
261
- def run_eval_ui(candidate_model, system_prompt, eval_subset, num_examples, seed):
262
  dataset_file = DATASET_FILES.get(eval_subset)
263
  if not dataset_file:
264
- return "<p style='color:red'>Invalid dataset</p>", {}, generate_runs_html()
265
 
266
  seed_val = int(seed) if seed else None
267
  num_val = int(num_examples) if num_examples else None
@@ -269,23 +267,8 @@ def run_eval_ui(candidate_model, system_prompt, eval_subset, num_examples, seed)
269
  eval_obj = HealthBenchEval(dataset_file, num_examples=num_val, seed=seed_val)
270
  result = eval_obj(candidate_model, system_prompt, eval_subset=eval_subset)
271
 
272
- # Load existing runs
273
- runs = []
274
- if os.path.exists(RUNS_FILE):
275
- try:
276
- with open(RUNS_FILE, "r", encoding="utf-8") as f:
277
- runs = json.load(f)
278
- if not isinstance(runs, list):
279
- runs = []
280
- except (json.JSONDecodeError, ValueError):
281
- runs = []
282
-
283
  runs.extend(result.sample_records)
284
-
285
- with open(RUNS_FILE, "w", encoding="utf-8") as f:
286
- json.dump(runs, f, indent=2)
287
-
288
- runs_html = generate_runs_html()
289
 
290
  metrics = {
291
  "eval_id": result.eval_id,
@@ -295,7 +278,10 @@ def run_eval_ui(candidate_model, system_prompt, eval_subset, num_examples, seed)
295
  "seed": seed_val
296
  }
297
 
298
- return result.htmls, metrics, runs_html
 
 
 
299
 
300
  # -------------------------
301
  # Gradio UI
@@ -328,22 +314,32 @@ def ui():
328
 
329
  output_html = gr.HTML(label="Evaluation Report")
330
  output_metrics = gr.JSON(label="Metrics JSON")
331
- output_all_runs = gr.HTML(label="Evaluation History", value=generate_runs_html())
 
332
 
333
  with gr.Row():
334
  clear_btn = gr.Button("Clear History")
 
 
 
 
335
 
336
- # Connect buttons
337
  run_btn.click(
338
  fn=run_eval_ui,
339
- inputs=[candidate_model, system_prompt, eval_subset, num_examples, seed],
340
- outputs=[output_html, output_metrics, output_all_runs]
341
  )
342
 
343
  clear_btn.click(
344
  fn=clear_runs,
345
  inputs=[],
346
- outputs=[output_all_runs]
 
 
 
 
 
 
347
  )
348
 
349
  return demo
 
2
  import json
3
  import re
4
  import uuid
5
+ import io
6
+ import csv
7
  from datetime import datetime
8
  import openai
9
  import gradio as gr
 
33
  openai.api_key = os.getenv("OPENAI_API_KEY")
34
  genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
35
 
36
+ # Models that only support default temperature
37
  MODEL_DEFAULT_TEMP = ["o4-mini"]
38
 
 
 
 
39
  # -------------------------
40
  # Helper to read JSONL
41
  # -------------------------
 
66
 
67
  self.scores = []
68
  self.htmls = ""
69
+ self.sample_records = []
70
  self.seed = seed
71
+ self.eval_id = str(uuid.uuid4())
72
 
73
  def score_with_grader(self, prompt_text, completion_text, example_index):
74
  prompt = f"""
 
95
  return 0.0
96
 
97
  def generate_with_candidate(self, candidate_model, system_prompt, prompt_text, example_index, max_tokens=1024):
98
+ for attempt in range(3):
 
 
 
99
  try:
100
  if candidate_model.startswith("gemini"):
101
  model = genai.GenerativeModel(candidate_model)
 
115
  messages.append({"role": "system", "content": system_prompt})
116
  messages.append({"role": "user", "content": prompt_text})
117
 
 
118
  if candidate_model in MODEL_DEFAULT_TEMP:
119
  resp = openai.chat.completions.create(
120
  model=candidate_model,
 
129
  max_completion_tokens=max_tokens
130
  )
131
  completion = resp.choices[0].message.content
 
132
 
133
  return completion.strip() if hasattr(completion, "strip") else completion
134
 
 
136
  print(f"[ERROR] Candidate model {candidate_model} failed at dataset index {example_index} (attempt {attempt+1}/3)")
137
  print(f"Prompt text: {prompt_text[:200]}...")
138
  print(f"Error: {e}")
139
+ if attempt == 2:
140
  return f"[ERROR after 3 retries: {str(e)}]"
141
 
142
  def __call__(self, candidate_model, system_prompt, eval_subset=""):
 
144
 
145
  cumulative_total = 0.0
146
  for i, example in enumerate(self.dataset):
147
+ dataset_index = self.indices[i]
148
  prompt_obj = example.get("prompt", [])
149
  prompt_text = " ".join([m.get("content", "") for m in prompt_obj])
150
 
151
  completion_text = self.generate_with_candidate(candidate_model, system_prompt, prompt_text, dataset_index)
152
  score = self.score_with_grader(prompt_text, completion_text, dataset_index)
153
 
 
154
  cumulative_total += score
155
  cumulative_avg = cumulative_total / (i + 1)
156
 
157
  self.scores.append(score)
158
  html_lines.append(f"<li>Dataset Row {dataset_index}: Score = {score:.3f}</li>")
159
 
 
160
  self.sample_records.append({
161
  "eval_id": self.eval_id,
162
  "timestamp": datetime.utcnow().isoformat(),
 
176
  return self
177
 
178
  # -------------------------
179
+ # Helpers
180
  # -------------------------
181
+ def generate_runs_html(runs):
 
 
 
 
 
 
 
 
 
 
182
  if runs:
183
  table_rows = ""
184
  for r in reversed(runs):
 
226
  """
227
  else:
228
  runs_html = "<p>No evaluations yet.</p>"
 
229
  return runs_html
230
 
231
+ def generate_csv(runs):
232
+ if not runs:
233
+ return None
234
+ output = io.StringIO()
235
+ fieldnames = ["eval_id", "timestamp", "candidate_model", "system_prompt", "eval_subset",
236
+ "seed", "dataset_index", "prompt_text", "completion_text", "score",
237
+ "cumulative_total", "cumulative_avg"]
238
+ writer = csv.DictWriter(output, fieldnames=fieldnames)
239
+ writer.writeheader()
240
+ for run in runs:
241
+ writer.writerow(run)
242
+ csv_data = output.getvalue()
243
+ output.close()
244
+ return csv_data
245
+
246
+ def prepare_download(runs):
247
+ csv_data = generate_csv(runs)
248
+ if not csv_data:
249
+ return None
250
+ filename = f"eval_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
251
+ filepath = os.path.join("/tmp", filename)
252
+ with open(filepath, "w", encoding="utf-8") as f:
253
+ f.write(csv_data)
254
+ return filepath
255
 
256
  # -------------------------
257
  # Gradio UI function
258
  # -------------------------
259
+ def run_eval_ui(candidate_model, system_prompt, eval_subset, num_examples, seed, runs):
260
  dataset_file = DATASET_FILES.get(eval_subset)
261
  if not dataset_file:
262
+ return "<p style='color:red'>Invalid dataset</p>", {}, generate_runs_html(runs), runs
263
 
264
  seed_val = int(seed) if seed else None
265
  num_val = int(num_examples) if num_examples else None
 
267
  eval_obj = HealthBenchEval(dataset_file, num_examples=num_val, seed=seed_val)
268
  result = eval_obj(candidate_model, system_prompt, eval_subset=eval_subset)
269
 
 
 
 
 
 
 
 
 
 
 
 
270
  runs.extend(result.sample_records)
271
+ runs_html = generate_runs_html(runs)
 
 
 
 
272
 
273
  metrics = {
274
  "eval_id": result.eval_id,
 
278
  "seed": seed_val
279
  }
280
 
281
+ return result.htmls, metrics, runs_html, runs
282
+
283
+ def clear_runs():
284
+ return "<p>No evaluations yet.</p>", []
285
 
286
  # -------------------------
287
  # Gradio UI
 
314
 
315
  output_html = gr.HTML(label="Evaluation Report")
316
  output_metrics = gr.JSON(label="Metrics JSON")
317
+ output_all_runs = gr.HTML(label="Evaluation History")
318
+ session_runs = gr.State([])
319
 
320
  with gr.Row():
321
  clear_btn = gr.Button("Clear History")
322
+ download_btn = gr.DownloadButton(
323
+ label="Download CSV",
324
+ variant="secondary"
325
+ )
326
 
 
327
  run_btn.click(
328
  fn=run_eval_ui,
329
+ inputs=[candidate_model, system_prompt, eval_subset, num_examples, seed, session_runs],
330
+ outputs=[output_html, output_metrics, output_all_runs, session_runs]
331
  )
332
 
333
  clear_btn.click(
334
  fn=clear_runs,
335
  inputs=[],
336
+ outputs=[output_all_runs, session_runs]
337
+ )
338
+
339
+ download_btn.click(
340
+ fn=prepare_download,
341
+ inputs=[session_runs],
342
+ outputs=[download_btn]
343
  )
344
 
345
  return demo