Spaces:
Running
Running
File size: 21,107 Bytes
96c67b7 12f59da 919a2b5 96c67b7 12f59da 04447ea 40882d2 04447ea e2c374a 04447ea 902de67 9ac44b1 04447ea 90c7701 04447ea 90c7701 04447ea 2a64d15 04447ea 40882d2 04447ea 90c7701 40882d2 04447ea 90c7701 04447ea 90c7701 04447ea 12f59da 7ffe204 12f59da 523da05 7ffe204 52583fc 7ffe204 12f59da 7ffe204 52583fc 7ffe204 12f59da 04447ea 12f59da 04447ea 96c67b7 04447ea 12f59da 96c67b7 12f59da 96c67b7 12f59da 96c67b7 12f59da 96c67b7 12f59da 96c67b7 12f59da 96c67b7 12f59da 96c67b7 12f59da 52583fc 96c67b7 12f59da 96c67b7 12f59da 96c67b7 12f59da 96c67b7 12f59da 04447ea 96c67b7 04447ea 96c67b7 12f59da 96c67b7 12f59da 04447ea 96c67b7 12f59da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 |
import logging
import sys
from dataclasses import dataclass, make_dataclass
from enum import Enum
import numpy as np
from src.display.formatting import make_clickable_model, model_hyperlink
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
level=logging.INFO,
)
def fields(raw_class):
return [
v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"
]
### The Models (System-Under-Study, SUT) we're evaluating. ###
class ModelType(Enum):
BASE = "π₯ Base"
SFT = "β SFT"
PREFERENCE_ALIGNED = "β¦οΈ Preference-aligned"
UNKNOWN = "β Unknown"
class Multilingual(Enum):
MONOLINGUAL = "π Monolingual"
MULTILINGUAL = "π’ Multilingual"
SEA = "π΅ SEA-Focused"
UNKNOWN = "β Unknown"
@dataclass
class ModelSUT:
# fmt: off
param_size: float # Number of parameters
model_type: str # Model type: SFT, Preference-aligned
multilingual: str # Multilingual: Monolingual, SEA-focused, Multilingual
# fmt: on
model_registry = {
# fmt: off
"gpt-4o-2024-08-06": ModelSUT(param_size=-1, model_type=ModelType.UNKNOWN.value, multilingual=Multilingual.MULTILINGUAL.value),
"gpt-4o-mini": ModelSUT(param_size=-1, model_type=ModelType.UNKNOWN.value, multilingual=Multilingual.MULTILINGUAL.value),
"aisingapore/gemma2-9b-cpt-sea-lionv3-instruct": ModelSUT(param_size=9, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
"aisingapore/llama3.1-8b-cpt-sea-lionv3-instruct": ModelSUT(param_size=8, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
"aisingapore/Llama-SEA-LION-v3-70B-IT": ModelSUT(param_size=70, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
"google/gemma-2-9b-it": ModelSUT(param_size=9, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"google/gemma-2-27b-it": ModelSUT(param_size=27, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"google/gemma-3-27b-it": ModelSUT(param_size=27, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"google/gemma-3-12b-it": ModelSUT(param_size=12, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"sail/Sailor2-20B-Chat": ModelSUT(param_size=20, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.SEA.value),
"sail/Sailor2-8B-Chat": ModelSUT(param_size=8, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.SEA.value),
"Qwen/Qwen2.5-72B-Instruct": ModelSUT(param_size=72, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"Qwen/Qwen2.5-32B-Instruct": ModelSUT(param_size=32, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"Qwen/Qwen2.5-14B-Instruct": ModelSUT(param_size=14, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"Qwen/Qwen2.5-7B-Instruct": ModelSUT(param_size=7, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"Qwen/Qwen3-32B": ModelSUT(param_size=32, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value),
"Qwen/Qwen3-14B": ModelSUT(param_size=14, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value),
"Qwen/Qwen3-8B": ModelSUT(param_size=8, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value),
"Qwen/Qwen3-4B": ModelSUT(param_size=4, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value),
"aisingapore/Llama-SEA-LION-v3.5-70B-R": ModelSUT(param_size=70, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
"aisingapore/Llama-SEA-LION-v3.5-8B-R": ModelSUT(param_size=8, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
"CohereLabs/c4ai-command-a-03-2025": ModelSUT(param_size=111, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value),
"CohereLabs/c4ai-command-r7b-12-2024": ModelSUT(param_size=7, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value),
"SeaLLMs/SeaLLMs-v3-1.5B-Chat": ModelSUT(param_size=1.5, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
"SeaLLMs/SeaLLMs-v3-7B-Chat": ModelSUT(param_size=7, model_type=ModelType.SFT.value, multilingual=Multilingual.SEA.value),
"mistralai/Ministral-8B-Instruct-2410": ModelSUT(param_size=8, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"mistralai/Mixtral-8x7B-Instruct-v0.1": ModelSUT(param_size=47, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"Tower-Babel/Babel-9B-Chat": ModelSUT(param_size=9, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"Tower-Babel/Babel-83B-Chat": ModelSUT(param_size=83, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"Tower-Babel/Babel-83B": ModelSUT(param_size=83, model_type=ModelType.BASE.value, multilingual=Multilingual.MULTILINGUAL.value),
"meta-llama/Llama-3.1-8B-Instruct": ModelSUT(param_size=8, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"meta-llama/Llama-3.1-70B-Instruct": ModelSUT(param_size=70, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": ModelSUT(param_size=400, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value),
"meta-llama/Llama-4-Scout-17B-16E-Instruct": ModelSUT(param_size=109, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value),
"mistralai/Mixtral-8x22B-Instruct-v0.1": ModelSUT(param_size=141, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"CohereForAI/aya-expanse-32b": ModelSUT(param_size=32, model_type=ModelType.PREFERENCE_ALIGNED.value, multilingual=Multilingual.MULTILINGUAL.value),
"neulab/Pangea-7B": ModelSUT(param_size=7, model_type=ModelType.SFT.value, multilingual=Multilingual.MULTILINGUAL.value),
"HuggingFaceTB/SmolLM-1.7B-Instruct": ModelSUT(param_size=1.7, model_type=ModelType.SFT.value, multilingual=Multilingual.MONOLINGUAL.value),
# fmt: on
}
### The Task and Tasks classes store information about each benchmark we're scoring. ###
class TaskCategory(Enum):
CULTURAL_KNOWLEDGE = "π Cultural Knowledge"
CLASSICAL_NLP = "ποΈ Classical NLP"
READING_COMPREHENSION = "π Reading Comprehension"
TRANSLATION = "π’ Generation"
@dataclass
class Task:
benchmark: str # benchmark name in the results file
metric: str # metric to display
col_name: str # column name to display
language: str # language being evaluated
category: str # choice between different task categories
num_samples: int # canonical number of examples
class Tasks(Enum):
# fmt: off
balita_tgl_mcf = Task("balita_tgl_mcf", "acc_", "ποΈ BalitaNLP", "tgl", TaskCategory.CLASSICAL_NLP, 35_177)
belebele_ceb_mcf = Task("belebele_ceb_mcf", "acc_", "π Belebele (ceb)", "ceb", TaskCategory.READING_COMPREHENSION, 900)
belebele_fil_mcf = Task("belebele_fil_mcf", "acc_", "π Belebele (fil)", "fil", TaskCategory.READING_COMPREHENSION, 900)
cebuaner_ceb_mcf = Task("cebuaner_ceb_mcf", "acc_", "ποΈ CebuaNER", "ceb", TaskCategory.CLASSICAL_NLP, 1310)
dengue_filipino_fil = Task("dengue_filipino_fil:_average", "acc_norm", "ποΈ Dengue", "fil", TaskCategory.CLASSICAL_NLP, 4015)
firecs_fil_mcf = Task("firecs_fil_mcf", "acc_", "ποΈ FiReCS", "fil", TaskCategory.CLASSICAL_NLP, 7340)
global_mmlu_all_tgl = Task("global_mmlu_all_tgl_mcf:_average", "acc_", "π Global-MMLU", "tgl", TaskCategory.CULTURAL_KNOWLEDGE, 14_042)
include_tgl_mcf = Task("include_tgl_mcf:_average", "acc_", "π INCLUDE", "tgl", TaskCategory.CULTURAL_KNOWLEDGE, 500)
kalahi_tgl_mcf = Task("kalahi_tgl_mcf", "acc_", "π KALAHI", "tgl", TaskCategory.CULTURAL_KNOWLEDGE, 150)
newsphnli_fil_mcf = Task("newsphnli_fil_mcf", "acc_", "π NewsPH NLI", "fil", TaskCategory.READING_COMPREHENSION, 90_000)
ntrex128_fil = Task("ntrex128_fil", "rougeL", "π’ NTREX-128", "fil", TaskCategory.TRANSLATION, 1997)
readability_ceb_mcf = Task("readability_ceb_mcf", "acc_", "π Readability (ceb)", "ceb", TaskCategory.READING_COMPREHENSION, 350)
sib200_ceb_mcf = Task("sib200_ceb_mcf", "acc_", "ποΈ SIB-200 (ceb)", "ceb", TaskCategory.CLASSICAL_NLP, 99)
sib200_tgl_mcf = Task("sib200_tgl_mcf", "acc_", "ποΈ SIB-200 (tgl)", "tgl", TaskCategory.CLASSICAL_NLP, 99)
# stingraybench_corr_tgl_mcf = Task("stingraybench_correctness_tgl_mcf", "acc_", "StingrayBench (Correctness)", "tgl", TaskCategory.CULTURAL_KNOWLEDGE, 100)
stingraybench_sem_appropriateness_tgl_mcf = Task("stingraybench_semantic_appropriateness_tgl_mcf", "acc_", "πStingrayBench", "tgl", TaskCategory.CULTURAL_KNOWLEDGE, 100)
tatoeba_ceb = Task("tatoeba_ceb", "rougeL", "π’ Tatoeba (ceb)", "ceb", TaskCategory.TRANSLATION, 377)
tatoeba_tgl = Task("tatoeba_tgl", "rougeL", "π’ Tatoeba (tgl)", "tgl", TaskCategory.TRANSLATION, 2499)
tico19_tgl = Task("tico19_tgl", "rougeL", "π’ TICO-19", "tgl", TaskCategory.TRANSLATION, 971)
tlunifiedner_tgl_mcf = Task("tlunifiedner_tgl_mcf", "acc_", "ποΈ TLUnified NER", "tgl", TaskCategory.CLASSICAL_NLP, 1579)
universalner_ceb_mcf = Task("universalner_ceb_mcf", "acc_", "ποΈ Universal NER (ceb)", "ceb", TaskCategory.CLASSICAL_NLP, 49)
universalner_tgl_mcf = Task("universalner_tgl_mcf", "acc_", "ποΈ Universal NER (tgl)", "tgl", TaskCategory.CLASSICAL_NLP, 56)
# fmt: on
### These classes define how the columns will be represented ###
@dataclass
class ColumnContent:
name: str
type: str
displayed_by_default: bool
hidden: bool = False
never_hidden: bool = False
aggregate: bool = False
meta: bool = False
auto_eval_cols = [
# fmt: off
["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True, meta=True)],
["average", ColumnContent, ColumnContent("Average β¬οΈ", "number", True, meta=True)],
["precision", ColumnContent, ColumnContent("Precision", "str", False, meta=True)],
["param_size", ColumnContent, ColumnContent("# Parameters", "number", False, meta=True)],
["multilingual", ColumnContent, ColumnContent("Multilingual", "markdown", False, meta=True)],
["model_type", ColumnContent, ColumnContent("Model Type", "markdown", False, meta=True)],
["is_submission", ColumnContent, ColumnContent("Submission", "boolean", False, meta=True)],
["submission_date", ColumnContent, ColumnContent("Submission Date", "str", False, meta=True)],
# fmt: on
]
for task in Tasks:
auto_eval_cols.append(
[task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)]
)
for task_category in TaskCategory:
auto_eval_cols.append(
[
task_category.name,
ColumnContent,
ColumnContent(task_category.value, "number", True, aggregate=True),
]
)
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_cols, frozen=True)
### These classes define how a single model evaluation result will be represented ###
@dataclass
class ModelDetails:
name: str
display_name: str = ""
symbol: str = "" # emoji
class Precision(Enum):
float16 = ModelDetails("float16")
bfloat16 = ModelDetails("bfloat16")
Unknown = ModelDetails("?")
def from_str(precision):
if precision in ["torch.float16", "float16"]:
return Precision.float16
if precision in ["torch.bfloat16", "bfloat16"]:
return Precision.bfloat16
return Precision.Unknown
@dataclass
class EvalResult:
"""Represent one full model evaluation."""
eval_name: str
full_model: str
org: str
model: str
results: dict
average: float
aggregate_results: dict
precision: Precision = Precision.Unknown
# Submission metadata
is_submission: bool = False
param_size: float = -1
model_type: str = ModelType.UNKNOWN.value
multilingual: str = Multilingual.UNKNOWN.value
submission_date: str = ""
model_url: str = "https://huggingface.co/spaces/UD-Filipino/filbench-leaderboard"
@classmethod
def init_from_dict(self, data: dict, is_submission: bool = False) -> "EvalResult":
"""Populate results from a dictionary"""
# For model details, use user-provided metadata if it's a submission
config_key = "display_metadata" if is_submission else "config"
config = data.get(config_key)
precision = Precision.from_str(config.get("model_dtype"))
org_and_model = (
config.get("hf_id")
if is_submission
else config.get("model_name", config.get("model_args", None))
)
org_and_model = org_and_model.split("/", 1)
if len(org_and_model) == 1:
org = None
model = org_and_model[0]
result_key = f"{model}_{precision.value.name}"
else:
org = org_and_model[0]
model = org_and_model[1]
result_key = f"{org}_{model}_{precision.value.name}"
full_model = "/".join(org_and_model)
results = EvalResult.compute_scores_per_benchmark(data.get("results"))
aggregate_results = EvalResult.compute_aggregate_results(results)
filbench_score = np.mean(list(aggregate_results.values()))
# Format all results
if is_submission:
# Use pre-computed scores and check if they match our computed scores
category_scores = data.get("category_scores")
aggregate_results_precomputed = {
TaskCategory.CULTURAL_KNOWLEDGE.value: category_scores.get(
"CULTURAL_KNOWLEDGE"
),
TaskCategory.CLASSICAL_NLP.value: category_scores.get("CLASSICAL_NLP"),
TaskCategory.READING_COMPREHENSION.value: category_scores.get(
"READING_COMPREHENSION"
),
TaskCategory.TRANSLATION.value: category_scores.get("GENERATION"),
}
is_similar = EvalResult.compare_category_scores(
precomputed=aggregate_results_precomputed,
computed=aggregate_results,
)
if not is_similar:
logging.warning("Precomputed and computed category scores differ.")
logging.info("Will use computed scores for display.")
else:
logging.info("Precomputed and computed category scores are similar.")
aggregate_results = aggregate_results_precomputed
# Do the same comparison for FilBench score
filbench_score_precomputed = data.get("filbench_score")
is_filbench_score_similar = (
abs(filbench_score_precomputed - filbench_score) < 1e-2
)
if not is_filbench_score_similar:
logging.warning(
f"Precomputed filbench_score ({filbench_score_precomputed}) and"
f" official FilBench score ({filbench_score}) differ."
)
average = (
filbench_score_precomputed
if is_filbench_score_similar
else filbench_score
)
display_metadata = data.get("display_metadata")
return EvalResult(
eval_name=result_key,
full_model=full_model,
org=org,
model=model,
precision=precision,
results=results,
aggregate_results=aggregate_results,
average=average,
# Display Metadata
is_submission=True,
submission_date=display_metadata.get("submission_date", ""),
param_size=display_metadata.get("num_params", -1),
model_type=display_metadata.get("model_type", ModelType.UNKNOWN.value),
multilingual=display_metadata.get(
"multilinguality", Multilingual.UNKNOWN.value
),
model_url=display_metadata.get(
"url",
"https://huggingface.co/spaces/UD-Filipino/filbench-leaderboard",
),
)
else:
return self(
eval_name=result_key,
full_model=full_model,
org=org,
model=model,
precision=precision,
results=results,
aggregate_results=aggregate_results,
is_submission=False,
average=filbench_score,
)
@classmethod
def compute_scores_per_benchmark(cls, results: dict) -> dict[str, float]:
scores_per_benchmark = {}
for task in Tasks:
task = task.value
if results.get(task.benchmark):
score = results.get(task.benchmark).get(task.metric)
if "acc_" in task.metric:
score = score * 100.0
if "rougeL" in task.metric:
score = score * 100.0
scores_per_benchmark[task.benchmark] = score
else:
scores_per_benchmark[task.benchmark] = None
return scores_per_benchmark
@classmethod
def compute_aggregate_results(cls, results: dict) -> dict[str, float]:
aggregate_results = {}
for task_category in TaskCategory:
tasks = [
task.value for task in Tasks if task.value.category == task_category
]
total_category = sum([task.num_samples for task in tasks])
weighted_total_category = 0
for task in tasks:
if results[task.benchmark]:
score = results[task.benchmark]
else:
score = 0
weighted_total_category += score * task.num_samples
aggregate_results[task_category.value] = (
weighted_total_category / total_category
)
return aggregate_results
@classmethod
def compare_category_scores(
cls, precomputed: dict, computed: dict, threshold: float = 1e-2
) -> bool:
"""Compares precomputed and computed category scores."""
is_similar = True
for key, precomputed_value in precomputed.items():
computed_value = computed.get(key)
if precomputed_value is not None and computed_value is not None:
if abs(precomputed_value - computed_value) > threshold:
logging.warning(
f"Aggregate result for '{key}' differs"
f" (precomputed={precomputed_value}, computed={computed_value})"
)
is_similar = False
return is_similar
def to_dict(self):
"""Converts the EvalResult to a dict compatible with our dataframe display"""
if not self.is_submission:
model_details = model_registry.get(
self.full_model,
ModelSUT(
param_size=-1,
model_type=ModelType.UNKNOWN.value,
multilingual=Multilingual.UNKNOWN.value,
),
)
else:
model_details = ModelSUT(
param_size=self.param_size,
model_type=self.model_type,
multilingual=self.multilingual,
)
model_name_with_url = (
make_clickable_model(self.full_model)
if not self.is_submission
else f"π₯ {model_hyperlink(self.model_url, self.full_model)}"
)
data_dict = {
"eval_name": self.eval_name, # not a column, just a save name
AutoEvalColumn.precision.name: self.precision.value.name,
AutoEvalColumn.model.name: model_name_with_url,
AutoEvalColumn.average.name: self.average,
AutoEvalColumn.param_size.name: model_details.param_size,
AutoEvalColumn.model_type.name: model_details.model_type,
AutoEvalColumn.multilingual.name: model_details.multilingual,
AutoEvalColumn.is_submission.name: self.is_submission,
AutoEvalColumn.submission_date.name: self.submission_date,
}
for task in Tasks:
data_dict[task.value.col_name] = self.results[task.value.benchmark]
for task_category in TaskCategory:
data_dict[task_category.value] = self.aggregate_results[task_category.value]
return data_dict
|