|
|
""" |
|
|
DevRev Search Evaluation Leaderboard |
|
|
|
|
|
An interactive leaderboard for benchmarking search and retrieval systems |
|
|
on enterprise knowledge bases. Built with Gradio and ready for Hugging Face Spaces. |
|
|
|
|
|
Uses MTEB-style standardized JSON format for evaluation results. |
|
|
""" |
|
|
|
|
|
import base64 |
|
|
import io |
|
|
import json |
|
|
import os |
|
|
from datetime import datetime |
|
|
from pathlib import Path |
|
|
|
|
|
import gradio as gr |
|
|
import matplotlib.pyplot as plt |
|
|
import pandas as pd |
|
|
from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns |
|
|
|
|
|
|
|
|
def load_results_from_json(): |
|
|
"""Load evaluation results from standardized JSON files""" |
|
|
results = [] |
|
|
|
|
|
|
|
|
results_dirs = ["results", "leaderboard/results", "."] |
|
|
results_dir = None |
|
|
|
|
|
for dir_path in results_dirs: |
|
|
if os.path.exists(dir_path): |
|
|
temp_dir = Path(dir_path) |
|
|
if any(temp_dir.glob("*.json")): |
|
|
results_dir = temp_dir |
|
|
break |
|
|
|
|
|
if not results_dir: |
|
|
print( |
|
|
"No results directory found. Please create a 'results' directory with JSON files." |
|
|
) |
|
|
return [] |
|
|
|
|
|
|
|
|
for json_file in results_dir.glob("*.json"): |
|
|
|
|
|
if json_file.name == "RESULT_SCHEMA.json": |
|
|
continue |
|
|
|
|
|
try: |
|
|
with open(json_file, "r") as f: |
|
|
data = json.load(f) |
|
|
|
|
|
if "model_name" in data and "metrics" in data: |
|
|
results.append(data) |
|
|
print(f"Loaded: {json_file.name}") |
|
|
except Exception as e: |
|
|
print(f"Error loading {json_file}: {e}") |
|
|
|
|
|
return results |
|
|
|
|
|
|
|
|
def create_leaderboard_data(): |
|
|
"""Create the leaderboard dataframe from JSON results""" |
|
|
|
|
|
|
|
|
results = load_results_from_json() |
|
|
|
|
|
if not results: |
|
|
print( |
|
|
"No evaluation results found. Please add JSON files to the 'results' directory." |
|
|
) |
|
|
return pd.DataFrame() |
|
|
|
|
|
|
|
|
data = [] |
|
|
for result in results: |
|
|
metrics = result.get("metrics", {}) |
|
|
|
|
|
|
|
|
paper_field = result.get("paper", "N/A") |
|
|
if paper_field and paper_field != "N/A": |
|
|
|
|
|
references = [ref.strip() for ref in paper_field.split(";")] |
|
|
formatted_refs = [] |
|
|
for ref in references: |
|
|
if ref.startswith("http"): |
|
|
|
|
|
formatted_refs.append(f"[{ref}]({ref})") |
|
|
else: |
|
|
|
|
|
formatted_refs.append(ref) |
|
|
paper_display = " | ".join(formatted_refs) |
|
|
else: |
|
|
paper_display = "N/A" |
|
|
|
|
|
row = { |
|
|
"π Rank": 0, |
|
|
"π§ Method": result.get("model_name", "Unknown"), |
|
|
"π Paper/Details": paper_display, |
|
|
"π·οΈ Type": result.get("model_type", "Unknown"), |
|
|
"π Recall@5": metrics.get("recall@5", 0), |
|
|
"π Recall@10": metrics.get("recall@10", 0), |
|
|
"π Recall@25": metrics.get("recall@25", 0), |
|
|
"π Recall@50": metrics.get("recall@50", 0), |
|
|
"π Precision@5": metrics.get("precision@5", 0), |
|
|
"π Precision@10": metrics.get("precision@10", 0), |
|
|
"π Precision@25": metrics.get("precision@25", 0), |
|
|
"π Precision@50": metrics.get("precision@50", 0), |
|
|
"π Open Source": "β
" if result.get("open_source", False) else "β", |
|
|
"π
Date": result.get("evaluation_date", "N/A"), |
|
|
} |
|
|
data.append(row) |
|
|
|
|
|
|
|
|
df = pd.DataFrame(data) |
|
|
|
|
|
|
|
|
df = df.sort_values(["π Recall@10", "π Precision@10"], ascending=False) |
|
|
|
|
|
|
|
|
df["π Rank"] = range(1, len(df) + 1) |
|
|
|
|
|
|
|
|
columns_order = [ |
|
|
"π Rank", |
|
|
"π§ Method", |
|
|
"π Paper/Details", |
|
|
"π·οΈ Type", |
|
|
"π Recall@5", |
|
|
"π Recall@10", |
|
|
"π Recall@25", |
|
|
"π Recall@50", |
|
|
"π Precision@5", |
|
|
"π Precision@10", |
|
|
"π Precision@25", |
|
|
"π Precision@50", |
|
|
"π Open Source", |
|
|
"π
Date", |
|
|
] |
|
|
df = df[columns_order] |
|
|
|
|
|
return df |
|
|
|
|
|
|
|
|
def create_comparison_plot(): |
|
|
"""Create performance comparison visualizations""" |
|
|
df = create_leaderboard_data() |
|
|
|
|
|
if df.empty: |
|
|
return "<p style='text-align: center; color: #666;'>No data available for visualization. Please add evaluation results to the 'results' directory.</p>" |
|
|
|
|
|
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6)) |
|
|
|
|
|
|
|
|
df_sorted = df.sort_values("π Recall@50", ascending=True) |
|
|
|
|
|
|
|
|
methods = df_sorted["π§ Method"].tolist() |
|
|
recall_50 = df_sorted["π Recall@50"].tolist() |
|
|
colors = ["#ff6b6b" if "DevRev" in m else "#4ecdc4" for m in methods] |
|
|
|
|
|
ax1.barh(methods, recall_50, color=colors, alpha=0.8) |
|
|
ax1.set_xlabel("Recall@50 (%)", fontsize=12) |
|
|
ax1.set_title("Recall@50 Comparison", fontsize=14, fontweight="bold") |
|
|
ax1.grid(True, axis="x", alpha=0.3) |
|
|
|
|
|
|
|
|
for i, (method, recall) in enumerate(zip(methods, recall_50)): |
|
|
ax1.text(recall + 0.5, i, f"{recall:.1f}%", va="center", fontsize=10) |
|
|
|
|
|
|
|
|
precision_50 = df_sorted["π Precision@50"].tolist() |
|
|
|
|
|
ax2.barh(methods, precision_50, color=colors, alpha=0.8) |
|
|
ax2.set_xlabel("Precision@50 (%)", fontsize=12) |
|
|
ax2.set_title("Precision@50 Comparison", fontsize=14, fontweight="bold") |
|
|
ax2.grid(True, axis="x", alpha=0.3) |
|
|
|
|
|
|
|
|
for i, (method, precision) in enumerate(zip(methods, precision_50)): |
|
|
ax2.text( |
|
|
precision + 0.5, |
|
|
i, |
|
|
f"{precision:.1f}%", |
|
|
va="center", |
|
|
fontsize=10, |
|
|
) |
|
|
|
|
|
plt.tight_layout() |
|
|
|
|
|
|
|
|
buf = io.BytesIO() |
|
|
plt.savefig(buf, format="png", dpi=150, bbox_inches="tight") |
|
|
buf.seek(0) |
|
|
img_base64 = base64.b64encode(buf.read()).decode() |
|
|
plt.close() |
|
|
|
|
|
return f'<img src="data:image/png;base64,{img_base64}" style="width: 100%; max-width: 1000px; margin: 20px auto; display: block;">' |
|
|
|
|
|
|
|
|
def create_interface(): |
|
|
"""Create the Gradio interface with leaderboard and visualizations""" |
|
|
|
|
|
deep_link_js = r""" |
|
|
() => { |
|
|
function openAboutAndScroll() { |
|
|
if (window.location.hash !== "#about") return; |
|
|
|
|
|
// Switch to the About tab (Gradio tabs are rendered as role="tab" buttons) |
|
|
const tabs = Array.from(document.querySelectorAll('button[role="tab"]')); |
|
|
const aboutTab = tabs.find((b) => (b.innerText || "").includes("About")); |
|
|
if (aboutTab) aboutTab.click(); |
|
|
|
|
|
// The About content is mounted after tab switch; retry briefly. |
|
|
let attempts = 0; |
|
|
const timer = setInterval(() => { |
|
|
const el = document.getElementById("about"); |
|
|
if (el) { |
|
|
el.scrollIntoView({ behavior: "smooth", block: "start" }); |
|
|
clearInterval(timer); |
|
|
} |
|
|
attempts += 1; |
|
|
if (attempts > 25) clearInterval(timer); |
|
|
}, 200); |
|
|
} |
|
|
|
|
|
window.addEventListener("hashchange", openAboutAndScroll); |
|
|
openAboutAndScroll(); |
|
|
setTimeout(openAboutAndScroll, 600); |
|
|
} |
|
|
""" |
|
|
|
|
|
with gr.Blocks( |
|
|
title="DevRev Search Evaluation Leaderboard", js=deep_link_js |
|
|
) as demo: |
|
|
|
|
|
gr.HTML( |
|
|
""" |
|
|
<div style="text-align: center; margin-bottom: 30px;"> |
|
|
<h1 style="font-size: 3em; font-weight: bold; margin-bottom: 10px;"> |
|
|
π DevRev Search Evaluation Leaderboard |
|
|
</h1> |
|
|
<p style="font-size: 1.2em; color: #666;"> |
|
|
Benchmarking Search and Retrieval Systems for Enterprise Knowledge Bases |
|
|
</p> |
|
|
</div> |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tabs(): |
|
|
|
|
|
with gr.TabItem("π Main Leaderboard"): |
|
|
gr.Markdown( |
|
|
""" |
|
|
### Evaluation Overview |
|
|
This leaderboard displays metrics of search systems on the test queries present in [DevRev Search Dataset](https://huggingface.co/datasets/devrev/search). |
|
|
All methods are evaluated on the same set of agent support queries with consistent evaluation protocols. |
|
|
|
|
|
**Metrics**: Recall@K and Precision@K measure the effectiveness of retrieving relevant articles within the top K retrieved articles. |
|
|
|
|
|
**Leaderboard ranking**: Sorted by **Recall@10** (primary) and **Precision@10** (secondary). |
|
|
|
|
|
**To add your results**: Submission details are available in the [About](#about) section. |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
df = create_leaderboard_data() |
|
|
|
|
|
if not df.empty: |
|
|
|
|
|
default_columns = [ |
|
|
"π Rank", |
|
|
"π§ Method", |
|
|
"π·οΈ Type", |
|
|
"π Recall@10", |
|
|
"π Recall@50", |
|
|
"π Precision@10", |
|
|
"π Precision@50", |
|
|
"π Open Source", |
|
|
] |
|
|
|
|
|
|
|
|
type_column = ColumnFilter("π·οΈ Type", type="checkboxgroup") |
|
|
open_source_column = ColumnFilter( |
|
|
"π Open Source", type="checkboxgroup" |
|
|
) |
|
|
|
|
|
|
|
|
Leaderboard( |
|
|
value=df, |
|
|
datatype=[ |
|
|
"number", |
|
|
"markdown", |
|
|
"markdown", |
|
|
"str", |
|
|
"number", |
|
|
"number", |
|
|
"number", |
|
|
"number", |
|
|
"number", |
|
|
"number", |
|
|
"number", |
|
|
"number", |
|
|
"str", |
|
|
"str", |
|
|
], |
|
|
select_columns=SelectColumns( |
|
|
default_selection=default_columns, |
|
|
cant_deselect=[ |
|
|
"π Rank", |
|
|
"π§ Method", |
|
|
"π Recall@10", |
|
|
], |
|
|
label="Select Columns to Display", |
|
|
), |
|
|
search_columns=[ |
|
|
"π§ Method", |
|
|
"π Paper/Details", |
|
|
"π·οΈ Type", |
|
|
], |
|
|
hide_columns=["π
Date"], |
|
|
filter_columns=[type_column, open_source_column], |
|
|
interactive=False, |
|
|
) |
|
|
else: |
|
|
gr.HTML( |
|
|
""" |
|
|
<div style="text-align: center; padding: 50px; background: #f5f5f5; border-radius: 10px;"> |
|
|
<h3>No Results Found</h3> |
|
|
<p>Please add JSON evaluation files to the 'results' directory.</p> |
|
|
<p>See the About tab for the required format.</p> |
|
|
</div> |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
with gr.TabItem("βΉοΈ About"): |
|
|
gr.Markdown( |
|
|
""" |
|
|
## About This Leaderboard |
|
|
|
|
|
This leaderboard tracks the performance of various search and retrieval systems on the [DevRev Search Dataset](https://huggingface.co/datasets/devrev/search). |
|
|
|
|
|
### π Evaluation Metrics |
|
|
|
|
|
- **Recall@K**: The percentage of relevant article chunks retrieved in the top K article chunks |
|
|
- **Precision@K**: The percentage of retrieved article chunks that are relevant among the top K article chunks |
|
|
|
|
|
### π€ How to Submit |
|
|
|
|
|
1. Run your retrieval on the test queries in DevRev Search Dataset |
|
|
2. Submit the results in same format as annotated_queries in the dataset through email to prateek.jain@devrev.ai |
|
|
3. Also include a **one-line system detail/link**, the **system type**, and whether it is **open source** |
|
|
|
|
|
### π Resources |
|
|
|
|
|
- [Computer by DevRev](https://devrev.ai/meet-computer) |
|
|
- [DevRev Search Dataset](https://huggingface.co/datasets/devrev/search) |
|
|
|
|
|
### π Acknowledgments |
|
|
|
|
|
Inspired by: |
|
|
- [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard) |
|
|
- [Berkeley Function Calling Leaderboard](https://gorilla.cs.berkeley.edu/leaderboard) |
|
|
|
|
|
### π Citation |
|
|
|
|
|
```bibtex |
|
|
@misc{devrev_search_leaderboard_2026, |
|
|
title={DevRev Search Leaderboard}, |
|
|
author={Research@DevRev}, |
|
|
year={2026}, |
|
|
url={https://huggingface.co/spaces/devrev/search} |
|
|
} |
|
|
``` |
|
|
""", |
|
|
elem_id="about", |
|
|
) |
|
|
|
|
|
|
|
|
gr.HTML( |
|
|
f""" |
|
|
<div style="text-align: center; margin-top: 50px; padding: 20px; border-top: 1px solid #e0e0e0; color: #666;"> |
|
|
<p> |
|
|
Last updated: {datetime.now().strftime("%Y-%m-%d %H:%M UTC")} |
|
|
</p> |
|
|
</div> |
|
|
""" |
|
|
) |
|
|
|
|
|
return demo |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo = create_interface() |
|
|
demo.launch(server_name="0.0.0.0", server_port=7860, share=True, show_api=False) |
|
|
|