splade-pt-br / model_metadata.json
AxelPCG's picture
Upload SPLADE-PT-BR model v1.0.0
fff5e6a verified
{
"model_name": "SPLADE-PT-BR",
"version": "1.0.0",
"description": "SPLADE sparse retrieval model trained for Brazilian Portuguese",
"author": "AxelPCG",
"release_date": "2025-12-01",
"base_model": {
"name": "neuralmind/bert-base-portuguese-cased",
"type": "BERTimbau",
"language": "Portuguese (Brazilian)",
"vocab_size": 29794
},
"training": {
"training_dataset": "mMARCO Portuguese (unicamp-dl/mmarco)",
"validation_dataset": "mRobust (unicamp-dl/mrobust)",
"num_iterations": 150000,
"final_loss": 4.7e-05,
"batch_size": 8,
"effective_batch_size": 32,
"gradient_accumulation_steps": 4,
"learning_rate": 2e-05,
"weight_decay": 0.01,
"warmup_steps": 6000,
"max_length": 256,
"fp16": true,
"optimizer": "AdamW",
"scheduler": "linear_with_warmup",
"regularization": {
"type": "FLOPS",
"lambda_q": 0.0003,
"lambda_d": 0.0001,
"T": 50000
}
},
"model_specs": {
"architecture": "SPLADE",
"aggregation": "max",
"output_dim": 29794,
"expected_sparsity": 0.99,
"avg_active_dims_query": 120,
"avg_active_dims_doc": 150
},
"performance": {
"dataset": "mRobust (TREC Robust04 Portuguese)",
"num_documents": 528032,
"num_queries": 250,
"metrics": {
"MRR@10": 0.453,
"evaluation_date": "2025-12-02"
},
"comparison": {
"splade_en_mrr10": 0.383,
"improvement": "+18.3%"
}
},
"usage": {
"primary_use_case": "Sparse vector retrieval for Portuguese RAG systems",
"recommended_for": [
"Question answering in Portuguese",
"Document retrieval with Qdrant",
"Hybrid search (sparse + dense)",
"Interpretable search results"
],
"integration": {
"qdrant": "Use with SparseVectorParams",
"elasticsearch": "Compatible with sparse_vector field type",
"custom": "Standard inverted index on non-zero dimensions"
}
},
"files": {
"checkpoint": "model_final_checkpoint.tar",
"config": "config.yaml",
"tokenizer": "neuralmind/bert-base-portuguese-cased",
"size_mb": 450
},
"huggingface": {
"repo_id": "AxelPCG/splade-pt-br",
"model_type": "bert",
"pipeline_tag": "feature-extraction",
"license": "apache-2.0"
},
"comparison_with_original": {
"original_model": "SPLADE++",
"original_language": "English",
"original_mrr10": 0.368,
"improvements_for_portuguese": [
"Native Portuguese vocabulary",
"Contextual expansion in Portuguese",
"No subword tokenization for PT words",
"Better semantic understanding of Brazilian Portuguese"
]
},
"limitations": [
"Optimized for Brazilian Portuguese",
"Not tested on European Portuguese",
"May require domain adaptation for specialized fields",
"Max sequence length: 256 tokens"
],
"citation": {
"bibtex": "@misc{splade-pt-br-2025, author = {Axel Chepanski}, title = {SPLADE-PT-BR: Sparse Retrieval for Portuguese}, year = {2025}, publisher = {Hugging Face}, url = {https://huggingface.co/AxelPCG/splade-pt-br}}"
}
}