| { | |
| "model_name": "SPLADE-PT-BR", | |
| "version": "1.0.0", | |
| "description": "SPLADE sparse retrieval model trained for Brazilian Portuguese", | |
| "author": "AxelPCG", | |
| "release_date": "2025-12-01", | |
| "base_model": { | |
| "name": "neuralmind/bert-base-portuguese-cased", | |
| "type": "BERTimbau", | |
| "language": "Portuguese (Brazilian)", | |
| "vocab_size": 29794 | |
| }, | |
| "training": { | |
| "training_dataset": "mMARCO Portuguese (unicamp-dl/mmarco)", | |
| "validation_dataset": "mRobust (unicamp-dl/mrobust)", | |
| "num_iterations": 150000, | |
| "final_loss": 4.7e-05, | |
| "batch_size": 8, | |
| "effective_batch_size": 32, | |
| "gradient_accumulation_steps": 4, | |
| "learning_rate": 2e-05, | |
| "weight_decay": 0.01, | |
| "warmup_steps": 6000, | |
| "max_length": 256, | |
| "fp16": true, | |
| "optimizer": "AdamW", | |
| "scheduler": "linear_with_warmup", | |
| "regularization": { | |
| "type": "FLOPS", | |
| "lambda_q": 0.0003, | |
| "lambda_d": 0.0001, | |
| "T": 50000 | |
| } | |
| }, | |
| "model_specs": { | |
| "architecture": "SPLADE", | |
| "aggregation": "max", | |
| "output_dim": 29794, | |
| "expected_sparsity": 0.99, | |
| "avg_active_dims_query": 120, | |
| "avg_active_dims_doc": 150 | |
| }, | |
| "performance": { | |
| "dataset": "mRobust (TREC Robust04 Portuguese)", | |
| "num_documents": 528032, | |
| "num_queries": 250, | |
| "metrics": { | |
| "MRR@10": 0.453, | |
| "evaluation_date": "2025-12-02" | |
| }, | |
| "comparison": { | |
| "splade_en_mrr10": 0.383, | |
| "improvement": "+18.3%" | |
| } | |
| }, | |
| "usage": { | |
| "primary_use_case": "Sparse vector retrieval for Portuguese RAG systems", | |
| "recommended_for": [ | |
| "Question answering in Portuguese", | |
| "Document retrieval with Qdrant", | |
| "Hybrid search (sparse + dense)", | |
| "Interpretable search results" | |
| ], | |
| "integration": { | |
| "qdrant": "Use with SparseVectorParams", | |
| "elasticsearch": "Compatible with sparse_vector field type", | |
| "custom": "Standard inverted index on non-zero dimensions" | |
| } | |
| }, | |
| "files": { | |
| "checkpoint": "model_final_checkpoint.tar", | |
| "config": "config.yaml", | |
| "tokenizer": "neuralmind/bert-base-portuguese-cased", | |
| "size_mb": 450 | |
| }, | |
| "huggingface": { | |
| "repo_id": "AxelPCG/splade-pt-br", | |
| "model_type": "bert", | |
| "pipeline_tag": "feature-extraction", | |
| "license": "apache-2.0" | |
| }, | |
| "comparison_with_original": { | |
| "original_model": "SPLADE++", | |
| "original_language": "English", | |
| "original_mrr10": 0.368, | |
| "improvements_for_portuguese": [ | |
| "Native Portuguese vocabulary", | |
| "Contextual expansion in Portuguese", | |
| "No subword tokenization for PT words", | |
| "Better semantic understanding of Brazilian Portuguese" | |
| ] | |
| }, | |
| "limitations": [ | |
| "Optimized for Brazilian Portuguese", | |
| "Not tested on European Portuguese", | |
| "May require domain adaptation for specialized fields", | |
| "Max sequence length: 256 tokens" | |
| ], | |
| "citation": { | |
| "bibtex": "@misc{splade-pt-br-2025, author = {Axel Chepanski}, title = {SPLADE-PT-BR: Sparse Retrieval for Portuguese}, year = {2025}, publisher = {Hugging Face}, url = {https://huggingface.co/AxelPCG/splade-pt-br}}" | |
| } | |
| } |