{ "model_name": "SPLADE-PT-BR", "version": "1.0.0", "description": "SPLADE sparse retrieval model trained for Brazilian Portuguese", "author": "AxelPCG", "release_date": "2025-12-01", "base_model": { "name": "neuralmind/bert-base-portuguese-cased", "type": "BERTimbau", "language": "Portuguese (Brazilian)", "vocab_size": 29794 }, "training": { "training_dataset": "mMARCO Portuguese (unicamp-dl/mmarco)", "validation_dataset": "mRobust (unicamp-dl/mrobust)", "num_iterations": 150000, "final_loss": 4.7e-05, "batch_size": 8, "effective_batch_size": 32, "gradient_accumulation_steps": 4, "learning_rate": 2e-05, "weight_decay": 0.01, "warmup_steps": 6000, "max_length": 256, "fp16": true, "optimizer": "AdamW", "scheduler": "linear_with_warmup", "regularization": { "type": "FLOPS", "lambda_q": 0.0003, "lambda_d": 0.0001, "T": 50000 } }, "model_specs": { "architecture": "SPLADE", "aggregation": "max", "output_dim": 29794, "expected_sparsity": 0.99, "avg_active_dims_query": 120, "avg_active_dims_doc": 150 }, "performance": { "dataset": "mRobust (TREC Robust04 Portuguese)", "num_documents": 528032, "num_queries": 250, "metrics": { "MRR@10": 0.453, "evaluation_date": "2025-12-02" }, "comparison": { "splade_en_mrr10": 0.383, "improvement": "+18.3%" } }, "usage": { "primary_use_case": "Sparse vector retrieval for Portuguese RAG systems", "recommended_for": [ "Question answering in Portuguese", "Document retrieval with Qdrant", "Hybrid search (sparse + dense)", "Interpretable search results" ], "integration": { "qdrant": "Use with SparseVectorParams", "elasticsearch": "Compatible with sparse_vector field type", "custom": "Standard inverted index on non-zero dimensions" } }, "files": { "checkpoint": "model_final_checkpoint.tar", "config": "config.yaml", "tokenizer": "neuralmind/bert-base-portuguese-cased", "size_mb": 450 }, "huggingface": { "repo_id": "AxelPCG/splade-pt-br", "model_type": "bert", "pipeline_tag": "feature-extraction", "license": "apache-2.0" }, "comparison_with_original": { "original_model": "SPLADE++", "original_language": "English", "original_mrr10": 0.368, "improvements_for_portuguese": [ "Native Portuguese vocabulary", "Contextual expansion in Portuguese", "No subword tokenization for PT words", "Better semantic understanding of Brazilian Portuguese" ] }, "limitations": [ "Optimized for Brazilian Portuguese", "Not tested on European Portuguese", "May require domain adaptation for specialized fields", "Max sequence length: 256 tokens" ], "citation": { "bibtex": "@misc{splade-pt-br-2025, author = {Axel Chepanski}, title = {SPLADE-PT-BR: Sparse Retrieval for Portuguese}, year = {2025}, publisher = {Hugging Face}, url = {https://huggingface.co/AxelPCG/splade-pt-br}}" } }