NamanAgnih0tri commited on
Commit
73a27bc
·
verified ·
1 Parent(s): 809884f

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "cross-encoder/ms-marco-MiniLM-L-6-v2",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 384,
12
+ "id2label": {
13
+ "0": "LABEL_0"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 1536,
17
+ "label2id": {
18
+ "LABEL_0": 0
19
+ },
20
+ "layer_norm_eps": 1e-12,
21
+ "max_position_embeddings": 512,
22
+ "model_type": "bert",
23
+ "num_attention_heads": 12,
24
+ "num_hidden_layers": 6,
25
+ "pad_token_id": 0,
26
+ "position_embedding_type": "absolute",
27
+ "sbert_ce_default_activation_function": "torch.nn.modules.linear.Identity",
28
+ "torch_dtype": "float32",
29
+ "transformers_version": "4.48.3",
30
+ "type_vocab_size": 2,
31
+ "use_cache": true,
32
+ "vocab_size": 30522
33
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44fac56139daae40c68cd68721c226ab6260d31fcb6ccfc3c9cad89cb5fbbcff
3
+ size 90866412
readme.md ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - reranker
4
+ - code search
5
+ - cross-encoder
6
+ - MiniLM
7
+ - staqc
8
+ - information retrieval
9
+ - MRR
10
+ - code understanding
11
+ - python
12
+ - stack-overflow
13
+ library_name: sentence-transformers
14
+ pipeline_tag: text-classification
15
+ license: apache-2.0
16
+ model-index:
17
+ - name: code-reranker-miniLM-staqc
18
+ results:
19
+ - task:
20
+ type: text-classification
21
+ name: Code Reranking
22
+ dataset:
23
+ name: StaQC (Stack Overflow Question-Code)
24
+ type: custom
25
+ metrics:
26
+ - name: MRR
27
+ type: mean_reciprocal_rank
28
+ value: 0.9380
29
+ - name: Top-1 Accuracy
30
+ type: accuracy
31
+ value: 0.9100
32
+ ---
33
+
34
+ # code-reranker-miniLM-staqc
35
+
36
+ **A fine-tuned cross-encoder based on `cross-encoder/ms-marco-MiniLM-L-6-v2` for reranking Python code snippets based on natural language queries from Stack Overflow.**
37
+
38
+ ## Model Description
39
+
40
+ This model is a cross-encoder trained on the StaQC dataset (Stack Overflow Question-Code pairs) to rerank relevant Python code snippets given a programming question or natural language intent. It is specifically fine-tuned for Python code search and retrieval tasks where accurate relevance scoring is important.
41
+
42
+ * **Architecture**: Cross-Encoder based on MiniLM-L6
43
+ * **Base model**: `cross-encoder/ms-marco-MiniLM-L-6-v2`
44
+ * **Fine-tuned on**: StaQC SCA (Stack Overflow Question-Code) dataset
45
+ * **Task**: Python code snippet reranking for natural language queries
46
+ * **Language**: Python code snippets
47
+
48
+ ## Use Cases
49
+
50
+ * Python code search engines
51
+ * Developer assistants for Python programming
52
+ * AI coding agents with natural language interfaces
53
+ * Evaluation modules in RAG pipelines for Python programming use cases
54
+ * Code recommendation systems
55
+
56
+ ## Evaluation Results
57
+
58
+ The model was evaluated on 500 query-code candidates from the Conala curated dataset.
59
+
60
+ | Metric | Value |
61
+ | -------------- | ----- |
62
+ | MRR | 0.938 |
63
+ | Top‑1 Accuracy | 0.910 |
64
+
65
+ ## How to Use
66
+
67
+ ### Using sentence-transformers
68
+
69
+ ```python
70
+ from sentence_transformers import CrossEncoder
71
+
72
+ # Load the model
73
+ model = CrossEncoder("NamanAgnih0tri/code-reranker-miniLM-staqc")
74
+
75
+ # Sample input
76
+ query = "How to convert a string to int in Python?"
77
+ code_snippet = "int_value = int('123')"
78
+
79
+ # Get relevance score
80
+ score = model.predict([query, code_snippet])
81
+ print(f"Relevance Score: {score:.4f}")
82
+ ```
83
+
84
+ ### Using transformers directly
85
+
86
+ ```python
87
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
88
+ import torch
89
+
90
+ tokenizer = AutoTokenizer.from_pretrained("NamanAgnih0tri/code-reranker-miniLM-staqc")
91
+ model = AutoModelForSequenceClassification.from_pretrained("NamanAgnih0tri/code-reranker-miniLM-staqc")
92
+
93
+ # Sample input
94
+ query = "How to reverse a string in Python?"
95
+ code_snippet = "def reverse_string(s):\n return s[::-1]"
96
+
97
+ # Tokenize and predict relevance
98
+ inputs = tokenizer(query, code_snippet, return_tensors="pt", truncation=True, max_length=512)
99
+ with torch.no_grad():
100
+ logits = model(**inputs).logits
101
+ score = logits[0].item()
102
+
103
+ print(f"Relevance Score: {score:.4f}")
104
+ ```
105
+
106
+ ### Code Ranking Example
107
+
108
+ ```python
109
+ from sentence_transformers import CrossEncoder
110
+
111
+ model = CrossEncoder("NamanAgnih0tri/code-reranker-miniLM-staqc")
112
+
113
+ def rank_code_snippets(query, candidates):
114
+ """Rank code snippets by relevance to the query."""
115
+ pairs = [[query, code] for code in candidates]
116
+ scores = model.predict(pairs)
117
+ ranked_results = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
118
+ return ranked_results
119
+
120
+ # Example usage
121
+ query = "How to reverse a string in Python?"
122
+ candidates = [
123
+ "def reverse_string(s):\n return s[::-1]",
124
+ "print('hello'[::-1])",
125
+ "def add(a,b):\n return a + b",
126
+ "list = [1,2,3,4]"
127
+ ]
128
+
129
+ ranked_results = rank_code_snippets(query, candidates)
130
+ for rank, (code, score) in enumerate(ranked_results, 1):
131
+ print(f"{rank}. Score: {score:.4f}\n{code}\n")
132
+ ```
133
+
134
+ ## Dataset
135
+
136
+ * **StaQC SCA (Stack Overflow Question-Code pairs)**
137
+ * Each pair consists of a natural language programming question and a corresponding Python code snippet
138
+ * Positive and negative pairs were used for contrastive fine-tuning
139
+ * Dataset contains 85,294 training examples
140
+
141
+ ## Training Details
142
+
143
+ * **Base Model**: `cross-encoder/ms-marco-MiniLM-L-6-v2`
144
+ * **Optimizer**: AdamW
145
+ * **Epochs**: 3
146
+ * **Batch size**: 8
147
+ * **Learning rate**: 2e-5
148
+ * **Loss**: Cosine Similarity Loss
149
+ * **Training samples**: 170,588 (including negative samples)
150
+ * **Warmup steps**: 10% of total training steps
151
+
152
+ ## Model Performance Comparison
153
+
154
+ | Model | MRR | Top-1 Accuracy |
155
+ |-------|-----|----------------|
156
+ | **code-reranker-miniLM-staqc** | **0.938** | **0.910** |
157
+ | cross-encoder/ms-marco-MiniLM-L-6-v2 | 0.895 | 0.844 |
158
+ | cross-encoder/ms-marco-TinyBERT-L-2-v2 | 0.823 | 0.756 |
159
+
160
+ ## Limitations
161
+
162
+ * Trained specifically on Python code snippets; may not generalize well to other programming languages
163
+ * Model is relatively small; performance may lag behind larger rerankers on complex queries
164
+ * Fine-tuned on Stack Overflow-like questions; may not generalize to code from other domains
165
+ * Limited to text-based code snippets; does not handle complex code structures or dependencies
166
+
167
+ ## Citation
168
+
169
+ If you use this model in your work, please cite it as:
170
+
171
+ ```bibtex
172
+ @misc{code-reranker-miniLM-staqc,
173
+ title={Code Reranker using MiniLM and StaQC for Python Code Search},
174
+ author={Naman Agnihotri},
175
+ year={2025},
176
+ howpublished={\url{https://huggingface.co/NamanAgnih0tri/code-reranker-miniLM-staqc}}
177
+ }
178
+ ```
179
+
180
+ ## Author
181
+
182
+ * **Name**: Naman Agnihotri
183
+ * **Contact**: [LinkedIn](https://www.linkedin.com/in/namanagnihotri)
184
+ * **GitHub**: [NamanAgnih0tri](https://github.com/NamanAgnih0tri)
185
+
186
+ ## License
187
+
188
+ This model is licensed under the Apache 2.0 License.
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "extra_special_tokens": {},
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 512,
51
+ "never_split": null,
52
+ "pad_token": "[PAD]",
53
+ "sep_token": "[SEP]",
54
+ "strip_accents": null,
55
+ "tokenize_chinese_chars": true,
56
+ "tokenizer_class": "BertTokenizer",
57
+ "unk_token": "[UNK]"
58
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff