| """ |
| Deterministic ID generation for code chunks. |
| |
| This module provides deterministic hashing for chunk IDs, ensuring that |
| identical code chunks receive the same ID across runs. This is crucial for: |
| 1. Version tracking and change detection |
| 2. Cache consistency |
| 3. Reproducible datasets |
| 4. Efficient deduplication |
| |
| ID GENERATION STRATEGY: |
| Hash = SHA256(file_path + chunk_type + name + parent + |
| start_line + end_line + code + byte_spans) |
| |
| Result: prefix_hash (e.g., "primary_5c442008") |
| |
| KEY PROPERTIES: |
| 1. Deterministic: Same input β same ID |
| 2. Content-aware: Code changes β ID changes |
| 3. Position-aware: Line/byte changes β ID changes |
| 4. Hierarchical: Parent relationships affect ID |
| |
| USE CASE: |
| Ensures that during RAG operations, identical code chunks are |
| recognized as the same entity, improving retrieval accuracy. |
| |
| EXAMPLE: |
| deterministic_chunk_id( |
| file_path="src/module.py", |
| chunk_type="class", |
| name="MyClass", |
| parent="module", |
| start_line=10, |
| end_line=50, |
| code="class MyClass: ...", |
| start_byte=100, |
| end_byte=500 |
| ) |
| β "primary_a1b2c3d4" |
| """ |
|
|
| import hashlib |
| from typing import Optional |
|
|
| def deterministic_chunk_id( |
| *, |
| file_path: str, |
| chunk_type: str, |
| name: Optional[str], |
| parent: Optional[str], |
| start_line: Optional[int], |
| end_line: Optional[int], |
| code: str, |
| prefix: str = "primary", |
| start_byte: Optional[int] = None, |
| end_byte: Optional[int] = None, |
| ) -> str: |
| """ |
| Generate deterministic chunk ID that includes code content. |
| |
| Args: |
| file_path: Path to source file |
| chunk_type: Type of chunk (function, class, method, etc.) |
| name: Name of the symbol |
| parent: Parent symbol name |
| start_line: Starting line number |
| end_line: Ending line number |
| code: Actual code content |
| prefix: ID prefix (primary/secondary) |
| start_byte: Starting byte offset |
| end_byte: Ending byte offset |
| |
| Returns: |
| Deterministic chunk ID |
| """ |
| |
| payload = f""" |
| {file_path} |
| {chunk_type} |
| {name} |
| {parent} |
| {start_line} |
| {end_line} |
| {start_byte} |
| {end_byte} |
| {code} |
| """.strip() |
| |
| |
| hash_digest = hashlib.sha256(payload.encode("utf-8")).hexdigest()[:8] |
| return f"{prefix}_{hash_digest}" |
|
|