File size: 2,625 Bytes
8bab08d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# file: scripts/seed_vectorstore.py
#!/usr/bin/env python3
"""Seed the vector store with initial data"""

import sys
import json
from pathlib import Path

# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))

from vector.store import VectorStore
from vector.embeddings import get_embedding_model
from app.config import DATA_DIR

def seed_vectorstore():
    """Build and persist the initial vector index"""
    
    print("Initializing vector store...")
    store = VectorStore()
    model = get_embedding_model()
    
    # Load companies
    companies_file = DATA_DIR / "companies.json"
    if not companies_file.exists():
        print(f"Error: {companies_file} not found")
        return
    
    with open(companies_file) as f:
        companies = json.load(f)
    
    print(f"Loading {len(companies)} companies...")
    
    texts = []
    metadata = []
    
    for company in companies:
        # Company description
        desc = f"{company['name']} is a {company['industry']} company with {company['size']} employees"
        texts.append(desc)
        metadata.append({
            "company_id": company["id"],
            "type": "description",
            "text": desc
        })
        
        # Pain points
        for pain in company.get("pains", []):
            pain_text = f"{company['name']} challenge: {pain}"
            texts.append(pain_text)
            metadata.append({
                "company_id": company["id"],
                "type": "pain",
                "text": pain_text
            })
        
        # Notes
        for note in company.get("notes", []):
            note_text = f"{company['name']}: {note}"
            texts.append(note_text)
            metadata.append({
                "company_id": company["id"],
                "type": "note",
                "text": note_text
            })
    
    print(f"Encoding {len(texts)} documents...")
    embeddings = model.encode(texts)
    
    print("Adding to index...")
    store.add(embeddings, metadata)
    
    print(f"Vector store initialized with {len(texts)} documents")
    print(f"Index saved to: {store.index_path}")
    
    # Test retrieval
    print("\nTesting retrieval...")
    from vector.retriever import Retriever
    retriever = Retriever()
    
    for company in companies[:1]:  # Test with first company
        results = retriever.retrieve(company["id"], k=3)
        print(f"\nTop results for {company['name']}:")
        for r in results:
            print(f"  - {r['text'][:80]}... (score: {r.get('score', 0):.3f})")

if __name__ == "__main__":
    seed_vectorstore()