cx_ai_agent_v1 / scripts /seed_vectorstore.py
muzakkirhussain011's picture
Add application files (text files only)
8bab08d
# file: scripts/seed_vectorstore.py
#!/usr/bin/env python3
"""Seed the vector store with initial data"""
import sys
import json
from pathlib import Path
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from vector.store import VectorStore
from vector.embeddings import get_embedding_model
from app.config import DATA_DIR
def seed_vectorstore():
"""Build and persist the initial vector index"""
print("Initializing vector store...")
store = VectorStore()
model = get_embedding_model()
# Load companies
companies_file = DATA_DIR / "companies.json"
if not companies_file.exists():
print(f"Error: {companies_file} not found")
return
with open(companies_file) as f:
companies = json.load(f)
print(f"Loading {len(companies)} companies...")
texts = []
metadata = []
for company in companies:
# Company description
desc = f"{company['name']} is a {company['industry']} company with {company['size']} employees"
texts.append(desc)
metadata.append({
"company_id": company["id"],
"type": "description",
"text": desc
})
# Pain points
for pain in company.get("pains", []):
pain_text = f"{company['name']} challenge: {pain}"
texts.append(pain_text)
metadata.append({
"company_id": company["id"],
"type": "pain",
"text": pain_text
})
# Notes
for note in company.get("notes", []):
note_text = f"{company['name']}: {note}"
texts.append(note_text)
metadata.append({
"company_id": company["id"],
"type": "note",
"text": note_text
})
print(f"Encoding {len(texts)} documents...")
embeddings = model.encode(texts)
print("Adding to index...")
store.add(embeddings, metadata)
print(f"Vector store initialized with {len(texts)} documents")
print(f"Index saved to: {store.index_path}")
# Test retrieval
print("\nTesting retrieval...")
from vector.retriever import Retriever
retriever = Retriever()
for company in companies[:1]: # Test with first company
results = retriever.retrieve(company["id"], k=3)
print(f"\nTop results for {company['name']}:")
for r in results:
print(f" - {r['text'][:80]}... (score: {r.get('score', 0):.3f})")
if __name__ == "__main__":
seed_vectorstore()