Spaces:

holistic-ai
/

HyPA-RAG

Build error

App Files Files Community

HyPA-RAG / utils_code.py

wu981526092

Upload 11 files

4559323 verified about 1 year ago

raw

history blame contribute delete

7.29 kB


	import os
	from dotenv import load_dotenv, find_dotenv
	from llama_index.llms.azure_openai import AzureOpenAI
	from llama_index.readers.file import PyMuPDFReader
	from llama_index.core.chat_engine import ContextChatEngine
	from llama_index.core import KnowledgeGraphIndex
	from llama_index.core.node_parser import SentenceSplitter
	from llama_index.embeddings.huggingface import HuggingFaceEmbedding

	def initialize_openai_creds():
	"""Load environment variables and set API keys."""
	dotenv_path = find_dotenv()
	if dotenv_path == "":
	print("No .env file found. Make sure the .env file is in the correct directory.")
	else:
	print(f".env file found at: {dotenv_path}")

	load_dotenv(dotenv_path)

	# General Azure OpenAI settings for gpt35 and gpt-4o-mini
	general_creds = {
	"api_key": os.getenv('AZURE_OPENAI_API_KEY'),
	"api_version": os.getenv("AZURE_API_VERSION"),
	"endpoint": os.getenv("AZURE_OPENAI_ENDPOINT"),
	"temperature": 0, # Default temperature for models
	"gpt35_deployment_name": os.getenv("AZURE_DEPLOYMENT_NAME"),
	"gpt4o_mini_deployment_name": os.getenv("GPT4O_MINI_DEPLOYMENT_NAME")
	}

	# GPT-4o specific settings
	gpt4o_creds = {
	"api_key": os.getenv('GPT4O_API_KEY'),
	"api_version": os.getenv("GPT4O_API_VERSION"),
	"endpoint": os.getenv("GPT4O_AZURE_ENDPOINT"),
	"deployment_name": os.getenv("GPT4O_DEPLOYMENT_NAME"),
	"temperature": os.getenv("GPT4O_TEMPERATURE", 0) # Default temperature for GPT-4o
	}

	return general_creds, gpt4o_creds



	def initialize_openai_creds():
	"""Load environment variables and set API keys."""
	dotenv_path = find_dotenv()
	if dotenv_path == "":
	print("No .env file found. Make sure the .env file is in the correct directory.")
	else:
	print(f".env file found at: {dotenv_path}")

	load_dotenv(dotenv_path)

	# GPT-3.5 Credentials
	gpt35_creds = {
	"api_key": os.getenv('AZURE_OPENAI_API_KEY_GPT35'),
	"api_version": os.getenv("AZURE_API_VERSION"),
	"endpoint": os.getenv("AZURE_OPENAI_ENDPOINT_GPT35"),
	"temperature": 0, # Default temperature for models
	"deployment_name": os.getenv("AZURE_DEPLOYMENT_NAME_GPT35")
	}

	# GPT-4o-mini Credentials (shares the same API key as GPT-3.5 but different deployment name and endpoint)
	gpt4o_mini_creds = {
	"api_key": os.getenv('AZURE_OPENAI_API_KEY_GPT4O_MINI'),
	"api_version": os.getenv("AZURE_API_VERSION"),
	"endpoint": os.getenv("AZURE_OPENAI_ENDPOINT_GPT4O_MINI"),
	"temperature": 0, # Default temperature for models
	"deployment_name": os.getenv("GPT4O_MINI_DEPLOYMENT_NAME")
	}

	# GPT-4o specific credentials
	gpt4o_creds = {
	"api_key": os.getenv('GPT4O_API_KEY'),
	"api_version": os.getenv("GPT4O_API_VERSION"),
	"endpoint": os.getenv("GPT4O_AZURE_ENDPOINT"),
	"deployment_name": os.getenv("GPT4O_DEPLOYMENT_NAME"),
	"temperature": os.getenv("GPT4O_TEMPERATURE", 0) # Default temperature for GPT-4o
	}

	return gpt35_creds, gpt4o_mini_creds, gpt4o_creds



	def create_llm(model: str, gpt35_creds: dict, gpt4o_mini_creds: dict, gpt4o_creds: dict):
	"""
	Initialize and return the Azure OpenAI LLM based on the selected model.

	:param model: The model to initialize ("gpt35", "gpt4o", or "gpt-4o-mini").
	:param gpt35_creds: Credentials for gpt35.
	:param gpt4o_mini_creds: Credentials for gpt-4o-mini.
	:param gpt4o_creds: Credentials for gpt4o.
	"""
	if model == "gpt35":
	return AzureOpenAI(
	deployment_name=gpt35_creds["deployment_name"],
	temperature=gpt35_creds["temperature"],
	api_key=gpt35_creds["api_key"],
	azure_endpoint=gpt35_creds["endpoint"],
	api_version=gpt35_creds["api_version"]
	)
	elif model == "gpt-4o-mini":
	return AzureOpenAI(
	deployment_name=gpt4o_mini_creds["deployment_name"],
	temperature=gpt4o_mini_creds["temperature"],
	api_key=gpt4o_mini_creds["api_key"],
	azure_endpoint=gpt4o_mini_creds["endpoint"],
	api_version=gpt4o_mini_creds["api_version"]
	)
	elif model == "gpt4o":
	return AzureOpenAI(
	deployment_name=gpt4o_creds["deployment_name"],
	temperature=gpt4o_creds["temperature"],
	api_key=gpt4o_creds["api_key"],
	azure_endpoint=gpt4o_creds["endpoint"],
	api_version=gpt4o_creds["api_version"]
	)
	else:
	raise ValueError(f"Invalid model: {model}. Choose from 'gpt35', 'gpt4o', or 'gpt-4o-mini'.")



	def create_chat_engine(retriever, memory, llm):
	"""Create and return the ContextChatEngine using the provided retriever and memory."""
	chat_engine = ContextChatEngine.from_defaults(
	retriever=retriever,
	memory=memory,
	llm=llm
	)
	return chat_engine


	def load_documents(filepaths):
	"""
	Load and return documents from specified file paths.

	:param filepaths: A string (single file path) or a list of strings (multiple file paths).
	:return: A list of loaded documents.
	"""
	loader = PyMuPDFReader()

	# If a single string is passed, convert it to a list for consistent handling
	if isinstance(filepaths, str):
	filepaths = [filepaths]

	# Load and accumulate documents
	all_documents = []
	for filepath in filepaths:
	documents = loader.load(file_path=filepath)
	all_documents += documents

	return all_documents


	def create_kg_index(
	documents,
	storage_context,
	llm,
	max_triplets_per_chunk=10,
	embed_model=HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5"),
	include_embeddings=True,
	chunk_size=512
	):
	splitter = SentenceSplitter(chunk_size=chunk_size)
	graph_index = KnowledgeGraphIndex.from_documents(
	documents,
	storage_context=storage_context,
	max_triplets_per_chunk=max_triplets_per_chunk,
	llm=llm,
	embed_model=embed_model,
	include_embeddings=include_embeddings,
	transformations=[splitter]
	)
	return graph_index


	from llama_index.core.indices.property_graph import SimpleLLMPathExtractor
	from llama_index.core.indices.property_graph import DynamicLLMPathExtractor
	from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore
	from llama_index.core import PropertyGraphIndex


	def create_pg_index(
	llm,
	documents,
	graph_store,
	max_triplets_per_chunk=10,
	num_workers=4,
	embed_kg_nodes=True,
	embed_model=HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5")
	):

	splitter = SentenceSplitter(chunk_size=512)
	# Initialize the LLM path extractor
	kg_extractor = DynamicLLMPathExtractor(
	llm=llm,
	max_triplets_per_chunk=max_triplets_per_chunk,
	num_workers=num_workers
	)


	# Create the Property Graph Index
	graph_index = PropertyGraphIndex.from_documents(
	documents,
	property_graph_store=graph_store,
	embed_model=embed_model,
	embed_kg_nodes=embed_kg_nodes,
	kg_extractors=[kg_extractor],
	transformations=[splitter]
	)

	return graph_index