Spaces:

jskinner215
/

TAPAS_WTQ_Chunking

Build error

App Files Files Community

TAPAS_WTQ_Chunking / weaviate_utils.py

jskinner215

Update weaviate_utils.py

bda63f0 over 2 years ago

raw

history blame contribute delete

5.5 kB

	import weaviate
	import streamlit as st
	from weaviate.embedded import EmbeddedOptions
	from weaviate import Client
	import pandas as pd # <-- Add this import
	from io import StringIO # <-- Add this import
	import pandas as pd

	def hybrid_search_weaviate(client, selected_class, query):
	"""
	Perform a hybrid search on Weaviate using the provided class and query.
	Return the results as a list of dictionaries.
	"""
	# Construct the search query
	search_query = {
	"where": {
	"path": ["*"],
	"operator": "Like",
	"valueString": query
	}
	}

	# Execute the query and retrieve the results
	results = client.query.get(selected_class, "*").with_where(search_query).do()

	# Extract the data objects from the results
	data_objects = results.get('data', {}).get('Get', {}).get('Things', [])

	return data_objects





	def convert_to_tapas_format(data):
	"""
	Convert the list of dictionaries (from Weaviate) into the format TAPAS expects.
	Return the table as a list of lists.
	"""
	# Extract the data objects from the results
	data_objects = data.get('data', {}).get('Get', {}).get('Things', [])

	# Convert the data objects into a DataFrame
	df = pd.DataFrame([obj['thing'] for obj in data_objects])

	table = [df.columns.tolist()] + df.values.tolist()
	return table

	def initialize_weaviate_client():
	return weaviate.Client(embedded_options=EmbeddedOptions())

	def class_exists(client, class_name):
	try:
	client.schema.get_class(class_name)
	return True
	except:
	return False

	def map_dtype_to_weaviate(dtype):
	if "int" in str(dtype):
	return "int"
	elif "float" in str(dtype):
	return "number"
	elif "bool" in str(dtype):
	return "boolean"
	else:
	return "string"

	def create_new_class_schema(client, class_name, class_description):
	class_schema = {
	"class": class_name,
	"description": class_description,
	"properties": []
	}
	try:
	client.schema.create({"classes": [class_schema]})
	st.success(f"Class {class_name} created successfully!")
	except Exception as e:
	st.error(f"Error creating class: {e}")

	def ingest_data_to_weaviate(client, csv_file, selected_class):
	# Read the CSV data
	data = csv_file.read().decode("utf-8")
	dataframe = pd.read_csv(StringIO(data))

	# After converting the CSV to a dataframe
	embedded_table = tapas_utils.embed_table(dataframe)

	# Create a unique ID for the table (for example, based on its content)
	table_id = hashlib.md5(dataframe.to_csv(index=False).encode()).hexdigest()

	# Store the embedded table in Weaviate
	client.data_object.create({
	"id": table_id,
	"embeddedTable": embedded_table.tolist(),
	"content": dataframe.to_csv(index=False)
	}, selected_class)

	# Fetch the schema for the selected class
	class_schema = get_class_schema(client, selected_class)

	# If the schema is empty, create it based on the CSV columns
	if not class_schema or not class_schema["properties"]:
	for column_name, data_type in zip(dataframe.columns, dataframe.dtypes):
	property_schema = {
	"name": column_name,
	"description": f"Property for {column_name}",
	"dataType": [map_dtype_to_weaviate(data_type)]
	}
	try:
	client.schema.property.create(selected_class, property_schema)
	except weaviate.exceptions.SchemaValidationException:
	# Property might already exist, so we can continue
	pass
	else:
	# If the schema is not empty, compare it with the CSV columns
	schema_columns = [prop["name"] for prop in class_schema["properties"]]
	if set(dataframe.columns) != set(schema_columns):
	st.error("The columns in the uploaded CSV do not match the schema of the selected class. Please check and upload the correct CSV or create a new class.")
	return

	# Ingest the data into Weaviate
	data = dataframe.to_dict(orient="records")
	for record in data:
	try:
	client.data_object.create(record, selected_class)
	except Exception as e:
	st.error(f"Error ingesting record: {e}")

	# Display a preview of the ingested data
	st.write(f"Your CSV was successfully integrated into the vector database under the class '{selected_class}'")
	st.write(dataframe.head()) # Display the first few rows of the dataframe as a preview

	# Return the dataframe for preview
	return dataframe # Added this line

	def get_class_schema(client, class_name):
	try:
	schema = client.schema.get()
	for cls in schema["classes"]:
	if cls["class"] == class_name:
	return cls
	return None
	except weaviate.exceptions.SchemaValidationException:
	return None

	def retrieve_relevant_table(client, selected_class, question_embedding):
	# Query Weaviate to get the most relevant table
	results = client.query.get(selected_class, ["content"]).with_near_text(question_embedding).do()

	# Extract the table content from the results
	table_content = results.get('data', {}).get('Get', {}).get('Things', [])[0].get('content')

	# Convert the table content to a DataFrame
	table = pd.read_csv(StringIO(table_content))

	return table