Spaces:
Build error
Build error
| import weaviate | |
| import streamlit as st | |
| from weaviate.embedded import EmbeddedOptions | |
| from weaviate import Client | |
| import pandas as pd # <-- Add this import | |
| from io import StringIO # <-- Add this import | |
| import pandas as pd | |
| def hybrid_search_weaviate(client, selected_class, query): | |
| """ | |
| Perform a hybrid search on Weaviate using the provided class and query. | |
| Return the results as a list of dictionaries. | |
| """ | |
| # Construct the search query | |
| search_query = { | |
| "where": { | |
| "path": ["*"], | |
| "operator": "Like", | |
| "valueString": query | |
| } | |
| } | |
| # Execute the query and retrieve the results | |
| results = client.query.get(selected_class, "*").with_where(search_query).do() | |
| # Extract the data objects from the results | |
| data_objects = results.get('data', {}).get('Get', {}).get('Things', []) | |
| return data_objects | |
| def convert_to_tapas_format(data): | |
| """ | |
| Convert the list of dictionaries (from Weaviate) into the format TAPAS expects. | |
| Return the table as a list of lists. | |
| """ | |
| # Extract the data objects from the results | |
| data_objects = data.get('data', {}).get('Get', {}).get('Things', []) | |
| # Convert the data objects into a DataFrame | |
| df = pd.DataFrame([obj['thing'] for obj in data_objects]) | |
| table = [df.columns.tolist()] + df.values.tolist() | |
| return table | |
| def initialize_weaviate_client(): | |
| return weaviate.Client(embedded_options=EmbeddedOptions()) | |
| def class_exists(client, class_name): | |
| try: | |
| client.schema.get_class(class_name) | |
| return True | |
| except: | |
| return False | |
| def map_dtype_to_weaviate(dtype): | |
| if "int" in str(dtype): | |
| return "int" | |
| elif "float" in str(dtype): | |
| return "number" | |
| elif "bool" in str(dtype): | |
| return "boolean" | |
| else: | |
| return "string" | |
| def create_new_class_schema(client, class_name, class_description): | |
| class_schema = { | |
| "class": class_name, | |
| "description": class_description, | |
| "properties": [] | |
| } | |
| try: | |
| client.schema.create({"classes": [class_schema]}) | |
| st.success(f"Class {class_name} created successfully!") | |
| except Exception as e: | |
| st.error(f"Error creating class: {e}") | |
| def ingest_data_to_weaviate(client, csv_file, selected_class): | |
| # Read the CSV data | |
| data = csv_file.read().decode("utf-8") | |
| dataframe = pd.read_csv(StringIO(data)) | |
| # After converting the CSV to a dataframe | |
| embedded_table = tapas_utils.embed_table(dataframe) | |
| # Create a unique ID for the table (for example, based on its content) | |
| table_id = hashlib.md5(dataframe.to_csv(index=False).encode()).hexdigest() | |
| # Store the embedded table in Weaviate | |
| client.data_object.create({ | |
| "id": table_id, | |
| "embeddedTable": embedded_table.tolist(), | |
| "content": dataframe.to_csv(index=False) | |
| }, selected_class) | |
| # Fetch the schema for the selected class | |
| class_schema = get_class_schema(client, selected_class) | |
| # If the schema is empty, create it based on the CSV columns | |
| if not class_schema or not class_schema["properties"]: | |
| for column_name, data_type in zip(dataframe.columns, dataframe.dtypes): | |
| property_schema = { | |
| "name": column_name, | |
| "description": f"Property for {column_name}", | |
| "dataType": [map_dtype_to_weaviate(data_type)] | |
| } | |
| try: | |
| client.schema.property.create(selected_class, property_schema) | |
| except weaviate.exceptions.SchemaValidationException: | |
| # Property might already exist, so we can continue | |
| pass | |
| else: | |
| # If the schema is not empty, compare it with the CSV columns | |
| schema_columns = [prop["name"] for prop in class_schema["properties"]] | |
| if set(dataframe.columns) != set(schema_columns): | |
| st.error("The columns in the uploaded CSV do not match the schema of the selected class. Please check and upload the correct CSV or create a new class.") | |
| return | |
| # Ingest the data into Weaviate | |
| data = dataframe.to_dict(orient="records") | |
| for record in data: | |
| try: | |
| client.data_object.create(record, selected_class) | |
| except Exception as e: | |
| st.error(f"Error ingesting record: {e}") | |
| # Display a preview of the ingested data | |
| st.write(f"Your CSV was successfully integrated into the vector database under the class '{selected_class}'") | |
| st.write(dataframe.head()) # Display the first few rows of the dataframe as a preview | |
| # Return the dataframe for preview | |
| return dataframe # Added this line | |
| def get_class_schema(client, class_name): | |
| try: | |
| schema = client.schema.get() | |
| for cls in schema["classes"]: | |
| if cls["class"] == class_name: | |
| return cls | |
| return None | |
| except weaviate.exceptions.SchemaValidationException: | |
| return None | |
| def retrieve_relevant_table(client, selected_class, question_embedding): | |
| # Query Weaviate to get the most relevant table | |
| results = client.query.get(selected_class, ["content"]).with_near_text(question_embedding).do() | |
| # Extract the table content from the results | |
| table_content = results.get('data', {}).get('Get', {}).get('Things', [])[0].get('content') | |
| # Convert the table content to a DataFrame | |
| table = pd.read_csv(StringIO(table_content)) | |
| return table | |