| 1. seprate attributes and data | |
| 2. remove the datatypes from the attributes | |
| C:\Users\Niall Dcunha\DatasetCreator\house price prediction\21754539_dataset | |
| # import os | |
| # import glob | |
| # import pandas as pd | |
| # import openai | |
| # from openai import OpenAI | |
| # from dotenv import load_dotenv | |
| # import ast | |
| # import re | |
| # def extract_dict_from_response(response: str) -> dict: | |
| # # Try extracting code block content containing the dictionary | |
| # match = re.search(r"```(?:python)?\s*(\{.*?\})\s*```", response, re.DOTALL) | |
| # if match: | |
| # mapping_str = match.group(1) | |
| # else: | |
| # # Try extracting dictionary directly if it's not in code block | |
| # match = re.search(r"(\{.*\})", response, re.DOTALL) | |
| # if not match: | |
| # raise ValueError("β Could not find a Python dictionary in the response.") | |
| # mapping_str = match.group(1) | |
| # try: | |
| # return ast.literal_eval(mapping_str) | |
| # except Exception as e: | |
| # print("β οΈ Failed to evaluate extracted dictionary string.") | |
| # print("String:", mapping_str) | |
| # raise e | |
| # # Load environment variables | |
| # load_dotenv() | |
| # client = OpenAI( | |
| # api_key=os.getenv("OPENAI_API_KEY"), | |
| # base_url=os.getenv("OPENAI_API_BASE") # Optional: for Azure or self-hosted | |
| # ) | |
| # def load_csv_files(folder_path): | |
| # csv_files = glob.glob(os.path.join(folder_path, "*.csv")) | |
| # dataframes = [] | |
| # column_sets = [] | |
| # valid_paths = [] | |
| # print("π₯ Reading CSV files...") | |
| # for file in csv_files: | |
| # try: | |
| # df = pd.read_csv(file) | |
| # dataframes.append(df) | |
| # column_sets.append(list(df.columns)) | |
| # valid_paths.append(file) | |
| # print(f"β Loaded: {os.path.basename(file)}") | |
| # except pd.errors.ParserError as e: | |
| # print(f"β Skipping file due to parsing error: {os.path.basename(file)}") | |
| # print(f" β³ {e}") | |
| # except Exception as e: | |
| # print(f"β οΈ Unexpected error with file {os.path.basename(file)}: {e}") | |
| # return dataframes, column_sets, valid_paths | |
| # def generate_mapping_prompt(column_sets): | |
| # prompt = ( | |
| # "You are a data scientist helping to merge multiple ML prediction datasets. " | |
| # "Each CSV may have different or similar column names. I need a unified mapping to standardize these datasets. " | |
| # "Also, please identify likely prediction label columns (e.g., price, quality, outcome).\n\n" | |
| # "Here are the column headers from each CSV:\n" | |
| # ) | |
| # for i, columns in enumerate(column_sets): | |
| # prompt += f"CSV {i+1}: {columns}\n" | |
| # prompt += ( | |
| # "\nPlease provide:\n" | |
| # "1. A Python dictionary mapping similar columns across these CSVs.\n" | |
| # "2. A list of columns most likely to represent prediction labels.\n\n" | |
| # "Format your response as:\n" | |
| # "```python\n" | |
| # "column_mapping = { ... }\n" | |
| # "label_columns = [ ... ]\n" | |
| # "```" | |
| # ) | |
| # return prompt | |
| # def get_column_mapping_from_openai(column_sets): | |
| # prompt = generate_mapping_prompt(column_sets) | |
| # response = client.chat.completions.create( | |
| # model="gpt-4", | |
| # messages=[ | |
| # {"role": "system", "content": "You are a helpful data scientist."}, | |
| # {"role": "user", "content": prompt} | |
| # ], | |
| # temperature=0.3 | |
| # ) | |
| # content = response.choices[0].message.content | |
| # print("\nπ© Received response from OpenAI.") | |
| # try: | |
| # # Try parsing both dictionary and label list from the response | |
| # column_mapping_match = re.search(r"column_mapping\s*=\s*(\{.*?\})", content, re.DOTALL) | |
| # label_columns_match = re.search(r"label_columns\s*=\s*(\[.*?\])", content, re.DOTALL) | |
| # if column_mapping_match: | |
| # mapping = ast.literal_eval(column_mapping_match.group(1)) | |
| # else: | |
| # raise ValueError("β Could not find `column_mapping` in the response.") | |
| # if label_columns_match: | |
| # label_columns = ast.literal_eval(label_columns_match.group(1)) | |
| # else: | |
| # label_columns = [] | |
| # except Exception as e: | |
| # print("β οΈ Error parsing OpenAI response:") | |
| # print(content) | |
| # raise e | |
| # return mapping, label_columns | |
| # def standardize_columns(df, mapping): | |
| # new_columns = {col: mapping.get(col, col) for col in df.columns} | |
| # return df.rename(columns=new_columns) | |
| # def merge_csvs(folder_path, output_file="merged_dataset.csv"): | |
| # dfs, column_sets, csv_paths = load_csv_files(folder_path) | |
| # if not dfs: | |
| # print("β No valid CSVs found to merge.") | |
| # return | |
| # print("\nπ§ Requesting column mapping from OpenAI...") | |
| # mapping, label_columns = get_column_mapping_from_openai(column_sets) | |
| # print("\nπ Column Mapping:") | |
| # for k, v in mapping.items(): | |
| # print(f" '{k}' -> '{v}'") | |
| # print("\nπ·οΈ Suggested Label Columns:") | |
| # for label in label_columns: | |
| # print(f" - {label}") | |
| # standardized_dfs = [standardize_columns(df, mapping) for df in dfs] | |
| # merged_df = pd.concat(standardized_dfs, ignore_index=True, sort=False) | |
| # merged_df.to_csv(output_file, index=False) | |
| # print(f"\nβ Merged dataset saved as '{output_file}'") | |
| # if __name__ == "__main__": | |
| # folder_path = "house" | |
| import os | |
| import glob | |
| import pandas as pd | |
| import ast | |
| import re | |
| from itertools import combinations | |
| from rapidfuzz import fuzz, process | |
| from dotenv import load_dotenv | |
| from openai import OpenAI | |
| # Manual rename map to standardize some known variations | |
| manual_rename_map = { | |
| "review": "text", | |
| "text": "text", | |
| "NumBedrooms": "bedrooms", | |
| "HousePrice": "price", | |
| "TARGET(PRICE_IN_LACS)": "price", | |
| "SquareFootage": "area", | |
| "SQUARE_FT": "area", | |
| "sentiment": "label", | |
| "target": "label", | |
| "type": "label", | |
| "variety": "label", | |
| "class": "label", | |
| "HeartDisease": "label", | |
| "Heart Attack Risk (Binary)": "label", | |
| "Heart Attack Risk": "label" | |
| } | |
| def normalize(col): | |
| return re.sub(r'[^a-z0-9]', '', col.lower()) | |
| def apply_manual_renaming(df, rename_map): | |
| renamed = {} | |
| for col in df.columns: | |
| if col in rename_map: | |
| renamed[col] = rename_map[col] | |
| return df.rename(columns=renamed) | |
| def get_fuzzy_common_columns(cols_list, threshold=75): | |
| base = cols_list[0] | |
| common = set() | |
| for col in base: | |
| match_all = True | |
| for other in cols_list[1:]: | |
| match, score, _ = process.extractOne(col, other, scorer=fuzz.token_sort_ratio) | |
| if score < threshold: | |
| match_all = False | |
| break | |
| if match_all: | |
| common.add(col) | |
| return common | |
| def sortFiles(dfs): | |
| unique_dfs = [] | |
| seen = [] | |
| for i, df1 in enumerate(dfs): | |
| duplicate = False | |
| for j in seen: | |
| df2 = dfs[j] | |
| if df1.shape != df2.shape: | |
| continue | |
| if df1.reset_index(drop=True).equals(df2.reset_index(drop=True)): | |
| duplicate = True | |
| break | |
| if not duplicate: | |
| unique_dfs.append(df1) | |
| seen.append(i) | |
| return unique_dfs | |
| def load_csv_files(folder_path): | |
| csv_files = glob.glob(os.path.join(folder_path, "*.csv")) | |
| dfs = [] | |
| column_sets = [] | |
| paths = [] | |
| for file in csv_files: | |
| try: | |
| df = pd.read_csv(file) | |
| dfs.append(df) | |
| column_sets.append(list(df.columns)) | |
| paths.append(file) | |
| print(f"β Loaded: {os.path.basename(file)}") | |
| except Exception as e: | |
| print(f"β Failed to load {file}: {e}") | |
| return dfs, column_sets, paths | |
| def generate_mapping_prompt(column_sets): | |
| prompt = ( | |
| "You are a data scientist helping to merge multiple machine learning prediction datasets. " | |
| "Each CSV file may have different column names, even if they represent similar types of data. " | |
| "Your task is to identify and map these similar columns across datasets to a common, unified name. " | |
| "Columns with clearly similar features (e.g., 'Bedrooms' and 'BedroomsAbvGr') should be merged into one column with a relevant name like 'bedrooms'.\n\n" | |
| "Avoid keeping redundant or unique columns that do not have any logical counterpart in other datasets unless they are essential. " | |
| "The goal is not to maximize the number of columns or rows, but to create a clean, consistent dataset for training ML models.\n\n" | |
| "Examples:\n" | |
| "- Dataset1: 'Locality' -> Mumbai, Delhi\n" | |
| "- Dataset2: 'Places' -> Goa, Singapore\n" | |
| "β Merge both into a common column like 'location'.\n\n" | |
| "Please also identify likely label or target columns that are typically used for prediction (e.g., price, sentiment, outcome, quality).\n\n" | |
| ) | |
| for i, cols in enumerate(column_sets): | |
| prompt += f"CSV {i+1}: {cols}\n" | |
| prompt += "\nPlease return:\n```python\ncolumn_mapping = { ... }\nlabel_columns = [ ... ]\n```" | |
| return prompt | |
| def get_column_mapping_from_openai(column_sets): | |
| load_dotenv() | |
| client = OpenAI( | |
| api_key=os.getenv("OPENAI_API_KEY"), | |
| base_url=os.getenv("OPENAI_API_BASE", "") | |
| ) | |
| prompt = generate_mapping_prompt(column_sets) | |
| response = client.chat.completions.create( | |
| model="gpt-4", | |
| messages=[ | |
| {"role": "system", "content": "You are a helpful data scientist."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| temperature=0.3 | |
| ) | |
| content = response.choices[0].message.content | |
| try: | |
| column_mapping_match = re.search(r"column_mapping\s*=\s*(\{.*?\})", content, re.DOTALL) | |
| label_columns_match = re.search(r"label_columns\s*=\s*(\[.*?\])", content, re.DOTALL) | |
| column_mapping = ast.literal_eval(column_mapping_match.group(1)) if column_mapping_match else {} | |
| label_columns = ast.literal_eval(label_columns_match.group(1)) if label_columns_match else [] | |
| except Exception as e: | |
| print("β οΈ Error parsing OpenAI response:") | |
| print(content) | |
| raise e | |
| return column_mapping, label_columns | |
| def clean_and_merge(folder, query=None, use_ai=True): | |
| os.makedirs("./final", exist_ok=True) | |
| dfs, column_sets, csv_paths = load_csv_files(folder) | |
| if not dfs: | |
| print("No valid CSVs found.") | |
| return | |
| dfs = sortFiles(dfs) | |
| dfs = [apply_manual_renaming(df, manual_rename_map) for df in dfs] | |
| if use_ai: | |
| try: | |
| column_mapping, label_columns = get_column_mapping_from_openai(column_sets) | |
| dfs = [df.rename(columns={col: column_mapping.get(col, col) for col in df.columns}) for df in dfs] | |
| except Exception as e: | |
| print("Falling back to fuzzy matching due to OpenAI error:", e) | |
| use_ai = False | |
| if not use_ai: | |
| # Normalize columns for fuzzy match fallback | |
| normalized_cols = [] | |
| for df in dfs: | |
| normalized_cols.append({normalize(col) for col in df.columns}) | |
| # Get best combination with fuzzy common columns | |
| max_common = set() | |
| best_combo = [] | |
| for i in range(2, len(dfs)+1): | |
| for combo in combinations(range(len(dfs)), i): | |
| selected = [normalized_cols[j] for j in combo] | |
| fuzzy_common = get_fuzzy_common_columns(selected) | |
| if len(fuzzy_common) >= len(max_common): | |
| max_common = fuzzy_common | |
| best_combo = combo | |
| # Harmonize and align | |
| aligned_dfs = [] | |
| for idx in best_combo: | |
| df = dfs[idx] | |
| col_map = {} | |
| for std_col in max_common: | |
| match, _, _ = process.extractOne(std_col, [normalize(col) for col in df.columns]) | |
| for col in df.columns: | |
| if normalize(col) == match: | |
| col_map[col] = std_col | |
| break | |
| df_subset = df[list(col_map.keys())].rename(columns=col_map) | |
| aligned_dfs.append(df_subset) | |
| combined_df = pd.concat(aligned_dfs, ignore_index=True) | |
| else: | |
| combined_df = pd.concat(dfs, ignore_index=True) | |
| # Label assignment fallback | |
| for i, df in enumerate(dfs): | |
| if 'label' not in df.columns: | |
| name = os.path.basename(csv_paths[i]).split(".")[0].lower() | |
| name_cleaned = name | |
| if query: | |
| words = set(re.sub(r'[^a-z]', ' ', query.lower()).split()) | |
| for word in words: | |
| name_cleaned = name_cleaned.replace(word, "") | |
| df['label'] = name_cleaned | |
| # Decide best final file | |
| largest_df = max(dfs, key=lambda df: len(df)) | |
| flag = False | |
| if len(largest_df) > len(combined_df) and len(largest_df.columns) > 2: | |
| flag = True | |
| elif len(combined_df) > len(largest_df) and (len(largest_df.columns) - len(combined_df.columns)) > 3 and len(largest_df.columns) < 7: | |
| flag = True | |
| output_file = f"./final/{query or os.path.basename(folder)}.csv" | |
| if flag: | |
| largest_df.to_csv(output_file, index=False) | |
| print(f"β οΈ Saved fallback single file due to poor merge: {output_file}") | |
| else: | |
| combined_df.to_csv(output_file, index=False) | |
| print(f"β Saved merged file: {output_file}") | |
| # Example usage: | |
| clean_and_merge("house", query="house", use_ai=True) | |
| # merge_csvs(folder_path) | |