Spaces:
Runtime error
Runtime error
| import json | |
| import yaml | |
| import pandas as pd | |
| import numpy as np | |
| from pathlib import Path | |
| from jsonargparse import ArgumentParser | |
| def parse_args(): | |
| """Parse command-line arguments.""" | |
| parser = ArgumentParser() | |
| parser.add_argument("--rating_path", type=str, required=True, default="./dataset/ratings.csv") | |
| parser.add_argument("--book_path", type=str, required=True, default="./dataset/books.csv") | |
| parser.add_argument("--out_dir", type=str, required=True, default="./processed") | |
| parser.add_argument("--limit", required=True, type=int, default=1000) | |
| return vars(parser.parse_args()) | |
| def main( | |
| rating_path, | |
| book_path, | |
| out_dir, | |
| limit, | |
| **kwargs | |
| ): | |
| data = pd.read_csv(rating_path, delimiter=';', nrows=limit, encoding='ISO-8859-1') | |
| # Make Y | |
| Y = data.pivot(index='ISBN', columns='User-ID', values='Book-Rating') | |
| Y = Y.fillna(0) | |
| Y = Y.values | |
| # Make R | |
| R = np.where(Y != 0, 1, 0) | |
| # Save Y and R as dense matrices | |
| out_dir_path = Path(out_dir) | |
| if out_dir_path.exists(): | |
| assert out_dir_path.is_dir() | |
| else: | |
| out_dir_path.mkdir(parents=True) | |
| np.save(f'{out_dir_path}/Y.npy', Y) | |
| np.save(f'{out_dir_path}/R.npy', R) | |
| # Create mappings for book and user IDs | |
| book_lst = data['ISBN'].unique() | |
| user_lst = data['User-ID'].unique() | |
| book_id_map = {book_id: i for i, book_id in enumerate(book_lst)} | |
| user_id_map = {user_id: i for i, user_id in enumerate(user_lst)} | |
| # Convert keys to compatible types | |
| book_id_map = {str(key): value for key, value in book_id_map.items()} | |
| user_id_map = {str(key): value for key, value in user_id_map.items()} | |
| # Save book_id_map to file | |
| with open(f'{out_dir_path}/book_id_map.json', 'w') as f: | |
| json.dump(book_id_map, f) | |
| # Save user_id_map to file | |
| with open(f'{out_dir_path}/user_id_map.json', 'w') as f: | |
| json.dump(user_id_map, f) | |
| # Get summary | |
| function = { | |
| "Book-Rating": "mean", | |
| "User-ID": "count" | |
| } | |
| book_df = pd.read_csv(book_path, delimiter=';', encoding='ISO-8859-1', on_bad_lines='skip') | |
| book_df = book_df[book_df["ISBN"].isin(book_id_map.keys())] | |
| summary_rating = data.groupby("ISBN").agg(function, axis=0) | |
| summary_rating = summary_rating.rename(columns={"Book-Rating": "Mean-Rating", "User-ID": "Num-Rating"}) | |
| df = book_df.merge(summary_rating, how="left", left_on="ISBN", right_on="ISBN") | |
| df = df.dropna() | |
| df.drop(columns=["Image-URL-S", "Image-URL-M", "Image-URL-L"], inplace=True) | |
| df.to_csv(f"{out_dir_path}/summary_book.csv", index=False) | |
| if __name__ == "__main__": | |
| main(**parse_args()) | |