| # Model Loading and Testing Instructions | |
| This document provides step-by-step instructions on how to load our model from the Hugging Face Hub and evaluate it on a test dataset. | |
| The following code load and test the models on colab notebook. | |
| --- | |
| # Step 1: Prerequisites | |
| 1. Import the required Python packages: | |
| ```python | |
| from huggingface_hub import login | |
| import torch | |
| import torch.nn as nn | |
| from transformers import RobertaForSequenceClassification, RobertaTokenizer | |
| from torch.utils.data import Dataset, DataLoader | |
| import pandas as pd | |
| import numpy as np | |
| import re | |
| from sklearn.metrics import accuracy_score | |
| from transformers import AutoModel, AutoTokenizer | |
| from huggingface_hub import login | |
| ``` | |
| 2. Log in by using the account (see our Ed private post & email sent to TAs, thanks!): | |
| ```python | |
| login("Replace with the key") | |
| ``` | |
| # Step 2: Define the preprocessing and dataset clas | |
| Run the following class and functions designed to preprocess the test data | |
| ```python | |
| class NewsDataset(Dataset): | |
| def __init__(self, texts, labels, tokenizer, max_len=128): | |
| self.texts = texts | |
| self.labels = labels | |
| self.tokenizer = tokenizer | |
| self.max_len = max_len | |
| def __len__(self): | |
| return len(self.texts) | |
| def __getitem__(self, idx): | |
| text = self.texts[idx] | |
| label = self.labels[idx] | |
| encoding = self.tokenizer( | |
| text, | |
| max_length=self.max_len, | |
| padding="max_length", | |
| truncation=True, | |
| return_tensors="pt" | |
| ) | |
| return { | |
| "input_ids": encoding["input_ids"].squeeze(), | |
| "attention_mask": encoding["attention_mask"].squeeze(), | |
| "labels": torch.tensor(label, dtype=torch.long) | |
| } | |
| def preprocess_text(text): | |
| """Clean and preprocess text.""" | |
| text = str(text) | |
| contractions = { | |
| "n't": " not", | |
| "'s": " is", | |
| "'ll": " will", | |
| "'ve": " have" | |
| } | |
| for contraction, expansion in contractions.items(): | |
| text = text.replace(contraction, expansion) | |
| text = re.sub(r'\$\\d+\.?\\d*\s*(million|billion|trillion)?', r'$ \1', text, flags=re.IGNORECASE) | |
| text = re.sub(r'http\\S+', '', text) | |
| text = re.sub(r'-', ' ', text) | |
| text = text.lower() | |
| text = ' '.join(text.split()) | |
| return text | |
| ``` | |
| # Step 3: Load the model and tokenizer from Hugging Face Hub | |
| This step loads the pre-trained model and tokenizer, which are hosted on the Hugging Face Hub. | |
| ```python | |
| print("Loading model and tokenizer...") | |
| REPO_NAME = "CIS5190GoGo/CustomModel" #This is where we pushed the model to | |
| model = RobertaForSequenceClassification.from_pretrained(REPO_NAME) | |
| tokenizer = RobertaTokenizer.from_pretrained(REPO_NAME) | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| model.to(device) | |
| print("Model and tokenizer loaded successfully!") | |
| ``` | |
| # Step 4: Load test dataset | |
| ```python | |
| print("Loading test data...") | |
| test_data_path = "Replace wit your test set path" #Note: Replace with your test set path | |
| test_data = pd.read_csv(test_data_path) | |
| ``` | |
| # Step 5: Preprocess test data | |
| ```python | |
| X_test = test_data['title'].apply(preprocess_text).values | |
| y_test = test_data['labels'].values | |
| ``` | |
| # Step 6: Prepare the dataset and dataloader | |
| ```python | |
| test_dataset = NewsDataset(X_test, y_test, tokenizer) | |
| test_loader = DataLoader(test_dataset, batch_size=16, num_workers=2) | |
| ``` | |
| # Step 7: Evaluate the model and calculate accuracy | |
| ```python | |
| print("Evaluating the model...") | |
| model.eval() | |
| all_preds, all_labels = [], [] | |
| with torch.no_grad(): | |
| for batch in test_loader: | |
| input_ids = batch["input_ids"].to(device) | |
| attention_mask = batch["attention_mask"].to(device) | |
| labels = batch["labels"].to(device) | |
| outputs = model(input_ids=input_ids, attention_mask=attention_mask) | |
| preds = torch.argmax(outputs.logits, dim=-1) | |
| all_preds.extend(preds.cpu().numpy()) | |
| all_labels.extend(labels.cpu().numpy()) | |
| accuracy = accuracy_score(all_labels, all_preds) | |
| print(f"Test Accuracy: {accuracy:.4f}") | |
| ``` | |
| # Expected output: | |
| ```python | |
| Loading model and tokenizer... | |
| Model and tokenizer loaded successfully! | |
| Loading test data... | |
| Evaluating the model... | |
| Test Accuracy: 0.8500 | |
| ``` |