Spaces:
Sleeping
Sleeping
| # --- IMPORTS --- | |
| import gradio as gr | |
| import torch | |
| from datasets import Dataset | |
| from transformers import Trainer, TrainingArguments, DataCollatorWithPadding | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| from sklearn.metrics import accuracy_score, precision_recall_fscore_support | |
| from sklearn.model_selection import train_test_split | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import re | |
| import nltk | |
| from nltk.corpus import stopwords | |
| nltk.download('stopwords') | |
| stopwords = set(stopwords.words('english')) | |
| # ------------------------- | |
| # --- USEFUL FUNCTIONS ---- | |
| def clean_text(text): | |
| """ | |
| This function get's rid of nonalphabetical characters, stopwords and lower cases the text. | |
| Args: | |
| text (str): The text to be cleaned | |
| Returns: | |
| text (str): The cleaned text | |
| Example: | |
| df['text'] = df['text'].apply(clean_text) | |
| """ | |
| text = re.sub(r'[^a-zA-Z]', ' ', text) | |
| text = text.lower() | |
| words = text.split() | |
| text = [word for word in words if not word in stopwords] | |
| text = ' '.join(words) | |
| return text | |
| def tokenize_function(dataframe): | |
| """ | |
| This function tokenizes the 'text' field of the dataframe. | |
| Args: | |
| dataframe (pandas.DataFrame): The dataframe to be tokenized | |
| Returns: | |
| dataframe (pandas.DataFrame): The tokenized dataframe | |
| Example and output: | |
| train_dataset_token = train_dataset.map(tokenize_function, batched=True) | |
| """ | |
| return tokenizer(dataframe["text"], truncation=True) | |
| def compute_metrics(eval_pred): | |
| """ | |
| This function computes the accuracy, precision, recall and f1 score of the model. | |
| It'is passed to the trainer and it outputs when evaluating the model. | |
| Args: | |
| eval_pred (tuple): The predictions and labels of the model | |
| Returns: | |
| dict: The accuracy, precision, recall and f1 score of the model | |
| Example: | |
| >>> trainer.evaluate() | |
| { | |
| 'accuracy': accuracy, | |
| 'precision': precision, | |
| 'recall': recall, | |
| 'f1': f1 | |
| } | |
| """ | |
| predictions, labels = eval_pred | |
| predictions = predictions.argmax(axis=-1) | |
| accuracy = accuracy_score(labels, predictions) | |
| precision, recall, f1, _ = precision_recall_fscore_support( | |
| labels, predictions, average='binary') | |
| return { | |
| 'accuracy': accuracy, | |
| 'precision': precision, | |
| 'recall': recall, | |
| 'f1': f1 | |
| } | |
| def predict(essay): | |
| """ | |
| This function makes a prediction based on the text input. | |
| Args: | |
| text (list): List of all essays to check. | |
| Returns: | |
| Prediction | |
| """ | |
| # --- DATA PREPROCESSING --- | |
| # Now we convert the input to a dataset | |
| df = pd.DataFrame({'text': [essay]}) | |
| # Get rid of nonalphatetical characters, stopwords and we lower case it. | |
| df['text'] = df['text'].apply(clean_text) | |
| # We convert the pandas dataframe into hugging face datasets and tokenize both of them | |
| ds = Dataset.from_pandas(df) | |
| ds_token = ds.map(tokenize_function, batched=True) | |
| # Drop columns that are not necessary and set the dataset format to pytorch tensors | |
| ds_token = ds_token.remove_columns(["text", "token_type_ids"]) | |
| ds_token.set_format(type='torch', columns=['input_ids', 'attention_mask']) | |
| # ------------------------- | |
| # --- INSTANTIATING TRAINER ---- | |
| # We instantiate a DataCollatorWithPadding in order to pad the inputs in batches while training | |
| data_collator = DataCollatorWithPadding(tokenizer=tokenizer) | |
| # Create the training arguments | |
| training_args = TrainingArguments(".") | |
| # Create the trainer | |
| trainer = Trainer( | |
| model, | |
| training_args, | |
| eval_dataset=ds_token, | |
| data_collator=data_collator, | |
| tokenizer=tokenizer, | |
| compute_metrics=compute_metrics | |
| ) | |
| # ------------------------- | |
| # --- PREDICT --- | |
| # We predict and then format the output | |
| predictions = trainer.predict(ds_token) | |
| predictions = torch.from_numpy(predictions.predictions) | |
| predictions = torch.nn.functional.softmax(predictions, dim=-1) | |
| results = [] | |
| index = torch.argmax(predictions[0]) | |
| confidence = round(predictions[0][index].item() * 100, 2) | |
| label = "HUMAN" if index == 0 else "AI" | |
| results.append(f'{label} with {confidence}% confidence.') | |
| return "\n".join(results) | |
| # ------------------------- | |
| # ------------------------- | |
| # --- LOADING THE MODEL --- | |
| # Load the initial tokenizer and model to set the number of labels its going to classify as 2 | |
| checkpoint = "diegovelilla/EssAI" | |
| tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
| model = AutoModelForSequenceClassification.from_pretrained(checkpoint) | |
| # ------------------------- | |
| iface = gr.Interface( | |
| fn=predict, | |
| inputs=gr.Textbox( | |
| lines=2, placeholder="Enter your essay here...", label="Your essay"), | |
| outputs=gr.Textbox(label="Prediction Result"), | |
| title="EssAI", | |
| description="Detect AI-generated essays in a few seconds." | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| iface.launch() | |