Spaces:

diegovelilla
/

EssAI-app

Sleeping

App Files Files Community

diegovelilla commited on Aug 16, 2024

Commit

b9c3ba7

verified ·

1 Parent(s): 786c2e2

first upload of app.py

Browse files

Files changed (1) hide show

app.py +183 -0

app.py ADDED Viewed

	@@ -0,0 +1,183 @@

+# --- IMPORTS ---
+import gradio as gr
+import torch
+from datasets import Dataset
+from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from sklearn.metrics import accuracy_score, precision_recall_fscore_support
+from sklearn.model_selection import train_test_split
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import re
+import nltk
+from nltk.corpus import stopwords
+nltk.download('stopwords')
+stopwords = set(stopwords.words('english'))
+# -------------------------
+# --- USEFUL FUNCTIONS ----
+def clean_text(text):
+    """
+    This function get's rid of nonalphabetical characters, stopwords and lower cases the text.
+    Args:
+    text (str): The text to be cleaned
+    Returns:
+    text (str): The cleaned text
+    Example:
+    df['text'] = df['text'].apply(clean_text)
+    """
+    text = re.sub(r'[^a-zA-Z]', ' ', text)
+    text = text.lower()
+    words = text.split()
+    text = [word for word in words if not word in stopwords]
+    text = ' '.join(words)
+    return text
+def tokenize_function(dataframe):
+    """
+    This function tokenizes the 'text' field of the dataframe.
+    Args:
+    dataframe (pandas.DataFrame): The dataframe to be tokenized
+    Returns:
+    dataframe (pandas.DataFrame): The tokenized dataframe
+    Example and output:
+    train_dataset_token = train_dataset.map(tokenize_function, batched=True)
+    """
+    return tokenizer(dataframe["text"], truncation=True)
+def compute_metrics(eval_pred):
+    """
+    This function computes the accuracy, precision, recall and f1 score of the model.
+    It'is passed to the trainer and it outputs when evaluating the model.
+    Args:
+    eval_pred (tuple): The predictions and labels of the model
+    Returns:
+    dict: The accuracy, precision, recall and f1 score of the model
+    Example:
+    >>> trainer.evaluate()
+    {
+        'accuracy': accuracy,
+        'precision': precision,
+        'recall': recall,
+        'f1': f1
+    }
+    """
+    predictions, labels = eval_pred
+    predictions = predictions.argmax(axis=-1)
+    accuracy = accuracy_score(labels, predictions)
+    precision, recall, f1, _ = precision_recall_fscore_support(
+        labels, predictions, average='binary')
+    return {
+        'accuracy': accuracy,
+        'precision': precision,
+        'recall': recall,
+        'f1': f1
+    }
+def predict(essay):
+    """
+    This function makes a prediction based on the text input.
+    Args:
+    text (list): List of all essays to check.
+    Returns:
+    Prediction
+    """
+    # --- DATA PREPROCESSING ---
+    # Now we convert the input to a dataset
+    df = pd.DataFrame({'text': [essay]})
+    # Get rid of nonalphatetical characters, stopwords and we lower case it.
+    df['text'] = df['text'].apply(clean_text)
+    # We convert the pandas dataframe into hugging face datasets and tokenize both of them
+    ds = Dataset.from_pandas(df)
+    ds_token = ds.map(tokenize_function, batched=True)
+    # Drop columns that are not necessary and set the dataset format to pytorch tensors
+    ds_token = ds_token.remove_columns(["text", "token_type_ids"])
+    ds_token.set_format(type='torch', columns=['input_ids', 'attention_mask'])
+    # -------------------------
+    # --- INSTANTIATING TRAINER ----
+    # We instantiate a DataCollatorWithPadding in order to pad the inputs in batches while training
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+    # Create the training arguments
+    training_args = TrainingArguments(".")
+    # Create the trainer
+    trainer = Trainer(
+        model,
+        training_args,
+        eval_dataset=ds_token,
+        data_collator=data_collator,
+        tokenizer=tokenizer,
+        compute_metrics=compute_metrics
+    )
+    # -------------------------
+    # --- PREDICT ---
+    # We predict and then format the output
+    predictions = trainer.predict(ds_token)
+    predictions = torch.from_numpy(predictions.predictions)
+    predictions = torch.nn.functional.softmax(predictions, dim=-1)
+    results = []
+    index = torch.argmax(predictions[0])
+    confidence = round(predictions[0][index].item() * 100, 2)
+    label = "HUMAN" if index == 0 else "AI"
+    results.append(f'{label} with {confidence}% confidence.')
+    return "\n".join(results)
+    # -------------------------
+# -------------------------
+# --- LOADING THE MODEL ---
+# Load the initial tokenizer and model to set the number of labels its going to classify as 2
+checkpoint = "diegovelilla/EssAI"
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
+# -------------------------
+iface = gr.Interface(
+    fn=predict,
+    inputs=gr.Textbox(
+        lines=2, placeholder="Enter your essay here...", label="Your essay"),
+    outputs=gr.Textbox(label="Prediction Result"),
+    title="EssAI",
+    description="Detect AI-generated essays in a few seconds."
+)
+# Launch the app
+if __name__ == "__main__":
+    iface.launch()