Spaces:
Sleeping
Sleeping
Update app.py
Browse files- Reduced batch size from 16 to 8.
- Added garbage collection (gc.collect()) after each batch to free up memory.
- Moved Google categories loading into a separate function for lazy loading.
- Removed precomputation of all keyword embeddings at once, now computing them in batches.
- Set debug=False in the server run command to reduce memory usage in production.
app.py
CHANGED
|
@@ -11,14 +11,13 @@ from sklearn.metrics.pairwise import cosine_similarity
|
|
| 11 |
from gliner_spacy.pipeline import GlinerSpacy
|
| 12 |
import warnings
|
| 13 |
import os
|
|
|
|
| 14 |
|
| 15 |
# Suppress specific warnings
|
| 16 |
warnings.filterwarnings("ignore", message="The sentencepiece tokenizer")
|
| 17 |
|
| 18 |
-
# Initialize Dash app
|
| 19 |
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.DARKLY, 'https://use.fontawesome.com/releases/v5.8.1/css/all.css'])
|
| 20 |
-
|
| 21 |
-
# Create server variable
|
| 22 |
server = app.server
|
| 23 |
|
| 24 |
# Reference absolute file path
|
|
@@ -36,6 +35,7 @@ custom_spacy_config = {
|
|
| 36 |
# Model variables for lazy loading
|
| 37 |
nlp = None
|
| 38 |
sentence_model = None
|
|
|
|
| 39 |
|
| 40 |
# Function to lazy load NLP model
|
| 41 |
def get_nlp():
|
|
@@ -56,11 +56,15 @@ def get_sentence_model():
|
|
| 56 |
return sentence_model
|
| 57 |
|
| 58 |
# Load Google's content categories
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
# Function to perform NER using GLiNER with spaCy
|
| 66 |
def perform_ner(text):
|
|
@@ -82,17 +86,19 @@ def extract_entities(text):
|
|
| 82 |
# Function to precompute category embeddings
|
| 83 |
def compute_category_embeddings():
|
| 84 |
try:
|
| 85 |
-
|
|
|
|
| 86 |
except Exception as e:
|
| 87 |
return []
|
| 88 |
|
| 89 |
# Function to perform topic modeling using sentence transformers
|
| 90 |
def perform_topic_modeling_from_similarities(similarities):
|
| 91 |
try:
|
|
|
|
| 92 |
top_indices = similarities.argsort()[-3:][::-1]
|
| 93 |
|
| 94 |
-
best_match =
|
| 95 |
-
second_best =
|
| 96 |
|
| 97 |
if similarities[top_indices[0]] > similarities[top_indices[1]] * 1.1:
|
| 98 |
return best_match
|
|
@@ -171,34 +177,26 @@ def sort_by_keyword_feature(f):
|
|
| 171 |
return "other"
|
| 172 |
|
| 173 |
# Optimized batch processing of keywords
|
| 174 |
-
def batch_process_keywords(keywords, batch_size=
|
| 175 |
processed_data = {'Keywords': [], 'Intent': [], 'NER Entities': [], 'Google Content Topics': []}
|
| 176 |
|
| 177 |
try:
|
| 178 |
-
|
| 179 |
-
keyword_embeddings = get_sentence_model().encode(keywords, batch_size=batch_size, show_progress_bar=False)
|
| 180 |
-
|
| 181 |
-
# Compute category embeddings
|
| 182 |
category_embeddings = compute_category_embeddings()
|
| 183 |
|
| 184 |
for i in range(0, len(keywords), batch_size):
|
| 185 |
batch = keywords[i:i+batch_size]
|
| 186 |
-
batch_embeddings =
|
| 187 |
|
| 188 |
-
# Batch process intents
|
| 189 |
intents = [sort_by_keyword_feature(kw) for kw in batch]
|
| 190 |
-
|
| 191 |
-
# Batch process entities
|
| 192 |
entities = [extract_entities(kw) for kw in batch]
|
| 193 |
|
| 194 |
-
# Batch process topics
|
| 195 |
similarities = cosine_similarity(batch_embeddings, category_embeddings)
|
| 196 |
Google_Content_Topics = [perform_topic_modeling_from_similarities(sim) for sim in similarities]
|
| 197 |
|
| 198 |
processed_data['Keywords'].extend(batch)
|
| 199 |
processed_data['Intent'].extend(intents)
|
| 200 |
|
| 201 |
-
# Convert entities to strings, handling both tuples and strings
|
| 202 |
processed_entities = []
|
| 203 |
for entity_list in entities:
|
| 204 |
entity_strings = []
|
|
@@ -211,6 +209,9 @@ def batch_process_keywords(keywords, batch_size=16):
|
|
| 211 |
|
| 212 |
processed_data['NER Entities'].extend(processed_entities)
|
| 213 |
processed_data['Google Content Topics'].extend(Google_Content_Topics)
|
|
|
|
|
|
|
|
|
|
| 214 |
|
| 215 |
except Exception as e:
|
| 216 |
pass
|
|
@@ -226,7 +227,7 @@ app.layout = dbc.Container([
|
|
| 226 |
dbc.NavItem(dbc.NavLink("Contact", href="#contact")),
|
| 227 |
],
|
| 228 |
brand="KeyIntentNER-T",
|
| 229 |
-
brand_href="https://
|
| 230 |
color="#151515",
|
| 231 |
dark=True,
|
| 232 |
brand_style={"background": "linear-gradient(to right, #ff7e5f, #feb47b)", "-webkit-background-clip": "text", "color": "transparent", "textShadow": "0 0 1px #ffffff, 0 0 3px #ff7e5f, 0 0 5px #ff7e5f"},
|
|
@@ -506,5 +507,5 @@ def download_csv(n_clicks, processed_data):
|
|
| 506 |
return dict(content=csv_string, filename="KeyIntentNER-T_keyword_analysis.csv")
|
| 507 |
|
| 508 |
if __name__ == "__main__":
|
| 509 |
-
port = int(os.environ.get("PORT", 10000))
|
| 510 |
-
app.run_server(debug=
|
|
|
|
| 11 |
from gliner_spacy.pipeline import GlinerSpacy
|
| 12 |
import warnings
|
| 13 |
import os
|
| 14 |
+
import gc
|
| 15 |
|
| 16 |
# Suppress specific warnings
|
| 17 |
warnings.filterwarnings("ignore", message="The sentencepiece tokenizer")
|
| 18 |
|
| 19 |
+
# Initialize Dash app
|
| 20 |
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.DARKLY, 'https://use.fontawesome.com/releases/v5.8.1/css/all.css'])
|
|
|
|
|
|
|
| 21 |
server = app.server
|
| 22 |
|
| 23 |
# Reference absolute file path
|
|
|
|
| 35 |
# Model variables for lazy loading
|
| 36 |
nlp = None
|
| 37 |
sentence_model = None
|
| 38 |
+
google_categories = []
|
| 39 |
|
| 40 |
# Function to lazy load NLP model
|
| 41 |
def get_nlp():
|
|
|
|
| 56 |
return sentence_model
|
| 57 |
|
| 58 |
# Load Google's content categories
|
| 59 |
+
def load_google_categories():
|
| 60 |
+
global google_categories
|
| 61 |
+
if not google_categories:
|
| 62 |
+
try:
|
| 63 |
+
with open(CATEGORIES_FILE, 'r') as f:
|
| 64 |
+
google_categories = [line.strip() for line in f]
|
| 65 |
+
except Exception as e:
|
| 66 |
+
google_categories = []
|
| 67 |
+
return google_categories
|
| 68 |
|
| 69 |
# Function to perform NER using GLiNER with spaCy
|
| 70 |
def perform_ner(text):
|
|
|
|
| 86 |
# Function to precompute category embeddings
|
| 87 |
def compute_category_embeddings():
|
| 88 |
try:
|
| 89 |
+
categories = load_google_categories()
|
| 90 |
+
return get_sentence_model().encode(categories)
|
| 91 |
except Exception as e:
|
| 92 |
return []
|
| 93 |
|
| 94 |
# Function to perform topic modeling using sentence transformers
|
| 95 |
def perform_topic_modeling_from_similarities(similarities):
|
| 96 |
try:
|
| 97 |
+
categories = load_google_categories()
|
| 98 |
top_indices = similarities.argsort()[-3:][::-1]
|
| 99 |
|
| 100 |
+
best_match = categories[top_indices[0]]
|
| 101 |
+
second_best = categories[top_indices[1]]
|
| 102 |
|
| 103 |
if similarities[top_indices[0]] > similarities[top_indices[1]] * 1.1:
|
| 104 |
return best_match
|
|
|
|
| 177 |
return "other"
|
| 178 |
|
| 179 |
# Optimized batch processing of keywords
|
| 180 |
+
def batch_process_keywords(keywords, batch_size=8):
|
| 181 |
processed_data = {'Keywords': [], 'Intent': [], 'NER Entities': [], 'Google Content Topics': []}
|
| 182 |
|
| 183 |
try:
|
| 184 |
+
sentence_model = get_sentence_model()
|
|
|
|
|
|
|
|
|
|
| 185 |
category_embeddings = compute_category_embeddings()
|
| 186 |
|
| 187 |
for i in range(0, len(keywords), batch_size):
|
| 188 |
batch = keywords[i:i+batch_size]
|
| 189 |
+
batch_embeddings = sentence_model.encode(batch, batch_size=batch_size, show_progress_bar=False)
|
| 190 |
|
|
|
|
| 191 |
intents = [sort_by_keyword_feature(kw) for kw in batch]
|
|
|
|
|
|
|
| 192 |
entities = [extract_entities(kw) for kw in batch]
|
| 193 |
|
|
|
|
| 194 |
similarities = cosine_similarity(batch_embeddings, category_embeddings)
|
| 195 |
Google_Content_Topics = [perform_topic_modeling_from_similarities(sim) for sim in similarities]
|
| 196 |
|
| 197 |
processed_data['Keywords'].extend(batch)
|
| 198 |
processed_data['Intent'].extend(intents)
|
| 199 |
|
|
|
|
| 200 |
processed_entities = []
|
| 201 |
for entity_list in entities:
|
| 202 |
entity_strings = []
|
|
|
|
| 209 |
|
| 210 |
processed_data['NER Entities'].extend(processed_entities)
|
| 211 |
processed_data['Google Content Topics'].extend(Google_Content_Topics)
|
| 212 |
+
|
| 213 |
+
# Force garbage collection
|
| 214 |
+
gc.collect()
|
| 215 |
|
| 216 |
except Exception as e:
|
| 217 |
pass
|
|
|
|
| 227 |
dbc.NavItem(dbc.NavLink("Contact", href="#contact")),
|
| 228 |
],
|
| 229 |
brand="KeyIntentNER-T",
|
| 230 |
+
brand_href="https://github.com/jeredhiggins/KeyIntentNER-T",
|
| 231 |
color="#151515",
|
| 232 |
dark=True,
|
| 233 |
brand_style={"background": "linear-gradient(to right, #ff7e5f, #feb47b)", "-webkit-background-clip": "text", "color": "transparent", "textShadow": "0 0 1px #ffffff, 0 0 3px #ff7e5f, 0 0 5px #ff7e5f"},
|
|
|
|
| 507 |
return dict(content=csv_string, filename="KeyIntentNER-T_keyword_analysis.csv")
|
| 508 |
|
| 509 |
if __name__ == "__main__":
|
| 510 |
+
port = int(os.environ.get("PORT", 10000))
|
| 511 |
+
app.run_server(debug=False, host='0.0.0.0', port=port)
|