j-higgins commited on
Commit
f792c47
·
unverified ·
1 Parent(s): 7528883

Update app.py

Browse files

- Reduced batch size from 16 to 8.
- Added garbage collection (gc.collect()) after each batch to free up memory.
- Moved Google categories loading into a separate function for lazy loading.
- Removed precomputation of all keyword embeddings at once, now computing them in batches.
- Set debug=False in the server run command to reduce memory usage in production.

Files changed (1) hide show
  1. app.py +26 -25
app.py CHANGED
@@ -11,14 +11,13 @@ from sklearn.metrics.pairwise import cosine_similarity
11
  from gliner_spacy.pipeline import GlinerSpacy
12
  import warnings
13
  import os
 
14
 
15
  # Suppress specific warnings
16
  warnings.filterwarnings("ignore", message="The sentencepiece tokenizer")
17
 
18
- # Initialize Dash app with Bootstrap theme and Font Awesome
19
  app = dash.Dash(__name__, external_stylesheets=[dbc.themes.DARKLY, 'https://use.fontawesome.com/releases/v5.8.1/css/all.css'])
20
-
21
- # Create server variable
22
  server = app.server
23
 
24
  # Reference absolute file path
@@ -36,6 +35,7 @@ custom_spacy_config = {
36
  # Model variables for lazy loading
37
  nlp = None
38
  sentence_model = None
 
39
 
40
  # Function to lazy load NLP model
41
  def get_nlp():
@@ -56,11 +56,15 @@ def get_sentence_model():
56
  return sentence_model
57
 
58
  # Load Google's content categories
59
- try:
60
- with open(CATEGORIES_FILE, 'r') as f:
61
- google_categories = [line.strip() for line in f]
62
- except Exception as e:
63
- google_categories = []
 
 
 
 
64
 
65
  # Function to perform NER using GLiNER with spaCy
66
  def perform_ner(text):
@@ -82,17 +86,19 @@ def extract_entities(text):
82
  # Function to precompute category embeddings
83
  def compute_category_embeddings():
84
  try:
85
- return get_sentence_model().encode(google_categories)
 
86
  except Exception as e:
87
  return []
88
 
89
  # Function to perform topic modeling using sentence transformers
90
  def perform_topic_modeling_from_similarities(similarities):
91
  try:
 
92
  top_indices = similarities.argsort()[-3:][::-1]
93
 
94
- best_match = google_categories[top_indices[0]]
95
- second_best = google_categories[top_indices[1]]
96
 
97
  if similarities[top_indices[0]] > similarities[top_indices[1]] * 1.1:
98
  return best_match
@@ -171,34 +177,26 @@ def sort_by_keyword_feature(f):
171
  return "other"
172
 
173
  # Optimized batch processing of keywords
174
- def batch_process_keywords(keywords, batch_size=16):
175
  processed_data = {'Keywords': [], 'Intent': [], 'NER Entities': [], 'Google Content Topics': []}
176
 
177
  try:
178
- # Precompute keyword embeddings once
179
- keyword_embeddings = get_sentence_model().encode(keywords, batch_size=batch_size, show_progress_bar=False)
180
-
181
- # Compute category embeddings
182
  category_embeddings = compute_category_embeddings()
183
 
184
  for i in range(0, len(keywords), batch_size):
185
  batch = keywords[i:i+batch_size]
186
- batch_embeddings = keyword_embeddings[i:i+batch_size]
187
 
188
- # Batch process intents
189
  intents = [sort_by_keyword_feature(kw) for kw in batch]
190
-
191
- # Batch process entities
192
  entities = [extract_entities(kw) for kw in batch]
193
 
194
- # Batch process topics
195
  similarities = cosine_similarity(batch_embeddings, category_embeddings)
196
  Google_Content_Topics = [perform_topic_modeling_from_similarities(sim) for sim in similarities]
197
 
198
  processed_data['Keywords'].extend(batch)
199
  processed_data['Intent'].extend(intents)
200
 
201
- # Convert entities to strings, handling both tuples and strings
202
  processed_entities = []
203
  for entity_list in entities:
204
  entity_strings = []
@@ -211,6 +209,9 @@ def batch_process_keywords(keywords, batch_size=16):
211
 
212
  processed_data['NER Entities'].extend(processed_entities)
213
  processed_data['Google Content Topics'].extend(Google_Content_Topics)
 
 
 
214
 
215
  except Exception as e:
216
  pass
@@ -226,7 +227,7 @@ app.layout = dbc.Container([
226
  dbc.NavItem(dbc.NavLink("Contact", href="#contact")),
227
  ],
228
  brand="KeyIntentNER-T",
229
- brand_href="https://jeredhiggins.com/keyintentnert",
230
  color="#151515",
231
  dark=True,
232
  brand_style={"background": "linear-gradient(to right, #ff7e5f, #feb47b)", "-webkit-background-clip": "text", "color": "transparent", "textShadow": "0 0 1px #ffffff, 0 0 3px #ff7e5f, 0 0 5px #ff7e5f"},
@@ -506,5 +507,5 @@ def download_csv(n_clicks, processed_data):
506
  return dict(content=csv_string, filename="KeyIntentNER-T_keyword_analysis.csv")
507
 
508
  if __name__ == "__main__":
509
- port = int(os.environ.get("PORT", 10000)) # Default to 10000 if PORT is not set
510
- app.run_server(debug=True, host='0.0.0.0', port=port)
 
11
  from gliner_spacy.pipeline import GlinerSpacy
12
  import warnings
13
  import os
14
+ import gc
15
 
16
  # Suppress specific warnings
17
  warnings.filterwarnings("ignore", message="The sentencepiece tokenizer")
18
 
19
+ # Initialize Dash app
20
  app = dash.Dash(__name__, external_stylesheets=[dbc.themes.DARKLY, 'https://use.fontawesome.com/releases/v5.8.1/css/all.css'])
 
 
21
  server = app.server
22
 
23
  # Reference absolute file path
 
35
  # Model variables for lazy loading
36
  nlp = None
37
  sentence_model = None
38
+ google_categories = []
39
 
40
  # Function to lazy load NLP model
41
  def get_nlp():
 
56
  return sentence_model
57
 
58
  # Load Google's content categories
59
+ def load_google_categories():
60
+ global google_categories
61
+ if not google_categories:
62
+ try:
63
+ with open(CATEGORIES_FILE, 'r') as f:
64
+ google_categories = [line.strip() for line in f]
65
+ except Exception as e:
66
+ google_categories = []
67
+ return google_categories
68
 
69
  # Function to perform NER using GLiNER with spaCy
70
  def perform_ner(text):
 
86
  # Function to precompute category embeddings
87
  def compute_category_embeddings():
88
  try:
89
+ categories = load_google_categories()
90
+ return get_sentence_model().encode(categories)
91
  except Exception as e:
92
  return []
93
 
94
  # Function to perform topic modeling using sentence transformers
95
  def perform_topic_modeling_from_similarities(similarities):
96
  try:
97
+ categories = load_google_categories()
98
  top_indices = similarities.argsort()[-3:][::-1]
99
 
100
+ best_match = categories[top_indices[0]]
101
+ second_best = categories[top_indices[1]]
102
 
103
  if similarities[top_indices[0]] > similarities[top_indices[1]] * 1.1:
104
  return best_match
 
177
  return "other"
178
 
179
  # Optimized batch processing of keywords
180
+ def batch_process_keywords(keywords, batch_size=8):
181
  processed_data = {'Keywords': [], 'Intent': [], 'NER Entities': [], 'Google Content Topics': []}
182
 
183
  try:
184
+ sentence_model = get_sentence_model()
 
 
 
185
  category_embeddings = compute_category_embeddings()
186
 
187
  for i in range(0, len(keywords), batch_size):
188
  batch = keywords[i:i+batch_size]
189
+ batch_embeddings = sentence_model.encode(batch, batch_size=batch_size, show_progress_bar=False)
190
 
 
191
  intents = [sort_by_keyword_feature(kw) for kw in batch]
 
 
192
  entities = [extract_entities(kw) for kw in batch]
193
 
 
194
  similarities = cosine_similarity(batch_embeddings, category_embeddings)
195
  Google_Content_Topics = [perform_topic_modeling_from_similarities(sim) for sim in similarities]
196
 
197
  processed_data['Keywords'].extend(batch)
198
  processed_data['Intent'].extend(intents)
199
 
 
200
  processed_entities = []
201
  for entity_list in entities:
202
  entity_strings = []
 
209
 
210
  processed_data['NER Entities'].extend(processed_entities)
211
  processed_data['Google Content Topics'].extend(Google_Content_Topics)
212
+
213
+ # Force garbage collection
214
+ gc.collect()
215
 
216
  except Exception as e:
217
  pass
 
227
  dbc.NavItem(dbc.NavLink("Contact", href="#contact")),
228
  ],
229
  brand="KeyIntentNER-T",
230
+ brand_href="https://github.com/jeredhiggins/KeyIntentNER-T",
231
  color="#151515",
232
  dark=True,
233
  brand_style={"background": "linear-gradient(to right, #ff7e5f, #feb47b)", "-webkit-background-clip": "text", "color": "transparent", "textShadow": "0 0 1px #ffffff, 0 0 3px #ff7e5f, 0 0 5px #ff7e5f"},
 
507
  return dict(content=csv_string, filename="KeyIntentNER-T_keyword_analysis.csv")
508
 
509
  if __name__ == "__main__":
510
+ port = int(os.environ.get("PORT", 10000))
511
+ app.run_server(debug=False, host='0.0.0.0', port=port)