michaellupo74 commited on
Commit
c66bdcc
Β·
1 Parent(s): 634f49c

feat(tags): global Matthew 25 tagging + expose tags in index

Browse files
Files changed (2) hide show
  1. app/ingest.py +51 -0
  2. app/tags.py +30 -0
app/ingest.py CHANGED
@@ -19,6 +19,8 @@ from bs4 import BeautifulSoup
19
  from datetime import datetime, timezone
20
  from difflib import SequenceMatcher
21
  from urllib.parse import urljoin
 
 
22
 
23
 
24
  # -------------------- Config --------------------
@@ -167,6 +169,21 @@ def _compute_is_active(deadline_iso: Optional[str]) -> bool:
167
  except Exception:
168
  return True
169
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  # -------------------- Grants.gov collector --------------------
171
 
172
  def _collect_from_grantsgov_api(src: Dict) -> List[Dict[str, Any]]:
@@ -281,6 +298,10 @@ def _normalize_web_record(
281
  "posted_date": extra.get("posted_date"),
282
  }
283
  rec["is_active"] = _compute_is_active(rec["deadline"])
 
 
 
 
284
  return rec
285
 
286
  # -------------------- Collectors: http_html / web_page --------------------
@@ -745,6 +766,7 @@ def _build_index_from_docstore() -> int:
745
  "title": title,
746
  "url": rec.get("url"),
747
  "source": rec.get("source"),
 
748
  "geo": rec.get("geo"),
749
  "categories": rec.get("categories"),
750
  "agency": agency,
@@ -867,9 +889,38 @@ def ingest(cfg_path: str = "config/sources.yaml", env: Dict | None = None):
867
  )
868
  rows = filtered
869
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
870
  print(f"[collect] {name} β†’ rows_after_filters={len(rows)}")
871
  all_rows.extend(rows)
872
 
 
 
873
  # ---- Cross-source DEDUPE + MERGE ----
874
  unique = _dedupe_and_merge(all_rows)
875
  print(f"[ingest] Unique records to index: {len(unique)}")
 
19
  from datetime import datetime, timezone
20
  from difflib import SequenceMatcher
21
  from urllib.parse import urljoin
22
+ from app.tags import infer_matt25_tags # ← NEW
23
+
24
 
25
 
26
  # -------------------- Config --------------------
 
169
  except Exception:
170
  return True
171
 
172
+ def _attach_matt25_tags(rec: Dict[str, Any]) -> None:
173
+ """Add Matthew 25 tags by inspecting core text fields; preserve any manual tags."""
174
+ blob = " ".join(filter(None, [
175
+ rec.get("title", ""),
176
+ rec.get("synopsis") or rec.get("summary") or "",
177
+ rec.get("eligibility", ""),
178
+ rec.get("agency", ""),
179
+ ]))
180
+ manual = rec.get("tags") or []
181
+ if not isinstance(manual, list):
182
+ manual = [manual]
183
+ auto = infer_matt25_tags(blob)
184
+ rec["tags"] = sorted(set(manual) | set(auto))
185
+
186
+
187
  # -------------------- Grants.gov collector --------------------
188
 
189
  def _collect_from_grantsgov_api(src: Dict) -> List[Dict[str, Any]]:
 
298
  "posted_date": extra.get("posted_date"),
299
  }
300
  rec["is_active"] = _compute_is_active(rec["deadline"])
301
+
302
+ # NEW: add Matthew 25 tags based on title/synopsis/eligibility text
303
+ _attach_matt25_tags(rec)
304
+
305
  return rec
306
 
307
  # -------------------- Collectors: http_html / web_page --------------------
 
766
  "title": title,
767
  "url": rec.get("url"),
768
  "source": rec.get("source"),
769
+ "tags": rec.get("tags"),
770
  "geo": rec.get("geo"),
771
  "categories": rec.get("categories"),
772
  "agency": agency,
 
889
  )
890
  rows = filtered
891
 
892
+ # ---- Apply capacity / geo filters BEFORE indexing (allow per-source bypass) ----
893
+ if rows:
894
+ if entry.get("skip_filters"):
895
+ print(f"[filter] {name}: skip_filters=true β†’ keeping all {len(rows)}")
896
+ else:
897
+ pre = len(rows)
898
+ filtered = []
899
+ for r in rows:
900
+ t = _doc_text_from_row(r)
901
+ if capacity_only and not _is_capacity_building_text(t):
902
+ continue
903
+ if pa_md_only and not _is_pa_md_text(t):
904
+ continue
905
+ filtered.append(r)
906
+ print(
907
+ f"[filter] {name}: kept {len(filtered)}/{pre} after filters "
908
+ f"(capacity_only={capacity_only}, pa_md_only={pa_md_only})"
909
+ )
910
+ rows = filtered
911
+
912
+ # >>> GLOBAL MATT 25 TAGGING (safer, one place) <<<
913
+ for r in rows:
914
+ # If you truly want to leave Grants.gov untouched, keep this guard:
915
+ if r.get("source") == "grants_gov":
916
+ continue
917
+ _attach_matt25_tags(r)
918
+
919
  print(f"[collect] {name} β†’ rows_after_filters={len(rows)}")
920
  all_rows.extend(rows)
921
 
922
+
923
+
924
  # ---- Cross-source DEDUPE + MERGE ----
925
  unique = _dedupe_and_merge(all_rows)
926
  print(f"[ingest] Unique records to index: {len(unique)}")
app/tags.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/tags.py
2
+ import re
3
+
4
+ MATT25 = {
5
+ "hunger": ["hunger","hungry","food","food pantry","food bank","meals","meal program","feeding","nutrition"],
6
+ "prison": ["prison","jail","incarcerated","incarceration","reentry","re-entry","returning citizens","parole","probation","corrections","detention"],
7
+ "clothing": ["clothing","clothes","coat","coats","garments","apparel","outerwear","uniforms","shoes","socks","coat drive","clothing closet"],
8
+ "hospital": ["hospital","sick","illness","health","healthcare","medical","clinic","patient","nursing","hospice","public health"],
9
+ "stranger": ["stranger","refugee","immigrant","asylum","asylee","welcoming","hospitality","newcomer","migrant","resettlement","lonely","isolation","isolated"],
10
+ "water": ["water","thirst","drink","drinking water","hydration","wells","borehole","rainwater","clean water","wash","w.a.s.h.","sanitation","hygiene","latrine"],
11
+ }
12
+
13
+ def _compile_tag_patterns(synonyms_map):
14
+ compiled = {}
15
+ for tag, terms in synonyms_map.items():
16
+ pats = []
17
+ for t in terms:
18
+ if " " in t:
19
+ pats.append(re.compile(r"\b" + r"\s+".join(map(re.escape, t.lower().split())) + r"\b", re.I))
20
+ else:
21
+ pats.append(re.compile(rf"\b{re.escape(t.lower())}\b", re.I))
22
+ compiled[tag] = pats
23
+ return compiled
24
+
25
+ _MATT25_PATTERNS = _compile_tag_patterns(MATT25)
26
+
27
+ def infer_matt25_tags(text: str) -> list[str]:
28
+ t = text or ""
29
+ hits = [tag for tag, pats in _MATT25_PATTERNS.items() if any(p.search(t) for p in pats)]
30
+ return sorted(set(hits))