Spaces:

michaellupo74
/

grants-rag

Runtime error

App Files Files Community

michaellupo74 commited on Sep 24

Commit

c66bdcc

1 Parent(s): 634f49c

feat(tags): global Matthew 25 tagging + expose tags in index

Browse files

Files changed (2) hide show

app/ingest.py +51 -0
app/tags.py +30 -0

app/ingest.py CHANGED Viewed

@@ -19,6 +19,8 @@ from bs4 import BeautifulSoup
 from datetime import datetime, timezone
 from difflib import SequenceMatcher
 from urllib.parse import urljoin
 # -------------------- Config --------------------
@@ -167,6 +169,21 @@ def _compute_is_active(deadline_iso: Optional[str]) -> bool:
     except Exception:
         return True
 # -------------------- Grants.gov collector --------------------
 def _collect_from_grantsgov_api(src: Dict) -> List[Dict[str, Any]]:
@@ -281,6 +298,10 @@ def _normalize_web_record(
         "posted_date": extra.get("posted_date"),
     }
     rec["is_active"] = _compute_is_active(rec["deadline"])
     return rec
 # -------------------- Collectors: http_html / web_page --------------------
@@ -745,6 +766,7 @@ def _build_index_from_docstore() -> int:
                 "title": title,
                 "url": rec.get("url"),
                 "source": rec.get("source"),
                 "geo": rec.get("geo"),
                 "categories": rec.get("categories"),
                 "agency": agency,
@@ -867,9 +889,38 @@ def ingest(cfg_path: str = "config/sources.yaml", env: Dict | None = None):
                 )
                 rows = filtered
         print(f"[collect] {name} → rows_after_filters={len(rows)}")
         all_rows.extend(rows)
     # ---- Cross-source DEDUPE + MERGE ----
     unique = _dedupe_and_merge(all_rows)
     print(f"[ingest] Unique records to index: {len(unique)}")

 from datetime import datetime, timezone
 from difflib import SequenceMatcher
 from urllib.parse import urljoin
+from app.tags import infer_matt25_tags  # ← NEW
 # -------------------- Config --------------------
     except Exception:
         return True
+def _attach_matt25_tags(rec: Dict[str, Any]) -> None:
+    """Add Matthew 25 tags by inspecting core text fields; preserve any manual tags."""
+    blob = " ".join(filter(None, [
+        rec.get("title", ""),
+        rec.get("synopsis") or rec.get("summary") or "",
+        rec.get("eligibility", ""),
+        rec.get("agency", ""),
+    ]))
+    manual = rec.get("tags") or []
+    if not isinstance(manual, list):
+        manual = [manual]
+    auto = infer_matt25_tags(blob)
+    rec["tags"] = sorted(set(manual) | set(auto))
 # -------------------- Grants.gov collector --------------------
 def _collect_from_grantsgov_api(src: Dict) -> List[Dict[str, Any]]:
         "posted_date": extra.get("posted_date"),
     }
     rec["is_active"] = _compute_is_active(rec["deadline"])
+    # NEW: add Matthew 25 tags based on title/synopsis/eligibility text
+    _attach_matt25_tags(rec)
     return rec
 # -------------------- Collectors: http_html / web_page --------------------
                 "title": title,
                 "url": rec.get("url"),
                 "source": rec.get("source"),
+                "tags": rec.get("tags"),
                 "geo": rec.get("geo"),
                 "categories": rec.get("categories"),
                 "agency": agency,
                 )
                 rows = filtered
+        # ---- Apply capacity / geo filters BEFORE indexing (allow per-source bypass) ----
+        if rows:
+            if entry.get("skip_filters"):
+                print(f"[filter] {name}: skip_filters=true → keeping all {len(rows)}")
+            else:
+                pre = len(rows)
+                filtered = []
+                for r in rows:
+                    t = _doc_text_from_row(r)
+                    if capacity_only and not _is_capacity_building_text(t):
+                        continue
+                    if pa_md_only and not _is_pa_md_text(t):
+                        continue
+                    filtered.append(r)
+                print(
+                    f"[filter] {name}: kept {len(filtered)}/{pre} after filters "
+                    f"(capacity_only={capacity_only}, pa_md_only={pa_md_only})"
+                )
+                rows = filtered
+        # >>> GLOBAL MATT 25 TAGGING (safer, one place) <<<
+        for r in rows:
+            # If you truly want to leave Grants.gov untouched, keep this guard:
+            if r.get("source") == "grants_gov":
+                continue
+            _attach_matt25_tags(r)
         print(f"[collect] {name} → rows_after_filters={len(rows)}")
         all_rows.extend(rows)
     # ---- Cross-source DEDUPE + MERGE ----
     unique = _dedupe_and_merge(all_rows)
     print(f"[ingest] Unique records to index: {len(unique)}")

app/tags.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# app/tags.py
+import re
+MATT25 = {
+    "hunger": ["hunger","hungry","food","food pantry","food bank","meals","meal program","feeding","nutrition"],
+    "prison": ["prison","jail","incarcerated","incarceration","reentry","re-entry","returning citizens","parole","probation","corrections","detention"],
+    "clothing": ["clothing","clothes","coat","coats","garments","apparel","outerwear","uniforms","shoes","socks","coat drive","clothing closet"],
+    "hospital": ["hospital","sick","illness","health","healthcare","medical","clinic","patient","nursing","hospice","public health"],
+    "stranger": ["stranger","refugee","immigrant","asylum","asylee","welcoming","hospitality","newcomer","migrant","resettlement","lonely","isolation","isolated"],
+    "water": ["water","thirst","drink","drinking water","hydration","wells","borehole","rainwater","clean water","wash","w.a.s.h.","sanitation","hygiene","latrine"],
+}
+def _compile_tag_patterns(synonyms_map):
+    compiled = {}
+    for tag, terms in synonyms_map.items():
+        pats = []
+        for t in terms:
+            if " " in t:
+                pats.append(re.compile(r"\b" + r"\s+".join(map(re.escape, t.lower().split())) + r"\b", re.I))
+            else:
+                pats.append(re.compile(rf"\b{re.escape(t.lower())}\b", re.I))
+        compiled[tag] = pats
+    return compiled
+_MATT25_PATTERNS = _compile_tag_patterns(MATT25)
+def infer_matt25_tags(text: str) -> list[str]:
+    t = text or ""
+    hits = [tag for tag, pats in _MATT25_PATTERNS.items() if any(p.search(t) for p in pats)]
+    return sorted(set(hits))