Spaces:
Runtime error
Runtime error
Commit
Β·
c66bdcc
1
Parent(s):
634f49c
feat(tags): global Matthew 25 tagging + expose tags in index
Browse files- app/ingest.py +51 -0
- app/tags.py +30 -0
app/ingest.py
CHANGED
|
@@ -19,6 +19,8 @@ from bs4 import BeautifulSoup
|
|
| 19 |
from datetime import datetime, timezone
|
| 20 |
from difflib import SequenceMatcher
|
| 21 |
from urllib.parse import urljoin
|
|
|
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
# -------------------- Config --------------------
|
|
@@ -167,6 +169,21 @@ def _compute_is_active(deadline_iso: Optional[str]) -> bool:
|
|
| 167 |
except Exception:
|
| 168 |
return True
|
| 169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
# -------------------- Grants.gov collector --------------------
|
| 171 |
|
| 172 |
def _collect_from_grantsgov_api(src: Dict) -> List[Dict[str, Any]]:
|
|
@@ -281,6 +298,10 @@ def _normalize_web_record(
|
|
| 281 |
"posted_date": extra.get("posted_date"),
|
| 282 |
}
|
| 283 |
rec["is_active"] = _compute_is_active(rec["deadline"])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
return rec
|
| 285 |
|
| 286 |
# -------------------- Collectors: http_html / web_page --------------------
|
|
@@ -745,6 +766,7 @@ def _build_index_from_docstore() -> int:
|
|
| 745 |
"title": title,
|
| 746 |
"url": rec.get("url"),
|
| 747 |
"source": rec.get("source"),
|
|
|
|
| 748 |
"geo": rec.get("geo"),
|
| 749 |
"categories": rec.get("categories"),
|
| 750 |
"agency": agency,
|
|
@@ -867,9 +889,38 @@ def ingest(cfg_path: str = "config/sources.yaml", env: Dict | None = None):
|
|
| 867 |
)
|
| 868 |
rows = filtered
|
| 869 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 870 |
print(f"[collect] {name} β rows_after_filters={len(rows)}")
|
| 871 |
all_rows.extend(rows)
|
| 872 |
|
|
|
|
|
|
|
| 873 |
# ---- Cross-source DEDUPE + MERGE ----
|
| 874 |
unique = _dedupe_and_merge(all_rows)
|
| 875 |
print(f"[ingest] Unique records to index: {len(unique)}")
|
|
|
|
| 19 |
from datetime import datetime, timezone
|
| 20 |
from difflib import SequenceMatcher
|
| 21 |
from urllib.parse import urljoin
|
| 22 |
+
from app.tags import infer_matt25_tags # β NEW
|
| 23 |
+
|
| 24 |
|
| 25 |
|
| 26 |
# -------------------- Config --------------------
|
|
|
|
| 169 |
except Exception:
|
| 170 |
return True
|
| 171 |
|
| 172 |
+
def _attach_matt25_tags(rec: Dict[str, Any]) -> None:
|
| 173 |
+
"""Add Matthew 25 tags by inspecting core text fields; preserve any manual tags."""
|
| 174 |
+
blob = " ".join(filter(None, [
|
| 175 |
+
rec.get("title", ""),
|
| 176 |
+
rec.get("synopsis") or rec.get("summary") or "",
|
| 177 |
+
rec.get("eligibility", ""),
|
| 178 |
+
rec.get("agency", ""),
|
| 179 |
+
]))
|
| 180 |
+
manual = rec.get("tags") or []
|
| 181 |
+
if not isinstance(manual, list):
|
| 182 |
+
manual = [manual]
|
| 183 |
+
auto = infer_matt25_tags(blob)
|
| 184 |
+
rec["tags"] = sorted(set(manual) | set(auto))
|
| 185 |
+
|
| 186 |
+
|
| 187 |
# -------------------- Grants.gov collector --------------------
|
| 188 |
|
| 189 |
def _collect_from_grantsgov_api(src: Dict) -> List[Dict[str, Any]]:
|
|
|
|
| 298 |
"posted_date": extra.get("posted_date"),
|
| 299 |
}
|
| 300 |
rec["is_active"] = _compute_is_active(rec["deadline"])
|
| 301 |
+
|
| 302 |
+
# NEW: add Matthew 25 tags based on title/synopsis/eligibility text
|
| 303 |
+
_attach_matt25_tags(rec)
|
| 304 |
+
|
| 305 |
return rec
|
| 306 |
|
| 307 |
# -------------------- Collectors: http_html / web_page --------------------
|
|
|
|
| 766 |
"title": title,
|
| 767 |
"url": rec.get("url"),
|
| 768 |
"source": rec.get("source"),
|
| 769 |
+
"tags": rec.get("tags"),
|
| 770 |
"geo": rec.get("geo"),
|
| 771 |
"categories": rec.get("categories"),
|
| 772 |
"agency": agency,
|
|
|
|
| 889 |
)
|
| 890 |
rows = filtered
|
| 891 |
|
| 892 |
+
# ---- Apply capacity / geo filters BEFORE indexing (allow per-source bypass) ----
|
| 893 |
+
if rows:
|
| 894 |
+
if entry.get("skip_filters"):
|
| 895 |
+
print(f"[filter] {name}: skip_filters=true β keeping all {len(rows)}")
|
| 896 |
+
else:
|
| 897 |
+
pre = len(rows)
|
| 898 |
+
filtered = []
|
| 899 |
+
for r in rows:
|
| 900 |
+
t = _doc_text_from_row(r)
|
| 901 |
+
if capacity_only and not _is_capacity_building_text(t):
|
| 902 |
+
continue
|
| 903 |
+
if pa_md_only and not _is_pa_md_text(t):
|
| 904 |
+
continue
|
| 905 |
+
filtered.append(r)
|
| 906 |
+
print(
|
| 907 |
+
f"[filter] {name}: kept {len(filtered)}/{pre} after filters "
|
| 908 |
+
f"(capacity_only={capacity_only}, pa_md_only={pa_md_only})"
|
| 909 |
+
)
|
| 910 |
+
rows = filtered
|
| 911 |
+
|
| 912 |
+
# >>> GLOBAL MATT 25 TAGGING (safer, one place) <<<
|
| 913 |
+
for r in rows:
|
| 914 |
+
# If you truly want to leave Grants.gov untouched, keep this guard:
|
| 915 |
+
if r.get("source") == "grants_gov":
|
| 916 |
+
continue
|
| 917 |
+
_attach_matt25_tags(r)
|
| 918 |
+
|
| 919 |
print(f"[collect] {name} β rows_after_filters={len(rows)}")
|
| 920 |
all_rows.extend(rows)
|
| 921 |
|
| 922 |
+
|
| 923 |
+
|
| 924 |
# ---- Cross-source DEDUPE + MERGE ----
|
| 925 |
unique = _dedupe_and_merge(all_rows)
|
| 926 |
print(f"[ingest] Unique records to index: {len(unique)}")
|
app/tags.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app/tags.py
|
| 2 |
+
import re
|
| 3 |
+
|
| 4 |
+
MATT25 = {
|
| 5 |
+
"hunger": ["hunger","hungry","food","food pantry","food bank","meals","meal program","feeding","nutrition"],
|
| 6 |
+
"prison": ["prison","jail","incarcerated","incarceration","reentry","re-entry","returning citizens","parole","probation","corrections","detention"],
|
| 7 |
+
"clothing": ["clothing","clothes","coat","coats","garments","apparel","outerwear","uniforms","shoes","socks","coat drive","clothing closet"],
|
| 8 |
+
"hospital": ["hospital","sick","illness","health","healthcare","medical","clinic","patient","nursing","hospice","public health"],
|
| 9 |
+
"stranger": ["stranger","refugee","immigrant","asylum","asylee","welcoming","hospitality","newcomer","migrant","resettlement","lonely","isolation","isolated"],
|
| 10 |
+
"water": ["water","thirst","drink","drinking water","hydration","wells","borehole","rainwater","clean water","wash","w.a.s.h.","sanitation","hygiene","latrine"],
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
def _compile_tag_patterns(synonyms_map):
|
| 14 |
+
compiled = {}
|
| 15 |
+
for tag, terms in synonyms_map.items():
|
| 16 |
+
pats = []
|
| 17 |
+
for t in terms:
|
| 18 |
+
if " " in t:
|
| 19 |
+
pats.append(re.compile(r"\b" + r"\s+".join(map(re.escape, t.lower().split())) + r"\b", re.I))
|
| 20 |
+
else:
|
| 21 |
+
pats.append(re.compile(rf"\b{re.escape(t.lower())}\b", re.I))
|
| 22 |
+
compiled[tag] = pats
|
| 23 |
+
return compiled
|
| 24 |
+
|
| 25 |
+
_MATT25_PATTERNS = _compile_tag_patterns(MATT25)
|
| 26 |
+
|
| 27 |
+
def infer_matt25_tags(text: str) -> list[str]:
|
| 28 |
+
t = text or ""
|
| 29 |
+
hits = [tag for tag, pats in _MATT25_PATTERNS.items() if any(p.search(t) for p in pats)]
|
| 30 |
+
return sorted(set(hits))
|