diff --git "a/app.py" "b/app.py"
--- "a/app.py"
+++ "b/app.py"
@@ -1,1378 +1,1378 @@
-import sys
-import os
-import re
-import json
-import math
-import logging
-from collections import Counter, defaultdict
-from datetime import datetime
-from typing import List, Dict, Tuple, Optional
-
-import nltk
-import numpy as np
-import pandas as pd
-from flask import Flask, request, jsonify
-try:
-    from langdetect import detect
-except Exception:
-    # Fallback sederhana jika langdetect tidak tersedia
-    def detect(_text: str) -> str:
-        return "id"
-
-# --- LIBRARY BARU (Deep Learning & Emoji) ---
-import emoji
-import torch
-from transformers import AutoTokenizer, AutoModel
-from sklearn.cluster import KMeans
-from sklearn.feature_extraction.text import TfidfVectorizer # Tetap butuh untuk fallback
-
-# NLTK & RAKE
-from nltk.corpus import stopwords
-from nltk.sentiment import SentimentIntensityAnalyzer
-from rake_nltk import Rake
-try:
-    # Optional Indonesian stemmer (improves recall)
-    from Sastrawi.Stemmer.StemmerFactory import StemmerFactory  # type: ignore
-    _sastrawi_factory = StemmerFactory()
-    _sastrawi_stemmer = _sastrawi_factory.create_stemmer()
-    def _stem_id(word: str) -> str:
-        try:
-            return _sastrawi_stemmer.stem(word)
-        except Exception:
-            return word
-except Exception:
-    _sastrawi_stemmer = None
-    def _stem_id(word: str) -> str:
-        return word
-
-# Setup Logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-# Download NLTK resources safely - SKIP jika tidak perlu
-def ensure_nltk_safe():
-    """Check NLTK packages, skip download if missing (offline mode)."""
-    needed = {
-        "punkt": "tokenizers/punkt",
-        "punkt_tab": "tokenizers/punkt_tab", 
-        "stopwords": "corpora/stopwords",
-    }
-    
-    for pkg, path in needed.items():
-        try:
-            nltk.data.find(path)
-            print(f"✅ {pkg} ready")
-        except LookupError:
-            print(f"⚠️ {pkg} not found - continuing in offline mode")
-
-# Panggil tanpa download otomatis
-try:
-    ensure_nltk_safe()
-    print("=" * 60)
-except Exception as e:
-    print(f"⚠️ NLTK check error: {e}")
-
-app = Flask(__name__)
-
-# Configuration
-API_KEY = os.getenv("FLASK_API_KEY", "rahasia-negara-123") # Gunakan env var
-SERVICE_VERSION = "1.2.0-bert-sarcasm" # Version bump
-
-# --- GLOBAL VARIABLES ---
-# Initialize SentimentIntensityAnalyzer safely (skip jika vader_lexicon tidak ada)
-try:
-    sia = SentimentIntensityAnalyzer()
-    print("✅ VADER sentiment analyzer ready")
-except Exception as e:
-    print(f"⚠️ VADER not available, using custom lexicon only: {e}")
-    sia = None
-
-STOPWORDS_ID_CHAT = set(stopwords.words('indonesian')) | set(stopwords.words('english'))
-_CHAT_FILLERS = {
-    "sih", "dong", "kok", "kan", "tuh", "deh", "lah", "yah", "ni", "tu", 
-    "ya", "yak", "yuk", "loh", "masa", "mana", "tapi", "kalo", "kalau", 
-    "biar", "buat", "bikin", "bilang", "gak", "ga", "nggak", "enggak", 
-    "kagak", "tak", "ndak", "udah", "sudah", "blm", "belum", "pas", 
-    "lagi", "lg", "td", "tadi", "km", "kamu", "aku", "saya", "gw", "gue", 
-    "lu", "lo", "elu", "kita", "kalian", "mereka", "dia", "ini", "itu", 
-    "sini", "situ", "sana", "bgt", "banget", "aja", "saja", "cuma", 
-    "doang", "terus", "trs", "jd", "jadi", "karna", "karena", "krn", 
-    "bisa", "bs", "mau", "mo", "pengen", "ingin", "ada", "tiada",
-    "sama", "dgn", "dengan", "dr", "dari", "ke", "di", "pd", "pada",
-    "kapan", "dimana", "siapa", "mengapa", "kenapa", "gimana", "bagaimana",
-    "wkwk", "haha", "hehe", "huhu", "anjir", "njir", "anjing",
-    "apalah", "apa", "aduh", "wah", "nah", "kek", "kayak", "macam"
-}
-STOPWORDS_ID_CHAT.update(_CHAT_FILLERS)
-
-# ==== Integrasi TALA Stopwords tambahan ====
-try:
-    _TALA_PATH = os.path.join(os.path.dirname(__file__), 'tala-stopwords-indonesia.txt')
-    if os.path.exists(_TALA_PATH):
-        with open(_TALA_PATH, 'r', encoding='utf-8') as _tf:
-            tala_words = {w.strip().lower() for w in _tf if w.strip() and not w.startswith('#')}
-            # Hindari kata yang terlalu pendek (1 huruf) agar tidak over-filter
-            tala_words = {w for w in tala_words if len(w) > 1}
-            STOPWORDS_ID_CHAT.update(tala_words)
-            logger.info(f"Loaded TALA stopwords: +{len(tala_words)} terms (total={len(STOPWORDS_ID_CHAT)})")
-    else:
-        logger.warning('TALA stopwords file not found, skipping integration.')
-except Exception as e:
-    logger.warning(f'Failed loading TALA stopwords: {e}')
-
-# Lexicon sederhana untuk Indonesia/Kupang dalam range standar [-1, +1]
-ID_EXTRA = {
-    # Emosi negatif umum
-    "capek": -0.7, "capai": -0.5, "pusing": -0.7, "marah": -0.8, "sedih": -0.7,
-    "murung": -0.7, "galau": -0.6, "bingung": -0.5, "takut": -0.7, "cemas": -0.7,
-    "kecewa": -0.7, "kesal": -0.6, "jengkel": -0.6, "frustasi": -0.8, "frustrasi": -0.8, "depresi": -0.9,
-    "stres": -0.8, "tegang": -0.6, "resah": -0.7, "gelisah": -0.7, "sendirian": -0.5,
-    # Emosi positif umum
-    "senang": 0.7, "bahagia": 0.8, "semangat": 0.7, "hepi": 0.7, "gembira": 0.8,
-    "excited": 0.7, "antusias": 0.7, "optimis": 0.6, "tenang": 0.5, "damai": 0.6,
-    "puas": 0.6, "lega": 0.6, "syukur": 0.7, "bangga": 0.7,
-    # Masalah sekolah
-    "telat": -0.6, "bolos": -0.8, "berantem": -0.9, "ribut": -0.7, "gaduh": -0.6,
-    "berkelahi": -0.9, "bertengkar": -0.8, "keributan": -0.7, "masalah": -0.5,
-    "PR": -0.3, "tugas": -0.2, "banyak": -0.2, "malas": -0.5, "rajin": 0.5,
-    "skip": -0.6, "cabut": -0.6, "pontang": -0.7, "mangkir": -0.7,
-    # Keluarga & rumah
-    "berantem": -0.9, "cekcok": -0.8, "bertengkar": -0.8, "marahan": -0.7,
-    "berisik": -0.5, "berantakan": -0.4, "kacau": -0.7, "chaos": -0.7,
-    "pisah": -0.7, "bercerai": -0.8, "kabur": -0.7, "minggat": -0.8, "pergi": -0.3,
-    # Kupang/Manado dialect dengan sentiment
-    "sonde": -0.3, "tara": -0.2, "teda": -0.2, "pigi": -0.1,  # Kupang negation/pergi
-    "kaco": -0.5, "cungkel": -0.5, "bongkar": -0.2, "kobo": -0.4, "susa": -0.6,
-    "dolo": -0.4, "molo": -0.4, "so": -0.3, "nda": -0.3,  # Manado negation
-    "bodo": -0.6, "bodoh": -0.7, "tolol": -0.8, "goblok": -0.8,  # Insults
-    # Neutral pronouns (score 0 won't affect sentiment)
-    "beta": 0.0, "ko": 0.0, "torang": 0.0, "katong": 0.0, "deng": 0.0,
-    "dong": 0.0, "de": 0.0, "so": 0.0, "pe": 0.0, "pung": 0.0,
-    "tanta": 0.0, "oma": 0.0, "opa": 0.0, "mama": 0.0, "papa": 0.0,
-}
-# tambahkan ke VADER (jika available)
-if sia:
-    sia.lexicon.update({k.lower(): v for k, v in ID_EXTRA.items()})
-
-app = Flask(__name__)
-
-API_KEY = os.environ.get("ML_API_KEY")  # optional
-FEEDBACK_FILE = os.environ.get("ML_FEEDBACK_FILE", os.path.join(os.path.dirname(__file__), "feedback_weights.json"))
-LEXICON_DIR = os.environ.get("ML_LEXICON_DIR", os.path.join(os.path.dirname(__file__), "lexicons"))
-ENABLE_BERT = os.environ.get("ML_ENABLE_BERT", "false").lower() in ("1","true","yes")
-BERT_MODEL_NAME = os.environ.get("ML_BERT_MODEL", "indobenchmark/indobert-base-p1")
-ENABLE_BERT_WARMUP = os.environ.get("ML_BERT_WARMUP", "false").lower() in ("1","true","yes")
-SERVICE_VERSION = os.environ.get("ML_VERSION", "ml-rasaya:2025.11.0")
-
-def check_key():
-    if API_KEY:
-        # accept both header casings/variants for compatibility
-        key = request.headers.get("X-API-KEY") or request.headers.get("X-API-Key")
-        if key != API_KEY:
-            return False
-    return True
-
-def detect_lang(txt, hint=None):
-    if hint:
-        return hint
-    try:
-        return detect(txt) if txt and txt.strip() else "id"
-    except Exception:
-        return "id"
-
-def label_from_score(compound: float) -> str:
-    if compound >= 0.05: return "positif"
-    if compound <= -0.05: return "negatif"
-    return "netral"
- 
-# Legacy default map removed in favor of taxonomy-derived categories
-
-def load_feedback_weights():
-    try:
-        with open(FEEDBACK_FILE, 'r', encoding='utf-8') as f:
-            return json.load(f)
-    except Exception:
-        return {}
-
-def save_feedback_weights(weights: dict):
-    try:
-        with open(FEEDBACK_FILE, 'w', encoding='utf-8') as f:
-            json.dump(weights, f, ensure_ascii=False, indent=2)
-    except Exception:
-        pass
-
-def score_categories_for_text(txt: str, categories_map: dict, feedback: dict):
-    """Scoring kategori berbasis token & n-gram.
-    - Tokenize + optional stemming (Sastrawi) untuk generalisasi.
-    - Match unigram/bigram/trigram secara exact (bukan substring bebas).
-    - Bobot dasar dibagi oleh banyaknya kategori yang memakai keyword (1/n_cats).
-    - Boost n-gram (bi=1.4x, tri=1.6x), downweight token sangat pendek (<=3: 0.5x).
-    - Tambahkan feedback weight jika ada, lalu normalisasi ke proporsi total.
-    """
-    clean = clean_text(txt)
-    toks = _tokenize_and_stem(clean)
-    uni, bi, tri = _build_ngram_sets(toks)
-
-    # Invert index: keyword -> categories
-    inv = defaultdict(list)
-    for cat, kws in categories_map.items():
-        for kw in kws:
-            k = (kw or '').strip().lower()
-            if k:
-                inv[k].append(cat)
-
-    scores = {cat: 0.0 for cat in categories_map.keys()}
-    reasons = defaultdict(list)
-
-    for kw, cats in inv.items():
-        parts = [p for p in kw.split() if p]
-        parts_stem = [_stem_id(p) for p in parts]
-        gram = len(parts_stem)
-        present = False
-        if gram == 1:
-            present = parts_stem[0] in uni
-        elif gram == 2:
-            present = (parts_stem[0] + ' ' + parts_stem[1]) in bi
-        else:
-            seq = ' '.join(parts_stem[:3])
-            present = seq in tri if len(parts_stem) >= 3 else False
-        if not present:
-            continue
-
-        base = 1.0 / max(1, len(cats))
-        if gram == 1 and len(parts_stem[0]) <= 3:
-            base *= 0.5
-        if gram == 2:
-            base *= 1.4
-        elif gram >= 3:
-            base *= 1.6
-
-        for cat in cats:
-            adj = base + float(feedback.get(kw, {}).get(cat, 0.0))
-            scores[cat] += adj
-            reasons[cat].append(kw)
-
-    total = sum(scores.values())
-    if total > 0:
-        for k in scores.keys():
-            scores[k] = round(scores[k] / total, 4)
-    return scores, {k: sorted(set(v))[:5] for k, v in reasons.items()}
-
-"""
-Cleaning & Lexicon Loader (InSet + optional Barasa)
-"""
-# Regex patterns
-_RE_URL = re.compile(r"https?://\S+|www\.\S+")
-_RE_MENTION = re.compile(r"[@#]\w+")
-_RE_REPEAT = re.compile(r"(.)\1{2,}") # 3 kali atau lebih
-_RE_MULTISPACE = re.compile(r"\s+")
-
-def clean_text(t: str) -> str:
-    """
-    Cleaning text tapi mempertahankan emoji dan tanda baca penting untuk sentimen.
-    """
-    if not t: return ""
-    
-    # 1. Demojize: Ubah emoji jadi teks bahasa Indonesia (manual mapping dikit)
-    t = emoji.demojize(t, delimiters=(" ", " ")) 
-    t = t.replace("loudly_crying_face", "menangis") \
-         .replace("crying_face", "sedih") \
-         .replace("pensive_face", "murung") \
-         .replace("angry_face", "marah") \
-         .replace("rolling_on_the_floor_laughing", "tertawa") \
-         .replace("face_with_rolling_eyes", "bosan") \
-         .replace("broken_heart", "patah hati")
-
-    t = t.lower().strip()
-
-    # 2. Remove URL & Mention
-    t = _RE_URL.sub(" ", t)
-    t = _RE_MENTION.sub(" ", t)
-
-    # 3. Keep punctuation important for emotion (?!.,)
-    # Hapus karakter aneh selain alphanumeric dan tanda baca penting
-    t = re.sub(r"[^a-z0-9\?\!\.\,\s]", " ", t)
-    
-    # Pisahkan tanda baca biar jadi token terpisah
-    t = re.sub(r"([\?\!\.\,])", r" \1 ", t)
-
-    # 4. Normalize Repeat (bangeeet -> banget)
-    t = _RE_REPEAT.sub(r"\1", t)
-
-    # 5. Slang & Dialect Normalization (Indonesian + Kupang + Manado + Ambon)
-    dialect = {
-        # Standard Indonesian slang
-        "gw": "saya", "gue": "saya", "lu": "kamu", "lo": "kamu", "elu": "kamu",
-        "ak": "aku", "aq": "aku", "sy": "saya", "w": "saya", "ane": "saya",
-        "gak": "tidak", "ga": "tidak", "nggak": "tidak", "kaga": "tidak", "ndak": "tidak",
-        "enggak": "tidak", "engga": "tidak", "ngga": "tidak", "kagak": "tidak",
-        "krn": "karena", "karna": "karena", "bgt": "banget", "bgtt": "banget",
-        "tdk": "tidak", "jgn": "jangan", "udh": "sudah", "sdh": "sudah",
-        "blm": "belum", "trus": "terus", "jd": "jadi", "dgn": "dengan",
-        "sm": "sama", "yg": "yang", "kalo": "kalau", "kl": "kalau",
-        "mager": "malas gerak", "baper": "bawa perasaan", "gabut": "bosan",
-        "anjir": "kaget", "njir": "kaget", "anjay": "hebat", 
-        "mantul": "mantap", "santuy": "santai", "sans": "santai",
-        "gajelas": "tidak jelas", "gaje": "tidak jelas",
-        # Kupang/NTT dialect
-        # --- KATA GANTI ORANG (PRONOUNS) ---
-        "beta": "saya", "b": "saya", "bt": "saya", # Kupang/Ambon
-        "kita": "saya", # Manado (konteks santai)
-        "ana": "saya", "awak": "saya", "sa": "saya", "sy": "saya",
-        "ak": "aku", "aq": "aku", "gw": "saya", "gue": "saya",
-        
-        "lu": "kamu", "lo": "kamu", "elu": "kamu", 
-        "ose": "kamu", "os": "kamu", "ale": "kamu", # Ambon
-        "ngana": "kamu", "nga": "kamu", # Manado
-        "ko": "kamu", "kau": "kamu", "ju": "kamu", # Kupang/Papua
-        "bo": "kamu", # Bima/Dompu kadang masuk
-        
-        "dia": "dia", "de": "dia", "i": "dia", # Papua/Kupang (De pung rumah)
-        "antua": "beliau", # Ambon (respektif)
-        
-        "katong": "kita", "ketong": "kita", "ktg": "kita", # Kupang/Ambon
-        "torang": "kita", "tong": "kita", # Manado/Papua
-        
-        "dorang": "mereka", "dong": "mereka", "drg": "mereka", # Manado/Kupang/Ambon
-        "besong": "kalian", "basong": "kalian", "kamorang": "kalian", # Kupang/Papua
-        "ngoni": "kalian", # Manado
-
-        # --- NEGASI (TIDAK/BUKAN) ---
-        "sonde": "tidak", "son": "tidak", "snd": "tidak", "sond": "tidak", # Kupang
-        "seng": "tidak", "sing": "tidak", "tra": "tidak", "trada": "tidak", # Ambon/Papua
-        "tara": "tidak", "tar": "tidak", 
-        "nyanda": "tidak", "nda": "tidak", "ndak": "tidak", # Manado/Jawa
-        "gak": "tidak", "ga": "tidak", "nggak": "tidak", "kaga": "tidak", 
-        "bukang": "bukan",
-
-        # --- KATA KERJA & KETERANGAN (VERBS & ADVERBS) ---
-        "pi": "pergi", "p": "pergi", "pig": "pergi", # Kupang/Ambon (saya kabur 'pi'...)
-        "su": "sudah", "so": "sudah", # Kupang/Manado/Ambon
-        "sdh": "sudah", "udh": "sudah", "udah": "sudah",
-        "blm": "belum", "balom": "belum", 
-        
-        "mo": "mau", "mau": "mau", 
-        "kasi": "beri", "kase": "beri", "kas": "beri", # Kase tinggal -> Beri tinggal
-        "omong": "bicara", "baomong": "bicara", "bakata": "berkata",
-        "dapa": "dapat", "dap": "dapat",
-        "baku": "saling", # Baku pukul -> Saling pukul
-        "bae": "baik", "baek": "baik",
-        "ancor": "hancur",
-        "ambe": "ambil", "pigi": "pergi",
-        
-        # --- KEPEMILIKAN & PENGHUBUNG ---
-        "pung": "punya", "puny": "punya", "pu": "punya", "pe": "punya", # Beta pung -> Saya punya
-        "deng": "dengan", "dg": "dengan", "dng": "dengan", 
-        "par": "untuk", "for": "untuk", # Ambon/Manado (For ngana)
-        "vor": "untuk",
-        "kek": "seperti", "mcam": "macam", "kek": "kayak",
-
-        # --- KATA SIFAT & LAINNYA ---
-        "talalu": "terlalu", "tlalu": "terlalu",
-        "sadiki": "sedikit", "sadikit": "sedikit",
-        "banya": "banyak", 
-        "skali": "sekali",
-        "samua": "semua",
-        "karna": "karena", "krn": "karena", "gara": "karena",
-        
-        # --- GENERAL SLANG INDONESIA ---
-        "bgt": "banget", "bgtt": "banget",
-        "trus": "terus", "trs": "terus",
-        "jd": "jadi", "jdi": "jadi", 
-        "yg": "yang", "kalo": "kalau", "kl": "kalau",
-        "mager": "malas gerak", "baper": "bawa perasaan", "gabut": "bosan",
-        "anjir": "kaget", "njir": "kaget", "anjay": "hebat", 
-        "mantul": "mantap", "santuy": "santai", "sans": "santai",
-        "gajelas": "tidak jelas", "gaje": "tidak jelas",
-        "ortu": "orang tua", "mksd": "maksud",
-        "knp": "kenapa", "np": "kenapa", "napa": "kenapa",
-        "utk": "untuk"
-    }
-    
-    toks = []
-    for tk in t.split():
-        toks.append(dialect.get(tk, tk))
-    
-    t = " ".join(toks)
-    t = _RE_MULTISPACE.sub(" ", t).strip()
-    return t
-
-# Tokenization + optional stemming helpers
-
-def _tokenize_and_stem(t: str) -> list[str]:
-    toks = [w for w in t.split() if w]
-    if _sastrawi_stemmer is None:
-        return toks
-    return [_stem_id(w) for w in toks]
-
-
-def _build_ngram_sets(tokens: list[str]) -> tuple[set[str], set[str], set[str]]:
-    uni = set(tokens)
-    bi = set([tokens[i] + " " + tokens[i+1] for i in range(len(tokens)-1)]) if len(tokens) >= 2 else set()
-    tri = set([tokens[i] + " " + tokens[i+1] + " " + tokens[i+2] for i in range(len(tokens)-2)]) if len(tokens) >= 3 else set()
-    return uni, bi, tri
-
-def detect_sarcasm_heuristic(text_clean, raw_text, current_sentiment):
-    """
-    Mendeteksi potensi sarkasme berdasarkan kontras sentimen, emoji, dan tanda baca.
-    Returns: (is_sarcasm: bool, confidence: float)
-    """
-    is_sarcasm = False
-    confidence = 0.0
-    text_clean = text_clean.lower()
-    
-    # Kamus Heuristik
-    intensifiers = ["banget", "bgt", "kali", "sumpah", "bener", "bet", "parah", "amat"]
-    positives = ["hebat", "bagus", "pinter", "jenius", "mantap", "enak", "keren", "rajin", "suci"]
-    negatives = ["pusing", "capek", "stres", "gila", "mati", "rusak", "hancur", "sebel", "benci", "malas", "bodoh", "tolol"]
-    
-    # Fitur
-    has_pos = any(p in text_clean for p in positives)
-    has_neg = any(n in text_clean for n in negatives)
-    has_intensifier = any(i in text_clean for i in intensifiers)
-    has_exclamation = "!" in raw_text or "?" in raw_text
-    
-    # LOGIC 1: Kalimat mengandung Positif DAN Negatif ("Hebat banget lo bikin gue stres")
-    if has_pos and has_neg:
-        return True, 0.75
-
-    # LOGIC 2: Kalimat Positif + Tanda baca agresif + Konteks ambigu ("Pinter ya lo??")
-    # Biasanya kalau muji beneran jarang pake '??'
-    if has_pos and ("??" in raw_text or "!!" in raw_text):
-        return True, 0.6
-
-    # LOGIC 3: Positif + Emoji Negatif (Manual check raw text for common sarcastic emojis)
-    # Emoji: Rolling eyes, Unamused face, Upside-down face
-    sarcastic_emojis = ["🙄", "😒", "🙃", "😤", "🤡"]
-    if has_pos and any(e in raw_text for e in sarcastic_emojis):
-        return True, 0.9
-
-    return False, 0.0
-
-def load_inset_lexicon(base_dir: str) -> dict[str, float]:
-    """Load InSet format: lexicons/inset/{positive.tsv,negative.tsv}."""
-    out: dict[str, float] = {}
-    inset_dir = os.path.join(base_dir, "inset")
-    pos = os.path.join(inset_dir, "positive.tsv")
-    neg = os.path.join(inset_dir, "negative.tsv")
-    if os.path.exists(pos):
-        with open(pos, "r", encoding="utf-8") as f:
-            for line in f:
-                w = line.strip().split("\t")[0]
-                if w:
-                    out[w.lower()] = 1.0
-    if os.path.exists(neg):
-        with open(neg, "r", encoding="utf-8") as f:
-            for line in f:
-                w = line.strip().split("\t")[0]
-                if w:
-                    out[w.lower()] = -1.0
-    return out
-
-
-def load_barasa_csv(path: str) -> dict[str, float]:
-    """Load Barasa CSV with headers; expects at least a 'lemma' column and
-    either a 'score' column (float, negative to positive) or separate
-    'pos'/'neg' columns that can be combined (score = pos - neg).
-    Values are clamped to [-1, 1].
-    """
-    lex: dict[str, float] = {}
-    try:
-        import csv
-        with open(path, encoding="utf-8") as f:
-            r = csv.DictReader(f)
-            for row in r:
-                lemma = (row.get("lemma") or row.get("word") or row.get("token") or "").strip().lower()
-                if not lemma:
-                    continue
-                score_val = None
-                # Prefer unified score
-                if row.get("score") not in (None, ""):
-                    try:
-                        score_val = float(row.get("score"))
-                    except Exception:
-                        score_val = None
-                # Else try pos/neg columns
-                if score_val is None:
-                    try:
-                        pos = float(row.get("pos") or row.get("positive") or 0)
-                        neg = float(row.get("neg") or row.get("negative") or 0)
-                        score_val = pos - neg
-                    except Exception:
-                        score_val = 0.0
-                score_val = max(-1.0, min(1.0, float(score_val)))
-                lex[lemma] = score_val
-    except Exception:
-        pass
-    return lex
-
-
-def load_barasa_optional(base_dir: str) -> dict[str, float]:
-    """
-    Try to read Barasa resources if available. The provided file wn-msa-all.tab
-    is a WordNet-style tab file (no explicit polarity). We don't assign scores
-    from it directly; instead we just return empty dict so it doesn't affect
-    sentiment unless in the future we add mapping rules.
-    If you later provide barasa.csv (word,score), we can extend this loader.
-    """
-    barasa_dir = os.path.join(base_dir, "barasa")
-    wn_file = os.path.join(barasa_dir, "wn-msa-all.tab")
-    # Placeholder: no direct sentiment; return empty for now.
-    # Future: map synonyms of existing sentiment words and inherit score * 0.8
-    if os.path.exists(wn_file):
-        return {}
-    # also support barasa.csv if added by user
-    csv_file = os.path.join(base_dir, "barasa.csv")
-    if os.path.exists(csv_file):
-        out: dict[str, float] = {}
-        with open(csv_file, "r", encoding="utf-8") as f:
-            for line in f:
-                if "," in line:
-                    w, sc = line.strip().split(",", 1)
-                    try:
-                        out[w.lower()] = max(-1.0, min(1.0, float(sc)))
-                    except Exception:
-                        continue
-        return out
-    return {}
-
-
-def build_lexicon() -> dict[str, float]:
-    # Start from InSet if available
-    lex = load_inset_lexicon(LEXICON_DIR)
-    # Merge Barasa if CSV provided; else try optional WordNet source (no polarity)
-    barasa_csv = os.path.join(LEXICON_DIR, "barasa", "barasa_lexicon.csv")
-    if os.path.exists(barasa_csv):
-        lex.update(load_barasa_csv(barasa_csv))
-    else:
-        bar = load_barasa_optional(LEXICON_DIR)
-        lex.update(bar)
-    # Add custom Kupang/ID extra (sudah dalam range [-1, +1])
-    for k, v in ID_EXTRA.items():
-        lex[k.lower()] = max(-1.0, min(1.0, float(v)))
-    return lex
-
-
-LEXICON_ID = build_lexicon()
-
-
-def score_with_lexicon(text: str, lex: Dict[str, float]) -> float:
-    toks = clean_text(text).split()
-    if not toks:
-        return 0.0
-
-    # Context-aware scoring: handle negation (pre & post), intensifiers
-    negation_words = {"tidak", "bukan", "belum", "jangan", "tanpa", "sonde", "tara", "teda", "nda", "tra"}
-    intensifiers = {"banget", "sangat", "amat", "sekali", "parah", "bener", "pisan"}
-
-    s = 0.0
-    neg_window = 0  # number of next tokens to negate
-    intensify = 1.0
-    # track last scored token to handle patterns like "paham ... belum"
-    last_score_val = 0.0
-    last_score_idx = -10
-
-    for i, tok in enumerate(toks):
-        # Negation token: start negation window and optionally flip previous positive nearby
-        if tok in negation_words:
-            # If a positive word occurred recently (within 2 tokens), flip it retroactively
-            if last_score_val > 0 and (i - last_score_idx) <= 2:
-                # subtract a bit more than added to reflect negation of previous positive
-                s -= last_score_val * 1.2
-                last_score_val = 0.0
-            neg_window = 3
-            continue
-
-        # Intensifier affects next scored word only
-        if tok in intensifiers:
-            intensify = 1.5
-            continue
-
-        # Base lexical score
-        score = lex.get(tok, 0.0)
-
-        # Apply active negation window
-        if neg_window > 0 and score != 0.0:
-            score = -score * 0.8
-            neg_window -= 1
-        elif neg_window > 0:
-            # consume window even if current token has no score
-            neg_window -= 1
-
-        # Apply intensifier
-        if intensify > 1.0 and score != 0.0:
-            score = score * intensify
-            intensify = 1.0
-
-        s += score
-
-        if score != 0.0:
-            last_score_val = score
-            last_score_idx = i
-
-    # Dampen by sqrt length to avoid bias for long texts
-    normalized = s / max(1.0, math.sqrt(len(toks)))
-    return max(-1.0, min(1.0, normalized))
-
-INTENSIFIERS = {"banget": 1.0, "sangat": 0.8, "parah": 0.9, "amat": 0.5}
-
-def negative_gate(aggregate: float, raw_txt: str) -> tuple[bool, float]:
-    # severity from magnitude + intensifiers + punctuation and repeats
-    clean = clean_text(raw_txt)
-    toks = clean.split()
-    intens = sum(INTENSIFIERS.get(t, 0.0) for t in toks)
-    exclam = min(raw_txt.count("!"), 3) * 0.1
-    repeat = 0.1 if _RE_REPEAT.search(raw_txt) else 0.0
-    sev = max(0.0, min(1.0, (-aggregate) * 0.7 + intens * 0.2 + exclam + repeat))
-    return (aggregate <= -0.05), round(sev, 3)
-
-# =====================
-# Taxonomy (topics/subtopics) for semi-supervised labeling
-# =====================
-TAXONOMY_PATH = os.path.join(os.path.dirname(__file__), "taxonomy.json")
-try:
-    with open(TAXONOMY_PATH, "r", encoding="utf-8") as _f:
-        _TAX = json.load(_f)
-except Exception:
-    _TAX = {"topics": []}
-
-def _taxonomy_keywords():
-    buckets = {}
-    subtopics = {}
-    for tp in _TAX.get("topics", []):
-        bucket = tp.get("bucket") or ""
-        topic_id = tp.get("id") or bucket or "TOPIC"
-        topic_name = tp.get("name") or topic_id
-        buckets.setdefault(bucket, set()).update([str(w).lower() for w in tp.get("keywords", []) if w])
-        for st in tp.get("subtopics", []) or []:
-            # Maintain internal id (taxonomy id) and external 'code' matching kategori_masalahs.kode
-            st_id = st.get("id") or st.get("code") or st.get("name")
-            st_code = st.get("code") or st_id
-            if not st_id:
-                continue
-            subtopics[st_id] = {
-                "name": st.get("name") or st_id,
-                "bucket": bucket,
-                "topic_id": topic_id,
-                "topic_name": topic_name,
-                "code": st_code,
-                "keywords": set([str(w).lower() for w in st.get("keywords", []) if w]),
-                "examples": st.get("examples", []) or []
-            }
-    return buckets, subtopics
-
-BUCKET_KW, SUBTOPICS = _taxonomy_keywords()
-
-def build_topic_index_and_categories_map():
-    """HYBRID APPROACH (OLD METHOD + NEW DATA):
-    Builds multi-level keyword matching dengan data dari database.
-    
-    Returns: (topic_index, categories_map, bucket_map)
-    - topic_index: metadata per kategori kecil {UPPER(name): {id, name, bucket, kode}}
-    - categories_map: keywords per kategori kecil {UPPER(name): [keywords]}
-    - bucket_map: keywords per kategori besar {UPPER(bucket): [aggregated keywords]}
-    
-    WHY THIS IS BETTER:
-    - Multi-level matching: Check keywords di kategori kecil DAN kategori besar
-    - Redundancy: Jika miss di kategori kecil, bisa match di bucket agregat
-    - Better coverage: Keywords dari semua kategori kecil teragregasi ke bucket
-    """
-    topic_index = {}
-    categories_map = {}
-    bucket_map = defaultdict(set)  # Agregasi keywords per bucket
-    
-    # Process topics (kategori kecil) dari database
-    for tp in _TAX.get("topics", []):
-        topic_id = tp.get("id") or tp.get("code") or "TOPIC"
-        topic_name = tp.get("name") or topic_id
-        bucket = tp.get("bucket") or ""
-        key = str(topic_name).upper()
-        
-        # Collect keywords from topic level (kategori kecil)
-        kw = set([str(w).lower().strip() for w in (tp.get("keywords") or []) if w])
-        
-        # Legacy support: subtopics (backward compatibility)
-        for st in tp.get("subtopics", []) or []:
-            for w in st.get("keywords", []) or []:
-                if w:
-                    kw.add(str(w).lower().strip())
-        
-        # Store kategori kecil metadata & keywords
-        topic_index[key] = {
-            "id": topic_id, 
-            "name": topic_name, 
-            "bucket": bucket,
-            "kode": topic_id  # Match dengan kategori_masalahs.kode
-        }
-        categories_map[key] = sorted(list(kw))
-        
-        # AGGREGATE keywords ke bucket (kategori besar)
-        # Ini yang bikin metode lama lebih akurat!
-        if bucket:
-            bucket_map[bucket.upper()].update(kw)
-    
-    # Convert bucket_map sets to sorted lists
-    bucket_keywords = {k: sorted(list(v)) for k, v in bucket_map.items()}
-    
-    return topic_index, categories_map, bucket_keywords
-
-def extract_keyphrases(texts, lang="id"):
-    # RAKE pakai stopwords bhs Inggris default; untuk id sederhana kita kasih stopwords id juga
-    sw = set(stopwords.words('indonesian')) | set(stopwords.words('english'))
-    r = Rake(stopwords=sw)
-    joined = " . ".join(texts)
-    r.extract_keywords_from_text(joined)
-    ranked = r.get_ranked_phrases_with_scores()
-    out = []
-    for score, phrase in ranked[:20]:
-        out.append({"term": phrase, "weight": float(score)})
-    return out
-
-def extract_core_tokens(texts):
-    """Ambil token inti dengan pembersihan:
-    - lower & clean_text
-    - buang stopwords (ID + EN) & filler umum
-    - buang token panjang < 3
-    - hitung frekuensi, ambil top 10
-    """
-    freq = Counter()
-    try:
-        sw_id = set(stopwords.words('indonesian'))
-    except Exception:
-        sw_id = set()
-    try:
-        sw_en = set(stopwords.words('english'))
-    except Exception:
-        sw_en = set()
-    filler = {
-        'dan','atau','yang','di','ke','dengan','pada','untuk','dari','lagi','sih','deh','lah','ya','kok','kan','udah','aja','pun','itu','ini','jadi','kalau','kalo','bahwa','sementara','sering','kayak','kayakny','nih','tuh','dong','de','si','mungkin','masih','bisa','harus','karena','seperti','kaya','gitu','buat'
-    }
-    for t in texts:
-        for tok in clean_text(t).split():
-            if len(tok) < 3: continue
-            if tok in sw_id or tok in sw_en or tok in filler: continue
-            freq[tok] += 1
-    return [w for w,_ in freq.most_common(10)]
-
-def _build_cluster_vectorizer():
-    """Vectorizer for clustering top-terms: single-word tokens, heavy stopwords cleanup."""
-    try:
-        sw_id = set(stopwords.words('indonesian'))
-    except Exception:
-        sw_id = set()
-    try:
-        sw_en = set(stopwords.words('english'))
-    except Exception:
-        sw_en = set()
-    extra = {
-        # connectors/intensifiers/pronouns/common fillers
-        'dan','atau','yang','di','ke','dengan','pada','untuk','dari','lagi','banget','sekali','paling','sih','deh','dong','lah','ya',
-        'aku','saya','gue','gua','dia','kamu','kau','ko','kami','kita','mereka',
-        'punya','dengar','dng','sm','nih','tuh','kok','kan','udah','lagi','aja','de','si',
-    }
-    stopset = sw_id | sw_en | extra
-    # Use our cleaner as preprocessor; single-word tokens only
-    vec = TfidfVectorizer(
-        preprocessor=clean_text,
-        tokenizer=str.split,
-        token_pattern=None,
-        lowercase=True,
-        stop_words=list(stopset),
-        ngram_range=(1,1),
-        max_df=0.95,
-        min_df=1,
-        max_features=1000,
-    )
-    return vec
-
-@app.get("/health")
-def health():
-    return jsonify({"status": "ok", "version": SERVICE_VERSION, "bert": ENABLE_BERT})
-
-# =====================
-# IndoBERT caching & optional warmup
-# =====================
-BERT_CACHE = {"tok": None, "mdl": None, "device": "cpu"}
-
-# --- GLOBAL BERT VARIABLES ---
-_bert_tokenizer = None
-_bert_model = None
-_bert_device = None
-
-def get_bert():
-    global _bert_tokenizer, _bert_model, _bert_device
-    if _bert_tokenizer is None:
-        print("⏳ Loading IndoBERT model... (First run might take a while)")
-        try:
-            model_name = "indobenchmark/indobert-base-p1"
-            _bert_tokenizer = AutoTokenizer.from_pretrained(model_name)
-            _bert_model = AutoModel.from_pretrained(model_name)
-            _bert_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-            _bert_model.to(_bert_device)
-            _bert_model.eval()
-            print(f"✅ IndoBERT loaded on {_bert_device}")
-        except Exception as e:
-            print(f"❌ Failed to load IndoBERT: {e}")
-            return None, None, None
-    return _bert_tokenizer, _bert_model, _bert_device
-
-# Warmup at startup if requested (download/load once)
-if ENABLE_BERT and ENABLE_BERT_WARMUP:
-    tok, mdl, dev = get_bert()
-    try:
-        if tok is not None and mdl is not None:
-            import torch  # type: ignore
-            with torch.no_grad():
-                enc = tok(["warmup"], padding=True, truncation=True, max_length=16, return_tensors="pt")
-                _ = mdl(**enc.to(dev))
-    except Exception:
-        pass
-
-@app.get("/warmup")
-def warmup():
-    """Optionally trigger BERT load and a tiny forward pass to avoid first-request latency."""
-    if not ENABLE_BERT:
-        return jsonify({"bert": "disabled"})
-    tok, mdl, dev = get_bert()
-    if tok is None or mdl is None:
-        return jsonify({"bert": "unavailable"}), 500
-    try:
-        import torch  # type: ignore
-        with torch.no_grad():
-            enc = tok(["warmup"], padding=True, truncation=True, max_length=16, return_tensors="pt")
-            _ = mdl(**enc.to(dev))
-        return jsonify({"bert": "ready", "device": dev})
-    except Exception as e:
-        return jsonify({"bert": "error", "message": str(e)}), 500
-
-@app.post("/analyze")
-@app.post("/analyze")
-# (Load helpers lain seperti check_key, load_feedback, taxonomy, dll biarkan seperti file lama Anda)
-# ... (Pastikan functions: check_key, load_feedback_weights, build_topic_index..., load_inset_lexicon ada) ...
-
-@app.post("/analyze")
-def analyze():
-    if not check_key():
-        return jsonify({"error": "unauthorized"}), 401
-
-    data = request.get_json(force=True) or {}
-    items = data.get("items")
-    
-    if items is None:
-        items = [{
-            "id": data.get("id") or "item-1",
-            "text": data.get("text") or "",
-            "lang_hint": (data.get("context") or {}).get("lang_hint") if isinstance(data.get("context"), dict) else None
-        }]
-    
-    if not isinstance(items, list) or not items:
-        return jsonify({"error": "items required"}), 422
-
-    # Setup Taxonomy & Feedback (HYBRID APPROACH)
-    categories_override = data.get("categories")
-    TOPIC_INDEX, TAXONOMY_CATEGORIES, BUCKET_KEYWORDS = build_topic_index_and_categories_map()
-    
-    categories_map = {}
-    bucket_map = {}
-    
-    if isinstance(categories_override, dict) and categories_override:
-        for k, v in categories_override.items():
-            if isinstance(v, list):
-                categories_map[str(k).upper()] = [str(x) for x in v if isinstance(x, (str, int))]
-    
-    if not categories_map:
-        categories_map = TAXONOMY_CATEGORIES
-        bucket_map = BUCKET_KEYWORDS
-    
-    feedback = load_feedback_weights()
-
-    # Setup Variables
-    results = []
-    per_legacy = []
-    all_texts = []
-    negatives = []
-    per_entry_cats = {}
-
-    # Load IndoBERT Model (only if enabled)
-    tok, mdl, dev = get_bert() if ENABLE_BERT else (None, None, None)
-
-    # --- PROCESS PER ITEM ---
-    for it in items:
-        item_id = it.get("id")
-        raw_txt = (it.get("text") or "").strip()
-        lang_hint = it.get("lang_hint")
-
-        # 1. Text Cleaning (New Logic)
-        clean = clean_text(raw_txt)
-        if not clean:
-            continue
-
-        # 2. Sentiment Scoring (Hybrid)
-        s_lex = score_with_lexicon(clean, LEXICON_ID)
-        s_vad = sia.polarity_scores(raw_txt).get("compound", 0.0) if sia else 0.0
-        aggregate = float(0.7 * s_lex + 0.3 * s_vad) if sia else s_lex
-        
-        # Fallback: keyword-based detection if aggregate is neutral (0)
-        if abs(aggregate) < 0.05:
-            negative_keywords = [
-                "berkelahi", "bertengkar", "murung", "sedih", "marah", "kabur",
-                "masalah", "ribut", "berantem", "stress", "stres", "pusing",
-                "takut", "cemas", "galau", "kecewa", "frustrasi", "frustasi",
-                "jelek", "drop", "sendiri", "sendirian", "tidak paham"
-            ]
-            positive_keywords = ["senang", "bahagia", "gembira", "semangat", "excited", "bagus", "oke", "mantap", "suka", "hebat"]
-            
-            neg_count = sum(1 for kw in negative_keywords if kw in clean)
-            pos_count = sum(1 for kw in positive_keywords if kw in clean)
-            
-            if neg_count > pos_count and neg_count > 0:
-                aggregate = -0.35  # Set mild negative
-            elif pos_count > neg_count and pos_count > 0:
-                aggregate = 0.3   # Set mild positive
-
-        # 3. Sarcasm Detection (New Logic)
-        is_sarcasm, sarc_conf = detect_sarcasm_heuristic(clean, raw_txt, aggregate)
-        
-        if is_sarcasm:
-            # Flip score: Positive -> Negative
-            if aggregate > 0:
-                aggregate = -0.5 * aggregate - 0.3
-            elif aggregate == 0:
-                aggregate = -0.4
-            lbl = "negatif"
-        else:
-            lbl = label_from_score(aggregate)
-
-        # 4. Negative Gate & Severity
-        # Check severity based on flipped score
-        neg_flag, severity = negative_gate(aggregate, raw_txt)
-        if is_sarcasm: 
-            neg_flag = True
-            severity = max(severity, 0.6) # Sarkasme biasanya sakit
-
-        # 5. Category Scoring (ONLY FOR NEGATIVE CONTENT)
-        # Skip kategorisasi jika semua input positif (aggregate > 0 dan tidak ada sarkasme)
-        cat_scores = {}
-        reasons = {}
-        bucket_scores = defaultdict(float)
-        best_cat = None
-        best_bucket = None
-        cluster = None
-        
-        if neg_flag or aggregate <= 0:
-            # HYBRID: Kategori Kecil + Bucket Agregat (ONLY FOR NEGATIVE)
-            cat_scores, reasons = score_categories_for_text(clean, categories_map, feedback)
-            
-            # BOOST: Aggregate bucket scores dari kategori kecil
-            for cat, score in cat_scores.items():
-                tp_meta = TOPIC_INDEX.get(str(cat).upper())
-                if tp_meta and tp_meta.get("bucket"):
-                    bucket_scores[tp_meta["bucket"]] += score * 0.8  # Slightly dampen aggregated
-            
-            # Also score directly against bucket keywords (OLD METHOD)
-            if bucket_map:
-                bucket_direct, _ = score_categories_for_text(clean, bucket_map, feedback)
-                for bucket, score in bucket_direct.items():
-                    bucket_scores[bucket] += score * 1.2  # Boost direct matches
-            
-            # Find best kategori kecil
-            best_cat = max(cat_scores, key=cat_scores.get) if cat_scores else None
-            best_bucket = max(bucket_scores, key=bucket_scores.get) if bucket_scores else None
-
-            # Apply minimum confidence thresholds to reduce false positives
-            if best_cat and cat_scores.get(best_cat, 0.0) < 0.22:
-                best_cat = None
-            if best_bucket and bucket_scores.get(best_bucket, 0.0) < 0.25:
-                best_bucket = None
-
-            # 6. Cluster Labeling (Prioritize Kategori Kecil, fallback to Bucket)
-            if best_cat:
-                tp_meta = TOPIC_INDEX.get(str(best_cat).upper())
-                if tp_meta:
-                    cluster = {
-                        "id": tp_meta.get("kode"),  # Match dengan kategori_masalahs.kode
-                        "label": tp_meta.get("name"),
-                        "bucket": tp_meta.get("bucket"),
-                        "topic_id": tp_meta.get("kode"),
-                        "topic_name": tp_meta.get("name"),
-                        "confidence": round(cat_scores[best_cat], 3)
-                    }
-            elif best_bucket:
-                # Fallback: Use bucket if no specific kategori kecil matched
-                cluster = {
-                    "id": best_bucket,
-                    "label": best_bucket,
-                    "bucket": best_bucket,
-                    "topic_id": None,
-                    "topic_name": None,
-                    "confidence": round(bucket_scores[best_bucket], 3)
-                }
-        # Else: Skip kategorisasi untuk input positif
-
-        # 7. Keywords Extraction
-        try:
-            rk = Rake(stopwords=STOPWORDS_ID_CHAT, min_length=1, max_length=3)
-            rk.extract_keywords_from_text(clean) # Use clean text
-            raw_phrases = [p.lower() for p in rk.get_ranked_phrases()[:8]]
-        except Exception:
-            raw_phrases = []
-        
-        # Filter phrases
-        phrases = sorted(list(set(raw_phrases)), key=len)[:5]
-
-        # 8. Summary Text
-        if is_sarcasm:
-            summary_text = f"Terdeteksi sarkasme/sindiran. Inti keluhan: {', '.join(phrases[:3])}."
-        elif neg_flag and cluster:
-            summary_text = f"Masalah utama: {cluster['label']}. Gejala: {', '.join(phrases[:3])}."
-        elif neg_flag:
-            summary_text = f"Inti keluhan: {', '.join(phrases[:3])}."
-        else:
-            # Positive input - no categorization needed
-            summary_text = f"Ekspresi positif. Kata kunci: {', '.join(phrases[:3]) if phrases else 'tidak ada keluhan'}."
-
-        results.append({
-            "id": item_id,
-            "clean_text": clean,
-            "sentiment": {
-                "barasa": s_lex, "english": s_vad, "aggregate": aggregate, "label": lbl
-            },
-            "negative_flag": neg_flag,
-            "is_sarcasm": is_sarcasm, # Field Baru
-            "severity": severity,
-            "cluster": cluster,
-            "summary": summary_text,
-            "key_phrases": phrases,
-            "recommendations": [],
-            "cat_scores": cat_scores,
-            "cat_reasons": reasons,
-        })
-
-        per_legacy.append({
-            "id": item_id, "text": raw_txt, "sentiment": aggregate, 
-            "label": lbl, "keywords": phrases
-        })
-        
-        all_texts.append(clean)
-        
-        # Collect negatives for clustering
-        if neg_flag:
-            negatives.append(clean)
-            ranked = sorted([(c, s) for c, s in cat_scores.items() if s > 0], key=lambda x: x[1], reverse=True)
-            per_entry_cats[item_id] = {
-                "ranked": ranked[:3],
-                "reasons": {c: reasons.get(c, []) for c, _ in ranked[:3]}
-            }
-
-    # --- AGGREGATION & CLUSTERING ---
-
-    # Global Keywords
-    keyphrases = extract_keyphrases(all_texts) if all_texts else []
-
-    # Clustering with IndoBERT
-    clusters = []
-    if len(negatives) >= 2:
-        used_engine = "tfidf"
-        X = None
-        
-        # Try BERT
-        if tok and mdl:
-            try:
-                with torch.no_grad():
-                    enc = tok(negatives, padding=True, truncation=True, max_length=128, return_tensors="pt").to(dev)
-                    out = mdl(**enc)
-                    cls = out.last_hidden_state[:, 0, :]
-                    X = cls.detach().cpu().numpy()
-                    used_engine = "bert"
-            except Exception as e:
-                print(f"⚠️ BERT error, falling back: {e}")
-                X = None
-        
-        # Fallback TF-IDF
-        if X is None:
-            vec = _build_cluster_vectorizer() # Pastikan fungsi ini ada (helper lama)
-            X = vec.fit_transform(negatives)
-            
-        k = 2 if len(negatives) == 2 else min(4, max(2, len(negatives)//2))
-        km = KMeans(n_clusters=k, n_init='auto', random_state=42)
-        y = km.fit_predict(X)
-        
-        for ci in range(k):
-            ex = [negatives[i] for i in range(len(negatives)) if y[i] == ci][:5]
-            clusters.append({
-                "cluster": int(ci),
-                "engine": used_engine,
-                "examples": ex
-            })
-
-    # Overview Weighted by Severity & Sarcasm (KATEGORI KECIL - NEGATIVE ONLY)
-    cat_counter = Counter()
-    for r in results:
-        # ONLY count negative items for categorization
-        if not r.get("negative_flag"):
-            continue
-            
-        sev = r.get("severity", 0.0)
-        weight = 1.0 + sev
-        
-        # Aggregate by kategori kecil (topic)
-        cluster = r.get("cluster") or {}
-        topic_name = cluster.get("topic_name") or cluster.get("label")
-        if topic_name:
-            # Use cluster confidence as base score
-            score = cluster.get("confidence", 0.5)
-            cat_counter[topic_name] += score * weight
-
-    categories_overview = [
-        {"category": cat, "score": round(val, 4)} for cat, val in cat_counter.most_common()
-    ]
-
-    # Summary Stats
-    avg = sum([x["sentiment"] for x in per_legacy]) / len(per_legacy) if per_legacy else 0.0
-    summary = {
-        "avg_sentiment": round(avg, 3),
-        "negative_ratio": round(sum(1 for x in per_legacy if x["label"]=="negatif")/len(per_legacy), 3) if per_legacy else 0.0
-    }
-
-    # NEW: Recommendations Generation PER KATEGORI KECIL (Granular)
-    # Laravel akan filter lebih lanjut berdasarkan master_rekomendasis.rules
-    def recommend_by_topic(topic_id: str, topic_name: str, bucket: str, severity_val: float, negative: bool, sarcasm: bool):
-        """Generate recommendations based on kategori kecil (topic).
-        Returns structured data yang bisa di-match dengan master_rekomendasis di Laravel.
-        
-        Format return:
-        {
-            "kategori_kode": topic_id,  # Match dengan kategori_masalahs.kode
-            "kategori_nama": topic_name,
-            "bucket": bucket,
-            "severity": severity_val,
-            "negative": negative,
-            "sarcasm": sarcasm,
-            "suggested_actions": [...]  # Heuristic suggestions (optional)
-        }
-        """
-        rec = {
-            "kategori_kode": topic_id,
-            "kategori_nama": topic_name,
-            "bucket": bucket,
-            "severity": severity_val,
-            "negative": negative,
-            "sarcasm": sarcasm,
-            "suggested_actions": []
-        }
-        
-        # Heuristic suggestions (Laravel akan filter sesuai master_rekomendasis)
-        if (negative or sarcasm) and severity_val >= 0.6:
-            rec["suggested_actions"].append({
-                "type": "URGENT",
-                "reason": "Severity tinggi atau terdeteksi sarkasme"
-            })
-        elif negative and severity_val >= 0.4:
-            rec["suggested_actions"].append({
-                "type": "MODERATE",
-                "reason": "Indikasi masalah perlu perhatian"
-            })
-        
-        return rec
-
-    # Assign Recs per item (GRANULAR: Per Kategori Kecil)
-    for r in results:
-        cluster = r.get("cluster") or {}
-        topic_id = cluster.get("topic_id") or cluster.get("id")
-        topic_name = cluster.get("topic_name") or cluster.get("label")
-        bucket = cluster.get("bucket", "")
-        
-        if topic_id:
-            # Return kategori kecil info untuk Laravel matching
-            r["recommendations"] = [recommend_by_topic(
-                topic_id, 
-                topic_name,
-                bucket,
-                r.get("severity", 0), 
-                r.get("negative_flag", False), 
-                r.get("is_sarcasm", False)
-            )]
-        else:
-            # Fallback: No specific kategori detected
-            r["recommendations"] = []
-
-    # Global Recs (PER KATEGORI KECIL - Granular)
-    abs_sent = abs(avg)
-    global_recommendations = []
-    valid_cats = [c for c in categories_overview if c["score"] >= 0.05]
-    is_neg_avg = avg < -0.05
-    
-    for cat in valid_cats:
-        cname = cat["category"]
-        meta = TOPIC_INDEX.get(cname.upper()) or {}
-        topic_id = meta.get("kode") or meta.get("id")
-        topic_name = meta.get("name", cname)
-        bucket = meta.get("bucket", "")
-        
-        if topic_id:
-            rec_data = recommend_by_topic(
-                topic_id, 
-                topic_name,
-                bucket,
-                max(0.3, abs_sent), 
-                is_neg_avg, 
-                False  # No global sarcasm flag
-            )
-            global_recommendations.append({
-                "category": cname,
-                "kategori_kode": topic_id,
-                "score": cat["score"],
-                "recommendation": rec_data
-            })
-
-    return jsonify({
-        "version": SERVICE_VERSION,
-        "items": results,
-        "summary": summary,
-        "keyphrases": keyphrases,
-        "clusters": clusters,
-        "categories_overview": categories_overview,
-        "global_recommendations": global_recommendations,
-    })
-
-@app.post("/feedback")
-def feedback():
-    if not check_key():
-        return jsonify({"error": "unauthorized"}), 401
-
-    data = request.get_json(force=True) or {}
-    # expected: { keywords: ["telat","bolos"], from_category?: "AKADEMIK", to_category?: "DISIPLIN", delta?: 0.2 }
-    kws = data.get("keywords") or []
-    from_cat = str(data.get("from_category") or "").upper()
-    to_cat = str(data.get("to_category") or "").upper()
-    delta = float(data.get("delta") or 0.2)
-    if not kws or (not from_cat and not to_cat):
-        return jsonify({"error": "invalid payload"}), 422
-
-    weights = load_feedback_weights()
-    for kw in kws:
-        k = str(kw).lower().strip()
-        if not k:
-            continue
-        entry = weights.get(k, {})
-        # penalize from_cat slightly, reward to_cat (if provided)
-        if from_cat:
-            entry[from_cat] = float(entry.get(from_cat, 0.0)) - (delta / 2.0)
-        if to_cat:
-            entry[to_cat] = float(entry.get(to_cat, 0.0)) + delta
-        weights[k] = entry
-    save_feedback_weights(weights)
-    return jsonify({"ok": True, "updated": len(kws)})
-
-@app.route("/feedback", methods=["POST"])
-def receive_feedback():
-    """
-    Receive teacher revision feedback for continuous learning.
-    
-    Expected payload:
-    {
-        "revision_id": 123,
-        "original_text": "...",
-        "original_kategori": "AKADEMIK",
-        "original_rekomendasi": [...],
-        "revised_kategori": "DISIPLIN",
-        "revised_rekomendasi": [...],
-        "revision_notes": "..." (optional)
-    }
-    
-    This endpoint will:
-    1. Extract keywords from original text
-    2. Penalize weights for original_kategori
-    3. Reward weights for revised_kategori
-    4. Learn from the correction pattern
-    """
-    if not check_key():
-        return jsonify({"error": "unauthorized"}), 401
-
-    try:
-        data = request.get_json(force=True) or {}
-        
-        revision_id = data.get("revision_id")
-        original_text = data.get("original_text", "")
-        original_kategori = str(data.get("original_kategori", "")).upper()
-        revised_kategori = str(data.get("revised_kategori", "")).upper()
-        
-        if not original_text or not revised_kategori:
-            return jsonify({"error": "Missing required fields"}), 422
-
-        # Only learn if kategori was changed (not just rekomendasi)
-        if original_kategori == revised_kategori:
-            logger.info(f"Revision #{revision_id}: Kategori unchanged, skipping weight update")
-            return jsonify({
-                "ok": True, 
-                "message": "Kategori unchanged, no weight update needed",
-                "revision_id": revision_id
-            })
-
-        # Extract keywords from original text
-        keywords = []
-        try:
-            # Simple keyword extraction - tokenize and filter stopwords
-            tokens = nltk.word_tokenize(original_text.lower())
-            filtered_tokens = [
-                t for t in tokens 
-                if t.isalnum() and len(t) > 2 
-                and t not in STOPWORDS_ID_CHAT 
-                and t not in _CHAT_FILLERS
-            ]
-            # Get top 10 most meaningful words
-            word_counts = Counter(filtered_tokens)
-            keywords = [word for word, _ in word_counts.most_common(10)]
-            
-            logger.info(f"Revision #{revision_id}: Extracted keywords: {keywords}")
-        except Exception as e:
-            logger.warning(f"Failed to extract keywords: {e}")
-            # Fallback: split by space
-            keywords = [w for w in original_text.lower().split() if len(w) > 2][:10]
-
-        if not keywords:
-            return jsonify({
-                "ok": False, 
-                "error": "Could not extract keywords from text"
-            }), 422
-
-        # Update feedback weights
-        weights = load_feedback_weights()
-        delta = 0.3  # Learning rate
-        
-        for kw in keywords:
-            k = str(kw).lower().strip()
-            entry = weights.get(k, {})
-            
-            # Penalize original (wrong) kategori
-            if original_kategori:
-                entry[original_kategori] = float(entry.get(original_kategori, 0.0)) - (delta / 2.0)
-            
-            # Reward revised (correct) kategori
-            entry[revised_kategori] = float(entry.get(revised_kategori, 0.0)) + delta
-            
-            weights[k] = entry
-        
-        save_feedback_weights(weights)
-        
-        logger.info(f"Revision #{revision_id}: Updated weights for {len(keywords)} keywords "
-                   f"from {original_kategori} → {revised_kategori}")
-        
-        return jsonify({
-            "ok": True,
-            "message": "Feedback learned successfully",
-            "revision_id": revision_id,
-            "keywords_updated": len(keywords),
-            "correction": f"{original_kategori} → {revised_kategori}"
-        })
-        
-    except Exception as e:
-        logger.error(f"Error processing feedback: {e}", exc_info=True)
-        return jsonify({
-            "ok": False,
-            "error": str(e)
-        }), 500
-
-if __name__ == "__main__":
-    # LOCAL MODE: port 5001 biar gampang
-    app.run(host="0.0.0.0", port=5001, debug=True)
+import sys
+import os
+import re
+import json
+import math
+import logging
+from collections import Counter, defaultdict
+from datetime import datetime
+from typing import List, Dict, Tuple, Optional
+
+import nltk
+import numpy as np
+import pandas as pd
+from flask import Flask, request, jsonify
+try:
+    from langdetect import detect
+except Exception:
+    # Fallback sederhana jika langdetect tidak tersedia
+    def detect(_text: str) -> str:
+        return "id"
+
+# --- LIBRARY BARU (Deep Learning & Emoji) ---
+import emoji
+import torch
+from transformers import AutoTokenizer, AutoModel
+from sklearn.cluster import KMeans
+from sklearn.feature_extraction.text import TfidfVectorizer # Tetap butuh untuk fallback
+
+# NLTK & RAKE
+from nltk.corpus import stopwords
+from nltk.sentiment import SentimentIntensityAnalyzer
+from rake_nltk import Rake
+try:
+    # Optional Indonesian stemmer (improves recall)
+    from Sastrawi.Stemmer.StemmerFactory import StemmerFactory  # type: ignore
+    _sastrawi_factory = StemmerFactory()
+    _sastrawi_stemmer = _sastrawi_factory.create_stemmer()
+    def _stem_id(word: str) -> str:
+        try:
+            return _sastrawi_stemmer.stem(word)
+        except Exception:
+            return word
+except Exception:
+    _sastrawi_stemmer = None
+    def _stem_id(word: str) -> str:
+        return word
+
+# Setup Logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# Download NLTK resources safely - SKIP jika tidak perlu
+def ensure_nltk_safe():
+    """Check NLTK packages, skip download if missing (offline mode)."""
+    needed = {
+        "punkt": "tokenizers/punkt",
+        "punkt_tab": "tokenizers/punkt_tab", 
+        "stopwords": "corpora/stopwords",
+    }
+    
+    for pkg, path in needed.items():
+        try:
+            nltk.data.find(path)
+            print(f"✅ {pkg} ready")
+        except LookupError:
+            print(f"⚠️ {pkg} not found - continuing in offline mode")
+
+# Panggil tanpa download otomatis
+try:
+    ensure_nltk_safe()
+    print("=" * 60)
+except Exception as e:
+    print(f"⚠️ NLTK check error: {e}")
+
+app = Flask(__name__)
+
+# Configuration
+API_KEY = os.getenv("FLASK_API_KEY", "rahasia-negara-123") # Gunakan env var
+SERVICE_VERSION = "1.2.0-bert-sarcasm" # Version bump
+
+# --- GLOBAL VARIABLES ---
+# Initialize SentimentIntensityAnalyzer safely (skip jika vader_lexicon tidak ada)
+try:
+    sia = SentimentIntensityAnalyzer()
+    print("✅ VADER sentiment analyzer ready")
+except Exception as e:
+    print(f"⚠️ VADER not available, using custom lexicon only: {e}")
+    sia = None
+
+STOPWORDS_ID_CHAT = set(stopwords.words('indonesian')) | set(stopwords.words('english'))
+_CHAT_FILLERS = {
+    "sih", "dong", "kok", "kan", "tuh", "deh", "lah", "yah", "ni", "tu", 
+    "ya", "yak", "yuk", "loh", "masa", "mana", "tapi", "kalo", "kalau", 
+    "biar", "buat", "bikin", "bilang", "gak", "ga", "nggak", "enggak", 
+    "kagak", "tak", "ndak", "udah", "sudah", "blm", "belum", "pas", 
+    "lagi", "lg", "td", "tadi", "km", "kamu", "aku", "saya", "gw", "gue", 
+    "lu", "lo", "elu", "kita", "kalian", "mereka", "dia", "ini", "itu", 
+    "sini", "situ", "sana", "bgt", "banget", "aja", "saja", "cuma", 
+    "doang", "terus", "trs", "jd", "jadi", "karna", "karena", "krn", 
+    "bisa", "bs", "mau", "mo", "pengen", "ingin", "ada", "tiada",
+    "sama", "dgn", "dengan", "dr", "dari", "ke", "di", "pd", "pada",
+    "kapan", "dimana", "siapa", "mengapa", "kenapa", "gimana", "bagaimana",
+    "wkwk", "haha", "hehe", "huhu", "anjir", "njir", "anjing",
+    "apalah", "apa", "aduh", "wah", "nah", "kek", "kayak", "macam"
+}
+STOPWORDS_ID_CHAT.update(_CHAT_FILLERS)
+
+# ==== Integrasi TALA Stopwords tambahan ====
+try:
+    _TALA_PATH = os.path.join(os.path.dirname(__file__), 'tala-stopwords-indonesia.txt')
+    if os.path.exists(_TALA_PATH):
+        with open(_TALA_PATH, 'r', encoding='utf-8') as _tf:
+            tala_words = {w.strip().lower() for w in _tf if w.strip() and not w.startswith('#')}
+            # Hindari kata yang terlalu pendek (1 huruf) agar tidak over-filter
+            tala_words = {w for w in tala_words if len(w) > 1}
+            STOPWORDS_ID_CHAT.update(tala_words)
+            logger.info(f"Loaded TALA stopwords: +{len(tala_words)} terms (total={len(STOPWORDS_ID_CHAT)})")
+    else:
+        logger.warning('TALA stopwords file not found, skipping integration.')
+except Exception as e:
+    logger.warning(f'Failed loading TALA stopwords: {e}')
+
+# Lexicon sederhana untuk Indonesia/Kupang dalam range standar [-1, +1]
+ID_EXTRA = {
+    # Emosi negatif umum
+    "capek": -0.7, "capai": -0.5, "pusing": -0.7, "marah": -0.8, "sedih": -0.7,
+    "murung": -0.7, "galau": -0.6, "bingung": -0.5, "takut": -0.7, "cemas": -0.7,
+    "kecewa": -0.7, "kesal": -0.6, "jengkel": -0.6, "frustasi": -0.8, "frustrasi": -0.8, "depresi": -0.9,
+    "stres": -0.8, "tegang": -0.6, "resah": -0.7, "gelisah": -0.7, "sendirian": -0.5,
+    # Emosi positif umum
+    "senang": 0.7, "bahagia": 0.8, "semangat": 0.7, "hepi": 0.7, "gembira": 0.8,
+    "excited": 0.7, "antusias": 0.7, "optimis": 0.6, "tenang": 0.5, "damai": 0.6,
+    "puas": 0.6, "lega": 0.6, "syukur": 0.7, "bangga": 0.7,
+    # Masalah sekolah
+    "telat": -0.6, "bolos": -0.8, "berantem": -0.9, "ribut": -0.7, "gaduh": -0.6,
+    "berkelahi": -0.9, "bertengkar": -0.8, "keributan": -0.7, "masalah": -0.5,
+    "PR": -0.3, "tugas": -0.2, "banyak": -0.2, "malas": -0.5, "rajin": 0.5,
+    "skip": -0.6, "cabut": -0.6, "pontang": -0.7, "mangkir": -0.7,
+    # Keluarga & rumah
+    "berantem": -0.9, "cekcok": -0.8, "bertengkar": -0.8, "marahan": -0.7,
+    "berisik": -0.5, "berantakan": -0.4, "kacau": -0.7, "chaos": -0.7,
+    "pisah": -0.7, "bercerai": -0.8, "kabur": -0.7, "minggat": -0.8, "pergi": -0.3,
+    # Kupang/Manado dialect dengan sentiment
+    "sonde": -0.3, "tara": -0.2, "teda": -0.2, "pigi": -0.1,  # Kupang negation/pergi
+    "kaco": -0.5, "cungkel": -0.5, "bongkar": -0.2, "kobo": -0.4, "susa": -0.6,
+    "dolo": -0.4, "molo": -0.4, "so": -0.3, "nda": -0.3,  # Manado negation
+    "bodo": -0.6, "bodoh": -0.7, "tolol": -0.8, "goblok": -0.8,  # Insults
+    # Neutral pronouns (score 0 won't affect sentiment)
+    "beta": 0.0, "ko": 0.0, "torang": 0.0, "katong": 0.0, "deng": 0.0,
+    "dong": 0.0, "de": 0.0, "so": 0.0, "pe": 0.0, "pung": 0.0,
+    "tanta": 0.0, "oma": 0.0, "opa": 0.0, "mama": 0.0, "papa": 0.0,
+}
+# tambahkan ke VADER (jika available)
+if sia:
+    sia.lexicon.update({k.lower(): v for k, v in ID_EXTRA.items()})
+
+app = Flask(__name__)
+
+API_KEY = os.environ.get("ML_API_KEY")  # optional
+FEEDBACK_FILE = os.environ.get("ML_FEEDBACK_FILE", os.path.join(os.path.dirname(__file__), "feedback_weights.json"))
+LEXICON_DIR = os.environ.get("ML_LEXICON_DIR", os.path.join(os.path.dirname(__file__), "lexicons"))
+ENABLE_BERT = os.environ.get("ML_ENABLE_BERT", "false").lower() in ("1","true","yes")
+BERT_MODEL_NAME = os.environ.get("ML_BERT_MODEL", "indobenchmark/indobert-base-p1")
+ENABLE_BERT_WARMUP = os.environ.get("ML_BERT_WARMUP", "false").lower() in ("1","true","yes")
+SERVICE_VERSION = os.environ.get("ML_VERSION", "ml-rasaya:2025.11.0")
+
+def check_key():
+    if API_KEY:
+        # accept both header casings/variants for compatibility
+        key = request.headers.get("X-API-KEY") or request.headers.get("X-API-Key")
+        if key != API_KEY:
+            return False
+    return True
+
+def detect_lang(txt, hint=None):
+    if hint:
+        return hint
+    try:
+        return detect(txt) if txt and txt.strip() else "id"
+    except Exception:
+        return "id"
+
+def label_from_score(compound: float) -> str:
+    if compound >= 0.05: return "positif"
+    if compound <= -0.05: return "negatif"
+    return "netral"
+ 
+# Legacy default map removed in favor of taxonomy-derived categories
+
+def load_feedback_weights():
+    try:
+        with open(FEEDBACK_FILE, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except Exception:
+        return {}
+
+def save_feedback_weights(weights: dict):
+    try:
+        with open(FEEDBACK_FILE, 'w', encoding='utf-8') as f:
+            json.dump(weights, f, ensure_ascii=False, indent=2)
+    except Exception:
+        pass
+
+def score_categories_for_text(txt: str, categories_map: dict, feedback: dict):
+    """Scoring kategori berbasis token & n-gram.
+    - Tokenize + optional stemming (Sastrawi) untuk generalisasi.
+    - Match unigram/bigram/trigram secara exact (bukan substring bebas).
+    - Bobot dasar dibagi oleh banyaknya kategori yang memakai keyword (1/n_cats).
+    - Boost n-gram (bi=1.4x, tri=1.6x), downweight token sangat pendek (<=3: 0.5x).
+    - Tambahkan feedback weight jika ada, lalu normalisasi ke proporsi total.
+    """
+    clean = clean_text(txt)
+    toks = _tokenize_and_stem(clean)
+    uni, bi, tri = _build_ngram_sets(toks)
+
+    # Invert index: keyword -> categories
+    inv = defaultdict(list)
+    for cat, kws in categories_map.items():
+        for kw in kws:
+            k = (kw or '').strip().lower()
+            if k:
+                inv[k].append(cat)
+
+    scores = {cat: 0.0 for cat in categories_map.keys()}
+    reasons = defaultdict(list)
+
+    for kw, cats in inv.items():
+        parts = [p for p in kw.split() if p]
+        parts_stem = [_stem_id(p) for p in parts]
+        gram = len(parts_stem)
+        present = False
+        if gram == 1:
+            present = parts_stem[0] in uni
+        elif gram == 2:
+            present = (parts_stem[0] + ' ' + parts_stem[1]) in bi
+        else:
+            seq = ' '.join(parts_stem[:3])
+            present = seq in tri if len(parts_stem) >= 3 else False
+        if not present:
+            continue
+
+        base = 1.0 / max(1, len(cats))
+        if gram == 1 and len(parts_stem[0]) <= 3:
+            base *= 0.5
+        if gram == 2:
+            base *= 1.4
+        elif gram >= 3:
+            base *= 1.6
+
+        for cat in cats:
+            adj = base + float(feedback.get(kw, {}).get(cat, 0.0))
+            scores[cat] += adj
+            reasons[cat].append(kw)
+
+    total = sum(scores.values())
+    if total > 0:
+        for k in scores.keys():
+            scores[k] = round(scores[k] / total, 4)
+    return scores, {k: sorted(set(v))[:5] for k, v in reasons.items()}
+
+"""
+Cleaning & Lexicon Loader (InSet + optional Barasa)
+"""
+# Regex patterns
+_RE_URL = re.compile(r"https?://\S+|www\.\S+")
+_RE_MENTION = re.compile(r"[@#]\w+")
+_RE_REPEAT = re.compile(r"(.)\1{2,}") # 3 kali atau lebih
+_RE_MULTISPACE = re.compile(r"\s+")
+
+def clean_text(t: str) -> str:
+    """
+    Cleaning text tapi mempertahankan emoji dan tanda baca penting untuk sentimen.
+    """
+    if not t: return ""
+    
+    # 1. Demojize: Ubah emoji jadi teks bahasa Indonesia (manual mapping dikit)
+    t = emoji.demojize(t, delimiters=(" ", " ")) 
+    t = t.replace("loudly_crying_face", "menangis") \
+         .replace("crying_face", "sedih") \
+         .replace("pensive_face", "murung") \
+         .replace("angry_face", "marah") \
+         .replace("rolling_on_the_floor_laughing", "tertawa") \
+         .replace("face_with_rolling_eyes", "bosan") \
+         .replace("broken_heart", "patah hati")
+
+    t = t.lower().strip()
+
+    # 2. Remove URL & Mention
+    t = _RE_URL.sub(" ", t)
+    t = _RE_MENTION.sub(" ", t)
+
+    # 3. Keep punctuation important for emotion (?!.,)
+    # Hapus karakter aneh selain alphanumeric dan tanda baca penting
+    t = re.sub(r"[^a-z0-9\?\!\.\,\s]", " ", t)
+    
+    # Pisahkan tanda baca biar jadi token terpisah
+    t = re.sub(r"([\?\!\.\,])", r" \1 ", t)
+
+    # 4. Normalize Repeat (bangeeet -> banget)
+    t = _RE_REPEAT.sub(r"\1", t)
+
+    # 5. Slang & Dialect Normalization (Indonesian + Kupang + Manado + Ambon)
+    dialect = {
+        # Standard Indonesian slang
+        "gw": "saya", "gue": "saya", "lu": "kamu", "lo": "kamu", "elu": "kamu",
+        "ak": "aku", "aq": "aku", "sy": "saya", "w": "saya", "ane": "saya",
+        "gak": "tidak", "ga": "tidak", "nggak": "tidak", "kaga": "tidak", "ndak": "tidak",
+        "enggak": "tidak", "engga": "tidak", "ngga": "tidak", "kagak": "tidak",
+        "krn": "karena", "karna": "karena", "bgt": "banget", "bgtt": "banget",
+        "tdk": "tidak", "jgn": "jangan", "udh": "sudah", "sdh": "sudah",
+        "blm": "belum", "trus": "terus", "jd": "jadi", "dgn": "dengan",
+        "sm": "sama", "yg": "yang", "kalo": "kalau", "kl": "kalau",
+        "mager": "malas gerak", "baper": "bawa perasaan", "gabut": "bosan",
+        "anjir": "kaget", "njir": "kaget", "anjay": "hebat", 
+        "mantul": "mantap", "santuy": "santai", "sans": "santai",
+        "gajelas": "tidak jelas", "gaje": "tidak jelas",
+        # Kupang/NTT dialect
+        # --- KATA GANTI ORANG (PRONOUNS) ---
+        "beta": "saya", "b": "saya", "bt": "saya", # Kupang/Ambon
+        "kita": "saya", # Manado (konteks santai)
+        "ana": "saya", "awak": "saya", "sa": "saya", "sy": "saya",
+        "ak": "aku", "aq": "aku", "gw": "saya", "gue": "saya",
+        
+        "lu": "kamu", "lo": "kamu", "elu": "kamu", 
+        "ose": "kamu", "os": "kamu", "ale": "kamu", # Ambon
+        "ngana": "kamu", "nga": "kamu", # Manado
+        "ko": "kamu", "kau": "kamu", "ju": "kamu", # Kupang/Papua
+        "bo": "kamu", # Bima/Dompu kadang masuk
+        
+        "dia": "dia", "de": "dia", "i": "dia", # Papua/Kupang (De pung rumah)
+        "antua": "beliau", # Ambon (respektif)
+        
+        "katong": "kita", "ketong": "kita", "ktg": "kita", # Kupang/Ambon
+        "torang": "kita", "tong": "kita", # Manado/Papua
+        
+        "dorang": "mereka", "dong": "mereka", "drg": "mereka", # Manado/Kupang/Ambon
+        "besong": "kalian", "basong": "kalian", "kamorang": "kalian", # Kupang/Papua
+        "ngoni": "kalian", # Manado
+
+        # --- NEGASI (TIDAK/BUKAN) ---
+        "sonde": "tidak", "son": "tidak", "snd": "tidak", "sond": "tidak", # Kupang
+        "seng": "tidak", "sing": "tidak", "tra": "tidak", "trada": "tidak", # Ambon/Papua
+        "tara": "tidak", "tar": "tidak", 
+        "nyanda": "tidak", "nda": "tidak", "ndak": "tidak", # Manado/Jawa
+        "gak": "tidak", "ga": "tidak", "nggak": "tidak", "kaga": "tidak", 
+        "bukang": "bukan",
+
+        # --- KATA KERJA & KETERANGAN (VERBS & ADVERBS) ---
+        "pi": "pergi", "p": "pergi", "pig": "pergi", # Kupang/Ambon (saya kabur 'pi'...)
+        "su": "sudah", "so": "sudah", # Kupang/Manado/Ambon
+        "sdh": "sudah", "udh": "sudah", "udah": "sudah",
+        "blm": "belum", "balom": "belum", 
+        
+        "mo": "mau", "mau": "mau", 
+        "kasi": "beri", "kase": "beri", "kas": "beri", # Kase tinggal -> Beri tinggal
+        "omong": "bicara", "baomong": "bicara", "bakata": "berkata",
+        "dapa": "dapat", "dap": "dapat",
+        "baku": "saling", # Baku pukul -> Saling pukul
+        "bae": "baik", "baek": "baik",
+        "ancor": "hancur",
+        "ambe": "ambil", "pigi": "pergi",
+        
+        # --- KEPEMILIKAN & PENGHUBUNG ---
+        "pung": "punya", "puny": "punya", "pu": "punya", "pe": "punya", # Beta pung -> Saya punya
+        "deng": "dengan", "dg": "dengan", "dng": "dengan", 
+        "par": "untuk", "for": "untuk", # Ambon/Manado (For ngana)
+        "vor": "untuk",
+        "kek": "seperti", "mcam": "macam", "kek": "kayak",
+
+        # --- KATA SIFAT & LAINNYA ---
+        "talalu": "terlalu", "tlalu": "terlalu",
+        "sadiki": "sedikit", "sadikit": "sedikit",
+        "banya": "banyak", 
+        "skali": "sekali",
+        "samua": "semua",
+        "karna": "karena", "krn": "karena", "gara": "karena",
+        
+        # --- GENERAL SLANG INDONESIA ---
+        "bgt": "banget", "bgtt": "banget",
+        "trus": "terus", "trs": "terus",
+        "jd": "jadi", "jdi": "jadi", 
+        "yg": "yang", "kalo": "kalau", "kl": "kalau",
+        "mager": "malas gerak", "baper": "bawa perasaan", "gabut": "bosan",
+        "anjir": "kaget", "njir": "kaget", "anjay": "hebat", 
+        "mantul": "mantap", "santuy": "santai", "sans": "santai",
+        "gajelas": "tidak jelas", "gaje": "tidak jelas",
+        "ortu": "orang tua", "mksd": "maksud",
+        "knp": "kenapa", "np": "kenapa", "napa": "kenapa",
+        "utk": "untuk"
+    }
+    
+    toks = []
+    for tk in t.split():
+        toks.append(dialect.get(tk, tk))
+    
+    t = " ".join(toks)
+    t = _RE_MULTISPACE.sub(" ", t).strip()
+    return t
+
+# Tokenization + optional stemming helpers
+
+def _tokenize_and_stem(t: str) -> list[str]:
+    toks = [w for w in t.split() if w]
+    if _sastrawi_stemmer is None:
+        return toks
+    return [_stem_id(w) for w in toks]
+
+
+def _build_ngram_sets(tokens: list[str]) -> tuple[set[str], set[str], set[str]]:
+    uni = set(tokens)
+    bi = set([tokens[i] + " " + tokens[i+1] for i in range(len(tokens)-1)]) if len(tokens) >= 2 else set()
+    tri = set([tokens[i] + " " + tokens[i+1] + " " + tokens[i+2] for i in range(len(tokens)-2)]) if len(tokens) >= 3 else set()
+    return uni, bi, tri
+
+def detect_sarcasm_heuristic(text_clean, raw_text, current_sentiment):
+    """
+    Mendeteksi potensi sarkasme berdasarkan kontras sentimen, emoji, dan tanda baca.
+    Returns: (is_sarcasm: bool, confidence: float)
+    """
+    is_sarcasm = False
+    confidence = 0.0
+    text_clean = text_clean.lower()
+    
+    # Kamus Heuristik
+    intensifiers = ["banget", "bgt", "kali", "sumpah", "bener", "bet", "parah", "amat"]
+    positives = ["hebat", "bagus", "pinter", "jenius", "mantap", "enak", "keren", "rajin", "suci"]
+    negatives = ["pusing", "capek", "stres", "gila", "mati", "rusak", "hancur", "sebel", "benci", "malas", "bodoh", "tolol"]
+    
+    # Fitur
+    has_pos = any(p in text_clean for p in positives)
+    has_neg = any(n in text_clean for n in negatives)
+    has_intensifier = any(i in text_clean for i in intensifiers)
+    has_exclamation = "!" in raw_text or "?" in raw_text
+    
+    # LOGIC 1: Kalimat mengandung Positif DAN Negatif ("Hebat banget lo bikin gue stres")
+    if has_pos and has_neg:
+        return True, 0.75
+
+    # LOGIC 2: Kalimat Positif + Tanda baca agresif + Konteks ambigu ("Pinter ya lo??")
+    # Biasanya kalau muji beneran jarang pake '??'
+    if has_pos and ("??" in raw_text or "!!" in raw_text):
+        return True, 0.6
+
+    # LOGIC 3: Positif + Emoji Negatif (Manual check raw text for common sarcastic emojis)
+    # Emoji: Rolling eyes, Unamused face, Upside-down face
+    sarcastic_emojis = ["🙄", "😒", "🙃", "😤", "🤡"]
+    if has_pos and any(e in raw_text for e in sarcastic_emojis):
+        return True, 0.9
+
+    return False, 0.0
+
+def load_inset_lexicon(base_dir: str) -> dict[str, float]:
+    """Load InSet format: lexicons/inset/{positive.tsv,negative.tsv}."""
+    out: dict[str, float] = {}
+    inset_dir = os.path.join(base_dir, "inset")
+    pos = os.path.join(inset_dir, "positive.tsv")
+    neg = os.path.join(inset_dir, "negative.tsv")
+    if os.path.exists(pos):
+        with open(pos, "r", encoding="utf-8") as f:
+            for line in f:
+                w = line.strip().split("\t")[0]
+                if w:
+                    out[w.lower()] = 1.0
+    if os.path.exists(neg):
+        with open(neg, "r", encoding="utf-8") as f:
+            for line in f:
+                w = line.strip().split("\t")[0]
+                if w:
+                    out[w.lower()] = -1.0
+    return out
+
+
+def load_barasa_csv(path: str) -> dict[str, float]:
+    """Load Barasa CSV with headers; expects at least a 'lemma' column and
+    either a 'score' column (float, negative to positive) or separate
+    'pos'/'neg' columns that can be combined (score = pos - neg).
+    Values are clamped to [-1, 1].
+    """
+    lex: dict[str, float] = {}
+    try:
+        import csv
+        with open(path, encoding="utf-8") as f:
+            r = csv.DictReader(f)
+            for row in r:
+                lemma = (row.get("lemma") or row.get("word") or row.get("token") or "").strip().lower()
+                if not lemma:
+                    continue
+                score_val = None
+                # Prefer unified score
+                if row.get("score") not in (None, ""):
+                    try:
+                        score_val = float(row.get("score"))
+                    except Exception:
+                        score_val = None
+                # Else try pos/neg columns
+                if score_val is None:
+                    try:
+                        pos = float(row.get("pos") or row.get("positive") or 0)
+                        neg = float(row.get("neg") or row.get("negative") or 0)
+                        score_val = pos - neg
+                    except Exception:
+                        score_val = 0.0
+                score_val = max(-1.0, min(1.0, float(score_val)))
+                lex[lemma] = score_val
+    except Exception:
+        pass
+    return lex
+
+
+def load_barasa_optional(base_dir: str) -> dict[str, float]:
+    """
+    Try to read Barasa resources if available. The provided file wn-msa-all.tab
+    is a WordNet-style tab file (no explicit polarity). We don't assign scores
+    from it directly; instead we just return empty dict so it doesn't affect
+    sentiment unless in the future we add mapping rules.
+    If you later provide barasa.csv (word,score), we can extend this loader.
+    """
+    barasa_dir = os.path.join(base_dir, "barasa")
+    wn_file = os.path.join(barasa_dir, "wn-msa-all.tab")
+    # Placeholder: no direct sentiment; return empty for now.
+    # Future: map synonyms of existing sentiment words and inherit score * 0.8
+    if os.path.exists(wn_file):
+        return {}
+    # also support barasa.csv if added by user
+    csv_file = os.path.join(base_dir, "barasa.csv")
+    if os.path.exists(csv_file):
+        out: dict[str, float] = {}
+        with open(csv_file, "r", encoding="utf-8") as f:
+            for line in f:
+                if "," in line:
+                    w, sc = line.strip().split(",", 1)
+                    try:
+                        out[w.lower()] = max(-1.0, min(1.0, float(sc)))
+                    except Exception:
+                        continue
+        return out
+    return {}
+
+
+def build_lexicon() -> dict[str, float]:
+    # Start from InSet if available
+    lex = load_inset_lexicon(LEXICON_DIR)
+    # Merge Barasa if CSV provided; else try optional WordNet source (no polarity)
+    barasa_csv = os.path.join(LEXICON_DIR, "barasa", "barasa_lexicon.csv")
+    if os.path.exists(barasa_csv):
+        lex.update(load_barasa_csv(barasa_csv))
+    else:
+        bar = load_barasa_optional(LEXICON_DIR)
+        lex.update(bar)
+    # Add custom Kupang/ID extra (sudah dalam range [-1, +1])
+    for k, v in ID_EXTRA.items():
+        lex[k.lower()] = max(-1.0, min(1.0, float(v)))
+    return lex
+
+
+LEXICON_ID = build_lexicon()
+
+
+def score_with_lexicon(text: str, lex: Dict[str, float]) -> float:
+    toks = clean_text(text).split()
+    if not toks:
+        return 0.0
+
+    # Context-aware scoring: handle negation (pre & post), intensifiers
+    negation_words = {"tidak", "bukan", "belum", "jangan", "tanpa", "sonde", "tara", "teda", "nda", "tra"}
+    intensifiers = {"banget", "sangat", "amat", "sekali", "parah", "bener", "pisan"}
+
+    s = 0.0
+    neg_window = 0  # number of next tokens to negate
+    intensify = 1.0
+    # track last scored token to handle patterns like "paham ... belum"
+    last_score_val = 0.0
+    last_score_idx = -10
+
+    for i, tok in enumerate(toks):
+        # Negation token: start negation window and optionally flip previous positive nearby
+        if tok in negation_words:
+            # If a positive word occurred recently (within 2 tokens), flip it retroactively
+            if last_score_val > 0 and (i - last_score_idx) <= 2:
+                # subtract a bit more than added to reflect negation of previous positive
+                s -= last_score_val * 1.2
+                last_score_val = 0.0
+            neg_window = 3
+            continue
+
+        # Intensifier affects next scored word only
+        if tok in intensifiers:
+            intensify = 1.5
+            continue
+
+        # Base lexical score
+        score = lex.get(tok, 0.0)
+
+        # Apply active negation window
+        if neg_window > 0 and score != 0.0:
+            score = -score * 0.8
+            neg_window -= 1
+        elif neg_window > 0:
+            # consume window even if current token has no score
+            neg_window -= 1
+
+        # Apply intensifier
+        if intensify > 1.0 and score != 0.0:
+            score = score * intensify
+            intensify = 1.0
+
+        s += score
+
+        if score != 0.0:
+            last_score_val = score
+            last_score_idx = i
+
+    # Dampen by sqrt length to avoid bias for long texts
+    normalized = s / max(1.0, math.sqrt(len(toks)))
+    return max(-1.0, min(1.0, normalized))
+
+INTENSIFIERS = {"banget": 1.0, "sangat": 0.8, "parah": 0.9, "amat": 0.5}
+
+def negative_gate(aggregate: float, raw_txt: str) -> tuple[bool, float]:
+    # severity from magnitude + intensifiers + punctuation and repeats
+    clean = clean_text(raw_txt)
+    toks = clean.split()
+    intens = sum(INTENSIFIERS.get(t, 0.0) for t in toks)
+    exclam = min(raw_txt.count("!"), 3) * 0.1
+    repeat = 0.1 if _RE_REPEAT.search(raw_txt) else 0.0
+    sev = max(0.0, min(1.0, (-aggregate) * 0.7 + intens * 0.2 + exclam + repeat))
+    return (aggregate <= -0.05), round(sev, 3)
+
+# =====================
+# Taxonomy (topics/subtopics) for semi-supervised labeling
+# =====================
+TAXONOMY_PATH = os.path.join(os.path.dirname(__file__), "taxonomy.json")
+try:
+    with open(TAXONOMY_PATH, "r", encoding="utf-8") as _f:
+        _TAX = json.load(_f)
+except Exception:
+    _TAX = {"topics": []}
+
+def _taxonomy_keywords():
+    buckets = {}
+    subtopics = {}
+    for tp in _TAX.get("topics", []):
+        bucket = tp.get("bucket") or ""
+        topic_id = tp.get("id") or bucket or "TOPIC"
+        topic_name = tp.get("name") or topic_id
+        buckets.setdefault(bucket, set()).update([str(w).lower() for w in tp.get("keywords", []) if w])
+        for st in tp.get("subtopics", []) or []:
+            # Maintain internal id (taxonomy id) and external 'code' matching kategori_masalahs.kode
+            st_id = st.get("id") or st.get("code") or st.get("name")
+            st_code = st.get("code") or st_id
+            if not st_id:
+                continue
+            subtopics[st_id] = {
+                "name": st.get("name") or st_id,
+                "bucket": bucket,
+                "topic_id": topic_id,
+                "topic_name": topic_name,
+                "code": st_code,
+                "keywords": set([str(w).lower() for w in st.get("keywords", []) if w]),
+                "examples": st.get("examples", []) or []
+            }
+    return buckets, subtopics
+
+BUCKET_KW, SUBTOPICS = _taxonomy_keywords()
+
+def build_topic_index_and_categories_map():
+    """HYBRID APPROACH (OLD METHOD + NEW DATA):
+    Builds multi-level keyword matching dengan data dari database.
+    
+    Returns: (topic_index, categories_map, bucket_map)
+    - topic_index: metadata per kategori kecil {UPPER(name): {id, name, bucket, kode}}
+    - categories_map: keywords per kategori kecil {UPPER(name): [keywords]}
+    - bucket_map: keywords per kategori besar {UPPER(bucket): [aggregated keywords]}
+    
+    WHY THIS IS BETTER:
+    - Multi-level matching: Check keywords di kategori kecil DAN kategori besar
+    - Redundancy: Jika miss di kategori kecil, bisa match di bucket agregat
+    - Better coverage: Keywords dari semua kategori kecil teragregasi ke bucket
+    """
+    topic_index = {}
+    categories_map = {}
+    bucket_map = defaultdict(set)  # Agregasi keywords per bucket
+    
+    # Process topics (kategori kecil) dari database
+    for tp in _TAX.get("topics", []):
+        topic_id = tp.get("id") or tp.get("code") or "TOPIC"
+        topic_name = tp.get("name") or topic_id
+        bucket = tp.get("bucket") or ""
+        key = str(topic_name).upper()
+        
+        # Collect keywords from topic level (kategori kecil)
+        kw = set([str(w).lower().strip() for w in (tp.get("keywords") or []) if w])
+        
+        # Legacy support: subtopics (backward compatibility)
+        for st in tp.get("subtopics", []) or []:
+            for w in st.get("keywords", []) or []:
+                if w:
+                    kw.add(str(w).lower().strip())
+        
+        # Store kategori kecil metadata & keywords
+        topic_index[key] = {
+            "id": topic_id, 
+            "name": topic_name, 
+            "bucket": bucket,
+            "kode": topic_id  # Match dengan kategori_masalahs.kode
+        }
+        categories_map[key] = sorted(list(kw))
+        
+        # AGGREGATE keywords ke bucket (kategori besar)
+        # Ini yang bikin metode lama lebih akurat!
+        if bucket:
+            bucket_map[bucket.upper()].update(kw)
+    
+    # Convert bucket_map sets to sorted lists
+    bucket_keywords = {k: sorted(list(v)) for k, v in bucket_map.items()}
+    
+    return topic_index, categories_map, bucket_keywords
+
+def extract_keyphrases(texts, lang="id"):
+    # RAKE pakai stopwords bhs Inggris default; untuk id sederhana kita kasih stopwords id juga
+    sw = set(stopwords.words('indonesian')) | set(stopwords.words('english'))
+    r = Rake(stopwords=sw)
+    joined = " . ".join(texts)
+    r.extract_keywords_from_text(joined)
+    ranked = r.get_ranked_phrases_with_scores()
+    out = []
+    for score, phrase in ranked[:20]:
+        out.append({"term": phrase, "weight": float(score)})
+    return out
+
+def extract_core_tokens(texts):
+    """Ambil token inti dengan pembersihan:
+    - lower & clean_text
+    - buang stopwords (ID + EN) & filler umum
+    - buang token panjang < 3
+    - hitung frekuensi, ambil top 10
+    """
+    freq = Counter()
+    try:
+        sw_id = set(stopwords.words('indonesian'))
+    except Exception:
+        sw_id = set()
+    try:
+        sw_en = set(stopwords.words('english'))
+    except Exception:
+        sw_en = set()
+    filler = {
+        'dan','atau','yang','di','ke','dengan','pada','untuk','dari','lagi','sih','deh','lah','ya','kok','kan','udah','aja','pun','itu','ini','jadi','kalau','kalo','bahwa','sementara','sering','kayak','kayakny','nih','tuh','dong','de','si','mungkin','masih','bisa','harus','karena','seperti','kaya','gitu','buat'
+    }
+    for t in texts:
+        for tok in clean_text(t).split():
+            if len(tok) < 3: continue
+            if tok in sw_id or tok in sw_en or tok in filler: continue
+            freq[tok] += 1
+    return [w for w,_ in freq.most_common(10)]
+
+def _build_cluster_vectorizer():
+    """Vectorizer for clustering top-terms: single-word tokens, heavy stopwords cleanup."""
+    try:
+        sw_id = set(stopwords.words('indonesian'))
+    except Exception:
+        sw_id = set()
+    try:
+        sw_en = set(stopwords.words('english'))
+    except Exception:
+        sw_en = set()
+    extra = {
+        # connectors/intensifiers/pronouns/common fillers
+        'dan','atau','yang','di','ke','dengan','pada','untuk','dari','lagi','banget','sekali','paling','sih','deh','dong','lah','ya',
+        'aku','saya','gue','gua','dia','kamu','kau','ko','kami','kita','mereka',
+        'punya','dengar','dng','sm','nih','tuh','kok','kan','udah','lagi','aja','de','si',
+    }
+    stopset = sw_id | sw_en | extra
+    # Use our cleaner as preprocessor; single-word tokens only
+    vec = TfidfVectorizer(
+        preprocessor=clean_text,
+        tokenizer=str.split,
+        token_pattern=None,
+        lowercase=True,
+        stop_words=list(stopset),
+        ngram_range=(1,1),
+        max_df=0.95,
+        min_df=1,
+        max_features=1000,
+    )
+    return vec
+
+@app.get("/health")
+def health():
+    return jsonify({"status": "ok", "version": SERVICE_VERSION, "bert": ENABLE_BERT})
+
+# =====================
+# IndoBERT caching & optional warmup
+# =====================
+BERT_CACHE = {"tok": None, "mdl": None, "device": "cpu"}
+
+# --- GLOBAL BERT VARIABLES ---
+_bert_tokenizer = None
+_bert_model = None
+_bert_device = None
+
+def get_bert():
+    global _bert_tokenizer, _bert_model, _bert_device
+    if _bert_tokenizer is None:
+        print("⏳ Loading IndoBERT model... (First run might take a while)")
+        try:
+            model_name = "indobenchmark/indobert-base-p1"
+            _bert_tokenizer = AutoTokenizer.from_pretrained(model_name)
+            _bert_model = AutoModel.from_pretrained(model_name)
+            _bert_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            _bert_model.to(_bert_device)
+            _bert_model.eval()
+            print(f"✅ IndoBERT loaded on {_bert_device}")
+        except Exception as e:
+            print(f"❌ Failed to load IndoBERT: {e}")
+            return None, None, None
+    return _bert_tokenizer, _bert_model, _bert_device
+
+# Warmup at startup if requested (download/load once)
+if ENABLE_BERT and ENABLE_BERT_WARMUP:
+    tok, mdl, dev = get_bert()
+    try:
+        if tok is not None and mdl is not None:
+            import torch  # type: ignore
+            with torch.no_grad():
+                enc = tok(["warmup"], padding=True, truncation=True, max_length=16, return_tensors="pt")
+                _ = mdl(**enc.to(dev))
+    except Exception:
+        pass
+
+@app.get("/warmup")
+def warmup():
+    """Optionally trigger BERT load and a tiny forward pass to avoid first-request latency."""
+    if not ENABLE_BERT:
+        return jsonify({"bert": "disabled"})
+    tok, mdl, dev = get_bert()
+    if tok is None or mdl is None:
+        return jsonify({"bert": "unavailable"}), 500
+    try:
+        import torch  # type: ignore
+        with torch.no_grad():
+            enc = tok(["warmup"], padding=True, truncation=True, max_length=16, return_tensors="pt")
+            _ = mdl(**enc.to(dev))
+        return jsonify({"bert": "ready", "device": dev})
+    except Exception as e:
+        return jsonify({"bert": "error", "message": str(e)}), 500
+
+@app.post("/analyze")
+@app.post("/analyze")
+# (Load helpers lain seperti check_key, load_feedback, taxonomy, dll biarkan seperti file lama Anda)
+# ... (Pastikan functions: check_key, load_feedback_weights, build_topic_index..., load_inset_lexicon ada) ...
+
+@app.post("/analyze")
+def analyze():
+    if not check_key():
+        return jsonify({"error": "unauthorized"}), 401
+
+    data = request.get_json(force=True) or {}
+    items = data.get("items")
+    
+    if items is None:
+        items = [{
+            "id": data.get("id") or "item-1",
+            "text": data.get("text") or "",
+            "lang_hint": (data.get("context") or {}).get("lang_hint") if isinstance(data.get("context"), dict) else None
+        }]
+    
+    if not isinstance(items, list) or not items:
+        return jsonify({"error": "items required"}), 422
+
+    # Setup Taxonomy & Feedback (HYBRID APPROACH)
+    categories_override = data.get("categories")
+    TOPIC_INDEX, TAXONOMY_CATEGORIES, BUCKET_KEYWORDS = build_topic_index_and_categories_map()
+    
+    categories_map = {}
+    bucket_map = {}
+    
+    if isinstance(categories_override, dict) and categories_override:
+        for k, v in categories_override.items():
+            if isinstance(v, list):
+                categories_map[str(k).upper()] = [str(x) for x in v if isinstance(x, (str, int))]
+    
+    if not categories_map:
+        categories_map = TAXONOMY_CATEGORIES
+        bucket_map = BUCKET_KEYWORDS
+    
+    feedback = load_feedback_weights()
+
+    # Setup Variables
+    results = []
+    per_legacy = []
+    all_texts = []
+    negatives = []
+    per_entry_cats = {}
+
+    # Load IndoBERT Model (only if enabled)
+    tok, mdl, dev = get_bert() if ENABLE_BERT else (None, None, None)
+
+    # --- PROCESS PER ITEM ---
+    for it in items:
+        item_id = it.get("id")
+        raw_txt = (it.get("text") or "").strip()
+        lang_hint = it.get("lang_hint")
+
+        # 1. Text Cleaning (New Logic)
+        clean = clean_text(raw_txt)
+        if not clean:
+            continue
+
+        # 2. Sentiment Scoring (Hybrid)
+        s_lex = score_with_lexicon(clean, LEXICON_ID)
+        s_vad = sia.polarity_scores(raw_txt).get("compound", 0.0) if sia else 0.0
+        aggregate = float(0.7 * s_lex + 0.3 * s_vad) if sia else s_lex
+        
+        # Fallback: keyword-based detection if aggregate is neutral (0)
+        if abs(aggregate) < 0.05:
+            negative_keywords = [
+                "berkelahi", "bertengkar", "murung", "sedih", "marah", "kabur",
+                "masalah", "ribut", "berantem", "stress", "stres", "pusing",
+                "takut", "cemas", "galau", "kecewa", "frustrasi", "frustasi",
+                "jelek", "drop", "sendiri", "sendirian", "tidak paham"
+            ]
+            positive_keywords = ["senang", "bahagia", "gembira", "semangat", "excited", "bagus", "oke", "mantap", "suka", "hebat"]
+            
+            neg_count = sum(1 for kw in negative_keywords if kw in clean)
+            pos_count = sum(1 for kw in positive_keywords if kw in clean)
+            
+            if neg_count > pos_count and neg_count > 0:
+                aggregate = -0.35  # Set mild negative
+            elif pos_count > neg_count and pos_count > 0:
+                aggregate = 0.3   # Set mild positive
+
+        # 3. Sarcasm Detection (New Logic)
+        is_sarcasm, sarc_conf = detect_sarcasm_heuristic(clean, raw_txt, aggregate)
+        
+        if is_sarcasm:
+            # Flip score: Positive -> Negative
+            if aggregate > 0:
+                aggregate = -0.5 * aggregate - 0.3
+            elif aggregate == 0:
+                aggregate = -0.4
+            lbl = "negatif"
+        else:
+            lbl = label_from_score(aggregate)
+
+        # 4. Negative Gate & Severity
+        # Check severity based on flipped score
+        neg_flag, severity = negative_gate(aggregate, raw_txt)
+        if is_sarcasm: 
+            neg_flag = True
+            severity = max(severity, 0.6) # Sarkasme biasanya sakit
+
+        # 5. Category Scoring (ONLY FOR NEGATIVE CONTENT)
+        # Skip kategorisasi jika semua input positif (aggregate > 0 dan tidak ada sarkasme)
+        cat_scores = {}
+        reasons = {}
+        bucket_scores = defaultdict(float)
+        best_cat = None
+        best_bucket = None
+        cluster = None
+        
+        if neg_flag or aggregate <= 0:
+            # HYBRID: Kategori Kecil + Bucket Agregat (ONLY FOR NEGATIVE)
+            cat_scores, reasons = score_categories_for_text(clean, categories_map, feedback)
+            
+            # BOOST: Aggregate bucket scores dari kategori kecil
+            for cat, score in cat_scores.items():
+                tp_meta = TOPIC_INDEX.get(str(cat).upper())
+                if tp_meta and tp_meta.get("bucket"):
+                    bucket_scores[tp_meta["bucket"]] += score * 0.8  # Slightly dampen aggregated
+            
+            # Also score directly against bucket keywords (OLD METHOD)
+            if bucket_map:
+                bucket_direct, _ = score_categories_for_text(clean, bucket_map, feedback)
+                for bucket, score in bucket_direct.items():
+                    bucket_scores[bucket] += score * 1.2  # Boost direct matches
+            
+            # Find best kategori kecil
+            best_cat = max(cat_scores, key=cat_scores.get) if cat_scores else None
+            best_bucket = max(bucket_scores, key=bucket_scores.get) if bucket_scores else None
+
+            # Apply minimum confidence thresholds to reduce false positives
+            if best_cat and cat_scores.get(best_cat, 0.0) < 0.22:
+                best_cat = None
+            if best_bucket and bucket_scores.get(best_bucket, 0.0) < 0.25:
+                best_bucket = None
+
+            # 6. Cluster Labeling (Prioritize Kategori Kecil, fallback to Bucket)
+            if best_cat:
+                tp_meta = TOPIC_INDEX.get(str(best_cat).upper())
+                if tp_meta:
+                    cluster = {
+                        "id": tp_meta.get("kode"),  # Match dengan kategori_masalahs.kode
+                        "label": tp_meta.get("name"),
+                        "bucket": tp_meta.get("bucket"),
+                        "topic_id": tp_meta.get("kode"),
+                        "topic_name": tp_meta.get("name"),
+                        "confidence": round(cat_scores[best_cat], 3)
+                    }
+            elif best_bucket:
+                # Fallback: Use bucket if no specific kategori kecil matched
+                cluster = {
+                    "id": best_bucket,
+                    "label": best_bucket,
+                    "bucket": best_bucket,
+                    "topic_id": None,
+                    "topic_name": None,
+                    "confidence": round(bucket_scores[best_bucket], 3)
+                }
+        # Else: Skip kategorisasi untuk input positif
+
+        # 7. Keywords Extraction
+        try:
+            rk = Rake(stopwords=STOPWORDS_ID_CHAT, min_length=1, max_length=3)
+            rk.extract_keywords_from_text(clean) # Use clean text
+            raw_phrases = [p.lower() for p in rk.get_ranked_phrases()[:8]]
+        except Exception:
+            raw_phrases = []
+        
+        # Filter phrases
+        phrases = sorted(list(set(raw_phrases)), key=len)[:5]
+
+        # 8. Summary Text
+        if is_sarcasm:
+            summary_text = f"Terdeteksi sarkasme/sindiran. Inti keluhan: {', '.join(phrases[:3])}."
+        elif neg_flag and cluster:
+            summary_text = f"Masalah utama: {cluster['label']}. Gejala: {', '.join(phrases[:3])}."
+        elif neg_flag:
+            summary_text = f"Inti keluhan: {', '.join(phrases[:3])}."
+        else:
+            # Positive input - no categorization needed
+            summary_text = f"Ekspresi positif. Kata kunci: {', '.join(phrases[:3]) if phrases else 'tidak ada keluhan'}."
+
+        results.append({
+            "id": item_id,
+            "clean_text": clean,
+            "sentiment": {
+                "barasa": s_lex, "english": s_vad, "aggregate": aggregate, "label": lbl
+            },
+            "negative_flag": neg_flag,
+            "is_sarcasm": is_sarcasm, # Field Baru
+            "severity": severity,
+            "cluster": cluster,
+            "summary": summary_text,
+            "key_phrases": phrases,
+            "recommendations": [],
+            "cat_scores": cat_scores,
+            "cat_reasons": reasons,
+        })
+
+        per_legacy.append({
+            "id": item_id, "text": raw_txt, "sentiment": aggregate, 
+            "label": lbl, "keywords": phrases
+        })
+        
+        all_texts.append(clean)
+        
+        # Collect negatives for clustering
+        if neg_flag:
+            negatives.append(clean)
+            ranked = sorted([(c, s) for c, s in cat_scores.items() if s > 0], key=lambda x: x[1], reverse=True)
+            per_entry_cats[item_id] = {
+                "ranked": ranked[:3],
+                "reasons": {c: reasons.get(c, []) for c, _ in ranked[:3]}
+            }
+
+    # --- AGGREGATION & CLUSTERING ---
+
+    # Global Keywords
+    keyphrases = extract_keyphrases(all_texts) if all_texts else []
+
+    # Clustering with IndoBERT
+    clusters = []
+    if len(negatives) >= 2:
+        used_engine = "tfidf"
+        X = None
+        
+        # Try BERT
+        if tok and mdl:
+            try:
+                with torch.no_grad():
+                    enc = tok(negatives, padding=True, truncation=True, max_length=128, return_tensors="pt").to(dev)
+                    out = mdl(**enc)
+                    cls = out.last_hidden_state[:, 0, :]
+                    X = cls.detach().cpu().numpy()
+                    used_engine = "bert"
+            except Exception as e:
+                print(f"⚠️ BERT error, falling back: {e}")
+                X = None
+        
+        # Fallback TF-IDF
+        if X is None:
+            vec = _build_cluster_vectorizer() # Pastikan fungsi ini ada (helper lama)
+            X = vec.fit_transform(negatives)
+            
+        k = 2 if len(negatives) == 2 else min(4, max(2, len(negatives)//2))
+        km = KMeans(n_clusters=k, n_init='auto', random_state=42)
+        y = km.fit_predict(X)
+        
+        for ci in range(k):
+            ex = [negatives[i] for i in range(len(negatives)) if y[i] == ci][:5]
+            clusters.append({
+                "cluster": int(ci),
+                "engine": used_engine,
+                "examples": ex
+            })
+
+    # Overview Weighted by Severity & Sarcasm (KATEGORI KECIL - NEGATIVE ONLY)
+    cat_counter = Counter()
+    for r in results:
+        # ONLY count negative items for categorization
+        if not r.get("negative_flag"):
+            continue
+            
+        sev = r.get("severity", 0.0)
+        weight = 1.0 + sev
+        
+        # Aggregate by kategori kecil (topic)
+        cluster = r.get("cluster") or {}
+        topic_name = cluster.get("topic_name") or cluster.get("label")
+        if topic_name:
+            # Use cluster confidence as base score
+            score = cluster.get("confidence", 0.5)
+            cat_counter[topic_name] += score * weight
+
+    categories_overview = [
+        {"category": cat, "score": round(val, 4)} for cat, val in cat_counter.most_common()
+    ]
+
+    # Summary Stats
+    avg = sum([x["sentiment"] for x in per_legacy]) / len(per_legacy) if per_legacy else 0.0
+    summary = {
+        "avg_sentiment": round(avg, 3),
+        "negative_ratio": round(sum(1 for x in per_legacy if x["label"]=="negatif")/len(per_legacy), 3) if per_legacy else 0.0
+    }
+
+    # NEW: Recommendations Generation PER KATEGORI KECIL (Granular)
+    # Laravel akan filter lebih lanjut berdasarkan master_rekomendasis.rules
+    def recommend_by_topic(topic_id: str, topic_name: str, bucket: str, severity_val: float, negative: bool, sarcasm: bool):
+        """Generate recommendations based on kategori kecil (topic).
+        Returns structured data yang bisa di-match dengan master_rekomendasis di Laravel.
+        
+        Format return:
+        {
+            "kategori_kode": topic_id,  # Match dengan kategori_masalahs.kode
+            "kategori_nama": topic_name,
+            "bucket": bucket,
+            "severity": severity_val,
+            "negative": negative,
+            "sarcasm": sarcasm,
+            "suggested_actions": [...]  # Heuristic suggestions (optional)
+        }
+        """
+        rec = {
+            "kategori_kode": topic_id,
+            "kategori_nama": topic_name,
+            "bucket": bucket,
+            "severity": severity_val,
+            "negative": negative,
+            "sarcasm": sarcasm,
+            "suggested_actions": []
+        }
+        
+        # Heuristic suggestions (Laravel akan filter sesuai master_rekomendasis)
+        if (negative or sarcasm) and severity_val >= 0.6:
+            rec["suggested_actions"].append({
+                "type": "URGENT",
+                "reason": "Severity tinggi atau terdeteksi sarkasme"
+            })
+        elif negative and severity_val >= 0.4:
+            rec["suggested_actions"].append({
+                "type": "MODERATE",
+                "reason": "Indikasi masalah perlu perhatian"
+            })
+        
+        return rec
+
+    # Assign Recs per item (GRANULAR: Per Kategori Kecil)
+    for r in results:
+        cluster = r.get("cluster") or {}
+        topic_id = cluster.get("topic_id") or cluster.get("id")
+        topic_name = cluster.get("topic_name") or cluster.get("label")
+        bucket = cluster.get("bucket", "")
+        
+        if topic_id:
+            # Return kategori kecil info untuk Laravel matching
+            r["recommendations"] = [recommend_by_topic(
+                topic_id, 
+                topic_name,
+                bucket,
+                r.get("severity", 0), 
+                r.get("negative_flag", False), 
+                r.get("is_sarcasm", False)
+            )]
+        else:
+            # Fallback: No specific kategori detected
+            r["recommendations"] = []
+
+    # Global Recs (PER KATEGORI KECIL - Granular)
+    abs_sent = abs(avg)
+    global_recommendations = []
+    valid_cats = [c for c in categories_overview if c["score"] >= 0.05]
+    is_neg_avg = avg < -0.05
+    
+    for cat in valid_cats:
+        cname = cat["category"]
+        meta = TOPIC_INDEX.get(cname.upper()) or {}
+        topic_id = meta.get("kode") or meta.get("id")
+        topic_name = meta.get("name", cname)
+        bucket = meta.get("bucket", "")
+        
+        if topic_id:
+            rec_data = recommend_by_topic(
+                topic_id, 
+                topic_name,
+                bucket,
+                max(0.3, abs_sent), 
+                is_neg_avg, 
+                False  # No global sarcasm flag
+            )
+            global_recommendations.append({
+                "category": cname,
+                "kategori_kode": topic_id,
+                "score": cat["score"],
+                "recommendation": rec_data
+            })
+
+    return jsonify({
+        "version": SERVICE_VERSION,
+        "items": results,
+        "summary": summary,
+        "keyphrases": keyphrases,
+        "clusters": clusters,
+        "categories_overview": categories_overview,
+        "global_recommendations": global_recommendations,
+    })
+
+@app.post("/feedback")
+def feedback():
+    if not check_key():
+        return jsonify({"error": "unauthorized"}), 401
+
+    data = request.get_json(force=True) or {}
+    # expected: { keywords: ["telat","bolos"], from_category?: "AKADEMIK", to_category?: "DISIPLIN", delta?: 0.2 }
+    kws = data.get("keywords") or []
+    from_cat = str(data.get("from_category") or "").upper()
+    to_cat = str(data.get("to_category") or "").upper()
+    delta = float(data.get("delta") or 0.2)
+    if not kws or (not from_cat and not to_cat):
+        return jsonify({"error": "invalid payload"}), 422
+
+    weights = load_feedback_weights()
+    for kw in kws:
+        k = str(kw).lower().strip()
+        if not k:
+            continue
+        entry = weights.get(k, {})
+        # penalize from_cat slightly, reward to_cat (if provided)
+        if from_cat:
+            entry[from_cat] = float(entry.get(from_cat, 0.0)) - (delta / 2.0)
+        if to_cat:
+            entry[to_cat] = float(entry.get(to_cat, 0.0)) + delta
+        weights[k] = entry
+    save_feedback_weights(weights)
+    return jsonify({"ok": True, "updated": len(kws)})
+
+@app.route("/feedback", methods=["POST"])
+def receive_feedback():
+    """
+    Receive teacher revision feedback for continuous learning.
+    
+    Expected payload:
+    {
+        "revision_id": 123,
+        "original_text": "...",
+        "original_kategori": "AKADEMIK",
+        "original_rekomendasi": [...],
+        "revised_kategori": "DISIPLIN",
+        "revised_rekomendasi": [...],
+        "revision_notes": "..." (optional)
+    }
+    
+    This endpoint will:
+    1. Extract keywords from original text
+    2. Penalize weights for original_kategori
+    3. Reward weights for revised_kategori
+    4. Learn from the correction pattern
+    """
+    if not check_key():
+        return jsonify({"error": "unauthorized"}), 401
+
+    try:
+        data = request.get_json(force=True) or {}
+        
+        revision_id = data.get("revision_id")
+        original_text = data.get("original_text", "")
+        original_kategori = str(data.get("original_kategori", "")).upper()
+        revised_kategori = str(data.get("revised_kategori", "")).upper()
+        
+        if not original_text or not revised_kategori:
+            return jsonify({"error": "Missing required fields"}), 422
+
+        # Only learn if kategori was changed (not just rekomendasi)
+        if original_kategori == revised_kategori:
+            logger.info(f"Revision #{revision_id}: Kategori unchanged, skipping weight update")
+            return jsonify({
+                "ok": True, 
+                "message": "Kategori unchanged, no weight update needed",
+                "revision_id": revision_id
+            })
+
+        # Extract keywords from original text
+        keywords = []
+        try:
+            # Simple keyword extraction - tokenize and filter stopwords
+            tokens = nltk.word_tokenize(original_text.lower())
+            filtered_tokens = [
+                t for t in tokens 
+                if t.isalnum() and len(t) > 2 
+                and t not in STOPWORDS_ID_CHAT 
+                and t not in _CHAT_FILLERS
+            ]
+            # Get top 10 most meaningful words
+            word_counts = Counter(filtered_tokens)
+            keywords = [word for word, _ in word_counts.most_common(10)]
+            
+            logger.info(f"Revision #{revision_id}: Extracted keywords: {keywords}")
+        except Exception as e:
+            logger.warning(f"Failed to extract keywords: {e}")
+            # Fallback: split by space
+            keywords = [w for w in original_text.lower().split() if len(w) > 2][:10]
+
+        if not keywords:
+            return jsonify({
+                "ok": False, 
+                "error": "Could not extract keywords from text"
+            }), 422
+
+        # Update feedback weights
+        weights = load_feedback_weights()
+        delta = 0.3  # Learning rate
+        
+        for kw in keywords:
+            k = str(kw).lower().strip()
+            entry = weights.get(k, {})
+            
+            # Penalize original (wrong) kategori
+            if original_kategori:
+                entry[original_kategori] = float(entry.get(original_kategori, 0.0)) - (delta / 2.0)
+            
+            # Reward revised (correct) kategori
+            entry[revised_kategori] = float(entry.get(revised_kategori, 0.0)) + delta
+            
+            weights[k] = entry
+        
+        save_feedback_weights(weights)
+        
+        logger.info(f"Revision #{revision_id}: Updated weights for {len(keywords)} keywords "
+                   f"from {original_kategori} → {revised_kategori}")
+        
+        return jsonify({
+            "ok": True,
+            "message": "Feedback learned successfully",
+            "revision_id": revision_id,
+            "keywords_updated": len(keywords),
+            "correction": f"{original_kategori} → {revised_kategori}"
+        })
+        
+    except Exception as e:
+        logger.error(f"Error processing feedback: {e}", exc_info=True)
+        return jsonify({
+            "ok": False,
+            "error": str(e)
+        }), 500
+
+if __name__ == '__main__':
+    # HUGGING FACE WAJIB PAKAI PORT 7860 & HOST 0.0.0.0
+    app.run(debug=False, host='0.0.0.0', port=7860)