#!/usr/bin/env python3 """ Feedback Weights Cleanup Utility - Splits long-phrase keys into individual words - Clips extreme weight values to [-1.0, +1.2] - Applies optional decay (default 5%) to prevent stale weights - Removes noise keywords (very generic, used by >6 categories) - Generates a clean backup before modifying """ import json import os import sys from datetime import datetime from collections import defaultdict FEEDBACK_FILE = os.path.join(os.path.dirname(__file__), "feedback_weights.json") BACKUP_DIR = os.path.join(os.path.dirname(__file__), "backups") DECAY_FACTOR = 0.95 # 5% decay per run MIN_WEIGHT = -1.0 MAX_WEIGHT = 1.2 NOISE_THRESHOLD = 6 # if keyword appears in >6 categories, flag as noise # Generic stopwords (too common to be signals) NOISE_KEYWORDS = { "banget", "aja", "saja", "dulu", "tapi", "kalo", "kalau", "yang", "sama", "juga", "dan", "atau", "untuk", "dari", "dengan", "ke", "di", "pada", "ini", "itu", "nya", "kan", "sih", "dong", "ya", "yah", "lah", "deh", "kek", "kayak", "masuk", "minggu", "hari", "kelas", "siswa", "guru", "sekolah" } def load_weights(): try: with open(FEEDBACK_FILE, 'r', encoding='utf-8') as f: return json.load(f) except Exception as e: print(f"Error loading {FEEDBACK_FILE}: {e}") sys.exit(1) def save_weights(weights, filepath): with open(filepath, 'w', encoding='utf-8') as f: json.dump(weights, f, ensure_ascii=False, indent=2) def backup_weights(): os.makedirs(BACKUP_DIR, exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") backup_path = os.path.join(BACKUP_DIR, f"feedback_weights_{timestamp}.json") weights = load_weights() save_weights(weights, backup_path) print(f"✅ Backup created: {backup_path}") return backup_path def split_phrase_keys(weights): """Split long-phrase keys (>3 words) into individual words, merge weights.""" new_weights = defaultdict(dict) splits = 0 for key, cats in weights.items(): words = [w.strip() for w in key.split() if w.strip() and len(w.strip()) >= 3] if len(words) <= 3: # Keep original key if <=3 words (good for bigrams/trigrams) for cat, val in cats.items(): if key not in new_weights: new_weights[key] = {} new_weights[key][cat] = val else: # Split into individual words splits += 1 for word in words: if word.lower() in NOISE_KEYWORDS: continue for cat, val in cats.items(): if word not in new_weights: new_weights[word] = {} # Average if word already has weight for this category if cat in new_weights[word]: new_weights[word][cat] = (new_weights[word][cat] + val) / 2.0 else: new_weights[word][cat] = val print(f"📝 Split {splits} long phrases into individual words") return dict(new_weights) def clip_weights(weights): """Clip all weights to [MIN_WEIGHT, MAX_WEIGHT].""" clipped = 0 for key, cats in weights.items(): for cat in cats: old_val = cats[cat] new_val = max(MIN_WEIGHT, min(MAX_WEIGHT, old_val)) if old_val != new_val: clipped += 1 cats[cat] = round(new_val, 4) print(f"✂️ Clipped {clipped} extreme weights to [{MIN_WEIGHT}, {MAX_WEIGHT}]") return weights def apply_decay(weights, decay=DECAY_FACTOR): """Apply decay to all weights to prevent old feedback from dominating.""" for key, cats in weights.items(): for cat in cats: cats[cat] = round(cats[cat] * decay, 4) print(f"📉 Applied {int((1-decay)*100)}% decay to all weights") return weights def remove_noise(weights): """Remove keywords that appear in too many categories (likely generic).""" cat_counts = defaultdict(int) for key, cats in weights.items(): cat_counts[key] = len(cats) noise_keys = [k for k, count in cat_counts.items() if count > NOISE_THRESHOLD or k.lower() in NOISE_KEYWORDS] for key in noise_keys: del weights[key] print(f"🗑️ Removed {len(noise_keys)} noisy keywords (>{NOISE_THRESHOLD} categories or in stoplist)") return weights def consolidate_duplicates(weights): """Merge case-insensitive duplicates.""" lower_map = {} for key in list(weights.keys()): lower_key = key.lower() if lower_key in lower_map: # Merge with existing orig_key = lower_map[lower_key] for cat, val in weights[key].items(): if cat in weights[orig_key]: weights[orig_key][cat] = (weights[orig_key][cat] + val) / 2.0 else: weights[orig_key][cat] = val del weights[key] else: lower_map[lower_key] = key print(f"🔗 Consolidated case-insensitive duplicates") return weights def main(): print("=" * 60) print("Feedback Weights Cleanup Utility") print("=" * 60) # Step 1: Backup backup_path = backup_weights() # Step 2: Load weights = load_weights() original_count = len(weights) print(f"📂 Loaded {original_count} keywords") # Step 3: Process weights = split_phrase_keys(weights) weights = consolidate_duplicates(weights) weights = clip_weights(weights) weights = apply_decay(weights) weights = remove_noise(weights) final_count = len(weights) print(f"📊 Final: {final_count} keywords (Δ {final_count - original_count:+d})") # Step 4: Save save_weights(weights, FEEDBACK_FILE) print(f"💾 Saved to {FEEDBACK_FILE}") print("=" * 60) print("✅ Cleanup complete!") print(f" Backup: {backup_path}") print("=" * 60) if __name__ == "__main__": main()