Spaces:

orva06
/

IndoBERT-MultiPredict

Sleeping

App Files Files Community

orva06 commited on 13 days ago

Commit

a11a937

verified ·

1 Parent(s): 620e204

Upload 6 files

Browse files

Files changed (6) hide show

app.py +95 -0
best_indobert_multipredict.pt +3 -0
inference.py +95 -0
le_tax_classes.npy +3 -0
le_topic_classes.npy +3 -0
requirements.txt +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# app.py
+import os, time, io, logging
+import pandas as pd
+import streamlit as st
+from inference import InferenceEngine
+# --------- CONFIG ----------
+MODEL_CKPT = "best_indobert_multipredict.pt"
+TOKENIZER_NAME = "indobenchmark/indobert-base-p2"
+CSV_INPUT_COL = "soal"   # expected text column in uploaded CSV
+LOG_PATH = "inference_log.csv"
+st.set_page_config(page_title="IndoBERT Multi-Predict (Topic + Taxonomy)", layout="wide")
+# basic logging to csv
+if not os.path.exists(LOG_PATH):
+    pd.DataFrame(columns=["timestamp","input_sample","topic_pred","topic_conf","tax_pred","tax_conf","runtime_s"]).to_csv(LOG_PATH, index=False)
+@st.cache_resource
+def load_engine():
+    eng = InferenceEngine(ckpt_path=MODEL_CKPT, tokenizer_name=TOKENIZER_NAME)
+    return eng
+st.title("IndoBERT — Multi-Predict (Topic & Taxonomy)")
+st.caption("Shared encoder → 2 output heads. Fast inference (CPU/GPU).")
+eng = load_engine()
+# Left column: input
+c1, c2 = st.columns([1,1.2])
+with c1:
+    st.header("Single prediction")
+    text = st.text_area("Paste your question / soal here:", height=160, placeholder="Tulis soal / pernyataan ...")
+    st.write("Light cleaning applied (lowercase, trim, normalize spaces).")
+    if st.button("Predict single"):
+        start = time.time()
+        res = eng.predict_texts([text])[0]
+        runtime = time.time() - start
+        # show result
+        st.subheader("Result")
+        st.metric("Topic", f"{res['topic_label']} ({res['topic_idx']})", delta=f"{res['topic_conf']:.3f}")
+        st.metric("Taxonomy", f"{res['tax_label']} ({res['tax_idx']})", delta=f"{res['tax_conf']:.3f}")
+        # optional: probability bar
+        st.write("Topic confidence:", f"{res['topic_conf']:.3f}")
+        st.write("Taxonomy confidence:", f"{res['tax_conf']:.3f}")
+        st.write("Raw probs (topic head): — first 8 shown")
+        st.write(res["topic_probs"][:8])
+        # logging
+        # logging disabled on HF Spaces
+        pass
+with c2:
+    st.header("Batch prediction (CSV)")
+    st.write("CSV format: must contain column called:", f"`{CSV_INPUT_COL}`")
+    uploaded = st.file_uploader("Upload CSV file", type=["csv"])
+    if uploaded is not None:
+        try:
+            df = pd.read_csv(uploaded)
+        except Exception:
+            df = pd.read_csv(uploaded, encoding="latin1")
+        st.write("Preview uploaded data (first 5 rows):")
+        st.dataframe(df.head())
+        if CSV_INPUT_COL not in df.columns:
+            st.error(f"CSV must contain column `{CSV_INPUT_COL}`. Rename your text column accordingly.")
+        else:
+            if st.button("Predict batch"):
+                texts = df[CSV_INPUT_COL].astype(str).tolist()
+                t0 = time.time()
+                results = eng.predict_texts(texts)
+                elapsed = time.time() - t0
+                # join results into dataframe
+                out = pd.DataFrame(results)
+                out = out.rename(columns={
+                    "topic_label":"pred_topic",
+                    "topic_conf":"pred_topic_conf",
+                    "tax_label":"pred_tax",
+                    "tax_conf":"pred_tax_conf"
+                })
+                # attach to original
+                df_out = pd.concat([df.reset_index(drop=True), out[["pred_topic","pred_topic_conf","pred_tax","pred_tax_conf"]]], axis=1)
+                st.success(f"Done — {len(df_out)} rows in {elapsed:.2f}s")
+                st.dataframe(df_out.head(50))
+                # allow download
+                csv_bytes = df_out.to_csv(index=False).encode("utf-8")
+                st.download_button("Download predictions (CSV)", csv_bytes, file_name="predictions.csv", mime="text/csv")
+                # append logs
+                # logging disabled on HF Spaces
+                pass  # logging disabled on HF Spaces
+                # logging disabled on HF Spaces
+st.write("---")
+st.markdown("**Model info:** IndoBERT shared encoder (multi-head). Checkpoint: `" + MODEL_CKPT + "`")
+st.markdown("**Notes:** For best label names ensure `le_topic_classes.npy` and `le_tax_classes.npy` are present in the app folder.")

best_indobert_multipredict.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e24acc8a8a4af4b79019ec97b7b921ec5512e5e62b9001f7946daa150e2c3054
+size 43092

inference.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# app.py
+import os, time, io, logging
+import pandas as pd
+import streamlit as st
+from inference import InferenceEngine
+# --------- CONFIG ----------
+MODEL_CKPT = "best_indobert_multipredict.pt"
+TOKENIZER_NAME = "indobenchmark/indobert-base-p2"
+CSV_INPUT_COL = "soal"   # expected text column in uploaded CSV
+LOG_PATH = "inference_log.csv"
+st.set_page_config(page_title="IndoBERT Multi-Predict (Topic + Taxonomy)", layout="wide")
+# basic logging to csv
+if not os.path.exists(LOG_PATH):
+    pd.DataFrame(columns=["timestamp","input_sample","topic_pred","topic_conf","tax_pred","tax_conf","runtime_s"]).to_csv(LOG_PATH, index=False)
+@st.cache_resource
+def load_engine():
+    eng = InferenceEngine(ckpt_path=MODEL_CKPT, tokenizer_name=TOKENIZER_NAME)
+    return eng
+st.title("IndoBERT — Multi-Predict (Topic & Taxonomy)")
+st.caption("Shared encoder → 2 output heads. Fast inference (CPU/GPU).")
+eng = load_engine()
+# Left column: input
+c1, c2 = st.columns([1,1.2])
+with c1:
+    st.header("Single prediction")
+    text = st.text_area("Paste your question / soal here:", height=160, placeholder="Tulis soal / pernyataan ...")
+    st.write("Light cleaning applied (lowercase, trim, normalize spaces).")
+    if st.button("Predict single"):
+        start = time.time()
+        res = eng.predict_texts([text])[0]
+        runtime = time.time() - start
+        # show result
+        st.subheader("Result")
+        st.metric("Topic", f"{res['topic_label']} ({res['topic_idx']})", delta=f"{res['topic_conf']:.3f}")
+        st.metric("Taxonomy", f"{res['tax_label']} ({res['tax_idx']})", delta=f"{res['tax_conf']:.3f}")
+        # optional: probability bar
+        st.write("Topic confidence:", f"{res['topic_conf']:.3f}")
+        st.write("Taxonomy confidence:", f"{res['tax_conf']:.3f}")
+        st.write("Raw probs (topic head): — first 8 shown")
+        st.write(res["topic_probs"][:8])
+        # logging
+        # logging disabled on HF Spaces
+        pass
+with c2:
+    st.header("Batch prediction (CSV)")
+    st.write("CSV format: must contain column called:", f"`{CSV_INPUT_COL}`")
+    uploaded = st.file_uploader("Upload CSV file", type=["csv"])
+    if uploaded is not None:
+        try:
+            df = pd.read_csv(uploaded)
+        except Exception:
+            df = pd.read_csv(uploaded, encoding="latin1")
+        st.write("Preview uploaded data (first 5 rows):")
+        st.dataframe(df.head())
+        if CSV_INPUT_COL not in df.columns:
+            st.error(f"CSV must contain column `{CSV_INPUT_COL}`. Rename your text column accordingly.")
+        else:
+            if st.button("Predict batch"):
+                texts = df[CSV_INPUT_COL].astype(str).tolist()
+                t0 = time.time()
+                results = eng.predict_texts(texts)
+                elapsed = time.time() - t0
+                # join results into dataframe
+                out = pd.DataFrame(results)
+                out = out.rename(columns={
+                    "topic_label":"pred_topic",
+                    "topic_conf":"pred_topic_conf",
+                    "tax_label":"pred_tax",
+                    "tax_conf":"pred_tax_conf"
+                })
+                # attach to original
+                df_out = pd.concat([df.reset_index(drop=True), out[["pred_topic","pred_topic_conf","pred_tax","pred_tax_conf"]]], axis=1)
+                st.success(f"Done — {len(df_out)} rows in {elapsed:.2f}s")
+                st.dataframe(df_out.head(50))
+                # allow download
+                csv_bytes = df_out.to_csv(index=False).encode("utf-8")
+                st.download_button("Download predictions (CSV)", csv_bytes, file_name="predictions.csv", mime="text/csv")
+                # append logs
+                # logging disabled on HF Spaces
+                pass  # logging disabled on HF Spaces
+                # logging disabled on HF Spaces
+st.write("---")
+st.markdown("**Model info:** IndoBERT shared encoder (multi-head). Checkpoint: `" + MODEL_CKPT + "`")
+st.markdown("**Notes:** For best label names ensure `le_topic_classes.npy` and `le_tax_classes.npy` are present in the app folder.")

le_tax_classes.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fa81876133d73c7c61dcb583cfa4e74f6dc72a5b1d30fe25f800576333a279be
+size 160

le_topic_classes.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8a3c127a53576a488189251889d52c07bb3d0bb33ff8b8a5a25b26e8f0ad90dc
+size 535

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+torch==2.2.1
+transformers==4.44.2
+streamlit==1.38.0
+numpy==1.26.4
+accelerate
+protobuf==3.20.3
+huggingface-hub==0.17.4
+joblib
+scikit-learn
+sentencepiece