zm-f21 commited on
Commit
ec6f784
·
verified ·
1 Parent(s): 0790a5c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -72
app.py CHANGED
@@ -1,18 +1,15 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
- import gradio as gr
5
- import os
6
- import zipfile
7
- import pandas as pd
8
- import numpy as np
9
  from transformers import pipeline
10
  from sentence_transformers import SentenceTransformer
 
 
 
 
11
  import torch
12
 
13
- # ----------------------------- #
14
- # Load Mistral model
15
- # ----------------------------- #
16
  llm = pipeline(
17
  "text-generation",
18
  model="mistralai/Mistral-7B-Instruct-v0.2",
@@ -20,58 +17,77 @@ llm = pipeline(
20
  device_map="auto"
21
  )
22
 
 
 
 
23
  embedding_model = SentenceTransformer("nlpaueb/legal-bert-base-uncased")
24
 
25
- # ----------------------------- #
26
- # Extract and load Yukon dataset
27
- # ----------------------------- #
28
- extract_folder = "yukon_texts"
29
- zip_path = "yukon.zip"
30
-
31
- if not os.path.exists(extract_folder):
32
- with zipfile.ZipFile(zip_path, "r") as zip_ref:
33
- zip_ref.extractall(extract_folder)
34
-
35
- # ----------------------------- #
36
- # Parse files and create embeddings
37
- # ----------------------------- #
38
- def parse_metadata_and_content(raw_text):
39
- """
40
- Replace this with your actual parsing function from Colab.
41
- Should return metadata dict and content string.
42
- """
43
  metadata = {}
44
- content = raw_text
 
 
 
 
 
 
 
 
 
 
45
  return metadata, content
46
 
 
47
  documents = []
 
48
  for root, dirs, files in os.walk(extract_folder):
49
  for filename in files:
50
- if filename.startswith("._") or not filename.endswith(".txt"):
51
  continue
52
- filepath = os.path.join(root, filename)
53
- with open(filepath, "r", encoding="latin-1") as f:
54
- raw = f.read()
55
- metadata, content = parse_metadata_and_content(raw)
56
- paragraphs = [p.strip() for p in content.split("\n\n") if p.strip()]
57
- for p in paragraphs:
58
- documents.append({
59
- "source_title": metadata.get("SOURCE_TITLE", "Unknown"),
60
- "province": metadata.get("PROVINCE", "Unknown"),
61
- "last_updated": metadata.get("LAST_UPDATED", "Unknown"),
62
- "url": metadata.get("URL", "N/A"),
63
- "pdf_links": metadata.get("PDF_LINKS", ""),
64
- "text": p
65
- })
 
66
 
67
  texts = [d["text"] for d in documents]
68
  embeddings = embedding_model.encode(texts).astype("float32")
 
69
  df = pd.DataFrame(documents)
70
  df["Embedding"] = list(embeddings)
71
 
72
- # ----------------------------- #
73
- # RAG Retrieval function
74
- # ----------------------------- #
 
 
75
  def retrieve_with_pandas(query, top_k=2):
76
  query_emb = embedding_model.encode([query])[0]
77
  df["Similarity"] = df["Embedding"].apply(
@@ -79,6 +95,9 @@ def retrieve_with_pandas(query, top_k=2):
79
  )
80
  return df.sort_values("Similarity", ascending=False).head(top_k)
81
 
 
 
 
82
  def generate_with_rag(query, top_k=2):
83
  top_docs = retrieve_with_pandas(query, top_k)
84
  context = " ".join(top_docs["text"].tolist())
@@ -92,7 +111,8 @@ Context:
92
 
93
  Question: {query}
94
  """
95
- response = llm(input_text, max_new_tokens=200, num_return_sequences=1)[0]["generated_text"]
 
96
 
97
  meta = []
98
  for _, row in top_docs.iterrows():
@@ -103,31 +123,22 @@ Question: {query}
103
  f" URL: {row['url']}\n"
104
  )
105
  metadata_block = "\n".join(meta)
106
- return f"{response.strip()}\n\nSources Used:\n{metadata_block}"
107
-
108
- # ----------------------------- #
109
- # Gradio ChatInterface
110
- # ----------------------------- #
111
- def respond(message, history: list[dict[str, str]], system_message, max_tokens, temperature, top_p, hf_token: gr.OAuthToken):
112
- # We ignore the system_message, max_tokens, temperature, top_p for simplicity; adjust if needed
113
- response = generate_with_rag(message)
114
- yield response
115
-
116
- chatbot = gr.ChatInterface(
117
- respond,
118
- type="messages",
119
- additional_inputs=[
120
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
121
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
122
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
123
- gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
124
- ],
125
- )
126
 
127
  with gr.Blocks() as demo:
128
- with gr.Sidebar():
129
- gr.LoginButton()
130
- chatbot.render()
 
131
 
132
  if __name__ == "__main__":
133
- demo.launch()
 
1
  import gradio as gr
 
 
 
 
 
 
 
2
  from transformers import pipeline
3
  from sentence_transformers import SentenceTransformer
4
+ import pandas as pd
5
+ import numpy as np
6
+ import zipfile
7
+ import os
8
  import torch
9
 
10
+ # -----------------------------
11
+ # Load Mistral pipeline
12
+ # -----------------------------
13
  llm = pipeline(
14
  "text-generation",
15
  model="mistralai/Mistral-7B-Instruct-v0.2",
 
17
  device_map="auto"
18
  )
19
 
20
+ # -----------------------------
21
+ # Load SentenceTransformer embeddings
22
+ # -----------------------------
23
  embedding_model = SentenceTransformer("nlpaueb/legal-bert-base-uncased")
24
 
25
+ # -----------------------------
26
+ # Extract Yukon ZIP
27
+ # -----------------------------
28
+ zip_path = "/app/yukon.zip" # make sure you uploaded here
29
+ extract_folder = "/app/yukon_texts"
30
+
31
+ # Remove old folder if exists
32
+ if os.path.exists(extract_folder):
33
+ import shutil
34
+ shutil.rmtree(extract_folder)
35
+
36
+ with zipfile.ZipFile(zip_path, "r") as zip_ref:
37
+ zip_ref.extractall(extract_folder)
38
+
39
+ # -----------------------------
40
+ # Parse TXT files and create dataframe
41
+ # -----------------------------
42
+ def parse_metadata_and_content(raw):
43
  metadata = {}
44
+ content = raw
45
+
46
+ for line in raw.split("\n"):
47
+ if ":" in line:
48
+ key, value = line.split(":", 1)
49
+ metadata[key.strip().upper()] = value.strip()
50
+
51
+ content_lines = [
52
+ line for line in raw.split("\n") if not any(k in line.upper() for k in metadata.keys())
53
+ ]
54
+ content = "\n".join(content_lines)
55
  return metadata, content
56
 
57
+
58
  documents = []
59
+
60
  for root, dirs, files in os.walk(extract_folder):
61
  for filename in files:
62
+ if filename.startswith("._"):
63
  continue
64
+ if filename.endswith(".txt"):
65
+ filepath = os.path.join(root, filename)
66
+ with open(filepath, "r", encoding="latin-1") as f:
67
+ raw = f.read()
68
+ metadata, content = parse_metadata_and_content(raw)
69
+ paragraphs = [p.strip() for p in content.split("\n\n") if p.strip()]
70
+ for p in paragraphs:
71
+ documents.append({
72
+ "source_title": metadata.get("SOURCE_TITLE", "Unknown"),
73
+ "province": metadata.get("PROVINCE", "Unknown"),
74
+ "last_updated": metadata.get("LAST_UPDATED", "Unknown"),
75
+ "url": metadata.get("URL", "N/A"),
76
+ "pdf_links": metadata.get("PDF_LINKS", ""),
77
+ "text": p
78
+ })
79
 
80
  texts = [d["text"] for d in documents]
81
  embeddings = embedding_model.encode(texts).astype("float32")
82
+
83
  df = pd.DataFrame(documents)
84
  df["Embedding"] = list(embeddings)
85
 
86
+ print("Loaded documents:", len(df))
87
+
88
+ # -----------------------------
89
+ # Retrieval function
90
+ # -----------------------------
91
  def retrieve_with_pandas(query, top_k=2):
92
  query_emb = embedding_model.encode([query])[0]
93
  df["Similarity"] = df["Embedding"].apply(
 
95
  )
96
  return df.sort_values("Similarity", ascending=False).head(top_k)
97
 
98
+ # -----------------------------
99
+ # RAG generation
100
+ # -----------------------------
101
  def generate_with_rag(query, top_k=2):
102
  top_docs = retrieve_with_pandas(query, top_k)
103
  context = " ".join(top_docs["text"].tolist())
 
111
 
112
  Question: {query}
113
  """
114
+
115
+ response = llm(input_text, max_new_tokens=150, num_return_sequences=1)[0]['generated_text']
116
 
117
  meta = []
118
  for _, row in top_docs.iterrows():
 
123
  f" URL: {row['url']}\n"
124
  )
125
  metadata_block = "\n".join(meta)
126
+ final = f"{response.strip()}\n\nSources Used:\n{metadata_block}"
127
+ return final
128
+
129
+ # -----------------------------
130
+ # Gradio Chat
131
+ # -----------------------------
132
+ def respond(message, history):
133
+ answer = generate_with_rag(message)
134
+ history.append((message, answer))
135
+ return history, history
 
 
 
 
 
 
 
 
 
 
136
 
137
  with gr.Blocks() as demo:
138
+ chatbot = gr.Chatbot()
139
+ msg = gr.Textbox(label="Your question")
140
+ msg.submit(respond, [msg, chatbot], [chatbot, chatbot])
141
+ gr.Markdown("Ask questions about Yukon rental rules and landlord responsibilities.")
142
 
143
  if __name__ == "__main__":
144
+ demo.launch(share=True)