import gradio as gr import onnxruntime as ort import numpy as np import cv2 from huggingface_hub import hf_hub_download # <-- IMPORT THE DOWNLOADER # --- 1. GLOBAL SETUP: DOWNLOAD AND LOAD MODELS AT STARTUP --- # This is the recommended way to use models in a Space. try: print("Downloading and loading ONNX models from the Hub...") # Define your model repository ID MODEL_REPO = "rtr46/meiki.text.detect.v0" # hf_hub_download will download the file and cache it, returning the local path. tiny_model_path = hf_hub_download(repo_id=MODEL_REPO, filename="meiki.text.detect.tiny.v0.onnx") small_model_path = hf_hub_download(repo_id=MODEL_REPO, filename="meiki.text.detect.small.v0.onnx") # Use CPUExecutionProvider for broad compatibility providers = ['CPUExecutionProvider'] ort_session_tiny = ort.InferenceSession(tiny_model_path, providers=providers) ort_session_small = ort.InferenceSession(small_model_path, providers=providers) print("Models loaded successfully.") except Exception as e: print(f"Error loading models: {e}") # If models fail to load, the app will not work. ort_session_tiny = None ort_session_small = None # --- 2. HELPER FUNCTION: PREPROCESSING --- # (This section remains exactly the same) def resize_and_pad(image: np.ndarray, size: int, is_color: bool): """ Resizes and pads an image, works for both grayscale and color. """ if is_color: h, w, _ = image.shape else: h, w = image.shape ratio = min(size / w, size / h) new_w, new_h = int(w * ratio), int(h * ratio) resized_image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LINEAR) if is_color: padded_image = np.zeros((size, size, 3), dtype=np.uint8) else: padded_image = np.zeros((size, size), dtype=np.uint8) pad_w, pad_h = (size - new_w) // 2, (size - new_h) // 2 padded_image[pad_h:pad_h + new_h, pad_w:pad_w + new_w] = resized_image return padded_image, ratio, pad_w, pad_h # --- 3. CORE INFERENCE FUNCTION --- # (This section remains exactly the same) def detect_text(model_name, input_image, confidence_threshold): """ Performs text detection on the input image using the selected model. """ if ort_session_tiny is None or ort_session_small is None: raise gr.Error("Models are not loaded. Please check the console logs for errors.") if model_name == "tiny": session = ort_session_tiny model_size = 320 is_color = False else: # "small" session = ort_session_small model_size = 640 is_color = True output_image = input_image.copy() if is_color: image_for_model = input_image else: image_for_model = cv2.cvtColor(input_image, cv2.COLOR_BGR2GRAY) padded_image, ratio, pad_w, pad_h = resize_and_pad(image_for_model, model_size, is_color) img_normalized = padded_image.astype(np.float32) / 255.0 if is_color: img_transposed = np.transpose(img_normalized, (2, 0, 1)) input_tensor = np.expand_dims(img_transposed, axis=0) else: input_tensor = np.expand_dims(np.expand_dims(img_normalized, axis=0), axis=0) sizes_tensor = np.array([[model_size, model_size]], dtype=np.int64) input_names = [inp.name for inp in session.get_inputs()] inputs = {input_names[0]: input_tensor, input_names[1]: sizes_tensor} outputs = session.run(None, inputs) if model_name == "tiny": boxes = outputs[0] scores = [1.0] * len(boxes) else: _, boxes, scores = outputs boxes, scores = boxes[0], scores[0] box_count = 0 for box, score in zip(boxes, scores): if score < confidence_threshold: continue box_count += 1 x_min, y_min, x_max, y_max = box final_x_min = int((x_min - pad_w) / ratio) final_y_min = int((y_min - pad_h) / ratio) final_x_max = int((x_max - pad_w) / ratio) final_y_max = int((y_max - pad_h) / ratio) color = (0, 255, 0) if model_name == "small" else (0, 0, 255) cv2.rectangle(output_image, (final_x_min, final_y_min), (final_x_max, final_y_max), color, 2) print(f"Processed with '{model_name}' model. Found {box_count} boxes with confidence > {confidence_threshold}.") return output_image # --- 4. GRADIO INTERFACE --- # (This section remains exactly the same) with gr.Blocks() as demo: gr.Markdown("# meiki text detect v0") gr.Markdown( "upload an image and choose a model to detect horizontal and vertical text lines. " "the **small** model is more accurate, especially for images with many text lines like manga, while the **tiny** model is much faster." ) with gr.Row(): with gr.Column(): input_image = gr.Image(type="numpy", label="upload image") model_name = gr.Radio( ["tiny", "small"], label="choose model", value="small" ) confidence_threshold = gr.Slider( minimum=0.1, maximum=1.0, value=0.4, step=0.1, label="confidence threshold" ) detect_button = gr.Button("detect text", variant="primary") with gr.Column(): output_image = gr.Image(type="numpy", label="result") detect_button.click( fn=detect_text, inputs=[model_name, input_image, confidence_threshold], outputs=output_image ) # --- 5. LAUNCH THE APP --- demo.launch()