Spaces:
Paused
Paused
| from typing import List | |
| import logging | |
| from model import Model | |
| import json, re | |
| from PIL import Image | |
| from types_io import ImageData | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
| ) | |
| logger = logging.getLogger(__name__) | |
| SCHEMA_JSON = json.dumps(ImageData.model_json_schema(), ensure_ascii=False) | |
| LAND_USE_PROMPT = f""" | |
| You are a structured image analysis agent. | |
| Think **only** inside markers: ◁think▷ ... ◁/think▷ | |
| After ◁/think▷, output **ONLY** a JSON object that validates against this JSON Schema: | |
| {SCHEMA_JSON} | |
| Guidelines: | |
| - Return 3–5 categories. | |
| - Confidence is in [0,1] where 1.0 means highest confidence (be consistent). | |
| Categories : | |
| - Residenciales: Buildings intended for housing - Houses, PH Buildings, Condominiums. | |
| - Comerciales1: Refers to the storage, distribution, or exchange of products, goods, or services with a commercial interest. | |
| - Comerciales2: Buildings where activities aimed at providing services are carried out. | |
| - Comerciales3: Buildings used for artisanal activities where raw materials are transformed on a local scale. | |
| - Comerciales4: Hotels, Motels, and Restaurants. | |
| - Comerciales5: Operational offices and warehouses. | |
| - Centros_Comerciales: Commercial premises located on properties of one or several buildings. | |
| - Bodegas: Buildings in warehouse-type constructions dedicated to commercial, industrial, or storage activities. | |
| - Parqueaderos: Buildings designed for vehicle parking. | |
| - Dotacionales1: Buildings where activities aimed at the welfare or service of a community are carried out. | |
| - Dotacionales2: Buildings designed to carry out educational or training activities. | |
| - Dotacionales3: Buildings with the necessary infrastructure to provide surgical and/or hospitalization services. | |
| - Dotacionales4: Buildings for religious worship owned by communities or religious congregations. | |
| - Dotacionales5: Theaters, cinemas, swimming pools, museums, sports, events, or shows. | |
| - Especiales: Military administrative areas, cemeteries, airport runways. | |
| - Moles: Large buildings in height (>4 floors) or area (>10,000 m²), usually under construction. | |
| - Rurales: Sheds, kiosks, shelters, barns, stables, silos, etc. | |
| - Mixto1: (Residencial + Comercial1) Housing and commercial premises. | |
| - Mixto2: (Residencial + Comercial2) Housing and offices. | |
| - Mixto3: (Comercial1 + Comercial2) Commercial premises and offices. | |
| Return ONLY the JSON object (no prose, no backticks) after ◁/think▷. | |
| """ | |
| class Classifier: | |
| def __init__(self, MAX_NEW_TOKENS: int = 1024): | |
| self.max_new_tokens = MAX_NEW_TOKENS | |
| logger.info("Initializing Classifier") | |
| logger.info("Loading model...") | |
| self.model = Model.load_model() | |
| logger.info("Loading processor...") | |
| self.processor = Model.load_processor() | |
| logger.info("Classifier initialization complete") | |
| logger.info("Setting up image data generator...") | |
| def get_response(self, images: List[Image.Image], saved_image_paths: List[str] = None) -> dict: | |
| logger.info(f"Processing classification request for {len(images)} images") | |
| logger.info("Loading and preprocessing images...") | |
| images = self.get_input_tensor(images) | |
| logger.debug("Successfully preprocessed images") | |
| logger.info("Preparing input messages...") | |
| messages = self.prepare_messages(saved_image_paths) | |
| response = self.generate_model_response(images, messages) | |
| # return {"output": response} | |
| think, json_text = self._split_think_and_json(response) | |
| data = json.loads(json_text) | |
| if isinstance(data, dict) and "think" not in data and think: | |
| data["think"] = think | |
| # Validate against Pydantic schema; raise on failure | |
| validated = ImageData.model_validate(data) | |
| return {"output": validated.model_dump()} | |
| def get_input_tensor(self, images: List[Image.Image]) -> List[Image.Image]: | |
| """ | |
| Preprocess a list of PIL images. | |
| Args: | |
| images (List[Image.Image]): List of PIL images to be processed. | |
| Returns: | |
| List[Image.Image]: List of preprocessed images ready for classification. | |
| """ | |
| if not images: | |
| raise ValueError("No images provided for classification.") | |
| logger.info(f"Preprocessing {len(images)} images...") | |
| processed_images = [] | |
| for idx, img in enumerate(images): | |
| logger.debug(f"Processing image at index: {idx}") | |
| try: | |
| img = self.resize_image(img) | |
| processed_images.append(img) | |
| logger.debug(f"Successfully processed image at index: {idx}") | |
| except Exception as e: | |
| logger.error(f"Error processing image at index {idx}: {str(e)}") | |
| raise | |
| return processed_images | |
| def generate_model_response(self, images: List[Image.Image], messages: List[dict]) -> str: | |
| """ | |
| Generate response from the model. | |
| Args: | |
| images (List[Image.Image]): List of preprocessed images. | |
| messages (List[dict]): Messages for the processor. | |
| Returns: | |
| str: Decoded response from the model. | |
| """ | |
| logger.info("Applying chat template...") | |
| try: | |
| # Get the text as string first, then let outlines handle tokenization | |
| text = self.processor.apply_chat_template( | |
| messages, add_generation_prompt=True, return_tensors="pt" | |
| ) | |
| logger.info(f"Text length: {len(text)} characters") | |
| inputs = self.processor(images=images, text=text, return_tensors="pt", padding=True, truncation=True).to(self.model.device) | |
| except Exception as e: | |
| logger.error(f"Error applying chat template: {str(e)}") | |
| raise | |
| logger.info("Generating response...") | |
| generated_ids = self.model.generate(**inputs, max_new_tokens=1024, temperature=0.1) | |
| generated_ids_trimmed = [ | |
| out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) | |
| ] | |
| response = self.processor.batch_decode( | |
| generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
| )[0] | |
| logger.debug("Successfully generated response") | |
| return response | |
| def resize_image(image: Image.Image, max_size: int = 224) -> Image.Image: | |
| """ | |
| Resize an image while maintaining aspect ratio. | |
| Args: | |
| image: PIL Image object to resize | |
| max_size: Maximum dimension (width or height) of the output image | |
| Returns: | |
| PIL Image: Resized image with maintained aspect ratio | |
| """ | |
| # Get current dimensions | |
| width, height = image.size | |
| # Calculate scaling factor to fit within max_size | |
| scale = min(max_size / width, max_size / height) | |
| # Only resize if image is larger than max_size | |
| if scale < 1: | |
| new_width = int(width * scale) | |
| new_height = int(height * scale) | |
| image = image.resize( | |
| (new_width, new_height), | |
| Image.Resampling.LANCZOS | |
| ) | |
| return image | |
| def prepare_messages(saved_image_paths: List[str]) -> List[dict]: | |
| """ | |
| Prepare messages for the processor. | |
| Args: | |
| saved_image_paths (List[str]): List of paths to saved images. | |
| classification_prompt (str): The prompt for classification. | |
| Returns: | |
| List[dict]: Messages for the processor. | |
| """ | |
| return [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": image_path} for image_path in saved_image_paths | |
| ] + [{"type": "text", "text": LAND_USE_PROMPT}], | |
| }, | |
| ] | |
| def _split_think_and_json(text: str): | |
| start, end = "◁think▷", "◁/think▷" | |
| think = "" | |
| after = text | |
| if start in text and end in text: | |
| s = text.find(start) + len(start) | |
| e = text.find(end, s) | |
| think = text[s:e].strip() | |
| after = text[e+len(end):].strip() | |
| m = re.search(r"\{.*\}", after, flags=re.S) | |
| json_text = m.group(0) if m else after | |
| return think, json_text | |