google
/

owlv2-base-patch16-ensemble

@@ -51,23 +51,12 @@ inputs = processor(text=texts, images=image, return_tensors="pt")
 with torch.no_grad():
     outputs = model(**inputs)
-# Note: boxes need to be visualized on the padded, unnormalized image
-# hence we'll set the target image sizes (height, width) based on that
-def get_preprocessed_image(pixel_values):
-    pixel_values = pixel_values.squeeze().numpy()
-    unnormalized_image = (pixel_values * np.array(OPENAI_CLIP_STD)[:, None, None]) + np.array(OPENAI_CLIP_MEAN)[:, None, None]
-    unnormalized_image = (unnormalized_image * 255).astype(np.uint8)
-    unnormalized_image = np.moveaxis(unnormalized_image, 0, -1)
-    unnormalized_image = Image.fromarray(unnormalized_image)
-    return unnormalized_image
-unnormalized_image = get_preprocessed_image(inputs.pixel_values)
-target_sizes = torch.Tensor([unnormalized_image.size[::-1]])
 # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
 results = processor.post_process_object_detection(
-    outputs=outputs, threshold=0.2, target_sizes=target_sizes
 )
 i = 0  # Retrieve predictions for the first image for the corresponding text queries
@@ -77,6 +66,20 @@ boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["l
 for box, score, label in zip(boxes, scores, labels):
     box = [round(i, 2) for i in box.tolist()]
     print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
 ```

 with torch.no_grad():
     outputs = model(**inputs)
+# Get original image size
+original_size = torch.Tensor([image.size[::-1]])
 # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
 results = processor.post_process_object_detection(
+    outputs=outputs, threshold=0.2, target_sizes=original_size
 )
 i = 0  # Retrieve predictions for the first image for the corresponding text queries
 for box, score, label in zip(boxes, scores, labels):
     box = [round(i, 2) for i in box.tolist()]
     print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
+# Draw each box on the image
+draw = ImageDraw.Draw(image)
+for box, score, label in zip(boxes, scores, labels):
+    box = [round(i, 2) for i in box.tolist()]
+    draw.rectangle(box, outline="red", width=2)
+    draw.text(
+        (box[0], box[1]),
+        f"{text[label]}: {round(score.item(), 3)}",
+        fill="red",
+    )
+image.show()
 ```