{ "architectures": [ "SiglipForImageClassification" ], "id2label": { "0": "barn", "1": "baseball_bat", "2": "basket", "3": "beach", "4": "bear", "5": "beard", "6": "bee", "7": "bird", "8": "blueberry", "9": "bowtie", "10": "bracelet", "11": "brain", "12": "bread", "13": "broccoli", "14": "bus", "15": "butterfly", "16": "circle", "17": "cloud", "18": "cruise_ship", "19": "dolphin", "20": "dumbbell", "21": "elephant", "22": "eye", "23": "eyeglasses", "24": "feather", "25": "fish", "26": "flower", "27": "foot", "28": "frog", "29": "giraffe", "30": "goatee", "31": "golf_club", "32": "grapes", "33": "grass", "34": "guitar", "35": "hamburger", "36": "hand", "37": "hat", "38": "headphones", "39": "helicopter", "40": "hexagon", "41": "hockey_stick", "42": "horse", "43": "hourglass", "44": "house", "45": "ice_cream", "46": "jacket", "47": "ladder", "48": "leg", "49": "lipstick", "50": "megaphone", "51": "monkey", "52": "moon", "53": "mushroom", "54": "necklace", "55": "owl", "56": "panda", "57": "pear", "58": "peas", "59": "penguin", "60": "pig", "61": "pillow", "62": "pineapple", "63": "pizza", "64": "pool", "65": "popsicle", "66": "rabbit", "67": "rhinoceros", "68": "rifle", "69": "river", "70": "sailboat", "71": "sandwich", "72": "sea_turtle", "73": "shark", "74": "shoe", "75": "skyscraper", "76": "snorkel", "77": "snowman", "78": "soccer_ball", "79": "speedboat", "80": "spider", "81": "spoon", "82": "square", "83": "squirrel", "84": "stethoscope", "85": "strawberry", "86": "streetlight", "87": "submarine", "88": "suitcase", "89": "sun", "90": "sweater", "91": "sword", "92": "table", "93": "teapot", "94": "teddy-bear", "95": "telephone", "96": "tent", "97": "The_Eiffel_Tower", "98": "The_Great_Wall_of_China", "99": "The_Mona_Lisa", "100": "tiger", "101": "toaster", "102": "tooth", "103": "tornado", "104": "tractor", "105": "train", "106": "tree", "107": "triangle", "108": "trombone", "109": "truck", "110": "trumpet", "111": "umbrella", "112": "vase", "113": "violin", "114": "watermelon", "115": "whale", "116": "windmill", "117": "wine_glass", "118": "yoga", "119": "zebra", "120": "zigzag" }, "initializer_factor": 1.0, "label2id": { "The_Eiffel_Tower": 97, "The_Great_Wall_of_China": 98, "The_Mona_Lisa": 99, "barn": 0, "baseball_bat": 1, "basket": 2, "beach": 3, "bear": 4, "beard": 5, "bee": 6, "bird": 7, "blueberry": 8, "bowtie": 9, "bracelet": 10, "brain": 11, "bread": 12, "broccoli": 13, "bus": 14, "butterfly": 15, "circle": 16, "cloud": 17, "cruise_ship": 18, "dolphin": 19, "dumbbell": 20, "elephant": 21, "eye": 22, "eyeglasses": 23, "feather": 24, "fish": 25, "flower": 26, "foot": 27, "frog": 28, "giraffe": 29, "goatee": 30, "golf_club": 31, "grapes": 32, "grass": 33, "guitar": 34, "hamburger": 35, "hand": 36, "hat": 37, "headphones": 38, "helicopter": 39, "hexagon": 40, "hockey_stick": 41, "horse": 42, "hourglass": 43, "house": 44, "ice_cream": 45, "jacket": 46, "ladder": 47, "leg": 48, "lipstick": 49, "megaphone": 50, "monkey": 51, "moon": 52, "mushroom": 53, "necklace": 54, "owl": 55, "panda": 56, "pear": 57, "peas": 58, "penguin": 59, "pig": 60, "pillow": 61, "pineapple": 62, "pizza": 63, "pool": 64, "popsicle": 65, "rabbit": 66, "rhinoceros": 67, "rifle": 68, "river": 69, "sailboat": 70, "sandwich": 71, "sea_turtle": 72, "shark": 73, "shoe": 74, "skyscraper": 75, "snorkel": 76, "snowman": 77, "soccer_ball": 78, "speedboat": 79, "spider": 80, "spoon": 81, "square": 82, "squirrel": 83, "stethoscope": 84, "strawberry": 85, "streetlight": 86, "submarine": 87, "suitcase": 88, "sun": 89, "sweater": 90, "sword": 91, "table": 92, "teapot": 93, "teddy-bear": 94, "telephone": 95, "tent": 96, "tiger": 100, "toaster": 101, "tooth": 102, "tornado": 103, "tractor": 104, "train": 105, "tree": 106, "triangle": 107, "trombone": 108, "truck": 109, "trumpet": 110, "umbrella": 111, "vase": 112, "violin": 113, "watermelon": 114, "whale": 115, "windmill": 116, "wine_glass": 117, "yoga": 118, "zebra": 119, "zigzag": 120 }, "model_type": "siglip", "problem_type": "single_label_classification", "text_config": { "attention_dropout": 0.0, "hidden_act": "gelu_pytorch_tanh", "hidden_size": 768, "intermediate_size": 3072, "layer_norm_eps": 1e-06, "max_position_embeddings": 64, "model_type": "siglip_text_model", "num_attention_heads": 12, "num_hidden_layers": 12, "projection_size": 768, "torch_dtype": "float32", "vocab_size": 256000 }, "torch_dtype": "float32", "transformers_version": "4.51.0.dev0", "vision_config": { "attention_dropout": 0.0, "hidden_act": "gelu_pytorch_tanh", "hidden_size": 768, "image_size": 224, "intermediate_size": 3072, "layer_norm_eps": 1e-06, "model_type": "siglip_vision_model", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "patch_size": 16, "torch_dtype": "float32" } }