File size: 8,261 Bytes
1633fcc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
from torch import nn
import torch.nn.functional as F
import torch.distributed.nn
import torch.distributed as dist
from torch.nn.init import trunc_normal_
from torch.nn.utils import weight_norm
import models_dinov2
from models_IB import IF_Module
import math


class MetaArch(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg

        student_model_dict = dict()
        teacher_model_dict = dict()

        import_student = getattr(models_dinov2, cfg.target_model)
        student = import_student(img_size=224,
            patch_size=cfg.patch_size,
            init_values=1.0,
            ffn_layer='mlp',
            block_chunks=0,
            num_register_tokens=0,
            interpolate_antialias=False,
            interpolate_offset=0.1)

        embed_dim = student.embed_dim
        
        if cfg.teacher_model == 'vit_base':
            teacher_backbone = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitb14_lc')
        elif cfg.teacher_model == 'vit_small':
            teacher_backbone = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14_lc')
        elif cfg.teacher_model == 'vit_large':
            teacher_backbone = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14_lc')
        elif cfg.teacher_model == 'vit_giant':
            teacher_backbone = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitg14_lc')
        teacher_backbone.eval()

        student_model_dict['backbone'] = student
        teacher_model_dict['backbone'] = teacher_backbone.backbone
        
        self.embed_dim = embed_dim

        # initialize parameters and checks
        self.total_n_global_crops = cfg.batch_size

        self.student = nn.ModuleDict(student_model_dict)
        self.teacher = nn.ModuleDict(teacher_model_dict)

        teacher_embed_dim = teacher_backbone.backbone.embed_dim
        self.ibot_head = nn.Sequential(
                  nn.LayerNorm(embed_dim),
                  nn.Linear(embed_dim, teacher_embed_dim))
        
        self.token_head = nn.Sequential(
                  nn.LayerNorm(embed_dim),
                  nn.Linear(embed_dim, teacher_embed_dim))

        self.fea_head = nn.Sequential(
                  nn.LayerNorm(embed_dim),
                  nn.Linear(embed_dim, teacher_embed_dim))

        self.soft_criterion = torch.nn.MSELoss()

        self.info_bottleneck = IF_Module(embed_dim=embed_dim, num_heads=12, mlp_ratio=4, depth=4)

        for param in self.teacher.backbone.parameters():
            param.requires_grad = False
    
    def cal_bpp(self, image, unmask_likelihood, mask_likelihood):
        b, _, h, w = image.size()
        num_pixels = b * h * w
        log_unmask_likelihoods = torch.log(unmask_likelihood)
        log_mask_likelihoods = torch.log(mask_likelihood)
        bpp = (log_unmask_likelihoods.sum() + log_mask_likelihoods.sum()) / (-math.log(2) * num_pixels * 1.5)
        return bpp

    def forward(self, inputs):
        global_crops = inputs["collated_global_crops"]
        
        masks = inputs["collated_masks"]
        mask_indices_list = inputs["mask_indices_list"]
        n_masked_patches = mask_indices_list.shape[0]
        upperbound = inputs["upperbound"]

        n_global_crops = 1

        # compute teacher output
        # @torch.no_grad()
        def compute_teacher_output():
            with torch.no_grad():
                teacher_backbone_output_dict = self.teacher.backbone(global_crops, is_training=True)
            teacher_cls_tokens = teacher_backbone_output_dict["x_norm_clstoken"]
            teacher_patch_tokens = teacher_backbone_output_dict["x_norm_patchtokens"]
            _dim = teacher_patch_tokens.shape[-1]

            # mask teacher patch tokens
            buffer_tensor_teacher = teacher_patch_tokens.new_zeros(upperbound, _dim)
            torch.index_select(
                teacher_patch_tokens.flatten(0, 1),
                dim=0,
                index=mask_indices_list,
                out=buffer_tensor_teacher[:n_masked_patches],
            )
            teacher_patch_tokens_masked = buffer_tensor_teacher[:n_masked_patches]

            return teacher_cls_tokens, teacher_patch_tokens, teacher_patch_tokens_masked

        # get the teacher outputs
        (
            teacher_cls_tokens,
            teacher_patch_tokens,
            teacher_patch_tokens_masked
        ) = compute_teacher_output()
        
        cur_masks = masks if self.cfg.mask_probability > 0 else None

        student_backbone_output_dict, student_backbone_output_dict_unmask = self.student.backbone(
            [global_crops, global_crops], masks=[cur_masks, None], is_training=True
        )

        student_cls_token_unmask = student_backbone_output_dict_unmask["x_norm_clstoken"]
        student_patch_tokens_unmask = student_backbone_output_dict_unmask["x_norm_patchtokens"]
        student_patch_tokens = student_backbone_output_dict["x_norm_patchtokens"]

        # calculate bitrate
        student_patch_tokens_unmask, unmask_likelihood = self.info_bottleneck(student_patch_tokens_unmask, is_training=True)
        student_patch_tokens, mask_likelihood = self.info_bottleneck(student_patch_tokens, is_training=True)
        bpp = self.cal_bpp(global_crops, unmask_likelihood, mask_likelihood)

        # mask student patch tokens
        _dim = student_patch_tokens.shape[-1]
        
        buffer_tensor_student = student_patch_tokens.new_zeros(upperbound, _dim)
        buffer_tensor_student[:n_masked_patches].copy_(
            torch.index_select(student_patch_tokens.flatten(0, 1),
                                dim=0,
                                index=mask_indices_list)
        )

        ## projection head
        student_patch_tokens_unmask = self.fea_head(student_patch_tokens_unmask)
        
        student_cls_token_unmask = self.token_head(student_cls_token_unmask)
        
        tokens_after_head = self.ibot_head(buffer_tensor_student)
        student_patch_tokens_masked = tokens_after_head[:n_masked_patches]

        ## token objective
        distillation_loss_token = self.soft_criterion(student_cls_token_unmask, teacher_cls_tokens)

        ## fea objective
        student_whole_fea = torch.cat((student_cls_token_unmask.unsqueeze(1),student_patch_tokens_unmask),dim=1)
        teacher_whole_fea = torch.cat((teacher_cls_tokens.unsqueeze(1),teacher_patch_tokens),dim=1)
        distillation_loss_fea = self.soft_criterion(student_whole_fea, teacher_whole_fea)

        ## patch objective
        patch_loss = self.soft_criterion(student_patch_tokens_masked, teacher_patch_tokens_masked)
        
        # coefficient
        token_loss = self.cfg.lambda_token * distillation_loss_token
        fea_loss = self.cfg.lambda_fea * distillation_loss_fea
        patch_loss_weighted = self.cfg.lambda_patch * patch_loss
        # print(f"self.cfg: {self.cfg}")
        # print(f"self.cfg.lambda_token: {self.cfg.lambda_token}, self.cfg.lambda_fea: {self.cfg.lambda_fea}, self.cfg.lambda_patch: {self.cfg.lambda_patch}")

        # compute the total loss
        total_loss = patch_loss_weighted + fea_loss + token_loss + 0.48 * bpp
        # task_loss = patch_loss + fea_loss + token_loss
        task_loss = patch_loss + distillation_loss_fea + distillation_loss_token

        # return the final loss dict
        loss_dict = {"bpp_loss": bpp, 
                     "patch_loss": patch_loss, 
                     "fea_loss": distillation_loss_fea, 
                     "token_loss": token_loss, 
                     "loss": total_loss,
                     "task_loss": task_loss,
                     }
        
        return loss_dict