Spaces:

nimocodes
/

DeepDetect

Runtime error

App Files Files Community

nimocodes commited on Feb 1, 2024

Commit

2c8d5d8

verified ·

1 Parent(s): e3b8e9c

Upload 6 files

Browse files

Files changed (6) hide show

models/classifiers.py +172 -0
models/efficientnet.onnx +3 -0
models/image.py +195 -0
models/links.txt +1 -0
models/model.pth +3 -0
models/rawnet.py +360 -0

models/classifiers.py ADDED Viewed

	@@ -0,0 +1,172 @@

+from functools import partial
+import numpy as np
+import torch
+from timm.models.efficientnet import tf_efficientnet_b4_ns, tf_efficientnet_b3_ns, \
+    tf_efficientnet_b5_ns, tf_efficientnet_b2_ns, tf_efficientnet_b6_ns, tf_efficientnet_b7_ns
+from torch import nn
+from torch.nn.modules.dropout import Dropout
+from torch.nn.modules.linear import Linear
+from torch.nn.modules.pooling import AdaptiveAvgPool2d
+encoder_params = {
+    "tf_efficientnet_b3_ns": {
+        "features": 1536,
+        "init_op": partial(tf_efficientnet_b3_ns, pretrained=True, drop_path_rate=0.2)
+    },
+    "tf_efficientnet_b2_ns": {
+        "features": 1408,
+        "init_op": partial(tf_efficientnet_b2_ns, pretrained=False, drop_path_rate=0.2)
+    },
+    "tf_efficientnet_b4_ns": {
+        "features": 1792,
+        "init_op": partial(tf_efficientnet_b4_ns, pretrained=True, drop_path_rate=0.5)
+    },
+    "tf_efficientnet_b5_ns": {
+        "features": 2048,
+        "init_op": partial(tf_efficientnet_b5_ns, pretrained=True, drop_path_rate=0.2)
+    },
+    "tf_efficientnet_b4_ns_03d": {
+        "features": 1792,
+        "init_op": partial(tf_efficientnet_b4_ns, pretrained=True, drop_path_rate=0.3)
+    },
+    "tf_efficientnet_b5_ns_03d": {
+        "features": 2048,
+        "init_op": partial(tf_efficientnet_b5_ns, pretrained=True, drop_path_rate=0.3)
+    },
+    "tf_efficientnet_b5_ns_04d": {
+        "features": 2048,
+        "init_op": partial(tf_efficientnet_b5_ns, pretrained=True, drop_path_rate=0.4)
+    },
+    "tf_efficientnet_b6_ns": {
+        "features": 2304,
+        "init_op": partial(tf_efficientnet_b6_ns, pretrained=True, drop_path_rate=0.2)
+    },
+    "tf_efficientnet_b7_ns": {
+        "features": 2560,
+        "init_op": partial(tf_efficientnet_b7_ns, pretrained=False, drop_path_rate=0.2)
+    },
+    "tf_efficientnet_b6_ns_04d": {
+        "features": 2304,
+        "init_op": partial(tf_efficientnet_b6_ns, pretrained=True, drop_path_rate=0.4)
+    },
+}
+def setup_srm_weights(input_channels: int = 3) -> torch.Tensor:
+    """Creates the SRM kernels for noise analysis."""
+    # note: values taken from Zhou et al., "Learning Rich Features for Image Manipulation Detection", CVPR2018
+    srm_kernel = torch.from_numpy(np.array([
+        [  # srm 1/2 horiz
+            [0., 0., 0., 0., 0.],  # noqa: E241,E201
+            [0., 0., 0., 0., 0.],  # noqa: E241,E201
+            [0., 1., -2., 1., 0.],  # noqa: E241,E201
+            [0., 0., 0., 0., 0.],  # noqa: E241,E201
+            [0., 0., 0., 0., 0.],  # noqa: E241,E201
+        ], [  # srm 1/4
+            [0., 0., 0., 0., 0.],  # noqa: E241,E201
+            [0., -1., 2., -1., 0.],  # noqa: E241,E201
+            [0., 2., -4., 2., 0.],  # noqa: E241,E201
+            [0., -1., 2., -1., 0.],  # noqa: E241,E201
+            [0., 0., 0., 0., 0.],  # noqa: E241,E201
+        ], [  # srm 1/12
+            [-1., 2., -2., 2., -1.],  # noqa: E241,E201
+            [2., -6., 8., -6., 2.],  # noqa: E241,E201
+            [-2., 8., -12., 8., -2.],  # noqa: E241,E201
+            [2., -6., 8., -6., 2.],  # noqa: E241,E201
+            [-1., 2., -2., 2., -1.],  # noqa: E241,E201
+        ]
+    ])).float()
+    srm_kernel[0] /= 2
+    srm_kernel[1] /= 4
+    srm_kernel[2] /= 12
+    return srm_kernel.view(3, 1, 5, 5).repeat(1, input_channels, 1, 1)
+def setup_srm_layer(input_channels: int = 3) -> torch.nn.Module:
+    """Creates a SRM convolution layer for noise analysis."""
+    weights = setup_srm_weights(input_channels)
+    conv = torch.nn.Conv2d(input_channels, out_channels=3, kernel_size=5, stride=1, padding=2, bias=False)
+    with torch.no_grad():
+        conv.weight = torch.nn.Parameter(weights, requires_grad=False)
+    return conv
+class DeepFakeClassifierSRM(nn.Module):
+    def __init__(self, encoder, dropout_rate=0.5) -> None:
+        super().__init__()
+        self.encoder = encoder_params[encoder]["init_op"]()
+        self.avg_pool = AdaptiveAvgPool2d((1, 1))
+        self.srm_conv = setup_srm_layer(3)
+        self.dropout = Dropout(dropout_rate)
+        self.fc = Linear(encoder_params[encoder]["features"], 1)
+    def forward(self, x):
+        noise = self.srm_conv(x)
+        x = self.encoder.forward_features(noise)
+        x = self.avg_pool(x).flatten(1)
+        x = self.dropout(x)
+        x = self.fc(x)
+        return x
+class GlobalWeightedAvgPool2d(nn.Module):
+    """
+    Global Weighted Average Pooling from paper "Global Weighted Average
+    Pooling Bridges Pixel-level Localization and Image-level Classification"
+    """
+    def __init__(self, features: int, flatten=False):
+        super().__init__()
+        self.conv = nn.Conv2d(features, 1, kernel_size=1, bias=True)
+        self.flatten = flatten
+    def fscore(self, x):
+        m = self.conv(x)
+        m = m.sigmoid().exp()
+        return m
+    def norm(self, x: torch.Tensor):
+        return x / x.sum(dim=[2, 3], keepdim=True)
+    def forward(self, x):
+        input_x = x
+        x = self.fscore(x)
+        x = self.norm(x)
+        x = x * input_x
+        x = x.sum(dim=[2, 3], keepdim=not self.flatten)
+        return x
+class DeepFakeClassifier(nn.Module):
+    def __init__(self, encoder, dropout_rate=0.0) -> None:
+        super().__init__()
+        self.encoder = encoder_params[encoder]["init_op"]()
+        self.avg_pool = AdaptiveAvgPool2d((1, 1))
+        self.dropout = Dropout(dropout_rate)
+        self.fc = Linear(encoder_params[encoder]["features"], 1)
+    def forward(self, x):
+        x = self.encoder.forward_features(x)
+        x = self.avg_pool(x).flatten(1)
+        x = self.dropout(x)
+        x = self.fc(x)
+        return x
+class DeepFakeClassifierGWAP(nn.Module):
+    def __init__(self, encoder, dropout_rate=0.5) -> None:
+        super().__init__()
+        self.encoder = encoder_params[encoder]["init_op"]()
+        self.avg_pool = GlobalWeightedAvgPool2d(encoder_params[encoder]["features"])
+        self.dropout = Dropout(dropout_rate)
+        self.fc = Linear(encoder_params[encoder]["features"], 1)
+    def forward(self, x):
+        x = self.encoder.forward_features(x)
+        x = self.avg_pool(x).flatten(1)
+        x = self.dropout(x)
+        x = self.fc(x)
+        return x

models/efficientnet.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:39774e1cc878ac2b587fd4dc1c96fba084c9fe5ee3106a43b560f6054a69ba26
+size 133

models/image.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import re
+import os
+import wget
+import torch
+import torchvision
+import torch.nn as nn
+import torch.nn.functional as F
+from models.rawnet import SincConv, Residual_block
+from models.classifiers import DeepFakeClassifier
+class ImageEncoder(nn.Module):
+    def __init__(self, args):
+        super(ImageEncoder, self).__init__()
+        self.device = args.device
+        self.args = args
+        self.flatten = nn.Flatten()
+        self.sigmoid = nn.Sigmoid()
+        # self.fc = nn.Linear(in_features=2560, out_features = 2)
+        self.pretrained_image_encoder = args.pretrained_image_encoder
+        self.freeze_image_encoder = args.freeze_image_encoder
+        if self.pretrained_image_encoder == False:
+            self.model = DeepFakeClassifier(encoder = "tf_efficientnet_b7_ns").to(self.device)
+        else:
+            self.pretrained_ckpt = torch.load('pretrained\\final_999_DeepFakeClassifier_tf_efficientnet_b7_ns_0_23', map_location = torch.device(self.args.device))
+            self.state_dict = self.pretrained_ckpt.get("state_dict", self.pretrained_ckpt)
+            self.model = DeepFakeClassifier(encoder = "tf_efficientnet_b7_ns").to(self.device)
+            print("Loading pretrained image encoder...")
+            self.model.load_state_dict({re.sub("^module.", "", k): v for k, v in self.state_dict.items()}, strict=True)
+            print("Loaded pretrained image encoder.")
+        if self.freeze_image_encoder == True:
+            for idx, param in self.model.named_parameters():
+                param.requires_grad = False
+        # self.model.fc = nn.Identity()
+    def forward(self, x):
+        x = self.model(x)
+        out = self.sigmoid(x)
+        # x = self.flatten(x)
+        # out = self.fc(x)
+        return out
+class RawNet(nn.Module):
+    def __init__(self, args):
+        super(RawNet, self).__init__()
+        self.device=args.device
+        self.filts = [20, [20, 20], [20, 128], [128, 128]]
+        self.Sinc_conv=SincConv(device=self.device,
+			out_channels = self.filts[0],
+			kernel_size = 1024,
+            in_channels = args.in_channels)
+        self.first_bn = nn.BatchNorm1d(num_features = self.filts[0])
+        self.selu = nn.SELU(inplace=True)
+        self.block0 = nn.Sequential(Residual_block(nb_filts = self.filts[1], first = True))
+        self.block1 = nn.Sequential(Residual_block(nb_filts = self.filts[1]))
+        self.block2 = nn.Sequential(Residual_block(nb_filts = self.filts[2]))
+        self.filts[2][0] = self.filts[2][1]
+        self.block3 = nn.Sequential(Residual_block(nb_filts = self.filts[2]))
+        self.block4 = nn.Sequential(Residual_block(nb_filts = self.filts[2]))
+        self.block5 = nn.Sequential(Residual_block(nb_filts = self.filts[2]))
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
+        self.fc_attention0 = self._make_attention_fc(in_features = self.filts[1][-1],
+            l_out_features = self.filts[1][-1])
+        self.fc_attention1 = self._make_attention_fc(in_features = self.filts[1][-1],
+            l_out_features = self.filts[1][-1])
+        self.fc_attention2 = self._make_attention_fc(in_features = self.filts[2][-1],
+            l_out_features = self.filts[2][-1])
+        self.fc_attention3 = self._make_attention_fc(in_features = self.filts[2][-1],
+            l_out_features = self.filts[2][-1])
+        self.fc_attention4 = self._make_attention_fc(in_features = self.filts[2][-1],
+            l_out_features = self.filts[2][-1])
+        self.fc_attention5 = self._make_attention_fc(in_features = self.filts[2][-1],
+            l_out_features = self.filts[2][-1])
+        self.bn_before_gru = nn.BatchNorm1d(num_features = self.filts[2][-1])
+        self.gru = nn.GRU(input_size = self.filts[2][-1],
+			hidden_size = args.gru_node,
+			num_layers = args.nb_gru_layer,
+			batch_first = True)
+        self.fc1_gru = nn.Linear(in_features = args.gru_node,
+			out_features = args.nb_fc_node)
+        self.fc2_gru = nn.Linear(in_features = args.nb_fc_node,
+			out_features = args.nb_classes ,bias=True)
+        self.sig = nn.Sigmoid()
+        self.logsoftmax = nn.LogSoftmax(dim=1)
+        self.pretrained_audio_encoder = args.pretrained_audio_encoder
+        self.freeze_audio_encoder = args.freeze_audio_encoder
+        if self.pretrained_audio_encoder == True:
+            print("Loading pretrained audio encoder")
+            ckpt = torch.load('pretrained\\RawNet.pth', map_location = torch.device(self.device))
+            print("Loaded pretrained audio encoder")
+            self.load_state_dict(ckpt, strict = True)
+        if self.freeze_audio_encoder:
+                for param in self.parameters():
+                    param.requires_grad = False
+    def forward(self, x, y = None):
+        nb_samp = x.shape[0]
+        len_seq = x.shape[1]
+        x=x.view(nb_samp,1,len_seq)
+        x = self.Sinc_conv(x)
+        x = F.max_pool1d(torch.abs(x), 3)
+        x = self.first_bn(x)
+        x =  self.selu(x)
+        x0 = self.block0(x)
+        y0 = self.avgpool(x0).view(x0.size(0), -1) # torch.Size([batch, filter])
+        y0 = self.fc_attention0(y0)
+        y0 = self.sig(y0).view(y0.size(0), y0.size(1), -1)  # torch.Size([batch, filter, 1])
+        x = x0 * y0 + y0  # (batch, filter, time) x (batch, filter, 1)
+        x1 = self.block1(x)
+        y1 = self.avgpool(x1).view(x1.size(0), -1) # torch.Size([batch, filter])
+        y1 = self.fc_attention1(y1)
+        y1 = self.sig(y1).view(y1.size(0), y1.size(1), -1)  # torch.Size([batch, filter, 1])
+        x = x1 * y1 + y1 # (batch, filter, time) x (batch, filter, 1)
+        x2 = self.block2(x)
+        y2 = self.avgpool(x2).view(x2.size(0), -1) # torch.Size([batch, filter])
+        y2 = self.fc_attention2(y2)
+        y2 = self.sig(y2).view(y2.size(0), y2.size(1), -1)  # torch.Size([batch, filter, 1])
+        x = x2 * y2 + y2 # (batch, filter, time) x (batch, filter, 1)
+        x3 = self.block3(x)
+        y3 = self.avgpool(x3).view(x3.size(0), -1) # torch.Size([batch, filter])
+        y3 = self.fc_attention3(y3)
+        y3 = self.sig(y3).view(y3.size(0), y3.size(1), -1)  # torch.Size([batch, filter, 1])
+        x = x3 * y3 + y3 # (batch, filter, time) x (batch, filter, 1)
+        x4 = self.block4(x)
+        y4 = self.avgpool(x4).view(x4.size(0), -1) # torch.Size([batch, filter])
+        y4 = self.fc_attention4(y4)
+        y4 = self.sig(y4).view(y4.size(0), y4.size(1), -1)  # torch.Size([batch, filter, 1])
+        x = x4 * y4 + y4 # (batch, filter, time) x (batch, filter, 1)
+        x5 = self.block5(x)
+        y5 = self.avgpool(x5).view(x5.size(0), -1) # torch.Size([batch, filter])
+        y5 = self.fc_attention5(y5)
+        y5 = self.sig(y5).view(y5.size(0), y5.size(1), -1)  # torch.Size([batch, filter, 1])
+        x = x5 * y5 + y5 # (batch, filter, time) x (batch, filter, 1)
+        x = self.bn_before_gru(x)
+        x = self.selu(x)
+        x = x.permute(0, 2, 1)     #(batch, filt, time) >> (batch, time, filt)
+        self.gru.flatten_parameters()
+        x, _ = self.gru(x)
+        x = x[:,-1,:]
+        x = self.fc1_gru(x)
+        x = self.fc2_gru(x)
+        output=self.logsoftmax(x)
+        return output
+    def _make_attention_fc(self, in_features, l_out_features):
+        l_fc = []
+        l_fc.append(nn.Linear(in_features = in_features,
+			        out_features = l_out_features))
+        return nn.Sequential(*l_fc)
+    def _make_layer(self, nb_blocks, nb_filts, first = False):
+        layers = []
+        #def __init__(self, nb_filts, first = False):
+        for i in range(nb_blocks):
+            first = first if i == 0 else False
+            layers.append(Residual_block(nb_filts = nb_filts,
+				first = first))
+            if i == 0: nb_filts[0] = nb_filts[1]
+        return nn.Sequential(*layers)

models/links.txt CHANGED Viewed

	@@ -0,0 +1 @@


1	+

models/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0315e9ad76374c2e0f91249847d4b1c8ad8c2b20ac334836e8e79657daa4b63a
+size 134

models/rawnet.py ADDED Viewed

	@@ -0,0 +1,360 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+import numpy as np
+from torch.utils import data
+from collections import OrderedDict
+from torch.nn.parameter import Parameter
+class SincConv(nn.Module):
+    @staticmethod
+    def to_mel(hz):
+        return 2595 * np.log10(1 + hz / 700)
+    @staticmethod
+    def to_hz(mel):
+        return 700 * (10 ** (mel / 2595) - 1)
+    def __init__(self, device,out_channels, kernel_size,in_channels=1,sample_rate=16000,
+                 stride=1, padding=0, dilation=1, bias=False, groups=1):
+        super(SincConv,self).__init__()
+        if in_channels != 1:
+            msg = "SincConv only support one input channel (here, in_channels = {%i})" % (in_channels)
+            raise ValueError(msg)
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.sample_rate=sample_rate
+        # Forcing the filters to be odd (i.e, perfectly symmetrics)
+        if kernel_size%2==0:
+            self.kernel_size=self.kernel_size+1
+        self.device=device
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        if bias:
+            raise ValueError('SincConv does not support bias.')
+        if groups > 1:
+            raise ValueError('SincConv does not support groups.')
+        # initialize filterbanks using Mel scale
+        NFFT = 512
+        f=int(self.sample_rate/2)*np.linspace(0,1,int(NFFT/2)+1)
+        fmel=self.to_mel(f)   # Hz to mel conversion
+        fmelmax=np.max(fmel)
+        fmelmin=np.min(fmel)
+        filbandwidthsmel=np.linspace(fmelmin,fmelmax,self.out_channels+1)
+        filbandwidthsf=self.to_hz(filbandwidthsmel)  # Mel to Hz conversion
+        self.mel=filbandwidthsf
+        self.hsupp=torch.arange(-(self.kernel_size-1)/2, (self.kernel_size-1)/2+1)
+        self.band_pass=torch.zeros(self.out_channels,self.kernel_size)
+    def forward(self,x):
+        for i in range(len(self.mel)-1):
+            fmin=self.mel[i]
+            fmax=self.mel[i+1]
+            hHigh=(2*fmax/self.sample_rate)*np.sinc(2*fmax*self.hsupp/self.sample_rate)
+            hLow=(2*fmin/self.sample_rate)*np.sinc(2*fmin*self.hsupp/self.sample_rate)
+            hideal=hHigh-hLow
+            self.band_pass[i,:]=Tensor(np.hamming(self.kernel_size))*Tensor(hideal)
+        band_pass_filter=self.band_pass.to(self.device)
+        self.filters = (band_pass_filter).view(self.out_channels, 1, self.kernel_size)
+        return F.conv1d(x, self.filters, stride=self.stride,
+                        padding=self.padding, dilation=self.dilation,
+                         bias=None, groups=1)
+class Residual_block(nn.Module):
+    def __init__(self, nb_filts, first = False):
+        super(Residual_block, self).__init__()
+        self.first = first
+        if not self.first:
+            self.bn1 = nn.BatchNorm1d(num_features = nb_filts[0])
+        self.lrelu = nn.LeakyReLU(negative_slope=0.3)
+        self.conv1 = nn.Conv1d(in_channels = nb_filts[0],
+			out_channels = nb_filts[1],
+			kernel_size = 3,
+			padding = 1,
+			stride = 1)
+        self.bn2 = nn.BatchNorm1d(num_features = nb_filts[1])
+        self.conv2 = nn.Conv1d(in_channels = nb_filts[1],
+			out_channels = nb_filts[1],
+			padding = 1,
+			kernel_size = 3,
+			stride = 1)
+        if nb_filts[0] != nb_filts[1]:
+            self.downsample = True
+            self.conv_downsample = nn.Conv1d(in_channels = nb_filts[0],
+				out_channels = nb_filts[1],
+				padding = 0,
+				kernel_size = 1,
+				stride = 1)
+        else:
+            self.downsample = False
+        self.mp = nn.MaxPool1d(3)
+    def forward(self, x):
+        identity = x
+        if not self.first:
+            out = self.bn1(x)
+            out = self.lrelu(out)
+        else:
+            out = x
+        out = self.conv1(x)
+        out = self.bn2(out)
+        out = self.lrelu(out)
+        out = self.conv2(out)
+        if self.downsample:
+            identity = self.conv_downsample(identity)
+        out += identity
+        out = self.mp(out)
+        return out
+class RawNet(nn.Module):
+    def __init__(self, d_args, device):
+        super(RawNet, self).__init__()
+        self.device=device
+        self.Sinc_conv=SincConv(device=self.device,
+			out_channels = d_args['filts'][0],
+			kernel_size = d_args['first_conv'],
+                        in_channels = d_args['in_channels']
+        )
+        self.first_bn = nn.BatchNorm1d(num_features = d_args['filts'][0])
+        self.selu = nn.SELU(inplace=True)
+        self.block0 = nn.Sequential(Residual_block(nb_filts = d_args['filts'][1], first = True))
+        self.block1 = nn.Sequential(Residual_block(nb_filts = d_args['filts'][1]))
+        self.block2 = nn.Sequential(Residual_block(nb_filts = d_args['filts'][2]))
+        d_args['filts'][2][0] = d_args['filts'][2][1]
+        self.block3 = nn.Sequential(Residual_block(nb_filts = d_args['filts'][2]))
+        self.block4 = nn.Sequential(Residual_block(nb_filts = d_args['filts'][2]))
+        self.block5 = nn.Sequential(Residual_block(nb_filts = d_args['filts'][2]))
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
+        self.fc_attention0 = self._make_attention_fc(in_features = d_args['filts'][1][-1],
+            l_out_features = d_args['filts'][1][-1])
+        self.fc_attention1 = self._make_attention_fc(in_features = d_args['filts'][1][-1],
+            l_out_features = d_args['filts'][1][-1])
+        self.fc_attention2 = self._make_attention_fc(in_features = d_args['filts'][2][-1],
+            l_out_features = d_args['filts'][2][-1])
+        self.fc_attention3 = self._make_attention_fc(in_features = d_args['filts'][2][-1],
+            l_out_features = d_args['filts'][2][-1])
+        self.fc_attention4 = self._make_attention_fc(in_features = d_args['filts'][2][-1],
+            l_out_features = d_args['filts'][2][-1])
+        self.fc_attention5 = self._make_attention_fc(in_features = d_args['filts'][2][-1],
+            l_out_features = d_args['filts'][2][-1])
+        self.bn_before_gru = nn.BatchNorm1d(num_features = d_args['filts'][2][-1])
+        self.gru = nn.GRU(input_size = d_args['filts'][2][-1],
+			hidden_size = d_args['gru_node'],
+			num_layers = d_args['nb_gru_layer'],
+			batch_first = True)
+        self.fc1_gru = nn.Linear(in_features = d_args['gru_node'],
+			out_features = d_args['nb_fc_node'])
+        self.fc2_gru = nn.Linear(in_features = d_args['nb_fc_node'],
+			out_features = d_args['nb_classes'],bias=True)
+        self.sig = nn.Sigmoid()
+        self.logsoftmax = nn.LogSoftmax(dim=1)
+    def forward(self, x, y = None):
+        nb_samp = x.shape[0]
+        len_seq = x.shape[1]
+        x=x.view(nb_samp,1,len_seq)
+        x = self.Sinc_conv(x)
+        x = F.max_pool1d(torch.abs(x), 3)
+        x = self.first_bn(x)
+        x =  self.selu(x)
+        x0 = self.block0(x)
+        y0 = self.avgpool(x0).view(x0.size(0), -1) # torch.Size([batch, filter])
+        y0 = self.fc_attention0(y0)
+        y0 = self.sig(y0).view(y0.size(0), y0.size(1), -1)  # torch.Size([batch, filter, 1])
+        x = x0 * y0 + y0  # (batch, filter, time) x (batch, filter, 1)
+        x1 = self.block1(x)
+        y1 = self.avgpool(x1).view(x1.size(0), -1) # torch.Size([batch, filter])
+        y1 = self.fc_attention1(y1)
+        y1 = self.sig(y1).view(y1.size(0), y1.size(1), -1)  # torch.Size([batch, filter, 1])
+        x = x1 * y1 + y1 # (batch, filter, time) x (batch, filter, 1)
+        x2 = self.block2(x)
+        y2 = self.avgpool(x2).view(x2.size(0), -1) # torch.Size([batch, filter])
+        y2 = self.fc_attention2(y2)
+        y2 = self.sig(y2).view(y2.size(0), y2.size(1), -1)  # torch.Size([batch, filter, 1])
+        x = x2 * y2 + y2 # (batch, filter, time) x (batch, filter, 1)
+        x3 = self.block3(x)
+        y3 = self.avgpool(x3).view(x3.size(0), -1) # torch.Size([batch, filter])
+        y3 = self.fc_attention3(y3)
+        y3 = self.sig(y3).view(y3.size(0), y3.size(1), -1)  # torch.Size([batch, filter, 1])
+        x = x3 * y3 + y3 # (batch, filter, time) x (batch, filter, 1)
+        x4 = self.block4(x)
+        y4 = self.avgpool(x4).view(x4.size(0), -1) # torch.Size([batch, filter])
+        y4 = self.fc_attention4(y4)
+        y4 = self.sig(y4).view(y4.size(0), y4.size(1), -1)  # torch.Size([batch, filter, 1])
+        x = x4 * y4 + y4 # (batch, filter, time) x (batch, filter, 1)
+        x5 = self.block5(x)
+        y5 = self.avgpool(x5).view(x5.size(0), -1) # torch.Size([batch, filter])
+        y5 = self.fc_attention5(y5)
+        y5 = self.sig(y5).view(y5.size(0), y5.size(1), -1)  # torch.Size([batch, filter, 1])
+        x = x5 * y5 + y5 # (batch, filter, time) x (batch, filter, 1)
+        x = self.bn_before_gru(x)
+        x = self.selu(x)
+        x = x.permute(0, 2, 1)     #(batch, filt, time) >> (batch, time, filt)
+        self.gru.flatten_parameters()
+        x, _ = self.gru(x)
+        x = x[:,-1,:]
+        x = self.fc1_gru(x)
+        x = self.fc2_gru(x)
+        output=self.logsoftmax(x)
+        print(f"Spec output shape: {output.shape}")
+        return output
+    def _make_attention_fc(self, in_features, l_out_features):
+        l_fc = []
+        l_fc.append(nn.Linear(in_features = in_features,
+			        out_features = l_out_features))
+        return nn.Sequential(*l_fc)
+    def _make_layer(self, nb_blocks, nb_filts, first = False):
+        layers = []
+        #def __init__(self, nb_filts, first = False):
+        for i in range(nb_blocks):
+            first = first if i == 0 else False
+            layers.append(Residual_block(nb_filts = nb_filts,
+				first = first))
+            if i == 0: nb_filts[0] = nb_filts[1]
+        return nn.Sequential(*layers)
+    def summary(self, input_size, batch_size=-1, device="cuda", print_fn = None):
+        if print_fn == None: printfn = print
+        model = self
+        def register_hook(module):
+            def hook(module, input, output):
+                class_name = str(module.__class__).split(".")[-1].split("'")[0]
+                module_idx = len(summary)
+                m_key = "%s-%i" % (class_name, module_idx + 1)
+                summary[m_key] = OrderedDict()
+                summary[m_key]["input_shape"] = list(input[0].size())
+                summary[m_key]["input_shape"][0] = batch_size
+                if isinstance(output, (list, tuple)):
+                    summary[m_key]["output_shape"] = [
+						[-1] + list(o.size())[1:] for o in output
+					]
+                else:
+                    summary[m_key]["output_shape"] = list(output.size())
+                    if len(summary[m_key]["output_shape"]) != 0:
+                        summary[m_key]["output_shape"][0] = batch_size
+                params = 0
+                if hasattr(module, "weight") and hasattr(module.weight, "size"):
+                    params += torch.prod(torch.LongTensor(list(module.weight.size())))
+                    summary[m_key]["trainable"] = module.weight.requires_grad
+                if hasattr(module, "bias") and hasattr(module.bias, "size"):
+                    params += torch.prod(torch.LongTensor(list(module.bias.size())))
+                summary[m_key]["nb_params"] = params
+            if (
+				not isinstance(module, nn.Sequential)
+				and not isinstance(module, nn.ModuleList)
+				and not (module == model)
+			):
+                hooks.append(module.register_forward_hook(hook))
+        device = device.lower()
+        assert device in [
+			"cuda",
+			"cpu",
+		], "Input device is not valid, please specify 'cuda' or 'cpu'"
+        if device == "cuda" and torch.cuda.is_available():
+            dtype = torch.cuda.FloatTensor
+        else:
+            dtype = torch.FloatTensor
+        if isinstance(input_size, tuple):
+            input_size = [input_size]
+        x = [torch.rand(2, *in_size).type(dtype) for in_size in input_size]
+        summary = OrderedDict()
+        hooks = []
+        model.apply(register_hook)
+        model(*x)
+        for h in hooks:
+            h.remove()
+        print_fn("----------------------------------------------------------------")
+        line_new = "{:>20}  {:>25} {:>15}".format("Layer (type)", "Output Shape", "Param #")
+        print_fn(line_new)
+        print_fn("================================================================")
+        total_params = 0
+        total_output = 0
+        trainable_params = 0
+        for layer in summary:
+            # input_shape, output_shape, trainable, nb_params
+            line_new = "{:>20}  {:>25} {:>15}".format(
+				layer,
+				str(summary[layer]["output_shape"]),
+				"{0:,}".format(summary[layer]["nb_params"]),
+			)
+            total_params += summary[layer]["nb_params"]
+            total_output += np.prod(summary[layer]["output_shape"])
+            if "trainable" in summary[layer]:
+                if summary[layer]["trainable"] == True:
+                    trainable_params += summary[layer]["nb_params"]
+            print_fn(line_new)