diff --git a/models.py b/models.py index 3302431..40ea5c1 100644 --- a/models.py +++ b/models.py @@ -435,6 +435,50 @@ class TuckER(torch.nn.Module): return pred +class PatchMerging(nn.Module): + r""" Patch Merging Layer. + + Args: + input_resolution (tuple[int]): Resolution of input feature. + dim (int): Number of input channels. + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, dim, norm_layer=nn.LayerNorm): + super().__init__() + self.dim = dim + self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) + self.norm = norm_layer(2 * dim) + + def forward(self, x): + """ + x: B, C, H, W + """ + B, C, H, W = x.shape + assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even." + + x = x.view(B, H, W, C) + + x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C + x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C + x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C + x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C + x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C + x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C + + x = self.reduction(x) + x = self.norm(x) + + return x + + def extra_repr(self) -> str: + return f"input_resolution={self.input_resolution}, dim={self.dim}" + + def flops(self): + H, W = self.input_resolution + flops = (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim + flops += H * W * self.dim // 2 + return flops class FouriER(torch.nn.Module): def __init__(self, params, hid_drop = None, embed_dim = None): @@ -519,11 +563,12 @@ class FouriER(torch.nn.Module): if downsamples[i] or embed_dims[i] != embed_dims[i+1]: # downsampling between two stages network.append( - PatchEmbed( - patch_size=down_patch_size, stride=down_stride, - padding=down_pad, - in_chans=embed_dims[i], embed_dim=embed_dims[i+1] - ) + # PatchEmbed( + # patch_size=down_patch_size, stride=down_stride, + # padding=down_pad, + # in_chans=embed_dims[i], embed_dim=embed_dims[i+1] + # ) + PatchMerging(dim=embed_dims[i+1]) ) self.network = nn.ModuleList(network)