Source code for flowvision.models.resnest

"""
Modified from https://github.com/zhanghang1989/ResNeSt/blob/master/resnest/torch/models/resnest.py
"""
import math

import oneflow as flow
import oneflow.nn as nn
import oneflow.nn.functional as F

from .utils import load_state_dict_from_url
from .registry import ModelCreator


__all__ = ["resnest50", "resnest101", "resnest200", "resnest269"]

model_urls = {
    "resnest50": "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNeSt/ResNeSt_50.zip",
    "resnest101": "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNeSt/ResNeSt_101.zip",
    "resnest200": "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNeSt/ResNeSt_200.zip",
    "resnest269": "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNeSt/ResNeSt_269.zip",
}


class SplAtConv2d(nn.Module):
    r""" Split-Attention Conv2d. 
    
    Args:
        in_channels (int): Number of input channels.
        channels (int): Number of channels in each split within a cardinal group.
        kernel_size (int): Size of convolutional kernel. 
        stride (tuple(int)): Stride of convolution. Default: (1, 1)
        padding (tuple(int)): Padding of convolution. Default: (0, 0)
        dilation (tuple(int)): Dilation of convolution. Default: (1, 1)
        groups (int): Number of feature map cardinal groups. Default: 1
        bias (bool): Default: True
        radix (int): Number of splits within a cardinal group. Default: 2
        reduction_factor (int): Reduction factor. Default: 4
        norm_layer: Normalization layer used in backbone network. Default: nn.BatchNorm2d
    """

    def __init__(
        self,
        in_channels,
        channels,
        kernel_size,
        stride=(1, 1),
        padding=(0, 0),
        dilation=(1, 1),
        groups=1,
        bias=True,
        radix=2,
        reduction_factor=4,
        norm_layer=None,
        **kwargs
    ):
        super(SplAtConv2d, self).__init__()
        padding = nn.modules.utils._pair(padding)
        inter_channels = max(in_channels * radix // reduction_factor, 32)
        self.radix = radix

        self.cardinality = groups
        self.channels = channels
        self.conv = nn.Conv2d(
            in_channels,
            channels * radix,
            kernel_size,
            stride,
            padding,
            dilation,
            groups=groups * radix,
            bias=bias,
            **kwargs
        )
        self.use_bn = norm_layer is not None
        if self.use_bn:
            self.bn0 = norm_layer(channels * radix)
        self.relu = nn.ReLU(inplace=True)
        self.fc1 = nn.Conv2d(channels, inter_channels, 1, groups=self.cardinality)
        if self.use_bn:
            self.bn1 = norm_layer(inter_channels)
        self.fc2 = nn.Conv2d(
            inter_channels, channels * radix, 1, groups=self.cardinality
        )
        self.rsoftmax = rSoftMax(radix, groups)

    def forward(self, x):
        x = self.conv(x)
        if self.use_bn:
            x = self.bn0(x)
        x = self.relu(x)

        batch, rchannel = x.shape[:2]
        if self.radix > 1:
            splited = flow.split(x, rchannel // self.radix, dim=1)
            gap = sum(splited)
        else:
            gap = x
        gap = F.adaptive_avg_pool2d(gap, 1)
        gap = self.fc1(gap)

        if self.use_bn:
            gap = self.bn1(gap)
        gap = self.relu(gap)

        atten = self.fc2(gap)
        atten = self.rsoftmax(atten).view(batch, -1, 1, 1)

        if self.radix > 1:
            attens = flow.split(atten, rchannel // self.radix, dim=1)
            out = sum([att * split for (att, split) in zip(attens, splited)])
        else:
            out = atten * x
        return out.contiguous()


class rSoftMax(nn.Module):
    def __init__(self, radix, cardinality):
        super().__init__()
        self.radix = radix
        self.cardinality = cardinality

    def forward(self, x):
        batch = x.size(0)
        if self.radix > 1:
            x = x.view(batch, self.cardinality, self.radix, -1).transpose(1, 2)
            x = F.softmax(x, dim=1)
            x = x.reshape(batch, -1)
        else:
            x = flow.sigmoid(x)
        return x


class GlobalAvgPool2d(nn.Module):
    def __init__(self):
        """Global average pooling over the input's spatial dimensions"""
        super(GlobalAvgPool2d, self).__init__()

    def forward(self, inputs):
        return nn.functional.adaptive_avg_pool2d(inputs, 1).view(inputs.size(0), -1)


class ResNestBottleneck(nn.Module):
    """ResNest Bottleneck
    """

    expansion = 4

    def __init__(
        self,
        inplanes,
        planes,
        stride=1,
        downsample=None,
        radix=1,
        cardinality=1,
        bottleneck_width=64,
        avd=False,
        avd_first=False,
        dilation=1,
        is_first=False,
        norm_layer=None,
        last_gamma=False,
    ):
        super(ResNestBottleneck, self).__init__()
        group_width = int(planes * (bottleneck_width / 64.0)) * cardinality
        self.conv1 = nn.Conv2d(inplanes, group_width, kernel_size=1, bias=False)
        self.bn1 = norm_layer(group_width)
        self.radix = radix
        self.avd = avd and (stride > 1 or is_first)
        self.avd_first = avd_first

        if self.avd:
            self.avd_layer = nn.AvgPool2d(3, stride, padding=1)
            stride = 1

        if radix >= 1:
            self.conv2 = SplAtConv2d(
                group_width,
                group_width,
                kernel_size=3,
                stride=stride,
                padding=dilation,
                dilation=dilation,
                groups=cardinality,
                bias=False,
                radix=radix,
                norm_layer=norm_layer,
            )
        else:
            self.conv2 = nn.Conv2d(
                group_width,
                group_width,
                kernel_size=3,
                stride=stride,
                padding=dilation,
                dilation=dilation,
                groups=cardinality,
                bias=False,
            )
            self.bn2 = norm_layer(group_width)

        self.conv3 = nn.Conv2d(group_width, planes * 4, kernel_size=1, bias=False)
        self.bn3 = norm_layer(planes * 4)

        if last_gamma:
            nn.init.zeros_(self.bn3.weight)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.dilation = dilation
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        if self.avd and self.avd_first:
            out = self.avd_layer(out)

        out = self.conv2(out)
        if self.radix == 0:
            out = self.bn2(out)
            out = self.relu(out)

        if self.avd and not self.avd_first:
            out = self.avd_layer(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class ResNest(nn.Module):
    """ResNest: 
    The OneFlow impl of : 'ResNeSt: Split-Attention Networks' - 
        https://arxiv.org/pdf/2004.08955.pdf
        
    Args:
        block: Class for the residual block. Default: ResNestBottleneck
        layers (list(int)) : Numbers of layers in each block.
        radix (int): Number of splits within a cardinal group. Default: 2
        groups (int): Number of feature map cardinal groups. Default: 1
    """

    def __init__(
        self,
        block,
        layers,
        radix=2,
        groups=1,
        bottleneck_width=64,
        num_classes=1000,
        dilated=False,
        dilation=1,
        deep_stem=False,
        stem_width=64,
        avg_down=False,
        avd=False,
        avd_first=False,
        final_drop=0.0,
        last_gamma=False,
        norm_layer=nn.BatchNorm2d,
    ):
        self.cardinality = groups
        self.bottleneck_width = bottleneck_width
        # ResNet-D params
        self.inplanes = stem_width * 2 if deep_stem else 64
        self.avg_down = avg_down
        self.last_gamma = last_gamma
        # ResNeSt params
        self.radix = radix
        self.avd = avd
        self.avd_first = avd_first

        super(ResNest, self).__init__()

        conv_layer = nn.Conv2d

        if deep_stem:
            self.conv1 = nn.Sequential(
                conv_layer(
                    3, stem_width, kernel_size=3, stride=2, padding=1, bias=False
                ),
                norm_layer(stem_width),
                nn.ReLU(inplace=True),
                conv_layer(
                    stem_width,
                    stem_width,
                    kernel_size=3,
                    stride=1,
                    padding=1,
                    bias=False,
                ),
                norm_layer(stem_width),
                nn.ReLU(inplace=True),
                conv_layer(
                    stem_width,
                    stem_width * 2,
                    kernel_size=3,
                    stride=1,
                    padding=1,
                    bias=False,
                ),
            )
        else:
            self.conv1 = conv_layer(
                3, 64, kernel_size=7, stride=2, padding=3, bias=False
            )
        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(
            block, 64, layers[0], norm_layer=norm_layer, is_first=False
        )
        self.layer2 = self._make_layer(
            block, 128, layers[1], stride=2, norm_layer=norm_layer
        )
        if dilated or dilation == 4:
            self.layer3 = self._make_layer(
                block, 256, layers[2], stride=1, dilation=2, norm_layer=norm_layer
            )
            self.layer4 = self._make_layer(
                block, 512, layers[3], stride=1, dilation=4, norm_layer=norm_layer
            )
        elif dilation == 2:
            self.layer3 = self._make_layer(
                block, 256, layers[2], stride=2, dilation=1, norm_layer=norm_layer
            )
            self.layer4 = self._make_layer(
                block, 512, layers[3], stride=1, dilation=2, norm_layer=norm_layer
            )
        else:
            self.layer3 = self._make_layer(
                block, 256, layers[2], stride=2, norm_layer=norm_layer
            )
            self.layer4 = self._make_layer(
                block, 512, layers[3], stride=2, norm_layer=norm_layer
            )
        self.avgpool = GlobalAvgPool2d()
        self.drop = nn.Dropout(final_drop) if final_drop > 0.0 else None
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2.0 / n))
            elif isinstance(m, norm_layer):
                m.weight.data.fill_(1)
                m.bias.data.zeros_()

    def _make_layer(
        self,
        block,
        planes,
        blocks,
        stride=1,
        dilation=1,
        norm_layer=None,
        is_first=True,
    ):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            down_layers = []
            if self.avg_down:
                if dilation == 1:
                    down_layers.append(
                        nn.AvgPool2d(
                            kernel_size=stride,
                            stride=stride,
                            ceil_mode=True,
                            count_include_pad=False,
                        )
                    )
                else:
                    down_layers.append(
                        nn.AvgPool2d(
                            kernel_size=1,
                            stride=1,
                            ceil_mode=True,
                            count_include_pad=False,
                        )
                    )
                down_layers.append(
                    nn.Conv2d(
                        self.inplanes,
                        planes * block.expansion,
                        kernel_size=1,
                        stride=1,
                        bias=False,
                    )
                )
            else:
                down_layers.append(
                    nn.Conv2d(
                        self.inplanes,
                        planes * block.expansion,
                        kernel_size=1,
                        stride=stride,
                        bias=False,
                    )
                )
            down_layers.append(norm_layer(planes * block.expansion))
            downsample = nn.Sequential(*down_layers)

        layers = []
        if dilation == 1 or dilation == 2:
            layers.append(
                block(
                    self.inplanes,
                    planes,
                    stride,
                    downsample=downsample,
                    radix=self.radix,
                    cardinality=self.cardinality,
                    bottleneck_width=self.bottleneck_width,
                    avd=self.avd,
                    avd_first=self.avd_first,
                    dilation=1,
                    is_first=is_first,
                    norm_layer=norm_layer,
                    last_gamma=self.last_gamma,
                )
            )
        elif dilation == 4:
            layers.append(
                block(
                    self.inplanes,
                    planes,
                    stride,
                    downsample=downsample,
                    radix=self.radix,
                    cardinality=self.cardinality,
                    bottleneck_width=self.bottleneck_width,
                    avd=self.avd,
                    avd_first=self.avd_first,
                    dilation=2,
                    is_first=is_first,
                    norm_layer=norm_layer,
                    last_gamma=self.last_gamma,
                )
            )
        else:
            raise RuntimeError("=> unknown dilation size: {}".format(dilation))

        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(
                block(
                    self.inplanes,
                    planes,
                    radix=self.radix,
                    cardinality=self.cardinality,
                    bottleneck_width=self.bottleneck_width,
                    avd=self.avd,
                    avd_first=self.avd_first,
                    dilation=dilation,
                    norm_layer=norm_layer,
                    last_gamma=self.last_gamma,
                )
            )

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = flow.flatten(x, 1)
        if self.drop:
            x = self.drop(x)
        x = self.fc(x)
        return x


def _create_resnest(arch, pretrained=False, progress=True, **model_kwargs):
    model = ResNest(**model_kwargs)
    if pretrained:
        state_dict = load_state_dict_from_url(model_urls[arch], progress=progress)
        model.load_state_dict(state_dict)
    return model


[docs]@ModelCreator.register_model
def resnest50(pretrained=False, progress=True, **kwargs):
    """
    Constructs the ResNeSt-50 model trained on ImageNet2012.

    .. note::
        ResNeSt-50 model from `"ResNeSt: Split-Attention Networks" <https://arxiv.org/abs/2004.08955>` _.
        The required input size of the model is 224x224.

    Args:
        pretrained (bool): Whether to download the pre-trained model on ImageNet. Default: ``False``
        progress (bool): If True, displays a progress bar of the download to stderr. Default: ``True``

    For example:

    .. code-block:: python

        >>> import flowvision
        >>> resnest50 = flowvision.models.resnest50(pretrained=False, progress=True)

    """
    model_kwargs = dict(
        block=ResNestBottleneck,
        layers=[3, 4, 6, 3],
        radix=2,
        groups=1,
        bottleneck_width=64,
        deep_stem=True,
        stem_width=32,
        avg_down=True,
        avd=True,
        avd_first=False,
        **kwargs
    )
    return _create_resnest(
        "resnest50", pretrained=pretrained, progress=progress, **model_kwargs
    )


[docs]@ModelCreator.register_model
def resnest101(pretrained=False, progress=True, **kwargs):
    """
    Constructs the ResNeSt-101 model trained on ImageNet2012.

    .. note::
        ResNeSt-101 model from `"ResNeSt: Split-Attention Networks" <https://arxiv.org/abs/2004.08955>` _.
        The required input size of the model is 256x256.

    Args:
        pretrained (bool): Whether to download the pre-trained model on ImageNet. Default: ``False``
        progress (bool): If True, displays a progress bar of the download to stderr. Default: ``True``

    For example:

    .. code-block:: python

        >>> import flowvision
        >>> resnest101 = flowvision.models.resnest101(pretrained=False, progress=True)

    """
    model_kwargs = dict(
        block=ResNestBottleneck,
        layers=[3, 4, 23, 3],
        radix=2,
        groups=1,
        bottleneck_width=64,
        deep_stem=True,
        stem_width=64,
        avg_down=True,
        avd=True,
        avd_first=False,
        **kwargs
    )
    return _create_resnest(
        "resnest101", pretrained=pretrained, progress=progress, **model_kwargs
    )


[docs]@ModelCreator.register_model
def resnest200(pretrained=False, progress=True, **kwargs):
    """
    Constructs the ResNeSt-200 model trained on ImageNet2012.

    .. note::
        ResNeSt-200 model from `"ResNeSt: Split-Attention Networks" <https://arxiv.org/abs/2004.08955>` _.
        The required input size of the model is 320x320.

    Args:
        pretrained (bool): Whether to download the pre-trained model on ImageNet. Default: ``False``
        progress (bool): If True, displays a progress bar of the download to stderr. Default: ``True``

    For example:

    .. code-block:: python

        >>> import flowvision
        >>> resnest200 = flowvision.models.resnest200(pretrained=False, progress=True)

    """
    model_kwargs = dict(
        block=ResNestBottleneck,
        layers=[3, 24, 36, 3],
        radix=2,
        groups=1,
        bottleneck_width=64,
        deep_stem=True,
        stem_width=64,
        avg_down=True,
        avd=True,
        avd_first=False,
        **kwargs
    )
    return _create_resnest(
        "resnest200", pretrained=pretrained, progress=progress, **model_kwargs
    )


[docs]@ModelCreator.register_model
def resnest269(pretrained=False, progress=True, **kwargs):
    """
    Constructs the ResNeSt-269 model trained on ImageNet2012.

    .. note::
        ResNeSt-269 model from `"ResNeSt: Split-Attention Networks" <https://arxiv.org/abs/2004.08955>` _.
        The required input size of the model is 416x416.

    Args:
        pretrained (bool): Whether to download the pre-trained model on ImageNet. Default: ``False``
        progress (bool): If True, displays a progress bar of the download to stderr. Default: ``True``

    For example:

    .. code-block:: python

        >>> import flowvision
        >>> resnest269 = flowvision.models.resnest269(pretrained=False, progress=True)

    """
    model_kwargs = dict(
        block=ResNestBottleneck,
        layers=[3, 30, 48, 8],
        radix=2,
        groups=1,
        bottleneck_width=64,
        deep_stem=True,
        stem_width=64,
        avg_down=True,
        avd=True,
        avd_first=False,
        **kwargs
    )
    return _create_resnest(
        "resnest269", pretrained=pretrained, progress=progress, **model_kwargs
    )