add initial i3d features

3ece3c41 · Chrol-Cannon, Joseph Dr (Computer Science) · 5d9ab943 · 3ece3c41 · 3ece3c41 · 3ece3c41
Commit 3ece3c41 authored 3 years ago by Chrol-Cannon, Joseph Dr (Computer Science)
--- a/i3D/gtransforms.py
+++ b/i3D/gtransforms.py
+# Borrowed from: https://github.com/yjxiong/tsn-pytorch/blob/master/transforms.py
+
+import torchvision
+import random
+from PIL import Image
+import numbers
+import torch
+import torchvision.transforms.functional as F
+
+
+class GroupResize(object):
+    def __init__(self, size, interpolation=Image.BILINEAR):
+        self.worker = torchvision.transforms.Resize(size, interpolation)
+
+    def __call__(self, img_group):
+        return [self.worker(img) for img in img_group]
+
+
+class GroupRandomCrop(object):
+    def __init__(self, size):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+
+    def __call__(self, img_group):
+
+        w, h = img_group[0].size
+        th, tw = self.size
+
+        out_images = list()
+
+        x1 = random.randint(0, w - tw)
+        y1 = random.randint(0, h - th)
+
+        for img in img_group:
+            assert (img.size[0] == w and img.size[1] == h)
+            if w == tw and h == th:
+                out_images.append(img)
+            else:
+                out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
+
+        return out_images
+
+
+class GroupCenterCrop(object):
+    def __init__(self, size):
+        self.worker = torchvision.transforms.CenterCrop(size)
+
+    def __call__(self, img_group):
+        return [self.worker(img) for img in img_group]
+
+
+class GroupRandomHorizontalFlip(object):
+    def __call__(self, img_group):
+        if random.random() < 0.5:
+            img_group = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group]
+        return img_group
+
+
+class GroupNormalize(object):
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, tensor):  # (T, 3, 224, 224)
+        for b in range(tensor.size(0)):
+            for t, m, s in zip(tensor[b], self.mean, self.std):
+                t.sub_(m).div_(s)
+        return tensor
+
+
+class LoopPad(object):
+
+    def __init__(self, max_len):
+        self.max_len = max_len
+
+    def __call__(self, tensor):
+        length = tensor.size(0)
+
+        if length == self.max_len:
+            return tensor
+
+        # repeat the clip as many times as is necessary
+        n_pad = self.max_len - length
+        pad = [tensor] * (n_pad // length)
+        if n_pad % length > 0:
+            pad += [tensor[0:n_pad % length]]
+
+        tensor = torch.cat([tensor] + pad, 0)
+        return tensor
+
+
+# NOTE: Returns [0-255] rather than torchvision's [0-1]
+class ToTensor(object):
+    def __init__(self):
+        self.worker = lambda x: F.to_tensor(x) * 255
+
+    def __call__(self, img_group):
+        img_group = [self.worker(img) for img in img_group]
+        return torch.stack(img_group, 0)
+
+class GroupMultiScaleCrop(object):
+    def __init__(self, output_size, scales=None, max_distort=1, fix_crop=True, more_fix_crop=True,
+                 center_crop_only=False):
+        self.scales = scales if scales is not None else [1, .875, .75, .66]
+        self.max_distort = max_distort
+        self.fix_crop = fix_crop
+        self.more_fix_crop = more_fix_crop
+        self.center_crop_only = center_crop_only
+        assert center_crop_only is False or max_distort == 0 and len(self.scales) == 1, \
+            'Center crop should only be performed during testing time.'
+        self.output_size = output_size if not isinstance(output_size, int) else [output_size, output_size]
+        self.interpolation = Image.BILINEAR
+
+    def __call__(self, img_group):
+
+        im_size = img_group[0].size
+
+        crop_w, crop_h, offset_w, offset_h = self._sample_crop_size(im_size)
+        crop_img_group = [img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h)) for img in img_group]
+        ret_img_group = [img.resize((self.output_size[0], self.output_size[1]), self.interpolation)
+                         for img in crop_img_group]
+        return ret_img_group, (offset_h, offset_w, crop_h, crop_w)
+
+    def _sample_crop_size(self, im_size):
+        image_w, image_h = im_size[0], im_size[1]
+
+        # find a crop size
+        base_size = min(image_w, image_h)
+        crop_sizes = [int(base_size * x) for x in self.scales]
+        crop_h = [self.output_size[1] if abs(x - self.output_size[1]) < 3 else x for x in crop_sizes]
+        crop_w = [self.output_size[0] if abs(x - self.output_size[0]) < 3 else x for x in crop_sizes]
+
+        pairs = []
+        for i, h in enumerate(crop_h):
+            for j, w in enumerate(crop_w):
+                if abs(i - j) <= self.max_distort:
+                    pairs.append((w, h))
+
+        crop_pair = random.choice(pairs)
+        if not self.fix_crop:
+            w_offset = random.randint(0, image_w - crop_pair[0])
+            h_offset = random.randint(0, image_h - crop_pair[1])
+        else:
+            w_offset, h_offset = self._sample_fix_offset(image_w, image_h, crop_pair[0], crop_pair[1])
+
+        return crop_pair[0], crop_pair[1], w_offset, h_offset
+
+    def _sample_fix_offset(self, image_w, image_h, crop_w, crop_h):
+        offsets = self.fill_fix_offset(self.center_crop_only, self.more_fix_crop, image_w, image_h, crop_w, crop_h)
+        return random.choice(offsets)
+
+    @staticmethod
+    def fill_fix_offset(center_crop_only, more_fix_crop, image_w, image_h, crop_w, crop_h):
+        w_step = (image_w - crop_w) // 4
+        h_step = (image_h - crop_h) // 4
+
+        ret = list()
+        ret.append((0, 0))  # upper left
+        ret.append((2 * w_step, 2 * h_step))  # center
+        if center_crop_only:
+            return ret
+        ret.append((4 * w_step, 0))  # upper right
+        ret.append((0, 4 * h_step))  # lower left
+        ret.append((4 * w_step, 4 * h_step))  # lower right
+
+        if more_fix_crop:
+            ret.append((0, 2 * h_step))  # center left
+            ret.append((4 * w_step, 2 * h_step))  # center right
+            ret.append((2 * w_step, 4 * h_step))  # lower center
+            ret.append((2 * w_step, 0 * h_step))  # upper center
+
+            ret.append((1 * w_step, 1 * h_step))  # upper left quarter
+            ret.append((3 * w_step, 1 * h_step))  # upper right quarter
+            ret.append((1 * w_step, 3 * h_step))  # lower left quarter
+            ret.append((3 * w_step, 3 * h_step))  # lower righ quarter
+
+        return ret
--- a/i3D/model.py
+++ b/i3D/model.py
+import torch
+import torch.nn as nn
+from i3D.resnet3d_xl import Net
+import torch.nn.functional as F
+'''
+Video Classification Model library.
+'''
+
+class TrainingScheduleError(Exception):
+    pass
+
+class VideoModel(nn.Module):
+    def __init__(self,
+                 num_classes,
+                 num_boxes,
+                 num_videos=16,
+                 restore_dict=None,
+                 freeze_weights=None,
+                 device=None,
+                 loss_type='softmax'):
+        super(VideoModel, self).__init__()
+        self.device = device
+        self.num_frames = num_videos
+        self.num_classes = num_classes
+        # Network loads kinetic pre-trained weights in initialization
+        self.i3D = Net(num_classes, extract_features=True, loss_type=loss_type)
+
+
+        try:
+            # Restore weights
+            if restore_dict:
+                self.restore(restore_dict)
+            # Freeze weights
+            if freeze_weights:
+                self.freeze_weights(freeze_weights)
+            else:
+                print(" > No weights are freezed")
+        except Exception as e:
+            print(" > Exception {}".format(e))
+
+    def restore(self, restore=None):
+        # Load pre-trained I3D + Graph weights for fine-tune (replace the last FC)
+        restore_finetuned = restore.get("restore_finetuned", None)
+        if restore_finetuned:
+            self._restore_fintuned(restore_finetuned)
+            print(" > Restored I3D + Graph weights")
+            return
+
+        # Load pre-trained I3D weights
+        restore_i3d = restore.get("restore_i3d", None)
+        if restore_i3d:
+            self._restore_i3d(restore_i3d)
+            print(" > Restored only I3D weights")
+            return
+
+        # Load pre-trained I3D + Graph weights without replacing anything
+        restore_predict = restore.get("restore_predict", None)
+        if restore_predict:
+            self._restore_predict(restore_predict)
+            print(" > Restored the model with strict weights")
+            return
+
+    def _restore_predict(self, path):
+        if path is None:
+            raise TrainingScheduleError('You should pre-train the video model on your training data first')
+
+        weights = torch.load(path, map_location=self.device)['state_dict']
+        new_weights = {}
+        for k, v in weights.items():
+            new_weights[k.replace('module.', '')] = v
+
+        self.load_state_dict(new_weights, strict=True)
+        print(" > Weights {} loaded".format(path))
+
+    def _restore_i3d(self, path):
+        if path is None:
+            raise TrainingScheduleError('You should pre-train the video model on your training data first')
+       
+        weights = torch.load(path, map_location=self.device)['state_dict']
+        new_weights = {}
+        for k, v in weights.items():
+            if not k.startswith('module.fc') and not k.startswith('module.i3D.classifier'):
+                new_weights[k.replace('module.', '')] = v
+        self.load_state_dict(new_weights, strict=False)
+
+    def _restore_fintuned(self, path):
+        if path is None:
+            raise TrainingScheduleError('You should pre-train the video model on your training data first')
+
+        weights = torch.load(path, map_location=self.device)['state_dict']
+        new_weights = {}
+        for k, v in weights.items():
+            # Don't load classifiers (different classes 88 vs 86)
+            if not k.startswith('module.fc'):
+                if not k.startswith('module.i3D.classifier'):
+                    new_weights[k.replace('module.', '')] = v
+
+        self.load_state_dict(new_weights, strict=False)
+        print(" > Weights {} loaded".format(path))
+
+    def freeze_weights(self, module):
+        if module == 'i3d':
+            print(" > Freeze I3D module")
+            for param in self.i3D.parameters():
+                param.requires_grad = False
+        elif module == 'fine_tuned':
+            print(" > Freeze Graph + I3D module, only last FC is training")
+            # Fixed the entire params without the last FC
+            for name, param in self.i3D.named_parameters():
+                if not name.startswith('classifier'):
+                    param.requires_grad = False
+            for param in self.graph_embedding.parameters():
+                param.requires_grad = False
+            for param in self.conv.parameters():
+                param.requires_grad = False
+
+        else:
+            raise NotImplementedError('Unrecognized option, you can freeze either graph module or I3D module')
+        pass
+
+    def _get_i3d_features(self, videos, output_video_features=False):
+        # org_features - [V x 2048 x T / 2 x 14 x 14]
+        _, org_features = self.i3D(videos)
+        # Reduce dimension video_features - [V x 512 x T / 2 x 14 x 14]
+        videos_features = self.conv(org_features)
+        bs, d, t, h, w = videos_features.size()
+        # Get global features
+        videos_features_rs = videos_features.permute(0, 2, 1, 3, 4)  # [V x T / 2 x 512 x h x w]
+        videos_features_rs = videos_features_rs.reshape(-1, d, h, w)  # [V * T / 2 x 512 x h x w]
+        global_features = self.avgpool(videos_features_rs)  # [V * T / 2 x 512 x 1 x 1]
+        global_features = self.dropout(global_features)
+        global_features = global_features.reshape(bs, t, d)  # [V x T / 2 x 512]
+        if output_video_features:
+            return global_features, videos_features
+        else:
+            return global_features
+
+    def flatten(self, x):
+        return [item for sublist in x for item in sublist]
+
--- a/i3D/model_lib.py
+++ b/i3D/model_lib.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from i3D.resnet3d_xl import Net
+from i3D.nonlocal_helper import Nonlocal
+
+
+class VideoModelCoord(nn.Module):
+    def __init__(self, opt):
+        super(VideoModelCoord, self).__init__()
+        self.nr_boxes = opt.num_boxes
+        self.nr_actions = opt.num_classes
+        self.nr_frames = opt.num_frames // 2
+        self.coord_feature_dim = opt.coord_feature_dim
+
+        self.coord_to_feature = nn.Sequential(
+            nn.Linear(4, self.coord_feature_dim//2, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim//2),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.coord_feature_dim//2, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU()
+        )
+
+        self.spatial_node_fusion = nn.Sequential(
+            nn.Linear(self.coord_feature_dim*2, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU()
+        )
+
+        self.box_feature_fusion = nn.Sequential(
+            nn.Linear(self.nr_frames*self.coord_feature_dim, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU()
+        )
+
+        self.classifier = nn.Sequential(
+            nn.Linear(self.coord_feature_dim, self.coord_feature_dim),
+            # nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.coord_feature_dim, 512), #self.coord_feature_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(512, self.nr_actions)
+        )
+
+        if opt.fine_tune:
+            self.fine_tune(opt.fine_tune)
+
+    def fine_tune(self, restore_path, parameters_to_train=['classifier']):
+        weights = torch.load(restore_path)['state_dict']
+        new_weights = {}
+        #import pdb
+        for k, v in weights.items():
+            if not 'classifier.4' in k:
+                new_weights[k.replace('module.', '')] = v
+        #pdb.set_trace()
+        self.load_state_dict(new_weights, strict=False)
+        print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
+
+        frozen_weights = 0
+        for name, param in self.named_parameters():
+            if not 'classifier.4' in name:
+
+                param.requires_grad = False
+                frozen_weights += 1
+
+            else:
+                print('Training : {}'.format(name))
+        print('Number of frozen weights {}'.format(frozen_weights))
+        assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
+                                    'Check the naming convention of the parameters'
+
+    def forward(self, global_img_input, box_categories, box_input, video_label, is_inference=False):
+        # local_img_tensor is (b, nr_frames, nr_boxes, 3, h, w)
+        # global_img_tensor is (b, nr_frames, 3, h, w)
+        # box_input is (b, nr_frames, nr_boxes, 4)
+
+        b, _, _, _h, _w = global_img_input.size()
+        # global_imgs = global_img_input.view(b*self.nr_frames, 3, _h, _w)
+        # local_imgs = local_img_input.view(b*self.nr_frames*self.nr_boxes, 3, _h, _w)
+
+        box_input = box_input.transpose(2, 1).contiguous()
+        box_input = box_input.view(b*self.nr_boxes*self.nr_frames, 4)
+
+        bf = self.coord_to_feature(box_input)
+        bf = bf.view(b, self.nr_boxes, self.nr_frames, self.coord_feature_dim)
+
+        # spatial message passing (graph)
+        spatial_message = bf.sum(dim=1, keepdim=True)  # (b, 1, self.nr_frames, coord_feature_dim)
+        # message passed should substract itself, and normalize to it as a single feature
+        spatial_message = (spatial_message - bf) / (self.nr_boxes - 1)  # message passed should substract itself
+        bf_and_message = torch.cat([bf, spatial_message], dim=3)  # (b, nr_boxes, nr_frames, 2*coord_feature_dim)
+
+        # (b*nr_boxes*nr_frames, coord_feature_dim)
+        bf_spatial = self.spatial_node_fusion(bf_and_message.view(b*self.nr_boxes*self.nr_frames, -1))
+        bf_spatial = bf_spatial.view(b, self.nr_boxes, self.nr_frames, self.coord_feature_dim)
+
+        bf_temporal_input = bf_spatial.view(b, self.nr_boxes, self.nr_frames*self.coord_feature_dim)
+
+        box_features = self.box_feature_fusion(bf_temporal_input.view(b*self.nr_boxes, -1))  # (b*nr_boxes, coord_feature_dim)
+        box_features = torch.mean(box_features.view(b, self.nr_boxes, -1), dim=1)  # (b, coord_feature_dim)
+        # video_features = torch.cat([global_features, local_features, box_features], dim=1)
+        video_features = box_features
+
+        cls_output = self.classifier(video_features)  # (b, num_classes)
+        return cls_output
+
+class VideoModelCoordLatent(nn.Module):
+    def __init__(self, opt):
+        super(VideoModelCoordLatent, self).__init__()
+        self.nr_boxes = opt.num_boxes
+        self.nr_actions = opt.num_classes
+        self.nr_frames = opt.num_frames // 2
+        self.img_feature_dim = opt.img_feature_dim
+        self.coord_feature_dim = opt.coord_feature_dim
+
+        self.category_embed_layer = nn.Embedding(3, opt.coord_feature_dim // 2, padding_idx=0, scale_grad_by_freq=True)
+
+        self.coord_to_feature = nn.Sequential(
+            nn.Linear(4, self.coord_feature_dim//2, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim//2),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.coord_feature_dim//2, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU()
+        )
+
+        self.coord_category_fusion = nn.Sequential(
+            nn.Linear(self.coord_feature_dim+self.coord_feature_dim//2, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU(inplace=True),
+        )
+
+        self.spatial_node_fusion = nn.Sequential(
+            nn.Linear(self.coord_feature_dim*2, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU()
+        )
+
+        self.box_feature_fusion = nn.Sequential(
+            nn.Linear(self.nr_frames*self.coord_feature_dim, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU()
+        )
+
+        self.classifier = nn.Sequential(
+            nn.Linear(self.coord_feature_dim, self.coord_feature_dim),
+            # nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.coord_feature_dim, 512), #self.coord_feature_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(512, self.nr_actions)
+        )
+
+        if opt.fine_tune:
+            self.fine_tune(opt.fine_tune)
+
+    def fine_tune(self, restore_path, parameters_to_train=['classifier']):
+        weights = torch.load(restore_path)['state_dict']
+        new_weights = {}
+        for k, v in weights.items():
+            if not 'classifier.4' in k:
+                new_weights[k.replace('module.', '')] = v
+        self.load_state_dict(new_weights, strict=False)
+        print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
+
+        frozen_weights = 0
+        for name, param in self.named_parameters():
+            if not 'classifier.4' in name:
+                param.requires_grad = False
+                frozen_weights += 1
+            else:
+                print('Training : {}'.format(name))
+        print('Number of frozen weights {}'.format(frozen_weights))
+        assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
+                                    'Check the naming convention of the parameters'
+
+    def forward(self, global_img_input, box_categories, box_input, video_label, is_inference=False):
+        # local_img_tensor is (b, nr_frames, nr_boxes, 3, h, w)
+        # global_img_tensor is (b, nr_frames, 3, h, w)
+        # box_input is (b, nr_frames, nr_boxes, 4)
+
+        b, _, _, _h, _w = global_img_input.size()
+
+        box_input = box_input.transpose(2, 1).contiguous()
+        box_input = box_input.view(b*self.nr_boxes*self.nr_frames, 4)
+
+        box_categories = box_categories.long()
+        box_categories = box_categories.transpose(2, 1).contiguous()
+        box_categories = box_categories.view(b*self.nr_boxes*self.nr_frames)
+        box_category_embeddings = self.category_embed_layer(box_categories)  # (b*nr_b*nr_f, coord_feature_dim//2)
+
+        bf = self.coord_to_feature(box_input)
+        bf = torch.cat([bf, box_category_embeddings], dim=1)  # (b*nr_b*nr_f, coord_feature_dim + coord_feature_dim//2)
+        bf = self.coord_category_fusion(bf)  # (b*nr_b*nr_f, coord_feature_dim)
+        bf = bf.view(b, self.nr_boxes, self.nr_frames, self.coord_feature_dim)
+
+        # spatial message passing (graph)
+        spatial_message = bf.sum(dim=1, keepdim=True)  # (b, 1, self.nr_frames, coord_feature_dim)
+        # message passed should substract itself, and normalize to it as a single feature
+        spatial_message = (spatial_message - bf) / (self.nr_boxes - 1)  # message passed should substract itself
+        bf_and_message = torch.cat([bf, spatial_message], dim=3)  # (b, nr_boxes, nr_frames, 2*coord_feature_dim)
+
+        # (b*nr_boxes*nr_frames, coord_feature_dim)
+        bf_spatial = self.spatial_node_fusion(bf_and_message.view(b*self.nr_boxes*self.nr_frames, -1))
+        bf_spatial = bf_spatial.view(b, self.nr_boxes, self.nr_frames, self.coord_feature_dim)
+
+        bf_temporal_input = bf_spatial.view(b, self.nr_boxes, self.nr_frames*self.coord_feature_dim)
+
+        box_features = self.box_feature_fusion(bf_temporal_input.view(b*self.nr_boxes, -1))  # (b*nr_boxes, coord_feature_dim)
+        box_features = torch.mean(box_features.view(b, self.nr_boxes, -1), dim=1)  # (b, coord_feature_dim)
+        # video_features = torch.cat([global_features, local_features, box_features], dim=1)
+        video_features = box_features
+
+        cls_output = self.classifier(video_features)  # (b, num_classes)
+        return cls_output
+
+class VideoModelCoordLatentNL(nn.Module):
+    def __init__(self, opt):
+        super(VideoModelCoordLatentNL, self).__init__()
+        self.nr_boxes = opt.num_boxes
+        self.nr_actions = opt.num_classes
+        self.nr_frames = opt.num_frames // 2
+        self.img_feature_dim = opt.img_feature_dim
+        self.coord_feature_dim = opt.coord_feature_dim
+
+        self.category_embed_layer = nn.Embedding(3, opt.coord_feature_dim // 2, padding_idx=0, scale_grad_by_freq=True)
+
+        self.coord_to_feature = nn.Sequential(
+            nn.Linear(4, self.coord_feature_dim // 2, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim // 2),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.coord_feature_dim // 2, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU()
+        )
+
+        self.coord_category_fusion = nn.Sequential(
+            nn.Linear(self.coord_feature_dim + self.coord_feature_dim // 2, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU(inplace=True),
+        )
+
+        self.spatial_node_fusion = nn.Sequential(
+            nn.Linear(self.coord_feature_dim * 2, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU()
+        )
+
+        self.nr_nonlocal_layers = 3
+        self.nonlocal_fusion = []
+        for i in range(self.nr_nonlocal_layers):
+            self.nonlocal_fusion.append(nn.Sequential(
+                Nonlocal(dim=self.coord_feature_dim, dim_inner=self.coord_feature_dim // 2),
+                nn.Conv1d(self.coord_feature_dim, self.coord_feature_dim, kernel_size=1, stride=1, padding=0,
+                          bias=False),
+                nn.BatchNorm1d(self.coord_feature_dim),
+                nn.ReLU()
+            ))
+        self.nonlocal_fusion = nn.ModuleList(self.nonlocal_fusion)
+
+        self.box_feature_fusion = nn.Sequential(
+            nn.Linear(self.nr_frames * self.coord_feature_dim, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU()
+        )
+
+        self.classifier = nn.Sequential(
+            nn.Linear(self.coord_feature_dim, self.coord_feature_dim),
+            # nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.coord_feature_dim, 512),  # self.coord_feature_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(512, self.nr_actions)
+        )
+
+        if opt.fine_tune:
+            self.fine_tune(opt.fine_tune)
+
+    def train(self, mode=True):  # overriding default train function
+        super(VideoModelCoordLatentNL, self).train(mode)
+        for m in self.modules():  # or self.modules(), if freezing all bn layers
+            if isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d):
+                m.eval()
+                # shutdown update in frozen mode
+                m.weight.requires_grad = False
+                m.bias.requires_grad = False
+
+    def fine_tune(self, restore_path, parameters_to_train=['classifier']):
+        weights = torch.load(restore_path)['state_dict']
+        new_weights = {}
+        # import pdb
+        for k, v in weights.items():
+            if not 'classifier.4' in k:
+                new_weights[k.replace('module.', '')] = v
+        # pdb.set_trace()
+        self.load_state_dict(new_weights, strict=False)
+        print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
+
+        frozen_weights = 0
+        for name, param in self.named_parameters():
+            if not 'classifier.4' in name:
+
+                param.requires_grad = False
+                frozen_weights += 1
+
+            else:
+                print('Training : {}'.format(name))
+        print('Number of frozen weights {}'.format(frozen_weights))
+        assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
+                                    'Check the naming convention of the parameters'
+
+    def forward(self, global_img_input, box_categories, box_input, video_label, is_inference=False):
+        # local_img_tensor is (b, nr_frames, nr_boxes, 3, h, w)
+        # global_img_tensor is (b, nr_frames, 3, h, w)
+        # box_input is (b, nr_frames, nr_boxes, 4)
+
+        b, _, _, _h, _w = global_img_input.size()
+
+        box_input = box_input.transpose(2, 1).contiguous()
+        box_input = box_input.view(b * self.nr_boxes * self.nr_frames, 4)
+
+        box_categories = box_categories.long()
+        box_categories = box_categories.transpose(2, 1).contiguous()
+        box_categories = box_categories.view(b * self.nr_boxes * self.nr_frames)
+        box_category_embeddings = self.category_embed_layer(box_categories)  # (b*nr_b*nr_f, coord_feature_dim//2)
+
+        bf = self.coord_to_feature(box_input)
+        bf = torch.cat([bf, box_category_embeddings], dim=1)  # (b*nr_b*nr_f, coord_feature_dim + coord_feature_dim//2)
+        bf = self.coord_category_fusion(bf)  # (b*nr_b*nr_f, coord_feature_dim)
+        bf = bf.view(b, self.nr_boxes, self.nr_frames, self.coord_feature_dim)
+
+        # spatial message passing (graph)
+        spatial_message = bf.sum(dim=1, keepdim=True)  # (b, 1, self.nr_frames, coord_feature_dim)
+        # message passed should substract itself, and normalize to it as a single feature
+
+        spatial_message = (spatial_message - bf) / (self.nr_boxes - 1)  # message passed should substract itself
+        bf_and_message = torch.cat([bf, spatial_message], dim=3)  # (b, nr_boxes, nr_frames, 2*coord_feature_dim)
+
+        # (b*nr_boxes*nr_frames, coord_feature_dim)
+        bf_spatial = self.spatial_node_fusion(bf_and_message.view(b * self.nr_boxes * self.nr_frames, -1))
+        bf_spatial = bf_spatial.view(b, self.nr_boxes, self.nr_frames, self.coord_feature_dim)
+
+        bf_temporal_input = bf_spatial.view(b, self.nr_boxes, self.nr_frames * self.coord_feature_dim)
+
+        bf_nonlocal = self.box_feature_fusion(
+            bf_temporal_input.view(b * self.nr_boxes, -1))  # (b*nr_boxes, coord_feature_dim)
+        bf_nonlocal = bf_nonlocal.view(b, self.nr_boxes, self.coord_feature_dim).permute(0, 2,
+                                                                                         1).contiguous()  # (N, C, NB)
+        for i in range(self.nr_nonlocal_layers):
+            bf_nonlocal = self.nonlocal_fusion[i](bf_nonlocal)
+
+        box_features = torch.mean(bf_nonlocal, dim=2)  # (b, coord_feature_dim)
+
+        # video_features = torch.cat([global_features, local_features, box_features], dim=1)
+        video_features = box_features
+
+        cls_output = self.classifier(video_features)  # (b, num_classes)
+        return cls_output
+
+class VideoModelGlobalCoordLatent(nn.Module):
+    """
+    This model contains only global pooling without any graph.
+    """
+
+    def __init__(self, opt,
+                 ):
+        super(VideoModelGlobalCoordLatent, self).__init__()
+
+        self.nr_boxes = opt.num_boxes
+        self.nr_actions = opt.num_classes
+        self.nr_frames = opt.num_frames
+        self.img_feature_dim = opt.img_feature_dim
+        self.coord_feature_dim = opt.coord_feature_dim
+        self.i3D = Net(self.nr_actions, extract_features=True, loss_type='softmax')
+        self.dropout = nn.Dropout(0.3)
+        self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
+        self.conv = nn.Conv3d(2048, 512, kernel_size=(1, 1, 1), stride=1)
+
+        self.category_embed_layer = nn.Embedding(3, opt.coord_feature_dim // 2, padding_idx=0, scale_grad_by_freq=True)
+
+        self.c_coord_category_fusion = nn.Sequential(
+            nn.Linear(self.coord_feature_dim+self.coord_feature_dim//2, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU(inplace=True),
+        )
+
+        self.c_coord_to_feature = nn.Sequential(
+            nn.Linear(4, self.coord_feature_dim // 2, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim // 2),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.coord_feature_dim // 2, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU()
+        )
+
+        self.c_spatial_node_fusion = nn.Sequential(
+            nn.Linear(self.coord_feature_dim * 2, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU()
+        )
+
+        self.c_box_feature_fusion = nn.Sequential(
+            nn.Linear((self.nr_frames // 2) * self.coord_feature_dim, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU()
+        )
+
+        self.classifier = nn.Sequential(
+            nn.Linear(self.coord_feature_dim + 2*self.img_feature_dim, self.coord_feature_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.coord_feature_dim, 512),
+            nn.ReLU(inplace=True),
+            nn.Linear(512, self.nr_actions)
+        )
+        if opt.fine_tune:
+            self.fine_tune(opt.fine_tune)
+        if opt.restore_i3d:
+            self.restore_i3d(opt.restore_i3d)
+        if opt.restore_custom:
+            self.restore_custom(opt.restore_custom)
+
+    def train(self, mode=True):  # overriding default train function
+        super(VideoModelGlobalCoordLatent, self).train(mode)
+        for m in self.modules():  # or self.modules(), if freezing all bn layers
+            if isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d):
+                m.eval()
+                # shutdown update in frozen mode
+                m.weight.requires_grad = False
+                m.bias.requires_grad = False
+
+    def restore_custom(self, restore_path):
+        print("restoring path {}".format(restore_path))
+        weights = torch.load(restore_path)
+
+        ks = list(weights.keys())
+        print('\n\n BEFORE', weights[ks[0]][0,0,0])
+        new_weights = {}
+        # import pdb
+        for k, v in weights.items():
+            new_weights[k.replace('module.', '')] = v
+        self.load_state_dict(new_weights, strict=False)
+        print('\n\n AFTER', self.state_dict()[ks[0]][0,0, 0])
+        print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
+
+        frozen_weights = 0
+        for name, param in self.named_parameters():
+            if not name.startswith('classifier') :
+                param.requires_grad = False
+                frozen_weights += 1
+            else:
+                print('Training : {}'.format(name))
+        print('Number of frozen weights {}'.format(frozen_weights))
+        assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
+                                    'Check the naming convention of the parameters'
+
+
+    def restore_i3d(self, restore_path, parameters_to_train=['classifier']):
+        weights = torch.load(restore_path)['state_dict']
+        new_weights = {}
+        # import pdb
+        for k, v in weights.items():
+            if 'i3D' in k :
+                new_weights[k.replace('module.', '')] = v
+        # pdb.set_trace()
+        self.load_state_dict(new_weights, strict=False)
+        print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
+
+        for m in self.i3D.modules():
+            if isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d):
+                m.eval()
+                # shutdown update in frozen mode
+                m.weight.requires_grad = False
+                m.bias.requires_grad = False
+
+        frozen_weights = 0
+        for name, param in self.named_parameters():
+            if 'i3D' in name:
+                param.requires_grad = False
+                frozen_weights += 1
+            else:
+                print('Training : {}'.format(name))
+        print('Number of frozen weights {}'.format(frozen_weights))
+        assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
+                                    'Check the naming convention of the parameters'
+
+    def fine_tune(self, restore_path, parameters_to_train=['classifier']):
+        weights = torch.load(restore_path)['state_dict']
+        new_weights = {}
+        # import pdb
+        for k, v in weights.items():
+            if not 'classifier.4' in k and 'i3D.classifier':
+                new_weights[k.replace('module.', '')] = v
+        # pdb.set_trace()
+        self.load_state_dict(new_weights, strict=False)
+        print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
+
+        frozen_weights = 0
+        for name, param in self.named_parameters():
+            if not 'classifier.4' in name:
+                param.requires_grad = False
+                frozen_weights += 1
+            else:
+                print('Training : {}'.format(name))
+        print('Number of frozen weights {}'.format(frozen_weights))
+        assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
+                                    'Check the naming convention of the parameters'
+
+    def forward(self, global_img_input, box_categories, box_input, video_label, is_inference=False):
+
+        """
+        V: num of videos
+        T: num of frames
+        P: num of proposals
+        :param videos: [V x 3 x T x 224 x 224]
+        :param proposals_t: [V x T] List of BoxList (size of num_boxes each)
+        :return:
+        """
+
+        # org_features - [V x 2048 x T / 2 x 14 x 14]
+        bs, _, _, _, _ = global_img_input.shape
+        y_i3d, org_features = self.i3D(global_img_input)
+        # Reduce dimension video_features - [V x 512 x T / 2 x 14 x 14]
+        videos_features = self.conv(org_features)
+        b = bs
+
+        box_input = box_input.transpose(2, 1).contiguous()
+        box_input = box_input.view(b * self.nr_boxes * (self.nr_frames//2), 4)
+
+        box_categories = box_categories.long()
+        box_categories = box_categories.transpose(2, 1).contiguous()
+        box_categories = box_categories.view(b * self.nr_boxes * (self.nr_frames // 2))
+        box_category_embeddings = self.category_embed_layer(box_categories)  # (b*nr_b*nr_f, coord_feature_dim//2)
+
+        bf = self.c_coord_to_feature(box_input)
+        bf = torch.cat([bf, box_category_embeddings], dim=1)  # (b*nr_b*nr_f, coord_feature_dim + coord_feature_dim//2)
+        bf = self.c_coord_category_fusion(bf)  # (b*nr_b*nr_f, coord_feature_dim)
+
+        bf = bf.view(b, self.nr_boxes, self.nr_frames // 2, self.coord_feature_dim)
+
+        # spatial message passing (graph)
+        spatial_message = bf.sum(dim=1, keepdim=True)  # (b, 1, self.nr_frames, coord_feature_dim)
+        # message passed should substract itself, and normalize to it as a single feature
+        spatial_message = (spatial_message - bf) / (self.nr_boxes - 1)  # message passed should substract itself
+
+        bf_message_gf = torch.cat([bf, spatial_message], dim=3)  # (b, nr_boxes, nr_frames, 2*coord_feature_dim)
+
+        # (b*nr_boxes*nr_frames, coord_feature_dim)
+        bf_spatial = self.c_spatial_node_fusion(bf_message_gf.view(b * self.nr_boxes * (self.nr_frames // 2), -1))
+        bf_spatial = bf_spatial.view(b, self.nr_boxes, self.nr_frames // 2, self.coord_feature_dim)
+
+        bf_temporal_input = bf_spatial.view(b, self.nr_boxes, (self.nr_frames // 2) * self.coord_feature_dim)
+
+        box_features = self.c_box_feature_fusion(
+            bf_temporal_input.view(b * self.nr_boxes, -1))  # (b*nr_boxes, img_feature_dim)
+        coord_ft = torch.mean(box_features.view(b, self.nr_boxes, -1), dim=1)  # (b, coord_feature_dim)
+        # video_features = torch.cat([global_features, local_features, box_features], dim=1)
+        # _gf = self.global_new_fc(_gf)
+        _gf = videos_features.mean(-1).mean(-1).view(b, (self.nr_frames//2), 2*self.img_feature_dim)
+        _gf = _gf.mean(1)
+        video_features = torch.cat([_gf.view(b, -1), coord_ft], dim=-1)
+
+        cls_output = self.classifier(video_features)  # (b, num_classes)
+        return cls_output
+
+class VideoModelGlobalCoordLatentNL(nn.Module):
+    """
+    This model contains only global pooling without any graph.
+    """
+
+    def __init__(self, base_net, opt,
+                 ):
+        super(VideoModelGlobalCoordLatentNL, self).__init__()
+
+        self.nr_boxes = opt.num_boxes
+        self.nr_actions = opt.num_classes
+        self.nr_frames = opt.num_frames
+        self.img_feature_dim = opt.img_feature_dim
+        self.coord_feature_dim = opt.coord_feature_dim
+        self.i3D = Net(self.nr_actions, extract_features=True, loss_type='softmax')
+        self.dropout = nn.Dropout(0.3)
+        self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
+        self.conv = nn.Conv3d(2048, 512, kernel_size=(1, 1, 1), stride=1)
+
+
+        self.category_embed_layer = nn.Embedding(3, opt.coord_feature_dim // 2, padding_idx=0, scale_grad_by_freq=True)
+
+        self.c_coord_category_fusion = nn.Sequential(
+            nn.Linear(self.coord_feature_dim+self.coord_feature_dim//2, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU(inplace=True),
+        )
+
+        self.c_coord_to_feature = nn.Sequential(
+            nn.Linear(4, self.coord_feature_dim // 2, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim // 2),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.coord_feature_dim // 2, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU()
+        )
+
+        self.c_spatial_node_fusion = nn.Sequential(
+            nn.Linear(self.coord_feature_dim * 2, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU()
+        )
+
+        self.nr_nonlocal_layers = 3
+        self.c_nonlocal_fusion = []
+        for i in range(self.nr_nonlocal_layers):
+            self.c_nonlocal_fusion.append(nn.Sequential(
+                    Nonlocal(dim=self.coord_feature_dim, dim_inner=self.coord_feature_dim // 2),
+                    nn.Conv1d(self.coord_feature_dim, self.coord_feature_dim, kernel_size=1, stride=1, padding=0, bias=False),
+                    nn.BatchNorm1d(self.coord_feature_dim),
+                    nn.ReLU()
+            ))
+        self.c_nonlocal_fusion = nn.ModuleList(self.c_nonlocal_fusion)
+
+        self.c_box_feature_fusion = nn.Sequential(
+            nn.Linear((self.nr_frames // 2) * self.coord_feature_dim, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU()
+        )
+
+        self.classifier = nn.Sequential(
+            nn.Linear(self.coord_feature_dim + 2*self.img_feature_dim, self.coord_feature_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.coord_feature_dim, 512),
+            nn.ReLU(inplace=True),
+            nn.Linear(512, self.nr_actions)
+        )
+        if opt.fine_tune:
+            self.fine_tune(opt.fine_tune)
+        if opt.restore_i3d:
+            self.restore_i3d(opt.restore_i3d)
+
+        if opt.restore_custom:
+            self.restore_custom(opt.restore_custom)
+
+    def restore_custom(self, restore_path):
+        print("restoring path {}".format(restore_path))
+        weights = torch.load(restore_path)
+        ks = list(weights.keys())
+        print('\n\n BEFORE', weights[ks[0]][0,0,0])
+        new_weights = {}
+        # import pdb
+        for k, v in weights.items():
+            new_weights[k.replace('module.', '')] = v
+        self.load_state_dict(new_weights, strict=False)
+        print('\n\n AFTER', self.state_dict()[ks[0]][0,0, 0])
+        print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
+
+        frozen_weights = 0
+        for name, param in self.named_parameters():
+            if not name.startswith('classifier') :
+                param.requires_grad = False
+                frozen_weights += 1
+            else:
+                print('Training : {}'.format(name))
+        print('Number of frozen weights {}'.format(frozen_weights))
+        assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
+                                    'Check the naming convention of the parameters'
+
+
+
+    def restore_i3d(self, restore_path, parameters_to_train=['classifier']):
+        weights = torch.load(restore_path)['state_dict']
+        new_weights = {}
+        # import pdb
+        for k, v in weights.items():
+            if 'i3D' in k  or k.startswith('conv.'):
+                new_weights[k.replace('module.', '')] = v
+        # pdb.set_trace()
+        self.load_state_dict(new_weights, strict=False)
+        print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
+
+        for m in self.i3D.modules():
+            if isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d):
+                m.eval()
+                # shutdown update in frozen mode
+                m.weight.requires_grad = False
+                m.bias.requires_grad = False
+
+        frozen_weights = 0
+        for name, param in self.named_parameters():
+            if 'i3D' in name or k.startswith('conv.') :
+                param.requires_grad = False
+                frozen_weights += 1
+            else:
+                print('Training : {}'.format(name))
+        print('Number of frozen weights {}'.format(frozen_weights))
+        assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
+                                    'Check the naming convention of the parameters'
+
+    def train(self, mode=True):  # overriding default train function
+        super(VideoModelGlobalCoordLatentNL, self).train(mode)
+        for m in self.i3D.modules():  # or self.modules(), if freezing all bn layers
+            if isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d):
+                m.eval()
+                # shutdown update in frozen mode
+                m.weight.requires_grad = False
+                m.bias.requires_grad = False
+
+    def fine_tune(self, restore_path, parameters_to_train=['classifier']):
+        weights = torch.load(restore_path)['state_dict']
+        new_weights = {}
+        import pdb
+        for k, v in weights.items():
+            if not 'classifier.4' in k and 'i3D.classifier' not in k:
+                new_weights[k.replace('module.', '')] = v
+        pdb.set_trace()
+        self.load_state_dict(new_weights, strict=False)
+        print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
+
+        frozen_weights = 0
+        for name, param in self.named_parameters():
+            if not 'classifier.4' in name:
+                param.requires_grad = False
+                frozen_weights += 1
+            else:
+                print('Training : {}'.format(name))
+        print('Number of frozen weights {}'.format(frozen_weights))
+        assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
+                                    'Check the naming convention of the parameters'
+
+    def forward(self, global_img_input, box_categories, box_input, video_label, is_inference=False):
+
+        """
+        V: num of videos
+        T: num of frames
+        P: num of proposals
+        :param videos: [V x 3 x T x 224 x 224]
+        :param proposals_t: [V x T] List of BoxList (size of num_boxes each)
+        :return:
+        """
+
+        # org_features - [V x 2048 x T / 2 x 14 x 14]
+        bs, _, _, _, _ = global_img_input.shape
+        y_i3d, org_features = self.i3D(global_img_input)
+        # Reduce dimension video_features - [V x 512 x T / 2 x 14 x 14]
+        videos_features = self.conv(org_features)
+        b = bs
+
+        box_input = box_input.transpose(2, 1).contiguous()
+        box_input = box_input.view(b * self.nr_boxes * (self.nr_frames//2), 4)
+
+        box_categories = box_categories.long()
+        box_categories = box_categories.transpose(2, 1).contiguous()
+        box_categories = box_categories.view(b * self.nr_boxes * (self.nr_frames // 2))
+        box_category_embeddings = self.category_embed_layer(box_categories)  # (b*nr_b*nr_f, coord_feature_dim//2)
+
+        bf = self.c_coord_to_feature(box_input)
+        bf = torch.cat([bf, box_category_embeddings], dim=1)  # (b*nr_b*nr_f, coord_feature_dim + coord_feature_dim//2)
+        bf = self.c_coord_category_fusion(bf)  # (b*nr_b*nr_f, coord_feature_dim)
+
+        bf = bf.view(b, self.nr_boxes, self.nr_frames // 2, self.coord_feature_dim)
+
+        # spatial message passing (graph)
+        spatial_message = bf.sum(dim=1, keepdim=True)  # (b, 1, self.nr_frames, coord_feature_dim)
+        # message passed should substract itself, and normalize to it as a single feature
+        spatial_message = (spatial_message - bf) / (self.nr_boxes - 1)  # message passed should substract itself
+
+        bf_message_gf = torch.cat([bf, spatial_message], dim=3)  # (b, nr_boxes, nr_frames, 2*coord_feature_dim)
+
+        # (b*nr_boxes*nr_frames, coord_feature_dim)
+        bf_spatial = self.c_spatial_node_fusion(bf_message_gf.view(b * self.nr_boxes * (self.nr_frames // 2), -1))
+        bf_spatial = bf_spatial.view(b, self.nr_boxes, self.nr_frames // 2, self.coord_feature_dim)
+
+        bf_temporal_input = bf_spatial.view(b, self.nr_boxes, (self.nr_frames // 2) * self.coord_feature_dim)
+
+        bf_nonlocal = self.c_box_feature_fusion(
+            bf_temporal_input.view(b * self.nr_boxes, -1))  # (b*nr_boxes, img_feature_dim)
+
+        bf_nonlocal = bf_nonlocal.view(b, self.nr_boxes, self.coord_feature_dim).permute(0, 2, 1).contiguous()  # (N, C, NB)
+        for i in range(self.nr_nonlocal_layers):
+            bf_nonlocal = self.c_nonlocal_fusion[i](bf_nonlocal)
+
+        coord_ft = torch.mean(bf_nonlocal, dim=2)  # (b, coord_feature_dim)
+
+        # video_features = torch.cat([global_features, local_features, box_features], dim=1)
+        _gf = videos_features.mean(-1).mean(-1).view(b, (self.nr_frames//2), 2*self.img_feature_dim)
+        _gf = _gf.mean(1)
+        video_features = torch.cat([_gf.view(b, -1), coord_ft], dim=-1)
+
+        cls_output = self.classifier(video_features)  # (b, num_classes)
+        return cls_output
+
+class VideoGlobalModel(nn.Module):
+    """
+    This model contains only global pooling without any graph.
+    """
+
+    def __init__(self, opt,
+                 ):
+        super(VideoGlobalModel, self).__init__()
+
+        self.nr_boxes = opt.num_boxes
+        self.nr_actions = opt.num_classes
+        self.nr_frames = opt.num_frames
+        self.img_feature_dim = opt.img_feature_dim
+        self.coord_feature_dim = opt.coord_feature_dim
+        self.i3D = Net(self.nr_actions, extract_features=True, loss_type='softmax')
+        self.dropout = nn.Dropout(0.3)
+        self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
+        self.conv = nn.Conv3d(2048, 512, kernel_size=(1, 1, 1), stride=1)
+        self.fc = nn.Linear(512, self.nr_actions)
+        self.crit = nn.CrossEntropyLoss()
+
+        if opt.fine_tune:
+            self.fine_tune(opt.fine_tune)
+
+    def fine_tune(self, restore_path, parameters_to_train=['classifier']):
+        weights = torch.load(restore_path)['state_dict']
+        new_weights = {}
+        for k, v in weights.items():
+            if not 'fc' in k and not 'classifier' in k:
+                new_weights[k.replace('module.', '')] = v
+        self.load_state_dict(new_weights, strict=False)
+        print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
+
+        frozen_weights = 0
+        for name, param in self.named_parameters():
+            if not 'fc' in name:
+                param.requires_grad = False
+                frozen_weights += 1
+            else:
+                print('Training : {}'.format(name))
+        print('Number of frozen weights {}'.format(frozen_weights))
+        assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
+                                    'Check the naming convention of the parameters'
+
+    def forward(self, global_img_input, local_img_input, box_input, video_label, is_inference=False):
+        """
+        V: num of videos
+        T: num of frames
+        P: num of proposals
+        :param videos: [V x 3 x T x 224 x 224]
+        :param proposals_t: [V x T] List of BoxList (size of num_boxes each)
+        :return:
+        """
+
+        # org_features - [V x 2048 x T / 2 x 14 x 14]
+        y_i3d, org_features = self.i3D(global_img_input)
+        # Reduce dimension video_features - [V x 512 x T / 2 x 14 x 14]
+        videos_features = self.conv(org_features)
+
+        # Get global features - [V x 512]
+        global_features = self.avgpool(videos_features).squeeze()
+        global_features = self.dropout(global_features)
+
+        cls_output = self.fc(global_features)
+        return cls_output
+
+class VideoModelGlobalCoord(nn.Module):
+    """
+    This model contains only global pooling without any graph.
+    """
+
+    def __init__(self, opt):
+        super(VideoModelGlobalCoord, self).__init__()
+
+        self.nr_boxes = opt.num_boxes
+        self.nr_actions = opt.num_classes
+        self.nr_frames = opt.num_frames
+        self.img_feature_dim = opt.img_feature_dim
+        self.coord_feature_dim = opt.coord_feature_dim
+        self.i3D = Net(self.nr_actions, extract_features=True, loss_type='softmax')
+        self.dropout = nn.Dropout(0.3)
+        self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
+        self.conv = nn.Conv3d(2048, 256, kernel_size=(1, 1, 1), stride=1)
+
+
+        self.global_new_fc = nn.Sequential(
+            nn.Linear(256, self.img_feature_dim, bias=False),
+            nn.BatchNorm1d(self.img_feature_dim),
+            nn.ReLU(inplace=True)
+        )
+
+
+        self.c_coord_to_feature = nn.Sequential(
+            nn.Linear(4, self.coord_feature_dim // 2, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim // 2),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.coord_feature_dim // 2, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU()
+        )
+
+        self.c_spatial_node_fusion = nn.Sequential(
+            nn.Linear(self.coord_feature_dim * 2, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU()
+        )
+
+        self.c_box_feature_fusion = nn.Sequential(
+            nn.Linear((self.nr_frames // 2) * self.coord_feature_dim, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
+            nn.BatchNorm1d(self.coord_feature_dim),
+            nn.ReLU()
+        )
+
+        self.classifier = nn.Sequential(
+            nn.Linear(self.coord_feature_dim + self.img_feature_dim, self.coord_feature_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.coord_feature_dim, 512),
+            nn.ReLU(inplace=True),
+            nn.Linear(512, self.nr_actions)
+        )
+        if opt.fine_tune:
+            self.fine_tune(opt.fine_tune)
+        if opt.restore_i3d:
+            self.restore_i3d(opt.restore_i3d)
+
+    def train(self, mode=True):  # overriding default train function
+        super(VideoModelGlobalCoord, self).train(mode)
+        for m in self.i3D.modules():  # or self.modules(), if freezing all bn layers
+            if isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d):
+                m.eval()
+                # shutdown update in frozen mode
+                m.weight.requires_grad = False
+                m.bias.requires_grad = False
+
+    def restore_i3d(self, restore_path, parameters_to_train=['classifier']):
+        weights = torch.load(restore_path)['state_dict']
+        new_weights = {}
+        # import pdb
+        for k, v in weights.items():
+            if 'i3D' in k :
+                new_weights[k.replace('module.', '')] = v
+        # pdb.set_trace()
+        self.load_state_dict(new_weights, strict=False)
+        print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
+
+        frozen_weights = 0
+        for name, param in self.named_parameters():
+            if 'i3D' in name:
+                param.requires_grad = False
+                frozen_weights += 1
+            else:
+                print('Training : {}'.format(name))
+        print('Number of frozen weights {}'.format(frozen_weights))
+        assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
+                                    'Check the naming convention of the parameters'
+
+    def fine_tune(self, restore_path, parameters_to_train=['classifier']):
+        weights = torch.load(restore_path)['state_dict']
+        new_weights = {}
+        # import pdb
+        for k, v in weights.items():
+            if not 'classifier.4' in k and 'i3D.classifier':
+                new_weights[k.replace('module.', '')] = v
+        # pdb.set_trace()
+        self.load_state_dict(new_weights, strict=False)
+        print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
+
+        frozen_weights = 0
+        for name, param in self.named_parameters():
+            if not 'classifier.4' in name:
+                param.requires_grad = False
+                frozen_weights += 1
+            else:
+                print('Training : {}'.format(name))
+        print('Number of frozen weights {}'.format(frozen_weights))
+        assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
+                                    'Check the naming convention of the parameters'
+
+    def forward(self, global_img_input, box_categories, box_input, video_label, is_inference=False):
+
+        """
+        V: num of videos
+        T: num of frames
+        P: num of proposals
+        :param videos: [V x 3 x T x 224 x 224]
+        :param proposals_t: [V x T] List of BoxList (size of num_boxes each)
+        :return:
+        """
+
+        # org_features - [V x 2048 x T / 2 x 14 x 14]
+        bs, _, _, _, _ = global_img_input.shape
+        y_i3d, org_features = self.i3D(global_img_input)
+        # Reduce dimension video_features - [V x 512 x T / 2 x 14 x 14]
+        videos_features = self.conv(org_features)
+        b = bs
+
+        box_input = box_input.transpose(2, 1).contiguous()
+        box_input = box_input.view(b * self.nr_boxes * (self.nr_frames//2), 4)
+
+        bf = self.c_coord_to_feature(box_input)
+        bf = bf.view(b, self.nr_boxes, self.nr_frames // 2, self.coord_feature_dim)
+
+        # spatial message passing (graph)
+        spatial_message = bf.sum(dim=1, keepdim=True)  # (b, 1, self.nr_frames, coord_feature_dim)
+        # message passed should substract itself, and normalize to it as a single feature
+        spatial_message = (spatial_message - bf) / (self.nr_boxes - 1)  # message passed should substract itself
+
+        bf_message_gf = torch.cat([bf, spatial_message], dim=3)  # (b, nr_boxes, nr_frames, 2*coord_feature_dim)
+
+        # (b*nr_boxes*nr_frames, coord_feature_dim)
+        bf_spatial = self.c_spatial_node_fusion(bf_message_gf.view(b * self.nr_boxes * (self.nr_frames // 2), -1))
+        bf_spatial = bf_spatial.view(b, self.nr_boxes, self.nr_frames // 2, self.coord_feature_dim)
+
+        bf_temporal_input = bf_spatial.view(b, self.nr_boxes, (self.nr_frames // 2) * self.coord_feature_dim)
+
+        box_features = self.c_box_feature_fusion(
+            bf_temporal_input.view(b * self.nr_boxes, -1))  # (b*nr_boxes, img_feature_dim)
+        coord_ft = torch.mean(box_features.view(b, self.nr_boxes, -1), dim=1)  # (b, coord_feature_dim)
+        # video_features = torch.cat([global_features, local_features, box_features], dim=1)
+        _gf = videos_features.mean(-1).mean(-1).view(b*(self.nr_frames//2), self.img_feature_dim)
+        _gf = self.global_new_fc(_gf)
+        _gf = _gf.view(b, self.nr_frames // 2, self.img_feature_dim).mean(1)
+        video_features = torch.cat([_gf.view(b, -1), coord_ft], dim=-1)
+
+        cls_output = self.classifier(video_features)  # (b, num_classes)
+        return cls_output
--- a/i3D/nonlocal_helper.py
+++ b/i3D/nonlocal_helper.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+"""Non-local helper"""
+
+import torch
+import torch.nn as nn
+
+
+class Nonlocal(nn.Module):
+    """
+    Builds Non-local Neural Networks as a generic family of building
+    blocks for capturing long-range dependencies. Non-local Network
+    computes the response at a position as a weighted sum of the
+    features at all positions. This building block can be plugged into
+    many computer vision architectures.
+    More details in the paper: https://arxiv.org/pdf/1711.07971.pdf
+    """
+
+    def __init__(
+        self,
+        dim,
+        dim_inner,
+        pool_size=None,
+        instantiation="softmax",
+        norm_type="layernorm",
+        zero_init_final_conv=True,
+        zero_init_final_norm=False,
+        norm_eps=1e-5,
+        norm_momentum=0.1,
+    ):
+        """
+        Args:
+            dim (int): number of dimension for the input.
+            dim_inner (int): number of dimension inside of the Non-local block.
+            pool_size (list): the kernel size of spatial temporal pooling,
+                temporal pool kernel size, spatial pool kernel size, spatial
+                pool kernel size in order. By default pool_size is None,
+                then there would be no pooling used.
+            instantiation (string): supports two different instantiation method:
+                "dot_product": normalizing correlation matrix with L2.
+                "softmax": normalizing correlation matrix with Softmax.
+            norm_type (string): support BatchNorm and LayerNorm for
+                normalization.
+                "batchnorm": using BatchNorm for normalization.
+                "layernorm": using LayerNorm for normalization.
+                "none": not using any normalization.
+            zero_init_final_conv (bool): If true, zero initializing the final
+                convolution of the Non-local block.
+            zero_init_final_norm (bool):
+                If true, zero initializing the final batch norm of the Non-local
+                block.
+        """
+        super(Nonlocal, self).__init__()
+        self.dim = dim
+        self.dim_inner = dim_inner
+        self.pool_size = pool_size
+        self.instantiation = instantiation
+        self.norm_type = norm_type
+        self.use_pool = (
+            False
+            if pool_size is None
+            else any((size > 1 for size in pool_size))
+        )
+        self.norm_eps = norm_eps
+        self.norm_momentum = norm_momentum
+        self._construct_nonlocal(zero_init_final_conv, zero_init_final_norm)
+
+    def _construct_nonlocal(self, zero_init_final_conv, zero_init_final_norm):
+        # Three convolution heads: theta, phi, and g.
+        self.conv_theta = nn.Conv1d(
+            self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0
+        )
+        self.conv_phi = nn.Conv1d(
+            self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0
+        )
+        self.conv_g = nn.Conv1d(
+            self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0
+        )
+
+        # Final convolution output.
+        self.conv_out = nn.Conv1d(
+            self.dim_inner, self.dim, kernel_size=1, stride=1, padding=0
+        )
+        # Zero initializing the final convolution output.
+        self.conv_out.zero_init = zero_init_final_conv
+
+        if self.norm_type == "batchnorm":
+            self.bn = nn.BatchNorm1d(
+                self.dim, eps=self.norm_eps, momentum=self.norm_momentum
+            )
+            # Zero initializing the final bn.
+            self.bn.transform_final_bn = zero_init_final_norm
+        elif self.norm_type == "layernorm":
+            # In Caffe2 the LayerNorm op does not contain the scale an bias
+            # terms described in the paper:
+            # https://caffe2.ai/docs/operators-catalogue.html#layernorm
+            # Builds LayerNorm as GroupNorm with one single group.
+            # Setting Affine to false to align with Caffe2.
+            self.ln = nn.GroupNorm(1, self.dim, eps=self.norm_eps, affine=False)
+        elif self.norm_type == "none":
+            # Does not use any norm.
+            pass
+        else:
+            raise NotImplementedError(
+                "Norm type {} is not supported".format(self.norm_type)
+            )
+
+        # Optional to add the spatial-temporal pooling.
+        if self.use_pool:
+            self.pool = nn.MaxPool1d(
+                kernel_size=self.pool_size,
+                stride=self.pool_size,
+                padding=[0, 0, 0],
+            )
+
+    def forward(self, x):
+        x_identity = x
+        N, C, NB = x.size()
+
+        theta = self.conv_theta(x)
+
+        # Perform temporal-spatial pooling to reduce the computation.
+        if self.use_pool:
+            x = self.pool(x)
+
+        phi = self.conv_phi(x)
+        g = self.conv_g(x)
+
+        theta = theta.view(N, self.dim_inner, -1)
+        phi = phi.view(N, self.dim_inner, -1)
+        g = g.view(N, self.dim_inner, -1)
+
+        # (N, C, NB) * (N, C, NB) => (N, NB, NB).
+        theta_phi = torch.einsum("nct,ncp->ntp", (theta, phi))
+        # For original Non-local paper, there are two main ways to normalize
+        # the affinity tensor:
+        #   1) Softmax normalization (norm on exp).
+        #   2) dot_product normalization.
+        if self.instantiation == "softmax":
+            # Normalizing the affinity tensor theta_phi before softmax.
+            theta_phi = theta_phi * (self.dim_inner ** -0.5)
+            theta_phi = nn.functional.softmax(theta_phi, dim=2)
+        elif self.instantiation == "dot_product":
+            spatial_temporal_dim = theta_phi.shape[2]
+            theta_phi = theta_phi / spatial_temporal_dim
+        else:
+            raise NotImplementedError(
+                "Unknown norm type {}".format(self.instantiation)
+            )
+
+        # (N, NB, NB) * (N, C, NB) => (N, C, NB).
+        theta_phi_g = torch.einsum("ntg,ncg->nct", (theta_phi, g))
+
+        # (N, C, NB) => (N, C, NB).
+        theta_phi_g = theta_phi_g.view(N, self.dim_inner, NB)
+
+        p = self.conv_out(theta_phi_g)
+        if self.norm_type == "batchnorm":
+            p = self.bn(p)
+        elif self.norm_type == "layernorm":
+            p = self.ln(p)
+        return x_identity + p
+
+
--- a/i3D/resnet3d_xl.py
+++ b/i3D/resnet3d_xl.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+import math
+import numpy as np
+
+from functools import partial
+
+__all__ = [
+    'ResNet', 'resnet10', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
+    'resnet152', 'resnet200',
+]
+
+
+def conv3x3x3(in_planes, out_planes, stride=1):
+    # 3x3x3 convolution with padding
+    return nn.Conv3d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=1,
+        bias=False)
+
+
+def downsample_basic_block(x, planes, stride):
+    out = F.avg_pool3d(x, kernel_size=1, stride=stride)
+    zero_pads = torch.Tensor(
+        out.size(0), planes - out.size(1), out.size(2), out.size(3),
+        out.size(4)).zero_()
+    if isinstance(out.data, torch.cuda.FloatTensor):
+        zero_pads = zero_pads.cuda()
+
+    out = Variable(torch.cat([out.data, zero_pads], dim=1))
+
+    return out
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm3d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3x3(planes, planes)
+        self.bn2 = nn.BatchNorm3d(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    conv_op = None
+    offset_groups = 1
+
+    def __init__(self, dim_in, dim_out, stride, dim_inner, group=1, use_temp_conv=1, temp_stride=1, dcn=False,
+                 shortcut_type='B'):
+        super(Bottleneck, self).__init__()
+        # 1 x 1 layer
+        self.with_dcn = dcn
+        self.conv1 = self.Conv3dBN(dim_in, dim_inner, (1 + use_temp_conv * 2, 1, 1), (temp_stride, 1, 1),
+                                   (use_temp_conv, 0, 0))
+        self.relu = nn.ReLU(inplace=True)
+        # 3 x 3 layer
+        self.conv2 = self.Conv3dBN(dim_inner, dim_inner, (1, 3, 3), (1, stride, stride), (0, 1, 1))
+        # 1 x 1 layer
+        self.conv3 = self.Conv3dBN(dim_inner, dim_out, (1, 1, 1), (1, 1, 1), (0, 0, 0))
+
+        self.shortcut_type = shortcut_type
+        self.dim_in = dim_in
+        self.dim_out = dim_out
+        self.temp_stride = temp_stride
+        self.stride = stride
+        # nn.Conv3d(dim_in, dim_out, (1,1,1),(temp_stride,stride,stride),(0,0,0))
+        if self.shortcut_type == 'B':
+            if self.dim_in == self.dim_out and self.temp_stride == 1 and self.stride == 1:  # or (self.dim_in == self.dim_out and self.dim_in == 64 and self.stride ==1):
+
+                pass
+            else:
+                # pass
+                self.shortcut = self.Conv3dBN(dim_in, dim_out, (1, 1, 1), (temp_stride, stride, stride), (0, 0, 0))
+
+        # nn.Conv3d(dim_in,dim_inner,kernel_size=(1+use_temp_conv*2,1,1),stride = (temp_stride,1,1),padding = )
+
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        if self.dim_in == self.dim_out and self.temp_stride == 1 and self.stride == 1:
+            pass
+        else:
+            residual = self.shortcut(residual)
+        out += residual
+        out = self.relu(out)
+        return out
+
+    def Conv3dBN(self, dim_in, dim_out, kernels, strides, pads, group=1):
+        if self.with_dcn and kernels[0] > 1:
+            # use deformable conv
+            return nn.Sequential(
+                self.conv_op(dim_in, dim_out, kernel_size=kernels, stride=strides, padding=pads, bias=False,
+                             offset_groups=self.offset_groups),
+                nn.BatchNorm3d(dim_out)
+            )
+        else:
+            return nn.Sequential(
+                nn.Conv3d(dim_in, dim_out, kernel_size=kernels, stride=strides, padding=pads, bias=False),
+                nn.BatchNorm3d(dim_out)
+            )
+
+
+class ResNet(nn.Module):
+
+    def __init__(self,
+                 block,
+                 layers,
+                 use_temp_convs_set,
+                 temp_strides_set,
+                 sample_size,
+                 sample_duration,
+                 shortcut_type='B',
+                 num_classes=400,
+                 stage_with_dcn=(False, False, False, False),
+                 extract_features=False,
+                 loss_type='softmax'):
+        super(ResNet, self).__init__()
+        self.extract_features = extract_features
+        self.stage_with_dcn = stage_with_dcn
+        self.group = 1
+        self.width_per_group = 64
+        self.dim_inner = self.group * self.width_per_group
+        # self.shortcut_type = shortcut_type
+        self.conv1 = nn.Conv3d(
+            3,
+            64,
+            kernel_size=(1 + use_temp_convs_set[0][0] * 2, 7, 7),
+            stride=(temp_strides_set[0][0], 2, 2),
+            padding=(use_temp_convs_set[0][0], 3, 3),
+            bias=False)
+        self.bn1 = nn.BatchNorm3d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool1 = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 0, 0))
+        with_dcn = True if self.stage_with_dcn[0] else False
+        self.layer1 = self._make_layer(block, 64, 256, shortcut_type, stride=1, num_blocks=layers[0],
+                                       dim_inner=self.dim_inner, group=self.group, use_temp_convs=use_temp_convs_set[1],
+                                       temp_strides=temp_strides_set[1], dcn=with_dcn)
+        self.maxpool2 = nn.MaxPool3d(kernel_size=(2, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
+        with_dcn = True if self.stage_with_dcn[1] else False
+        self.layer2 = self._make_layer(block, 256, 512, shortcut_type, stride=2, num_blocks=layers[1],
+                                       dim_inner=self.dim_inner * 2, group=self.group,
+                                       use_temp_convs=use_temp_convs_set[2], temp_strides=temp_strides_set[2],
+                                       dcn=with_dcn)
+        with_dcn = True if self.stage_with_dcn[2] else False
+        self.layer3 = self._make_layer(block, 512, 1024, shortcut_type, stride=2, num_blocks=layers[2],
+                                       dim_inner=self.dim_inner * 4, group=self.group,
+                                       use_temp_convs=use_temp_convs_set[3], temp_strides=temp_strides_set[3],
+                                       dcn=with_dcn)
+        with_dcn = True if self.stage_with_dcn[3] else False
+        self.layer4 = self._make_layer(block, 1024, 2048, shortcut_type, stride=1, num_blocks=layers[3],
+                                       dim_inner=self.dim_inner * 8, group=self.group,
+                                       use_temp_convs=use_temp_convs_set[4], temp_strides=temp_strides_set[4],
+                                       dcn=with_dcn)
+        last_duration = int(math.ceil(sample_duration / 2))  # int(math.ceil(sample_duration / 8))
+        last_size = int(math.ceil(sample_size / 16))
+        # self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1) #nn.AdaptiveAvgPool3d((1, 1, 1)) #
+        self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
+        self.dropout = torch.nn.Dropout(p=0.5)
+        self.classifier = nn.Linear(2048, num_classes)
+
+        for m in self.modules():
+            # if isinstance(m, nn.Conv3d):
+            #     m.weight = nn.init.kaiming_normal_(m.weight, mode='fan_out')
+            # elif isinstance(m,nn.Linear):
+            #    m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')
+            # elif 
+            if isinstance(m, nn.BatchNorm3d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_layer(self, block, dim_in, dim_out, shortcut_type, stride, num_blocks, dim_inner=None, group=None,
+                    use_temp_convs=None, temp_strides=None, dcn=False):
+        if use_temp_convs is None:
+            use_temp_convs = np.zeros(num_blocks).astype(int)
+        if temp_strides is None:
+            temp_strides = np.ones(num_blocks).astype(int)
+        if len(use_temp_convs) < num_blocks:
+            for _ in range(num_blocks - len(use_temp_convs)):
+                use_temp_convs.append(0)
+                temp_strides.append(1)
+        layers = []
+        for idx in range(num_blocks):
+            block_stride = 2 if (idx == 0 and stride == 2) else 1
+
+            layers.append(
+                block(dim_in, dim_out, block_stride, dim_inner, group, use_temp_convs[idx], temp_strides[idx], dcn))
+            dim_in = dim_out
+        return nn.Sequential(*layers)
+
+    def forward_single(self, x):
+        x = self.conv1(x)
+
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool1(x)
+
+        x = self.layer1(x)
+        x = self.maxpool2(x)
+        x = self.layer2(x)
+
+        x = self.layer3(x)
+        features = self.layer4(x)
+
+        x = self.avgpool(features)
+
+        x = x.view(x.size(0), -1)
+        x = self.dropout(x)
+
+        y = self.classifier(x)
+        if self.extract_features:
+            return y, features
+        else:
+            return y
+
+    def forward_multi(self, x):
+        clip_preds = []
+        # import ipdb;ipdb.set_trace()
+        for clip_idx in range(x.shape[1]):  # B, 10, 3, 3, 32, 224, 224
+            spatial_crops = []
+            for crop_idx in range(x.shape[2]):
+                clip = x[:, clip_idx, crop_idx]
+                clip = self.forward_single(clip)
+                spatial_crops.append(clip)
+            spatial_crops = torch.stack(spatial_crops, 1).mean(1)  # (B, 400)
+            clip_preds.append(spatial_crops)
+        clip_preds = torch.stack(clip_preds, 1).mean(1)  # (B, 400)
+        return clip_preds
+
+    def forward(self, x):
+
+        # 5D tensor == single clip
+        if x.dim() == 5:
+            pred = self.forward_single(x)
+
+        # 7D tensor == 3 crops/10 clips
+        elif x.dim() == 7:
+            pred = self.forward_multi(x)
+
+        # loss_dict = {}
+        # if 'label' in batch:
+        #     loss = F.cross_entropy(pred, batch['label'], reduction='none')
+        #     loss_dict = {'clf': loss}
+
+        return pred
+
+
+def get_fine_tuning_parameters(model, ft_begin_index):
+    if ft_begin_index == 0:
+        return model.parameters()
+
+    ft_module_names = []
+    for i in range(ft_begin_index, 5):
+        ft_module_names.append('layer{}'.format(i))
+    ft_module_names.append('fc')
+    # import ipdb;ipdb.set_trace()
+    parameters = []
+    for k, v in model.named_parameters():
+        for ft_module in ft_module_names:
+            if ft_module in k:
+                parameters.append({'params': v})
+                break
+        else:
+            parameters.append({'params': v, 'lr': 0.0})
+
+    return parameters
+
+
+def obtain_arc(arc_type):
+    # c2d, ResNet50
+    if arc_type == 1:
+        use_temp_convs_1 = [0]
+        temp_strides_1 = [2]
+        use_temp_convs_2 = [0, 0, 0]
+        temp_strides_2 = [1, 1, 1]
+        use_temp_convs_3 = [0, 0, 0, 0]
+        temp_strides_3 = [1, 1, 1, 1]
+        use_temp_convs_4 = [0, ] * 6
+        temp_strides_4 = [1, ] * 6
+        use_temp_convs_5 = [0, 0, 0]
+        temp_strides_5 = [1, 1, 1]
+
+    # i3d, ResNet50
+    if arc_type == 2:
+        use_temp_convs_1 = [2]
+        temp_strides_1 = [1]
+        use_temp_convs_2 = [1, 1, 1]
+        temp_strides_2 = [1, 1, 1]
+        use_temp_convs_3 = [1, 0, 1, 0]
+        temp_strides_3 = [1, 1, 1, 1]
+        use_temp_convs_4 = [1, 0, 1, 0, 1, 0]
+        temp_strides_4 = [1, 1, 1, 1, 1, 1]
+        use_temp_convs_5 = [0, 1, 0]
+        temp_strides_5 = [1, 1, 1]
+
+    # c2d, ResNet101
+    if arc_type == 3:
+        use_temp_convs_1 = [0]
+        temp_strides_1 = [2]
+        use_temp_convs_2 = [0, 0, 0]
+        temp_strides_2 = [1, 1, 1]
+        use_temp_convs_3 = [0, 0, 0, 0]
+        temp_strides_3 = [1, 1, 1, 1]
+        use_temp_convs_4 = [0, ] * 23
+        temp_strides_4 = [1, ] * 23
+        use_temp_convs_5 = [0, 0, 0]
+        temp_strides_5 = [1, 1, 1]
+
+    # i3d, ResNet101
+    if arc_type == 4:
+        use_temp_convs_1 = [2]
+        temp_strides_1 = [2]
+        use_temp_convs_2 = [1, 1, 1]
+        temp_strides_2 = [1, 1, 1]
+        use_temp_convs_3 = [1, 0, 1, 0]
+        temp_strides_3 = [1, 1, 1, 1]
+        use_temp_convs_4 = []
+        for i in range(23):
+            if i % 2 == 0:
+                use_temp_convs_4.append(1)
+            else:
+                use_temp_convs_4.append(0)
+
+        temp_strides_4 = [1, ] * 23
+        use_temp_convs_5 = [0, 1, 0]
+        temp_strides_5 = [1, 1, 1]
+
+    use_temp_convs_set = [use_temp_convs_1, use_temp_convs_2, use_temp_convs_3, use_temp_convs_4, use_temp_convs_5]
+    temp_strides_set = [temp_strides_1, temp_strides_2, temp_strides_3, temp_strides_4, temp_strides_5]
+
+    return use_temp_convs_set, temp_strides_set
+
+
+def resnet10(**kwargs):
+    """Constructs a ResNet-18 model.
+    """
+    use_temp_convs_set = []
+    temp_strides_set = []
+    model = ResNet(BasicBlock, [1, 1, 1, 1], use_temp_convs_set, temp_strides_set, **kwargs)
+    return model
+
+
+def resnet18(**kwargs):
+    """Constructs a ResNet-18 model.
+    """
+    use_temp_convs_set = []
+    temp_strides_set = []
+    model = ResNet(BasicBlock, [2, 2, 2, 2], use_temp_convs_set, temp_strides_set, **kwargs)
+    return model
+
+
+def resnet34(**kwargs):
+    """Constructs a ResNet-34 model.
+    """
+    use_temp_convs_set = []
+    temp_strides_set = []
+    model = ResNet(BasicBlock, [3, 4, 6, 3], use_temp_convs_set, temp_strides_set, **kwargs)
+    return model
+
+
+def resnet50(extract_features, **kwargs):
+    """Constructs a ResNet-50 model.
+    """
+    use_temp_convs_set, temp_strides_set = obtain_arc(2)
+    model = ResNet(Bottleneck, [3, 4, 6, 3], use_temp_convs_set, temp_strides_set,
+                   extract_features=extract_features, **kwargs)
+    return model
+
+
+def resnet101(**kwargs):
+    """Constructs a ResNet-101 model.
+    """
+    use_temp_convs_set, temp_strides_set = obtain_arc(4)
+    model = ResNet(Bottleneck, [3, 4, 23, 3], use_temp_convs_set, temp_strides_set, **kwargs)
+    return model
+
+
+def resnet152(**kwargs):
+    """Constructs a ResNet-101 model.
+    """
+    use_temp_convs_set = []
+    temp_strides_set = []
+    model = ResNet(Bottleneck, [3, 8, 36, 3], use_temp_convs_set, temp_strides_set, **kwargs)
+    return model
+
+
+def resnet200(**kwargs):
+    """Constructs a ResNet-101 model.
+    """
+    use_temp_convs_set = []
+    temp_strides_set = []
+    model = ResNet(Bottleneck, [3, 24, 36, 3], use_temp_convs_set, temp_strides_set, **kwargs)
+    return model
+
+
+def Net(num_classes, extract_features=False, loss_type='softmax',
+        weights=None, freeze_all_but_cls=False):
+    net = globals()['resnet' + str(50)](
+        num_classes=num_classes,
+        sample_size=50,
+        sample_duration=32,
+        extract_features=extract_features,
+        loss_type=loss_type,
+    )
+
+    if weights is not None:
+        kinetics_weights = torch.load(weights)['state_dict']
+        print("Found weights in {}.".format(weights))
+        cls_name = 'fc'
+    else:
+        kinetics_weights = torch.load('i3D/kinetics-res50.pth')
+        cls_name = 'fc'
+        print('\n Restoring Kintetics \n')
+
+    new_weights = {}
+    for k, v in kinetics_weights.items():
+        if not k.startswith('module.' + cls_name):
+            new_weights[k.replace('module.', '')] = v
+    net.load_state_dict(new_weights, strict=False)
+
+    if freeze_all_but_cls:
+        for name, par in net.named_parameters():
+            if not name.startswith('classifier'):
+                par.requires_grad = False
+    return net
--- a/regen_frame_fv.py
+++ b/regen_frame_fv.py
+# -*- coding: utf-8 -*-
+import os
+import sys
+import glob
+import datetime
+import argparse
+import random
+
+import numpy as np
+
+from pathlib import Path
+filepath = Path.cwd()
+sys.path.append(filepath)
+from video_loaders import load_av
+from se_bb_from_np import annot_np
+from SmthSequence import SmthSequence
+from SmthFrameRelations import frame_relations
+
+from PIL import Image
+import cv2
+import torch
+from i3D.model import VideoModel
+from i3D.model_lib import VideoModelGlobalCoordLatent
+import i3D.gtransforms as gtransforms
+
+class FrameFV:
+        def __init__(self,path,args):
+                self.anno = annot_np(path)
+                self.net = VideoModelGlobalCoordLatent(args)
+                self.pre_resize_shape = (224, 224)
+                self.random_crop = gtransforms.GroupMultiScaleCrop(output_size=224,
+                                                                   scales=[1],
+                                                                   max_distort=0,
+                                                                   center_crop_only=True)
+        
+        def process_video(self,finput,verbose=False):
+                # get video id
+                vidnum = int(os.path.splitext(os.path.basename(finput))[0])
+                
+                # load video to ndarray list
+                img_array = load_av(finput)
+                print(img_array[0].shape)
+                #for i in range(len(img_array)):
+                #    img_array[i] = cv2.resize(img_array[i],self.pre_resize_shape)
+                img_array = [cv2.resize(img, (self.pre_resize_shape[1], self.pre_resize_shape[0])) for img in img_array]
+                
+                rs = []
+                gs = []
+                bs = []
+                for i in range(len(img_array)//3):
+                    B, R, G = cv2.split(img_array[i])
+                    rs.append(R)
+                    gs.append(G)
+                    bs.append(B)
+                frames = [rs, gs, bs]
+                    
+                #frames = [Image.fromarray(img.astype('uint8'), 'RGB') for img in img_array]
+                #frames, (offset_h, offset_w, crop_h, crop_w) = self.random_crop(frames)
+                
+                # read frame annotations into Sequence
+                seq = SmthSequence()
+                for framenum in range(0,len(img_array)):
+                    cats, bbs = self.anno.get_vf_bbx(vidnum, framenum+1)
+                    # add detections to Sequence
+                    for i in range(0,len(cats)):
+                        seq.add(framenum, cats[i], bbs[i])
+                
+                # compute object relations per frame
+                relations = []
+                for framenum in range(0,len(img_array)):
+                    fv = frame_relations(seq, 0, 1, framenum)
+                    relations.append(fv)
+                relations  = np.asarray(relations)
+                
+                # TODO bb category embedding per frame
+                
+                # i3D features per frame
+                #clip = torch.from_numpy(np.asarray([[img_array[0],img_array[1],img_array[2]]]))
+                clip = torch.from_numpy(np.asarray([frames]))
+                #clip = img_array
+                print(clip.shape)
+                clip = clip.float()
+                glo, vid = self.net.i3D(clip)
+                
+                videos_features = self.net.conv(vid)
+                
+                print(glo.shape)
+                print(vid.shape)
+                
+                print(videos_features.shape)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--annotations',
+        dest='path_to_annotations',
+        default='../annotations_ground/',
+        help='folder to load annotations from')
+    parser.add_argument(
+        '--video',
+        dest='path_to_video',
+        default='.',
+        help='video to load')
+        
+    # begin import
+    parser.add_argument('--img_feature_dim', default=256, type=int, metavar='N',
+                    help='intermediate feature dimension for image-based features')
+    parser.add_argument('--coord_feature_dim', default=128, type=int, metavar='N',
+                        help='intermediate feature dimension for coord-based features')
+    parser.add_argument('--size', default=224, type=int, metavar='N',
+                        help='primary image input size')
+    parser.add_argument('--batch_size', '-b', default=72, type=int,
+                        metavar='N', help='mini-batch size (default: 72)')
+    parser.add_argument('--num_classes', default=50, type=int,
+                        help='num of class in the model')
+    parser.add_argument('--num_boxes', default=4, type=int,
+                        help='num of boxes for each image')
+    parser.add_argument('--num_frames', default=36, type=int,
+                        help='num of frames for the model')
+    parser.add_argument('--fine_tune', help='path with ckpt to restore')
+    parser.add_argument('--restore_i3d')
+    parser.add_argument('--restore_custom')
+    # end import
+    
+    args = parser.parse_args()
+    
+    compfv = FrameFV(args.path_to_annotations, args)
+    fv = compfv.process_video(args.path_to_video, verbose=True)
+    
+    print("fin")