diff --git a/collate_fv.py b/collate_fv.py
index e8103d2aa74c8c202aca136292ad147ce83ec810..33944a03cd24ac68dba5f4583413ea0bca9b70f7 100644
--- a/collate_fv.py
+++ b/collate_fv.py
@@ -10,10 +10,11 @@ import json
 import cv2
 import numpy as np
 
-from pathlib import Path
+#from pathlib import Path
 #filepath = Path.cwd()
 #sys.path.append(filepath)
 from compute_fv import ComputeFV
+from kinetics_feats import I3dFV
 
 if __name__ == "__main__":
         parser = argparse.ArgumentParser()
@@ -48,6 +49,7 @@ if __name__ == "__main__":
                 dest='path_to_annotations',
                 default='../annotations_ground/',
                 help='folder to load annotations from')
+        
         args = parser.parse_args()
         
         labs = json.load(open(args.labels_file,'r'))
@@ -58,6 +60,7 @@ if __name__ == "__main__":
         
         # load something-else annotated videos
         compfv = ComputeFV(args.path_to_annotations)
+        i3dfv = I3dFV(args.path_to_annotations)
         
         #ids = [] # video id's in order of being processed
         #classes = [] # class of each video == ordinal of label
@@ -71,13 +74,14 @@ if __name__ == "__main__":
                 for filename in glob.glob(folder + "/*.webm"):
                         # only process a random % of the videos (if not positive class example)
                         if int(k) != args.action_id:
-                            if random.random() > 0.01:
+                            if random.random() > 0.02:
                                 continue
                         
                         print("processing file: " + filename)
                         vidnum = int(os.path.splitext(os.path.basename(filename))[0])
 
                         fv = compfv.process_video(filename, os.path.join(args.path_to_phase_models, 'a'+str(args.action_id)+'.joblib'))
+                        fv0 = i3dfv.process_video(filename)
                         
                         #if type(fv) is not np.ndarray:
                         #       continue
@@ -85,7 +89,7 @@ if __name__ == "__main__":
                         if int(k) == args.action_id:
                                 fv[-1] = 1
                         
-                        feats.append(fv)
+                        feats.append(np.concatenate([fv0.flatten(), fv]))
                         #ids.append(vidnum)
         
         # list of np.ndarray to 2d ndarray
diff --git a/collect_action_features.sh b/collect_action_features.sh
index f31f518ab8831b2006872d52052564f81ad41c4f..2f87d7c94ffce9573923b1e1271583e544b8f744 100755
--- a/collect_action_features.sh
+++ b/collect_action_features.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-ACTIONS=( 131 141 )
+ACTIONS=( 1 12 13 16 25 33 34 35 38 48 51 52 54 55 56 58 61 63 64 66 69 71 75 76 78 80 81 82 88 89 95 96 97 102 111 118 127 128 130 131 133 136 138 141 147 152 155 159 160 163 )
 
 for i in ${ACTIONS[@]};
 do
diff --git a/i3D/i3dpt.py b/i3D/i3dpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..99d7487d5d8d558cd3d9c0ba6a82c483cb6759a8
--- /dev/null
+++ b/i3D/i3dpt.py
@@ -0,0 +1,455 @@
+import math
+import os
+import torch
+import numpy as np
+
+
+def get_padding_shape(filter_shape, stride, mod=0):
+    """Fetch a tuple describing the input padding shape.
+
+    NOTES: To replicate "TF SAME" style padding, the padding shape needs to be
+    determined at runtime to handle cases when the input dimension is not divisible
+    by the stride.
+    See https://stackoverflow.com/a/49842071 for explanation of TF SAME padding logic
+    """
+    def _pad_top_bottom(filter_dim, stride_val, mod):
+        if mod:
+            pad_along = max(filter_dim - mod, 0)
+        else:
+            pad_along = max(filter_dim - stride_val, 0)
+        pad_top = pad_along // 2
+        pad_bottom = pad_along - pad_top
+        return pad_top, pad_bottom
+
+    padding_shape = []
+    for idx, (filter_dim, stride_val) in enumerate(zip(filter_shape, stride)):
+        depth_mod = (idx == 0) and mod
+        pad_top, pad_bottom = _pad_top_bottom(filter_dim, stride_val, depth_mod)
+        padding_shape.append(pad_top)
+        padding_shape.append(pad_bottom)
+
+    depth_top = padding_shape.pop(0)
+    depth_bottom = padding_shape.pop(0)
+    padding_shape.append(depth_top)
+    padding_shape.append(depth_bottom)
+    return tuple(padding_shape)
+
+
+def simplify_padding(padding_shapes):
+    all_same = True
+    padding_init = padding_shapes[0]
+    for pad in padding_shapes[1:]:
+        if pad != padding_init:
+            all_same = False
+    return all_same, padding_init
+
+
+class Unit3Dpy(torch.nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=(1, 1, 1),
+                 stride=(1, 1, 1),
+                 activation='relu',
+                 padding='SAME',
+                 use_bias=False,
+                 use_bn=True):
+        super(Unit3Dpy, self).__init__()
+
+        self.padding = padding
+        self.activation = activation
+        self.use_bn = use_bn
+        self.stride = stride
+        if padding == 'SAME':
+            padding_shape = get_padding_shape(kernel_size, stride)
+            simplify_pad, pad_size = simplify_padding(padding_shape)
+            self.simplify_pad = simplify_pad
+            if stride[0] > 1:
+                padding_shapes = [get_padding_shape(kernel_size, stride, mod) for
+                                  mod in range(stride[0])]
+            else:
+                padding_shapes = [padding_shape]
+        elif padding == 'VALID':
+            padding_shape = 0
+        else:
+            raise ValueError(
+                'padding should be in [VALID|SAME] but got {}'.format(padding))
+
+        if padding == 'SAME':
+            if not simplify_pad:
+                self.pads = [torch.nn.ConstantPad3d(x, 0) for x in padding_shapes]
+                self.conv3d = torch.nn.Conv3d(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    stride=stride,
+                    bias=use_bias)
+            else:
+                self.conv3d = torch.nn.Conv3d(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    stride=stride,
+                    padding=pad_size,
+                    bias=use_bias)
+        elif padding == 'VALID':
+            self.conv3d = torch.nn.Conv3d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                padding=padding_shape,
+                stride=stride,
+                bias=use_bias)
+        else:
+            raise ValueError(
+                'padding should be in [VALID|SAME] but got {}'.format(padding))
+
+        if self.use_bn:
+            # This is not strictly the correct map between epsilons in keras and
+            # pytorch (which have slightly different definitions of the batch norm
+            # forward pass), but it seems to be good enough. The PyTorch formula
+            # is described here:
+            # https://pytorch.org/docs/stable/_modules/torch/nn/modules/batchnorm.html
+            tf_style_eps = 1E-3
+            self.batch3d = torch.nn.BatchNorm3d(out_channels, eps=tf_style_eps)
+
+        if activation == 'relu':
+            self.activation = torch.nn.functional.relu
+
+    def forward(self, inp):
+        if self.padding == 'SAME' and self.simplify_pad is False:
+            # Determine the padding to be applied by examining the input shape
+            pad_idx = inp.shape[2] % self.stride[0]
+            pad_op = self.pads[pad_idx]
+            inp = pad_op(inp)
+        out = self.conv3d(inp)
+        if self.use_bn:
+            out = self.batch3d(out)
+        if self.activation is not None:
+            out = torch.nn.functional.relu(out)
+        return out
+
+
+class MaxPool3dTFPadding(torch.nn.Module):
+    def __init__(self, kernel_size, stride=None, padding='SAME'):
+        super(MaxPool3dTFPadding, self).__init__()
+        if padding == 'SAME':
+            padding_shape = get_padding_shape(kernel_size, stride)
+            self.padding_shape = padding_shape
+            self.stride = stride
+            if stride[0] > 1:
+                padding_shapes = [get_padding_shape(kernel_size, stride, mod) for
+                                  mod in range(stride[0])]
+            else:
+                padding_shapes = [padding_shape]
+            self.pads = [torch.nn.ConstantPad3d(x, 0) for x in padding_shapes]
+        self.pool = torch.nn.MaxPool3d(kernel_size, stride, ceil_mode=True)
+
+    def forward(self, inp):
+        pad_idx = inp.shape[2] % self.stride[0]
+        pad_op = self.pads[pad_idx]
+        inp = pad_op(inp)
+        out = self.pool(inp)
+        return out
+
+
+class Mixed(torch.nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(Mixed, self).__init__()
+        # Branch 0
+        self.branch_0 = Unit3Dpy(
+            in_channels, out_channels[0], kernel_size=(1, 1, 1))
+
+        # Branch 1
+        branch_1_conv1 = Unit3Dpy(
+            in_channels, out_channels[1], kernel_size=(1, 1, 1))
+        branch_1_conv2 = Unit3Dpy(
+            out_channels[1], out_channels[2], kernel_size=(3, 3, 3))
+        self.branch_1 = torch.nn.Sequential(branch_1_conv1, branch_1_conv2)
+
+        # Branch 2
+        branch_2_conv1 = Unit3Dpy(
+            in_channels, out_channels[3], kernel_size=(1, 1, 1))
+        branch_2_conv2 = Unit3Dpy(
+            out_channels[3], out_channels[4], kernel_size=(3, 3, 3))
+        self.branch_2 = torch.nn.Sequential(branch_2_conv1, branch_2_conv2)
+
+        # Branch3
+        branch_3_pool = MaxPool3dTFPadding(
+            kernel_size=(3, 3, 3), stride=(1, 1, 1), padding='SAME')
+        branch_3_conv2 = Unit3Dpy(
+            in_channels, out_channels[5], kernel_size=(1, 1, 1))
+        self.branch_3 = torch.nn.Sequential(branch_3_pool, branch_3_conv2)
+
+    def forward(self, inp):
+        out_0 = self.branch_0(inp)
+        out_1 = self.branch_1(inp)
+        out_2 = self.branch_2(inp)
+        out_3 = self.branch_3(inp)
+        out = torch.cat((out_0, out_1, out_2, out_3), 1)
+        return out
+
+
+class I3D(torch.nn.Module):
+    def __init__(self,
+                 num_classes,
+                 modality='rgb',
+                 dropout_prob=0,
+                 name='inception'):
+        super(I3D, self).__init__()
+
+        self.name = name
+        self.num_classes = num_classes
+        if modality == 'rgb':
+            in_channels = 3
+        elif modality == 'flow':
+            in_channels = 2
+        else:
+            raise ValueError(
+                '{} not among known modalities [rgb|flow]'.format(modality))
+        self.modality = modality
+
+        conv3d_1a_7x7 = Unit3Dpy(
+            out_channels=64,
+            in_channels=in_channels,
+            kernel_size=(7, 7, 7),
+            stride=(2, 2, 2),
+            padding='SAME')
+        # 1st conv-pool
+        self.conv3d_1a_7x7 = conv3d_1a_7x7
+        self.maxPool3d_2a_3x3 = MaxPool3dTFPadding(
+            kernel_size=(1, 3, 3), stride=(1, 2, 2), padding='SAME')
+        # conv conv
+        conv3d_2b_1x1 = Unit3Dpy(
+            out_channels=64,
+            in_channels=64,
+            kernel_size=(1, 1, 1),
+            padding='SAME')
+        self.conv3d_2b_1x1 = conv3d_2b_1x1
+        conv3d_2c_3x3 = Unit3Dpy(
+            out_channels=192,
+            in_channels=64,
+            kernel_size=(3, 3, 3),
+            padding='SAME')
+        self.conv3d_2c_3x3 = conv3d_2c_3x3
+        self.maxPool3d_3a_3x3 = MaxPool3dTFPadding(
+            kernel_size=(1, 3, 3), stride=(1, 2, 2), padding='SAME')
+
+        # Mixed_3b
+        self.mixed_3b = Mixed(192, [64, 96, 128, 16, 32, 32])
+        self.mixed_3c = Mixed(256, [128, 128, 192, 32, 96, 64])
+
+        self.maxPool3d_4a_3x3 = MaxPool3dTFPadding(
+            kernel_size=(3, 3, 3), stride=(2, 2, 2), padding='SAME')
+
+        # Mixed 4
+        self.mixed_4b = Mixed(480, [192, 96, 208, 16, 48, 64])
+        self.mixed_4c = Mixed(512, [160, 112, 224, 24, 64, 64])
+        self.mixed_4d = Mixed(512, [128, 128, 256, 24, 64, 64])
+        self.mixed_4e = Mixed(512, [112, 144, 288, 32, 64, 64])
+        self.mixed_4f = Mixed(528, [256, 160, 320, 32, 128, 128])
+
+        self.maxPool3d_5a_2x2 = MaxPool3dTFPadding(
+            kernel_size=(2, 2, 2), stride=(2, 2, 2), padding='SAME')
+
+        # Mixed 5
+        self.mixed_5b = Mixed(832, [256, 160, 320, 32, 128, 128])
+        self.mixed_5c = Mixed(832, [384, 192, 384, 48, 128, 128])
+
+        self.avg_pool = torch.nn.AvgPool3d((2, 7, 7), (1, 1, 1))
+        self.dropout = torch.nn.Dropout(dropout_prob)
+        self.conv3d_0c_1x1 = Unit3Dpy(
+            in_channels=1024,
+            out_channels=self.num_classes,
+            kernel_size=(1, 1, 1),
+            activation=None,
+            use_bias=True,
+            use_bn=False)
+        self.softmax = torch.nn.Softmax(1)
+
+    def forward(self, inp):
+        # Preprocessing
+        out = self.conv3d_1a_7x7(inp)
+        out = self.maxPool3d_2a_3x3(out)
+        out = self.conv3d_2b_1x1(out)
+        out = self.conv3d_2c_3x3(out)
+        out = self.maxPool3d_3a_3x3(out)
+        out = self.mixed_3b(out)
+        out = self.mixed_3c(out)
+        out = self.maxPool3d_4a_3x3(out)
+        out = self.mixed_4b(out)
+        out = self.mixed_4c(out)
+        out = self.mixed_4d(out)
+        out = self.mixed_4e(out)
+        out = self.mixed_4f(out)
+        out = self.maxPool3d_5a_2x2(out)
+        out = self.mixed_5b(out)
+        out = self.mixed_5c(out)
+        out = self.avg_pool(out)
+        out = self.dropout(out)
+        out = self.conv3d_0c_1x1(out)
+        out = out.squeeze(3)
+        out = out.squeeze(3)
+        out = out.mean(2)
+        out_logits = out
+        out = self.softmax(out_logits)
+        return out, out_logits
+
+    def load_tf_weights(self, sess):
+        state_dict = {}
+        if self.modality == 'rgb':
+            prefix = 'RGB/inception_i3d'
+        elif self.modality == 'flow':
+            prefix = 'Flow/inception_i3d'
+        load_conv3d(state_dict, 'conv3d_1a_7x7', sess,
+                    os.path.join(prefix, 'Conv3d_1a_7x7'))
+        load_conv3d(state_dict, 'conv3d_2b_1x1', sess,
+                    os.path.join(prefix, 'Conv3d_2b_1x1'))
+        load_conv3d(state_dict, 'conv3d_2c_3x3', sess,
+                    os.path.join(prefix, 'Conv3d_2c_3x3'))
+
+        load_mixed(state_dict, 'mixed_3b', sess,
+                   os.path.join(prefix, 'Mixed_3b'))
+        load_mixed(state_dict, 'mixed_3c', sess,
+                   os.path.join(prefix, 'Mixed_3c'))
+        load_mixed(state_dict, 'mixed_4b', sess,
+                   os.path.join(prefix, 'Mixed_4b'))
+        load_mixed(state_dict, 'mixed_4c', sess,
+                   os.path.join(prefix, 'Mixed_4c'))
+        load_mixed(state_dict, 'mixed_4d', sess,
+                   os.path.join(prefix, 'Mixed_4d'))
+        load_mixed(state_dict, 'mixed_4e', sess,
+                   os.path.join(prefix, 'Mixed_4e'))
+        # Here goest to 0.1 max error with tf
+        load_mixed(state_dict, 'mixed_4f', sess,
+                   os.path.join(prefix, 'Mixed_4f'))
+
+        load_mixed(
+            state_dict,
+            'mixed_5b',
+            sess,
+            os.path.join(prefix, 'Mixed_5b'),
+            fix_typo=True)
+        load_mixed(state_dict, 'mixed_5c', sess,
+                   os.path.join(prefix, 'Mixed_5c'))
+        load_conv3d(
+            state_dict,
+            'conv3d_0c_1x1',
+            sess,
+            os.path.join(prefix, 'Logits', 'Conv3d_0c_1x1'),
+            bias=True,
+            bn=False)
+        self.load_state_dict(state_dict)
+
+
+def get_conv_params(sess, name, bias=False):
+    # Get conv weights
+    conv_weights_tensor = sess.graph.get_tensor_by_name(
+        os.path.join(name, 'w:0'))
+    if bias:
+        conv_bias_tensor = sess.graph.get_tensor_by_name(
+            os.path.join(name, 'b:0'))
+        conv_bias = sess.run(conv_bias_tensor)
+    conv_weights = sess.run(conv_weights_tensor)
+    conv_shape = conv_weights.shape
+
+    kernel_shape = conv_shape[0:3]
+    in_channels = conv_shape[3]
+    out_channels = conv_shape[4]
+
+    conv_op = sess.graph.get_operation_by_name(
+        os.path.join(name, 'convolution'))
+    padding_name = conv_op.get_attr('padding')
+    padding = _get_padding(padding_name, kernel_shape)
+    all_strides = conv_op.get_attr('strides')
+    strides = all_strides[1:4]
+    conv_params = [
+        conv_weights, kernel_shape, in_channels, out_channels, strides, padding
+    ]
+    if bias:
+        conv_params.append(conv_bias)
+    return conv_params
+
+
+def get_bn_params(sess, name):
+    moving_mean_tensor = sess.graph.get_tensor_by_name(
+        os.path.join(name, 'moving_mean:0'))
+    moving_var_tensor = sess.graph.get_tensor_by_name(
+        os.path.join(name, 'moving_variance:0'))
+    beta_tensor = sess.graph.get_tensor_by_name(os.path.join(name, 'beta:0'))
+    moving_mean = sess.run(moving_mean_tensor)
+    moving_var = sess.run(moving_var_tensor)
+    beta = sess.run(beta_tensor)
+    return moving_mean, moving_var, beta
+
+
+def _get_padding(padding_name, conv_shape):
+    padding_name = padding_name.decode("utf-8")
+    if padding_name == "VALID":
+        return [0, 0]
+    elif padding_name == "SAME":
+        # return [math.ceil(int(conv_shape[0])/2), math.ceil(int(conv_shape[1])/2)]
+        return [
+            math.floor(int(conv_shape[0]) / 2),
+            math.floor(int(conv_shape[1]) / 2),
+            math.floor(int(conv_shape[2]) / 2)
+        ]
+    else:
+        raise ValueError('Invalid padding name ' + padding_name)
+
+
+def load_conv3d(state_dict, name_pt, sess, name_tf, bias=False, bn=True):
+    # Transfer convolution params
+    conv_name_tf = os.path.join(name_tf, 'conv_3d')
+    conv_params = get_conv_params(sess, conv_name_tf, bias=bias)
+    if bias:
+        conv_weights, kernel_shape, in_channels, out_channels, strides, padding, conv_bias = conv_params
+    else:
+        conv_weights, kernel_shape, in_channels, out_channels, strides, padding = conv_params
+
+    conv_weights_rs = np.transpose(
+        conv_weights, (4, 3, 0, 1,
+                       2))  # to pt format (out_c, in_c, depth, height, width)
+    state_dict[name_pt + '.conv3d.weight'] = torch.from_numpy(conv_weights_rs)
+    if bias:
+        state_dict[name_pt + '.conv3d.bias'] = torch.from_numpy(conv_bias)
+
+    # Transfer batch norm params
+    if bn:
+        conv_tf_name = os.path.join(name_tf, 'batch_norm')
+        moving_mean, moving_var, beta = get_bn_params(sess, conv_tf_name)
+
+        out_planes = conv_weights_rs.shape[0]
+        state_dict[name_pt + '.batch3d.weight'] = torch.ones(out_planes)
+        state_dict[name_pt +
+                   '.batch3d.bias'] = torch.from_numpy(beta.squeeze())
+        state_dict[name_pt
+                   + '.batch3d.running_mean'] = torch.from_numpy(moving_mean.squeeze())
+        state_dict[name_pt
+                   + '.batch3d.running_var'] = torch.from_numpy(moving_var.squeeze())
+
+
+def load_mixed(state_dict, name_pt, sess, name_tf, fix_typo=False):
+    # Branch 0
+    load_conv3d(state_dict, name_pt + '.branch_0', sess,
+                os.path.join(name_tf, 'Branch_0/Conv3d_0a_1x1'))
+
+    # Branch .1
+    load_conv3d(state_dict, name_pt + '.branch_1.0', sess,
+                os.path.join(name_tf, 'Branch_1/Conv3d_0a_1x1'))
+    load_conv3d(state_dict, name_pt + '.branch_1.1', sess,
+                os.path.join(name_tf, 'Branch_1/Conv3d_0b_3x3'))
+
+    # Branch 2
+    load_conv3d(state_dict, name_pt + '.branch_2.0', sess,
+                os.path.join(name_tf, 'Branch_2/Conv3d_0a_1x1'))
+    if fix_typo:
+        load_conv3d(state_dict, name_pt + '.branch_2.1', sess,
+                    os.path.join(name_tf, 'Branch_2/Conv3d_0a_3x3'))
+    else:
+        load_conv3d(state_dict, name_pt + '.branch_2.1', sess,
+                    os.path.join(name_tf, 'Branch_2/Conv3d_0b_3x3'))
+
+    # Branch 3
+    load_conv3d(state_dict, name_pt + '.branch_3.1', sess,
+                os.path.join(name_tf, 'Branch_3/Conv3d_0b_1x1'))
diff --git a/i3D/model.py b/i3D/model.py
deleted file mode 100644
index 55cb83dfbccb3c112337f89ae7270e1e8b0da3d0..0000000000000000000000000000000000000000
--- a/i3D/model.py
+++ /dev/null
@@ -1,140 +0,0 @@
-import torch
-import torch.nn as nn
-from i3D.resnet3d_xl import Net
-import torch.nn.functional as F
-'''
-Video Classification Model library.
-'''
-
-class TrainingScheduleError(Exception):
-    pass
-
-class VideoModel(nn.Module):
-    def __init__(self,
-                 num_classes,
-                 num_boxes,
-                 num_videos=16,
-                 restore_dict=None,
-                 freeze_weights=None,
-                 device=None,
-                 loss_type='softmax'):
-        super(VideoModel, self).__init__()
-        self.device = device
-        self.num_frames = num_videos
-        self.num_classes = num_classes
-        # Network loads kinetic pre-trained weights in initialization
-        self.i3D = Net(num_classes, extract_features=True, loss_type=loss_type)
-
-
-        try:
-            # Restore weights
-            if restore_dict:
-                self.restore(restore_dict)
-            # Freeze weights
-            if freeze_weights:
-                self.freeze_weights(freeze_weights)
-            else:
-                print(" > No weights are freezed")
-        except Exception as e:
-            print(" > Exception {}".format(e))
-
-    def restore(self, restore=None):
-        # Load pre-trained I3D + Graph weights for fine-tune (replace the last FC)
-        restore_finetuned = restore.get("restore_finetuned", None)
-        if restore_finetuned:
-            self._restore_fintuned(restore_finetuned)
-            print(" > Restored I3D + Graph weights")
-            return
-
-        # Load pre-trained I3D weights
-        restore_i3d = restore.get("restore_i3d", None)
-        if restore_i3d:
-            self._restore_i3d(restore_i3d)
-            print(" > Restored only I3D weights")
-            return
-
-        # Load pre-trained I3D + Graph weights without replacing anything
-        restore_predict = restore.get("restore_predict", None)
-        if restore_predict:
-            self._restore_predict(restore_predict)
-            print(" > Restored the model with strict weights")
-            return
-
-    def _restore_predict(self, path):
-        if path is None:
-            raise TrainingScheduleError('You should pre-train the video model on your training data first')
-
-        weights = torch.load(path, map_location=self.device)['state_dict']
-        new_weights = {}
-        for k, v in weights.items():
-            new_weights[k.replace('module.', '')] = v
-
-        self.load_state_dict(new_weights, strict=True)
-        print(" > Weights {} loaded".format(path))
-
-    def _restore_i3d(self, path):
-        if path is None:
-            raise TrainingScheduleError('You should pre-train the video model on your training data first')
-       
-        weights = torch.load(path, map_location=self.device)['state_dict']
-        new_weights = {}
-        for k, v in weights.items():
-            if not k.startswith('module.fc') and not k.startswith('module.i3D.classifier'):
-                new_weights[k.replace('module.', '')] = v
-        self.load_state_dict(new_weights, strict=False)
-
-    def _restore_fintuned(self, path):
-        if path is None:
-            raise TrainingScheduleError('You should pre-train the video model on your training data first')
-
-        weights = torch.load(path, map_location=self.device)['state_dict']
-        new_weights = {}
-        for k, v in weights.items():
-            # Don't load classifiers (different classes 88 vs 86)
-            if not k.startswith('module.fc'):
-                if not k.startswith('module.i3D.classifier'):
-                    new_weights[k.replace('module.', '')] = v
-
-        self.load_state_dict(new_weights, strict=False)
-        print(" > Weights {} loaded".format(path))
-
-    def freeze_weights(self, module):
-        if module == 'i3d':
-            print(" > Freeze I3D module")
-            for param in self.i3D.parameters():
-                param.requires_grad = False
-        elif module == 'fine_tuned':
-            print(" > Freeze Graph + I3D module, only last FC is training")
-            # Fixed the entire params without the last FC
-            for name, param in self.i3D.named_parameters():
-                if not name.startswith('classifier'):
-                    param.requires_grad = False
-            for param in self.graph_embedding.parameters():
-                param.requires_grad = False
-            for param in self.conv.parameters():
-                param.requires_grad = False
-
-        else:
-            raise NotImplementedError('Unrecognized option, you can freeze either graph module or I3D module')
-        pass
-
-    def _get_i3d_features(self, videos, output_video_features=False):
-        # org_features - [V x 2048 x T / 2 x 14 x 14]
-        _, org_features = self.i3D(videos)
-        # Reduce dimension video_features - [V x 512 x T / 2 x 14 x 14]
-        videos_features = self.conv(org_features)
-        bs, d, t, h, w = videos_features.size()
-        # Get global features
-        videos_features_rs = videos_features.permute(0, 2, 1, 3, 4)  # [V x T / 2 x 512 x h x w]
-        videos_features_rs = videos_features_rs.reshape(-1, d, h, w)  # [V * T / 2 x 512 x h x w]
-        global_features = self.avgpool(videos_features_rs)  # [V * T / 2 x 512 x 1 x 1]
-        global_features = self.dropout(global_features)
-        global_features = global_features.reshape(bs, t, d)  # [V x T / 2 x 512]
-        if output_video_features:
-            return global_features, videos_features
-        else:
-            return global_features
-
-    def flatten(self, x):
-        return [item for sublist in x for item in sublist]
-
diff --git a/i3D/model_lib.py b/i3D/model_lib.py
deleted file mode 100644
index 54027926a3e2029b7b41952ade9ec85850eb1f57..0000000000000000000000000000000000000000
--- a/i3D/model_lib.py
+++ /dev/null
@@ -1,1050 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from i3D.resnet3d_xl import Net
-from i3D.nonlocal_helper import Nonlocal
-
-
-class VideoModelCoord(nn.Module):
-    def __init__(self, opt):
-        super(VideoModelCoord, self).__init__()
-        self.nr_boxes = opt.num_boxes
-        self.nr_actions = opt.num_classes
-        self.nr_frames = opt.num_frames // 2
-        self.coord_feature_dim = opt.coord_feature_dim
-
-        self.coord_to_feature = nn.Sequential(
-            nn.Linear(4, self.coord_feature_dim//2, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim//2),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.coord_feature_dim//2, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU()
-        )
-
-        self.spatial_node_fusion = nn.Sequential(
-            nn.Linear(self.coord_feature_dim*2, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU()
-        )
-
-        self.box_feature_fusion = nn.Sequential(
-            nn.Linear(self.nr_frames*self.coord_feature_dim, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU()
-        )
-
-        self.classifier = nn.Sequential(
-            nn.Linear(self.coord_feature_dim, self.coord_feature_dim),
-            # nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.coord_feature_dim, 512), #self.coord_feature_dim),
-            nn.ReLU(inplace=True),
-            nn.Linear(512, self.nr_actions)
-        )
-
-        if opt.fine_tune:
-            self.fine_tune(opt.fine_tune)
-
-    def fine_tune(self, restore_path, parameters_to_train=['classifier']):
-        weights = torch.load(restore_path)['state_dict']
-        new_weights = {}
-        #import pdb
-        for k, v in weights.items():
-            if not 'classifier.4' in k:
-                new_weights[k.replace('module.', '')] = v
-        #pdb.set_trace()
-        self.load_state_dict(new_weights, strict=False)
-        print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
-
-        frozen_weights = 0
-        for name, param in self.named_parameters():
-            if not 'classifier.4' in name:
-
-                param.requires_grad = False
-                frozen_weights += 1
-
-            else:
-                print('Training : {}'.format(name))
-        print('Number of frozen weights {}'.format(frozen_weights))
-        assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
-                                    'Check the naming convention of the parameters'
-
-    def forward(self, global_img_input, box_categories, box_input, video_label, is_inference=False):
-        # local_img_tensor is (b, nr_frames, nr_boxes, 3, h, w)
-        # global_img_tensor is (b, nr_frames, 3, h, w)
-        # box_input is (b, nr_frames, nr_boxes, 4)
-
-        b, _, _, _h, _w = global_img_input.size()
-        # global_imgs = global_img_input.view(b*self.nr_frames, 3, _h, _w)
-        # local_imgs = local_img_input.view(b*self.nr_frames*self.nr_boxes, 3, _h, _w)
-
-        box_input = box_input.transpose(2, 1).contiguous()
-        box_input = box_input.view(b*self.nr_boxes*self.nr_frames, 4)
-
-        bf = self.coord_to_feature(box_input)
-        bf = bf.view(b, self.nr_boxes, self.nr_frames, self.coord_feature_dim)
-
-        # spatial message passing (graph)
-        spatial_message = bf.sum(dim=1, keepdim=True)  # (b, 1, self.nr_frames, coord_feature_dim)
-        # message passed should substract itself, and normalize to it as a single feature
-        spatial_message = (spatial_message - bf) / (self.nr_boxes - 1)  # message passed should substract itself
-        bf_and_message = torch.cat([bf, spatial_message], dim=3)  # (b, nr_boxes, nr_frames, 2*coord_feature_dim)
-
-        # (b*nr_boxes*nr_frames, coord_feature_dim)
-        bf_spatial = self.spatial_node_fusion(bf_and_message.view(b*self.nr_boxes*self.nr_frames, -1))
-        bf_spatial = bf_spatial.view(b, self.nr_boxes, self.nr_frames, self.coord_feature_dim)
-
-        bf_temporal_input = bf_spatial.view(b, self.nr_boxes, self.nr_frames*self.coord_feature_dim)
-
-        box_features = self.box_feature_fusion(bf_temporal_input.view(b*self.nr_boxes, -1))  # (b*nr_boxes, coord_feature_dim)
-        box_features = torch.mean(box_features.view(b, self.nr_boxes, -1), dim=1)  # (b, coord_feature_dim)
-        # video_features = torch.cat([global_features, local_features, box_features], dim=1)
-        video_features = box_features
-
-        cls_output = self.classifier(video_features)  # (b, num_classes)
-        return cls_output
-
-class VideoModelCoordLatent(nn.Module):
-    def __init__(self, opt):
-        super(VideoModelCoordLatent, self).__init__()
-        self.nr_boxes = opt.num_boxes
-        self.nr_actions = opt.num_classes
-        self.nr_frames = opt.num_frames // 2
-        self.img_feature_dim = opt.img_feature_dim
-        self.coord_feature_dim = opt.coord_feature_dim
-
-        self.category_embed_layer = nn.Embedding(3, opt.coord_feature_dim // 2, padding_idx=0, scale_grad_by_freq=True)
-
-        self.coord_to_feature = nn.Sequential(
-            nn.Linear(4, self.coord_feature_dim//2, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim//2),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.coord_feature_dim//2, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU()
-        )
-
-        self.coord_category_fusion = nn.Sequential(
-            nn.Linear(self.coord_feature_dim+self.coord_feature_dim//2, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU(inplace=True),
-        )
-
-        self.spatial_node_fusion = nn.Sequential(
-            nn.Linear(self.coord_feature_dim*2, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU()
-        )
-
-        self.box_feature_fusion = nn.Sequential(
-            nn.Linear(self.nr_frames*self.coord_feature_dim, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU()
-        )
-
-        self.classifier = nn.Sequential(
-            nn.Linear(self.coord_feature_dim, self.coord_feature_dim),
-            # nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.coord_feature_dim, 512), #self.coord_feature_dim),
-            nn.ReLU(inplace=True),
-            nn.Linear(512, self.nr_actions)
-        )
-
-        if opt.fine_tune:
-            self.fine_tune(opt.fine_tune)
-
-    def fine_tune(self, restore_path, parameters_to_train=['classifier']):
-        weights = torch.load(restore_path)['state_dict']
-        new_weights = {}
-        for k, v in weights.items():
-            if not 'classifier.4' in k:
-                new_weights[k.replace('module.', '')] = v
-        self.load_state_dict(new_weights, strict=False)
-        print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
-
-        frozen_weights = 0
-        for name, param in self.named_parameters():
-            if not 'classifier.4' in name:
-                param.requires_grad = False
-                frozen_weights += 1
-            else:
-                print('Training : {}'.format(name))
-        print('Number of frozen weights {}'.format(frozen_weights))
-        assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
-                                    'Check the naming convention of the parameters'
-
-    def forward(self, global_img_input, box_categories, box_input, video_label, is_inference=False):
-        # local_img_tensor is (b, nr_frames, nr_boxes, 3, h, w)
-        # global_img_tensor is (b, nr_frames, 3, h, w)
-        # box_input is (b, nr_frames, nr_boxes, 4)
-
-        b, _, _, _h, _w = global_img_input.size()
-
-        box_input = box_input.transpose(2, 1).contiguous()
-        box_input = box_input.view(b*self.nr_boxes*self.nr_frames, 4)
-
-        box_categories = box_categories.long()
-        box_categories = box_categories.transpose(2, 1).contiguous()
-        box_categories = box_categories.view(b*self.nr_boxes*self.nr_frames)
-        box_category_embeddings = self.category_embed_layer(box_categories)  # (b*nr_b*nr_f, coord_feature_dim//2)
-
-        bf = self.coord_to_feature(box_input)
-        bf = torch.cat([bf, box_category_embeddings], dim=1)  # (b*nr_b*nr_f, coord_feature_dim + coord_feature_dim//2)
-        bf = self.coord_category_fusion(bf)  # (b*nr_b*nr_f, coord_feature_dim)
-        bf = bf.view(b, self.nr_boxes, self.nr_frames, self.coord_feature_dim)
-
-        # spatial message passing (graph)
-        spatial_message = bf.sum(dim=1, keepdim=True)  # (b, 1, self.nr_frames, coord_feature_dim)
-        # message passed should substract itself, and normalize to it as a single feature
-        spatial_message = (spatial_message - bf) / (self.nr_boxes - 1)  # message passed should substract itself
-        bf_and_message = torch.cat([bf, spatial_message], dim=3)  # (b, nr_boxes, nr_frames, 2*coord_feature_dim)
-
-        # (b*nr_boxes*nr_frames, coord_feature_dim)
-        bf_spatial = self.spatial_node_fusion(bf_and_message.view(b*self.nr_boxes*self.nr_frames, -1))
-        bf_spatial = bf_spatial.view(b, self.nr_boxes, self.nr_frames, self.coord_feature_dim)
-
-        bf_temporal_input = bf_spatial.view(b, self.nr_boxes, self.nr_frames*self.coord_feature_dim)
-
-        box_features = self.box_feature_fusion(bf_temporal_input.view(b*self.nr_boxes, -1))  # (b*nr_boxes, coord_feature_dim)
-        box_features = torch.mean(box_features.view(b, self.nr_boxes, -1), dim=1)  # (b, coord_feature_dim)
-        # video_features = torch.cat([global_features, local_features, box_features], dim=1)
-        video_features = box_features
-
-        cls_output = self.classifier(video_features)  # (b, num_classes)
-        return cls_output
-
-class VideoModelCoordLatentNL(nn.Module):
-    def __init__(self, opt):
-        super(VideoModelCoordLatentNL, self).__init__()
-        self.nr_boxes = opt.num_boxes
-        self.nr_actions = opt.num_classes
-        self.nr_frames = opt.num_frames // 2
-        self.img_feature_dim = opt.img_feature_dim
-        self.coord_feature_dim = opt.coord_feature_dim
-
-        self.category_embed_layer = nn.Embedding(3, opt.coord_feature_dim // 2, padding_idx=0, scale_grad_by_freq=True)
-
-        self.coord_to_feature = nn.Sequential(
-            nn.Linear(4, self.coord_feature_dim // 2, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim // 2),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.coord_feature_dim // 2, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU()
-        )
-
-        self.coord_category_fusion = nn.Sequential(
-            nn.Linear(self.coord_feature_dim + self.coord_feature_dim // 2, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU(inplace=True),
-        )
-
-        self.spatial_node_fusion = nn.Sequential(
-            nn.Linear(self.coord_feature_dim * 2, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU()
-        )
-
-        self.nr_nonlocal_layers = 3
-        self.nonlocal_fusion = []
-        for i in range(self.nr_nonlocal_layers):
-            self.nonlocal_fusion.append(nn.Sequential(
-                Nonlocal(dim=self.coord_feature_dim, dim_inner=self.coord_feature_dim // 2),
-                nn.Conv1d(self.coord_feature_dim, self.coord_feature_dim, kernel_size=1, stride=1, padding=0,
-                          bias=False),
-                nn.BatchNorm1d(self.coord_feature_dim),
-                nn.ReLU()
-            ))
-        self.nonlocal_fusion = nn.ModuleList(self.nonlocal_fusion)
-
-        self.box_feature_fusion = nn.Sequential(
-            nn.Linear(self.nr_frames * self.coord_feature_dim, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU()
-        )
-
-        self.classifier = nn.Sequential(
-            nn.Linear(self.coord_feature_dim, self.coord_feature_dim),
-            # nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.coord_feature_dim, 512),  # self.coord_feature_dim),
-            nn.ReLU(inplace=True),
-            nn.Linear(512, self.nr_actions)
-        )
-
-        if opt.fine_tune:
-            self.fine_tune(opt.fine_tune)
-
-    def train(self, mode=True):  # overriding default train function
-        super(VideoModelCoordLatentNL, self).train(mode)
-        for m in self.modules():  # or self.modules(), if freezing all bn layers
-            if isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d):
-                m.eval()
-                # shutdown update in frozen mode
-                m.weight.requires_grad = False
-                m.bias.requires_grad = False
-
-    def fine_tune(self, restore_path, parameters_to_train=['classifier']):
-        weights = torch.load(restore_path)['state_dict']
-        new_weights = {}
-        # import pdb
-        for k, v in weights.items():
-            if not 'classifier.4' in k:
-                new_weights[k.replace('module.', '')] = v
-        # pdb.set_trace()
-        self.load_state_dict(new_weights, strict=False)
-        print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
-
-        frozen_weights = 0
-        for name, param in self.named_parameters():
-            if not 'classifier.4' in name:
-
-                param.requires_grad = False
-                frozen_weights += 1
-
-            else:
-                print('Training : {}'.format(name))
-        print('Number of frozen weights {}'.format(frozen_weights))
-        assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
-                                    'Check the naming convention of the parameters'
-
-    def forward(self, global_img_input, box_categories, box_input, video_label, is_inference=False):
-        # local_img_tensor is (b, nr_frames, nr_boxes, 3, h, w)
-        # global_img_tensor is (b, nr_frames, 3, h, w)
-        # box_input is (b, nr_frames, nr_boxes, 4)
-
-        b, _, _, _h, _w = global_img_input.size()
-
-        box_input = box_input.transpose(2, 1).contiguous()
-        box_input = box_input.view(b * self.nr_boxes * self.nr_frames, 4)
-
-        box_categories = box_categories.long()
-        box_categories = box_categories.transpose(2, 1).contiguous()
-        box_categories = box_categories.view(b * self.nr_boxes * self.nr_frames)
-        box_category_embeddings = self.category_embed_layer(box_categories)  # (b*nr_b*nr_f, coord_feature_dim//2)
-
-        bf = self.coord_to_feature(box_input)
-        bf = torch.cat([bf, box_category_embeddings], dim=1)  # (b*nr_b*nr_f, coord_feature_dim + coord_feature_dim//2)
-        bf = self.coord_category_fusion(bf)  # (b*nr_b*nr_f, coord_feature_dim)
-        bf = bf.view(b, self.nr_boxes, self.nr_frames, self.coord_feature_dim)
-
-        # spatial message passing (graph)
-        spatial_message = bf.sum(dim=1, keepdim=True)  # (b, 1, self.nr_frames, coord_feature_dim)
-        # message passed should substract itself, and normalize to it as a single feature
-
-        spatial_message = (spatial_message - bf) / (self.nr_boxes - 1)  # message passed should substract itself
-        bf_and_message = torch.cat([bf, spatial_message], dim=3)  # (b, nr_boxes, nr_frames, 2*coord_feature_dim)
-
-        # (b*nr_boxes*nr_frames, coord_feature_dim)
-        bf_spatial = self.spatial_node_fusion(bf_and_message.view(b * self.nr_boxes * self.nr_frames, -1))
-        bf_spatial = bf_spatial.view(b, self.nr_boxes, self.nr_frames, self.coord_feature_dim)
-
-        bf_temporal_input = bf_spatial.view(b, self.nr_boxes, self.nr_frames * self.coord_feature_dim)
-
-        bf_nonlocal = self.box_feature_fusion(
-            bf_temporal_input.view(b * self.nr_boxes, -1))  # (b*nr_boxes, coord_feature_dim)
-        bf_nonlocal = bf_nonlocal.view(b, self.nr_boxes, self.coord_feature_dim).permute(0, 2,
-                                                                                         1).contiguous()  # (N, C, NB)
-        for i in range(self.nr_nonlocal_layers):
-            bf_nonlocal = self.nonlocal_fusion[i](bf_nonlocal)
-
-        box_features = torch.mean(bf_nonlocal, dim=2)  # (b, coord_feature_dim)
-
-        # video_features = torch.cat([global_features, local_features, box_features], dim=1)
-        video_features = box_features
-
-        cls_output = self.classifier(video_features)  # (b, num_classes)
-        return cls_output
-
-class VideoModelGlobalCoordLatent(nn.Module):
-    """
-    This model contains only global pooling without any graph.
-    """
-
-    def __init__(self, opt,
-                 ):
-        super(VideoModelGlobalCoordLatent, self).__init__()
-
-        self.nr_boxes = opt.num_boxes
-        self.nr_actions = opt.num_classes
-        self.nr_frames = opt.num_frames
-        self.img_feature_dim = opt.img_feature_dim
-        self.coord_feature_dim = opt.coord_feature_dim
-        self.i3D = Net(self.nr_actions, extract_features=True, loss_type='softmax')
-        self.dropout = nn.Dropout(0.3)
-        self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
-        self.conv = nn.Conv3d(2048, 512, kernel_size=(1, 1, 1), stride=1)
-
-        self.category_embed_layer = nn.Embedding(3, opt.coord_feature_dim // 2, padding_idx=0, scale_grad_by_freq=True)
-
-        self.c_coord_category_fusion = nn.Sequential(
-            nn.Linear(self.coord_feature_dim+self.coord_feature_dim//2, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU(inplace=True),
-        )
-
-        self.c_coord_to_feature = nn.Sequential(
-            nn.Linear(4, self.coord_feature_dim // 2, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim // 2),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.coord_feature_dim // 2, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU()
-        )
-
-        self.c_spatial_node_fusion = nn.Sequential(
-            nn.Linear(self.coord_feature_dim * 2, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU()
-        )
-
-        self.c_box_feature_fusion = nn.Sequential(
-            nn.Linear((self.nr_frames // 2) * self.coord_feature_dim, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU()
-        )
-
-        self.classifier = nn.Sequential(
-            nn.Linear(self.coord_feature_dim + 2*self.img_feature_dim, self.coord_feature_dim),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.coord_feature_dim, 512),
-            nn.ReLU(inplace=True),
-            nn.Linear(512, self.nr_actions)
-        )
-        if opt.fine_tune:
-            self.fine_tune(opt.fine_tune)
-        if opt.restore_i3d:
-            self.restore_i3d(opt.restore_i3d)
-        if opt.restore_custom:
-            self.restore_custom(opt.restore_custom)
-
-    def train(self, mode=True):  # overriding default train function
-        super(VideoModelGlobalCoordLatent, self).train(mode)
-        for m in self.modules():  # or self.modules(), if freezing all bn layers
-            if isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d):
-                m.eval()
-                # shutdown update in frozen mode
-                m.weight.requires_grad = False
-                m.bias.requires_grad = False
-
-    def restore_custom(self, restore_path):
-        print("restoring path {}".format(restore_path))
-        weights = torch.load(restore_path)
-
-        ks = list(weights.keys())
-        print('\n\n BEFORE', weights[ks[0]][0,0,0])
-        new_weights = {}
-        # import pdb
-        for k, v in weights.items():
-            new_weights[k.replace('module.', '')] = v
-        self.load_state_dict(new_weights, strict=False)
-        print('\n\n AFTER', self.state_dict()[ks[0]][0,0, 0])
-        print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
-
-        frozen_weights = 0
-        for name, param in self.named_parameters():
-            if not name.startswith('classifier') :
-                param.requires_grad = False
-                frozen_weights += 1
-            else:
-                print('Training : {}'.format(name))
-        print('Number of frozen weights {}'.format(frozen_weights))
-        assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
-                                    'Check the naming convention of the parameters'
-
-
-    def restore_i3d(self, restore_path, parameters_to_train=['classifier']):
-        weights = torch.load(restore_path)['state_dict']
-        new_weights = {}
-        # import pdb
-        for k, v in weights.items():
-            if 'i3D' in k :
-                new_weights[k.replace('module.', '')] = v
-        # pdb.set_trace()
-        self.load_state_dict(new_weights, strict=False)
-        print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
-
-        for m in self.i3D.modules():
-            if isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d):
-                m.eval()
-                # shutdown update in frozen mode
-                m.weight.requires_grad = False
-                m.bias.requires_grad = False
-
-        frozen_weights = 0
-        for name, param in self.named_parameters():
-            if 'i3D' in name:
-                param.requires_grad = False
-                frozen_weights += 1
-            else:
-                print('Training : {}'.format(name))
-        print('Number of frozen weights {}'.format(frozen_weights))
-        assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
-                                    'Check the naming convention of the parameters'
-
-    def fine_tune(self, restore_path, parameters_to_train=['classifier']):
-        weights = torch.load(restore_path)['state_dict']
-        new_weights = {}
-        # import pdb
-        for k, v in weights.items():
-            if not 'classifier.4' in k and 'i3D.classifier':
-                new_weights[k.replace('module.', '')] = v
-        # pdb.set_trace()
-        self.load_state_dict(new_weights, strict=False)
-        print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
-
-        frozen_weights = 0
-        for name, param in self.named_parameters():
-            if not 'classifier.4' in name:
-                param.requires_grad = False
-                frozen_weights += 1
-            else:
-                print('Training : {}'.format(name))
-        print('Number of frozen weights {}'.format(frozen_weights))
-        assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
-                                    'Check the naming convention of the parameters'
-
-    def forward(self, global_img_input, box_categories, box_input, video_label, is_inference=False):
-
-        """
-        V: num of videos
-        T: num of frames
-        P: num of proposals
-        :param videos: [V x 3 x T x 224 x 224]
-        :param proposals_t: [V x T] List of BoxList (size of num_boxes each)
-        :return:
-        """
-
-        # org_features - [V x 2048 x T / 2 x 14 x 14]
-        bs, _, _, _, _ = global_img_input.shape
-        y_i3d, org_features = self.i3D(global_img_input)
-        # Reduce dimension video_features - [V x 512 x T / 2 x 14 x 14]
-        videos_features = self.conv(org_features)
-        b = bs
-
-        box_input = box_input.transpose(2, 1).contiguous()
-        box_input = box_input.view(b * self.nr_boxes * (self.nr_frames//2), 4)
-
-        box_categories = box_categories.long()
-        box_categories = box_categories.transpose(2, 1).contiguous()
-        box_categories = box_categories.view(b * self.nr_boxes * (self.nr_frames // 2))
-        box_category_embeddings = self.category_embed_layer(box_categories)  # (b*nr_b*nr_f, coord_feature_dim//2)
-
-        bf = self.c_coord_to_feature(box_input)
-        bf = torch.cat([bf, box_category_embeddings], dim=1)  # (b*nr_b*nr_f, coord_feature_dim + coord_feature_dim//2)
-        bf = self.c_coord_category_fusion(bf)  # (b*nr_b*nr_f, coord_feature_dim)
-
-        bf = bf.view(b, self.nr_boxes, self.nr_frames // 2, self.coord_feature_dim)
-
-        # spatial message passing (graph)
-        spatial_message = bf.sum(dim=1, keepdim=True)  # (b, 1, self.nr_frames, coord_feature_dim)
-        # message passed should substract itself, and normalize to it as a single feature
-        spatial_message = (spatial_message - bf) / (self.nr_boxes - 1)  # message passed should substract itself
-
-        bf_message_gf = torch.cat([bf, spatial_message], dim=3)  # (b, nr_boxes, nr_frames, 2*coord_feature_dim)
-
-        # (b*nr_boxes*nr_frames, coord_feature_dim)
-        bf_spatial = self.c_spatial_node_fusion(bf_message_gf.view(b * self.nr_boxes * (self.nr_frames // 2), -1))
-        bf_spatial = bf_spatial.view(b, self.nr_boxes, self.nr_frames // 2, self.coord_feature_dim)
-
-        bf_temporal_input = bf_spatial.view(b, self.nr_boxes, (self.nr_frames // 2) * self.coord_feature_dim)
-
-        box_features = self.c_box_feature_fusion(
-            bf_temporal_input.view(b * self.nr_boxes, -1))  # (b*nr_boxes, img_feature_dim)
-        coord_ft = torch.mean(box_features.view(b, self.nr_boxes, -1), dim=1)  # (b, coord_feature_dim)
-        # video_features = torch.cat([global_features, local_features, box_features], dim=1)
-        # _gf = self.global_new_fc(_gf)
-        _gf = videos_features.mean(-1).mean(-1).view(b, (self.nr_frames//2), 2*self.img_feature_dim)
-        _gf = _gf.mean(1)
-        video_features = torch.cat([_gf.view(b, -1), coord_ft], dim=-1)
-
-        cls_output = self.classifier(video_features)  # (b, num_classes)
-        return cls_output
-
-class VideoModelGlobalCoordLatentNL(nn.Module):
-    """
-    This model contains only global pooling without any graph.
-    """
-
-    def __init__(self, base_net, opt,
-                 ):
-        super(VideoModelGlobalCoordLatentNL, self).__init__()
-
-        self.nr_boxes = opt.num_boxes
-        self.nr_actions = opt.num_classes
-        self.nr_frames = opt.num_frames
-        self.img_feature_dim = opt.img_feature_dim
-        self.coord_feature_dim = opt.coord_feature_dim
-        self.i3D = Net(self.nr_actions, extract_features=True, loss_type='softmax')
-        self.dropout = nn.Dropout(0.3)
-        self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
-        self.conv = nn.Conv3d(2048, 512, kernel_size=(1, 1, 1), stride=1)
-
-
-        self.category_embed_layer = nn.Embedding(3, opt.coord_feature_dim // 2, padding_idx=0, scale_grad_by_freq=True)
-
-        self.c_coord_category_fusion = nn.Sequential(
-            nn.Linear(self.coord_feature_dim+self.coord_feature_dim//2, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU(inplace=True),
-        )
-
-        self.c_coord_to_feature = nn.Sequential(
-            nn.Linear(4, self.coord_feature_dim // 2, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim // 2),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.coord_feature_dim // 2, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU()
-        )
-
-        self.c_spatial_node_fusion = nn.Sequential(
-            nn.Linear(self.coord_feature_dim * 2, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU()
-        )
-
-        self.nr_nonlocal_layers = 3
-        self.c_nonlocal_fusion = []
-        for i in range(self.nr_nonlocal_layers):
-            self.c_nonlocal_fusion.append(nn.Sequential(
-                    Nonlocal(dim=self.coord_feature_dim, dim_inner=self.coord_feature_dim // 2),
-                    nn.Conv1d(self.coord_feature_dim, self.coord_feature_dim, kernel_size=1, stride=1, padding=0, bias=False),
-                    nn.BatchNorm1d(self.coord_feature_dim),
-                    nn.ReLU()
-            ))
-        self.c_nonlocal_fusion = nn.ModuleList(self.c_nonlocal_fusion)
-
-        self.c_box_feature_fusion = nn.Sequential(
-            nn.Linear((self.nr_frames // 2) * self.coord_feature_dim, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU()
-        )
-
-        self.classifier = nn.Sequential(
-            nn.Linear(self.coord_feature_dim + 2*self.img_feature_dim, self.coord_feature_dim),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.coord_feature_dim, 512),
-            nn.ReLU(inplace=True),
-            nn.Linear(512, self.nr_actions)
-        )
-        if opt.fine_tune:
-            self.fine_tune(opt.fine_tune)
-        if opt.restore_i3d:
-            self.restore_i3d(opt.restore_i3d)
-
-        if opt.restore_custom:
-            self.restore_custom(opt.restore_custom)
-
-    def restore_custom(self, restore_path):
-        print("restoring path {}".format(restore_path))
-        weights = torch.load(restore_path)
-        ks = list(weights.keys())
-        print('\n\n BEFORE', weights[ks[0]][0,0,0])
-        new_weights = {}
-        # import pdb
-        for k, v in weights.items():
-            new_weights[k.replace('module.', '')] = v
-        self.load_state_dict(new_weights, strict=False)
-        print('\n\n AFTER', self.state_dict()[ks[0]][0,0, 0])
-        print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
-
-        frozen_weights = 0
-        for name, param in self.named_parameters():
-            if not name.startswith('classifier') :
-                param.requires_grad = False
-                frozen_weights += 1
-            else:
-                print('Training : {}'.format(name))
-        print('Number of frozen weights {}'.format(frozen_weights))
-        assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
-                                    'Check the naming convention of the parameters'
-
-
-
-    def restore_i3d(self, restore_path, parameters_to_train=['classifier']):
-        weights = torch.load(restore_path)['state_dict']
-        new_weights = {}
-        # import pdb
-        for k, v in weights.items():
-            if 'i3D' in k  or k.startswith('conv.'):
-                new_weights[k.replace('module.', '')] = v
-        # pdb.set_trace()
-        self.load_state_dict(new_weights, strict=False)
-        print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
-
-        for m in self.i3D.modules():
-            if isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d):
-                m.eval()
-                # shutdown update in frozen mode
-                m.weight.requires_grad = False
-                m.bias.requires_grad = False
-
-        frozen_weights = 0
-        for name, param in self.named_parameters():
-            if 'i3D' in name or k.startswith('conv.') :
-                param.requires_grad = False
-                frozen_weights += 1
-            else:
-                print('Training : {}'.format(name))
-        print('Number of frozen weights {}'.format(frozen_weights))
-        assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
-                                    'Check the naming convention of the parameters'
-
-    def train(self, mode=True):  # overriding default train function
-        super(VideoModelGlobalCoordLatentNL, self).train(mode)
-        for m in self.i3D.modules():  # or self.modules(), if freezing all bn layers
-            if isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d):
-                m.eval()
-                # shutdown update in frozen mode
-                m.weight.requires_grad = False
-                m.bias.requires_grad = False
-
-    def fine_tune(self, restore_path, parameters_to_train=['classifier']):
-        weights = torch.load(restore_path)['state_dict']
-        new_weights = {}
-        import pdb
-        for k, v in weights.items():
-            if not 'classifier.4' in k and 'i3D.classifier' not in k:
-                new_weights[k.replace('module.', '')] = v
-        pdb.set_trace()
-        self.load_state_dict(new_weights, strict=False)
-        print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
-
-        frozen_weights = 0
-        for name, param in self.named_parameters():
-            if not 'classifier.4' in name:
-                param.requires_grad = False
-                frozen_weights += 1
-            else:
-                print('Training : {}'.format(name))
-        print('Number of frozen weights {}'.format(frozen_weights))
-        assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
-                                    'Check the naming convention of the parameters'
-
-    def forward(self, global_img_input, box_categories, box_input, video_label, is_inference=False):
-
-        """
-        V: num of videos
-        T: num of frames
-        P: num of proposals
-        :param videos: [V x 3 x T x 224 x 224]
-        :param proposals_t: [V x T] List of BoxList (size of num_boxes each)
-        :return:
-        """
-
-        # org_features - [V x 2048 x T / 2 x 14 x 14]
-        bs, _, _, _, _ = global_img_input.shape
-        y_i3d, org_features = self.i3D(global_img_input)
-        # Reduce dimension video_features - [V x 512 x T / 2 x 14 x 14]
-        videos_features = self.conv(org_features)
-        b = bs
-
-        box_input = box_input.transpose(2, 1).contiguous()
-        box_input = box_input.view(b * self.nr_boxes * (self.nr_frames//2), 4)
-
-        box_categories = box_categories.long()
-        box_categories = box_categories.transpose(2, 1).contiguous()
-        box_categories = box_categories.view(b * self.nr_boxes * (self.nr_frames // 2))
-        box_category_embeddings = self.category_embed_layer(box_categories)  # (b*nr_b*nr_f, coord_feature_dim//2)
-
-        bf = self.c_coord_to_feature(box_input)
-        bf = torch.cat([bf, box_category_embeddings], dim=1)  # (b*nr_b*nr_f, coord_feature_dim + coord_feature_dim//2)
-        bf = self.c_coord_category_fusion(bf)  # (b*nr_b*nr_f, coord_feature_dim)
-
-        bf = bf.view(b, self.nr_boxes, self.nr_frames // 2, self.coord_feature_dim)
-
-        # spatial message passing (graph)
-        spatial_message = bf.sum(dim=1, keepdim=True)  # (b, 1, self.nr_frames, coord_feature_dim)
-        # message passed should substract itself, and normalize to it as a single feature
-        spatial_message = (spatial_message - bf) / (self.nr_boxes - 1)  # message passed should substract itself
-
-        bf_message_gf = torch.cat([bf, spatial_message], dim=3)  # (b, nr_boxes, nr_frames, 2*coord_feature_dim)
-
-        # (b*nr_boxes*nr_frames, coord_feature_dim)
-        bf_spatial = self.c_spatial_node_fusion(bf_message_gf.view(b * self.nr_boxes * (self.nr_frames // 2), -1))
-        bf_spatial = bf_spatial.view(b, self.nr_boxes, self.nr_frames // 2, self.coord_feature_dim)
-
-        bf_temporal_input = bf_spatial.view(b, self.nr_boxes, (self.nr_frames // 2) * self.coord_feature_dim)
-
-        bf_nonlocal = self.c_box_feature_fusion(
-            bf_temporal_input.view(b * self.nr_boxes, -1))  # (b*nr_boxes, img_feature_dim)
-
-        bf_nonlocal = bf_nonlocal.view(b, self.nr_boxes, self.coord_feature_dim).permute(0, 2, 1).contiguous()  # (N, C, NB)
-        for i in range(self.nr_nonlocal_layers):
-            bf_nonlocal = self.c_nonlocal_fusion[i](bf_nonlocal)
-
-        coord_ft = torch.mean(bf_nonlocal, dim=2)  # (b, coord_feature_dim)
-
-        # video_features = torch.cat([global_features, local_features, box_features], dim=1)
-        _gf = videos_features.mean(-1).mean(-1).view(b, (self.nr_frames//2), 2*self.img_feature_dim)
-        _gf = _gf.mean(1)
-        video_features = torch.cat([_gf.view(b, -1), coord_ft], dim=-1)
-
-        cls_output = self.classifier(video_features)  # (b, num_classes)
-        return cls_output
-
-class VideoGlobalModel(nn.Module):
-    """
-    This model contains only global pooling without any graph.
-    """
-
-    def __init__(self, opt,
-                 ):
-        super(VideoGlobalModel, self).__init__()
-
-        self.nr_boxes = opt.num_boxes
-        self.nr_actions = opt.num_classes
-        self.nr_frames = opt.num_frames
-        self.img_feature_dim = opt.img_feature_dim
-        self.coord_feature_dim = opt.coord_feature_dim
-        self.i3D = Net(self.nr_actions, extract_features=True, loss_type='softmax')
-        self.dropout = nn.Dropout(0.3)
-        self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
-        self.conv = nn.Conv3d(2048, 512, kernel_size=(1, 1, 1), stride=1)
-        self.fc = nn.Linear(512, self.nr_actions)
-        self.crit = nn.CrossEntropyLoss()
-
-        if opt.fine_tune:
-            self.fine_tune(opt.fine_tune)
-
-    def fine_tune(self, restore_path, parameters_to_train=['classifier']):
-        weights = torch.load(restore_path)['state_dict']
-        new_weights = {}
-        for k, v in weights.items():
-            if not 'fc' in k and not 'classifier' in k:
-                new_weights[k.replace('module.', '')] = v
-        self.load_state_dict(new_weights, strict=False)
-        print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
-
-        frozen_weights = 0
-        for name, param in self.named_parameters():
-            if not 'fc' in name:
-                param.requires_grad = False
-                frozen_weights += 1
-            else:
-                print('Training : {}'.format(name))
-        print('Number of frozen weights {}'.format(frozen_weights))
-        assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
-                                    'Check the naming convention of the parameters'
-
-    def forward(self, global_img_input, local_img_input, box_input, video_label, is_inference=False):
-        """
-        V: num of videos
-        T: num of frames
-        P: num of proposals
-        :param videos: [V x 3 x T x 224 x 224]
-        :param proposals_t: [V x T] List of BoxList (size of num_boxes each)
-        :return:
-        """
-
-        # org_features - [V x 2048 x T / 2 x 14 x 14]
-        y_i3d, org_features = self.i3D(global_img_input)
-        # Reduce dimension video_features - [V x 512 x T / 2 x 14 x 14]
-        videos_features = self.conv(org_features)
-
-        # Get global features - [V x 512]
-        global_features = self.avgpool(videos_features).squeeze()
-        global_features = self.dropout(global_features)
-
-        cls_output = self.fc(global_features)
-        return cls_output
-
-class VideoModelGlobalCoord(nn.Module):
-    """
-    This model contains only global pooling without any graph.
-    """
-
-    def __init__(self, opt):
-        super(VideoModelGlobalCoord, self).__init__()
-
-        self.nr_boxes = opt.num_boxes
-        self.nr_actions = opt.num_classes
-        self.nr_frames = opt.num_frames
-        self.img_feature_dim = opt.img_feature_dim
-        self.coord_feature_dim = opt.coord_feature_dim
-        self.i3D = Net(self.nr_actions, extract_features=True, loss_type='softmax')
-        self.dropout = nn.Dropout(0.3)
-        self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
-        self.conv = nn.Conv3d(2048, 256, kernel_size=(1, 1, 1), stride=1)
-
-
-        self.global_new_fc = nn.Sequential(
-            nn.Linear(256, self.img_feature_dim, bias=False),
-            nn.BatchNorm1d(self.img_feature_dim),
-            nn.ReLU(inplace=True)
-        )
-
-
-        self.c_coord_to_feature = nn.Sequential(
-            nn.Linear(4, self.coord_feature_dim // 2, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim // 2),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.coord_feature_dim // 2, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU()
-        )
-
-        self.c_spatial_node_fusion = nn.Sequential(
-            nn.Linear(self.coord_feature_dim * 2, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU()
-        )
-
-        self.c_box_feature_fusion = nn.Sequential(
-            nn.Linear((self.nr_frames // 2) * self.coord_feature_dim, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
-            nn.BatchNorm1d(self.coord_feature_dim),
-            nn.ReLU()
-        )
-
-        self.classifier = nn.Sequential(
-            nn.Linear(self.coord_feature_dim + self.img_feature_dim, self.coord_feature_dim),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.coord_feature_dim, 512),
-            nn.ReLU(inplace=True),
-            nn.Linear(512, self.nr_actions)
-        )
-        if opt.fine_tune:
-            self.fine_tune(opt.fine_tune)
-        if opt.restore_i3d:
-            self.restore_i3d(opt.restore_i3d)
-
-    def train(self, mode=True):  # overriding default train function
-        super(VideoModelGlobalCoord, self).train(mode)
-        for m in self.i3D.modules():  # or self.modules(), if freezing all bn layers
-            if isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d):
-                m.eval()
-                # shutdown update in frozen mode
-                m.weight.requires_grad = False
-                m.bias.requires_grad = False
-
-    def restore_i3d(self, restore_path, parameters_to_train=['classifier']):
-        weights = torch.load(restore_path)['state_dict']
-        new_weights = {}
-        # import pdb
-        for k, v in weights.items():
-            if 'i3D' in k :
-                new_weights[k.replace('module.', '')] = v
-        # pdb.set_trace()
-        self.load_state_dict(new_weights, strict=False)
-        print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
-
-        frozen_weights = 0
-        for name, param in self.named_parameters():
-            if 'i3D' in name:
-                param.requires_grad = False
-                frozen_weights += 1
-            else:
-                print('Training : {}'.format(name))
-        print('Number of frozen weights {}'.format(frozen_weights))
-        assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
-                                    'Check the naming convention of the parameters'
-
-    def fine_tune(self, restore_path, parameters_to_train=['classifier']):
-        weights = torch.load(restore_path)['state_dict']
-        new_weights = {}
-        # import pdb
-        for k, v in weights.items():
-            if not 'classifier.4' in k and 'i3D.classifier':
-                new_weights[k.replace('module.', '')] = v
-        # pdb.set_trace()
-        self.load_state_dict(new_weights, strict=False)
-        print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
-
-        frozen_weights = 0
-        for name, param in self.named_parameters():
-            if not 'classifier.4' in name:
-                param.requires_grad = False
-                frozen_weights += 1
-            else:
-                print('Training : {}'.format(name))
-        print('Number of frozen weights {}'.format(frozen_weights))
-        assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
-                                    'Check the naming convention of the parameters'
-
-    def forward(self, global_img_input, box_categories, box_input, video_label, is_inference=False):
-
-        """
-        V: num of videos
-        T: num of frames
-        P: num of proposals
-        :param videos: [V x 3 x T x 224 x 224]
-        :param proposals_t: [V x T] List of BoxList (size of num_boxes each)
-        :return:
-        """
-
-        # org_features - [V x 2048 x T / 2 x 14 x 14]
-        bs, _, _, _, _ = global_img_input.shape
-        y_i3d, org_features = self.i3D(global_img_input)
-        # Reduce dimension video_features - [V x 512 x T / 2 x 14 x 14]
-        videos_features = self.conv(org_features)
-        b = bs
-
-        box_input = box_input.transpose(2, 1).contiguous()
-        box_input = box_input.view(b * self.nr_boxes * (self.nr_frames//2), 4)
-
-        bf = self.c_coord_to_feature(box_input)
-        bf = bf.view(b, self.nr_boxes, self.nr_frames // 2, self.coord_feature_dim)
-
-        # spatial message passing (graph)
-        spatial_message = bf.sum(dim=1, keepdim=True)  # (b, 1, self.nr_frames, coord_feature_dim)
-        # message passed should substract itself, and normalize to it as a single feature
-        spatial_message = (spatial_message - bf) / (self.nr_boxes - 1)  # message passed should substract itself
-
-        bf_message_gf = torch.cat([bf, spatial_message], dim=3)  # (b, nr_boxes, nr_frames, 2*coord_feature_dim)
-
-        # (b*nr_boxes*nr_frames, coord_feature_dim)
-        bf_spatial = self.c_spatial_node_fusion(bf_message_gf.view(b * self.nr_boxes * (self.nr_frames // 2), -1))
-        bf_spatial = bf_spatial.view(b, self.nr_boxes, self.nr_frames // 2, self.coord_feature_dim)
-
-        bf_temporal_input = bf_spatial.view(b, self.nr_boxes, (self.nr_frames // 2) * self.coord_feature_dim)
-
-        box_features = self.c_box_feature_fusion(
-            bf_temporal_input.view(b * self.nr_boxes, -1))  # (b*nr_boxes, img_feature_dim)
-        coord_ft = torch.mean(box_features.view(b, self.nr_boxes, -1), dim=1)  # (b, coord_feature_dim)
-        # video_features = torch.cat([global_features, local_features, box_features], dim=1)
-        _gf = videos_features.mean(-1).mean(-1).view(b*(self.nr_frames//2), self.img_feature_dim)
-        _gf = self.global_new_fc(_gf)
-        _gf = _gf.view(b, self.nr_frames // 2, self.img_feature_dim).mean(1)
-        video_features = torch.cat([_gf.view(b, -1), coord_ft], dim=-1)
-
-        cls_output = self.classifier(video_features)  # (b, num_classes)
-        return cls_output
diff --git a/i3D/resnet3d_xl.py b/i3D/resnet3d_xl.py
deleted file mode 100644
index b4d1695507c7a9f2b232bd886ee87c5489ff899d..0000000000000000000000000000000000000000
--- a/i3D/resnet3d_xl.py
+++ /dev/null
@@ -1,456 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.autograd import Variable
-import math
-import numpy as np
-
-from functools import partial
-
-__all__ = [
-    'ResNet', 'resnet10', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
-    'resnet152', 'resnet200',
-]
-
-
-def conv3x3x3(in_planes, out_planes, stride=1):
-    # 3x3x3 convolution with padding
-    return nn.Conv3d(
-        in_planes,
-        out_planes,
-        kernel_size=3,
-        stride=stride,
-        padding=1,
-        bias=False)
-
-
-def downsample_basic_block(x, planes, stride):
-    out = F.avg_pool3d(x, kernel_size=1, stride=stride)
-    zero_pads = torch.Tensor(
-        out.size(0), planes - out.size(1), out.size(2), out.size(3),
-        out.size(4)).zero_()
-    if isinstance(out.data, torch.cuda.FloatTensor):
-        zero_pads = zero_pads.cuda()
-
-    out = Variable(torch.cat([out.data, zero_pads], dim=1))
-
-    return out
-
-
-class BasicBlock(nn.Module):
-    expansion = 1
-
-    def __init__(self, inplanes, planes, stride=1, downsample=None):
-        super(BasicBlock, self).__init__()
-        self.conv1 = conv3x3x3(inplanes, planes, stride)
-        self.bn1 = nn.BatchNorm3d(planes)
-        self.relu = nn.ReLU(inplace=True)
-        self.conv2 = conv3x3x3(planes, planes)
-        self.bn2 = nn.BatchNorm3d(planes)
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x):
-        residual = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-
-        if self.downsample is not None:
-            residual = self.downsample(x)
-
-        out += residual
-        out = self.relu(out)
-
-        return out
-
-
-class Bottleneck(nn.Module):
-    conv_op = None
-    offset_groups = 1
-
-    def __init__(self, dim_in, dim_out, stride, dim_inner, group=1, use_temp_conv=1, temp_stride=1, dcn=False,
-                 shortcut_type='B'):
-        super(Bottleneck, self).__init__()
-        # 1 x 1 layer
-        self.with_dcn = dcn
-        self.conv1 = self.Conv3dBN(dim_in, dim_inner, (1 + use_temp_conv * 2, 1, 1), (temp_stride, 1, 1),
-                                   (use_temp_conv, 0, 0))
-        self.relu = nn.ReLU(inplace=True)
-        # 3 x 3 layer
-        self.conv2 = self.Conv3dBN(dim_inner, dim_inner, (1, 3, 3), (1, stride, stride), (0, 1, 1))
-        # 1 x 1 layer
-        self.conv3 = self.Conv3dBN(dim_inner, dim_out, (1, 1, 1), (1, 1, 1), (0, 0, 0))
-
-        self.shortcut_type = shortcut_type
-        self.dim_in = dim_in
-        self.dim_out = dim_out
-        self.temp_stride = temp_stride
-        self.stride = stride
-        # nn.Conv3d(dim_in, dim_out, (1,1,1),(temp_stride,stride,stride),(0,0,0))
-        if self.shortcut_type == 'B':
-            if self.dim_in == self.dim_out and self.temp_stride == 1 and self.stride == 1:  # or (self.dim_in == self.dim_out and self.dim_in == 64 and self.stride ==1):
-
-                pass
-            else:
-                # pass
-                self.shortcut = self.Conv3dBN(dim_in, dim_out, (1, 1, 1), (temp_stride, stride, stride), (0, 0, 0))
-
-        # nn.Conv3d(dim_in,dim_inner,kernel_size=(1+use_temp_conv*2,1,1),stride = (temp_stride,1,1),padding = )
-
-    def forward(self, x):
-        residual = x
-        out = self.conv1(x)
-        out = self.relu(out)
-        out = self.conv2(out)
-        out = self.relu(out)
-        out = self.conv3(out)
-        if self.dim_in == self.dim_out and self.temp_stride == 1 and self.stride == 1:
-            pass
-        else:
-            residual = self.shortcut(residual)
-        out += residual
-        out = self.relu(out)
-        return out
-
-    def Conv3dBN(self, dim_in, dim_out, kernels, strides, pads, group=1):
-        if self.with_dcn and kernels[0] > 1:
-            # use deformable conv
-            return nn.Sequential(
-                self.conv_op(dim_in, dim_out, kernel_size=kernels, stride=strides, padding=pads, bias=False,
-                             offset_groups=self.offset_groups),
-                nn.BatchNorm3d(dim_out)
-            )
-        else:
-            return nn.Sequential(
-                nn.Conv3d(dim_in, dim_out, kernel_size=kernels, stride=strides, padding=pads, bias=False),
-                nn.BatchNorm3d(dim_out)
-            )
-
-
-class ResNet(nn.Module):
-
-    def __init__(self,
-                 block,
-                 layers,
-                 use_temp_convs_set,
-                 temp_strides_set,
-                 sample_size,
-                 sample_duration,
-                 shortcut_type='B',
-                 num_classes=400,
-                 stage_with_dcn=(False, False, False, False),
-                 extract_features=False,
-                 loss_type='softmax'):
-        super(ResNet, self).__init__()
-        self.extract_features = extract_features
-        self.stage_with_dcn = stage_with_dcn
-        self.group = 1
-        self.width_per_group = 64
-        self.dim_inner = self.group * self.width_per_group
-        # self.shortcut_type = shortcut_type
-        self.conv1 = nn.Conv3d(
-            3,
-            64,
-            kernel_size=(1 + use_temp_convs_set[0][0] * 2, 7, 7),
-            stride=(temp_strides_set[0][0], 2, 2),
-            padding=(use_temp_convs_set[0][0], 3, 3),
-            bias=False)
-        self.bn1 = nn.BatchNorm3d(64)
-        self.relu = nn.ReLU(inplace=True)
-        self.maxpool1 = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 0, 0))
-        with_dcn = True if self.stage_with_dcn[0] else False
-        self.layer1 = self._make_layer(block, 64, 256, shortcut_type, stride=1, num_blocks=layers[0],
-                                       dim_inner=self.dim_inner, group=self.group, use_temp_convs=use_temp_convs_set[1],
-                                       temp_strides=temp_strides_set[1], dcn=with_dcn)
-        self.maxpool2 = nn.MaxPool3d(kernel_size=(2, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
-        with_dcn = True if self.stage_with_dcn[1] else False
-        self.layer2 = self._make_layer(block, 256, 512, shortcut_type, stride=2, num_blocks=layers[1],
-                                       dim_inner=self.dim_inner * 2, group=self.group,
-                                       use_temp_convs=use_temp_convs_set[2], temp_strides=temp_strides_set[2],
-                                       dcn=with_dcn)
-        with_dcn = True if self.stage_with_dcn[2] else False
-        self.layer3 = self._make_layer(block, 512, 1024, shortcut_type, stride=2, num_blocks=layers[2],
-                                       dim_inner=self.dim_inner * 4, group=self.group,
-                                       use_temp_convs=use_temp_convs_set[3], temp_strides=temp_strides_set[3],
-                                       dcn=with_dcn)
-        with_dcn = True if self.stage_with_dcn[3] else False
-        self.layer4 = self._make_layer(block, 1024, 2048, shortcut_type, stride=1, num_blocks=layers[3],
-                                       dim_inner=self.dim_inner * 8, group=self.group,
-                                       use_temp_convs=use_temp_convs_set[4], temp_strides=temp_strides_set[4],
-                                       dcn=with_dcn)
-        last_duration = int(math.ceil(sample_duration / 2))  # int(math.ceil(sample_duration / 8))
-        last_size = int(math.ceil(sample_size / 16))
-        # self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1) #nn.AdaptiveAvgPool3d((1, 1, 1)) #
-        self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
-        self.dropout = torch.nn.Dropout(p=0.5)
-        self.classifier = nn.Linear(2048, num_classes)
-
-        for m in self.modules():
-            # if isinstance(m, nn.Conv3d):
-            #     m.weight = nn.init.kaiming_normal_(m.weight, mode='fan_out')
-            # elif isinstance(m,nn.Linear):
-            #    m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')
-            # elif 
-            if isinstance(m, nn.BatchNorm3d):
-                m.weight.data.fill_(1)
-                m.bias.data.zero_()
-
-    def _make_layer(self, block, dim_in, dim_out, shortcut_type, stride, num_blocks, dim_inner=None, group=None,
-                    use_temp_convs=None, temp_strides=None, dcn=False):
-        if use_temp_convs is None:
-            use_temp_convs = np.zeros(num_blocks).astype(int)
-        if temp_strides is None:
-            temp_strides = np.ones(num_blocks).astype(int)
-        if len(use_temp_convs) < num_blocks:
-            for _ in range(num_blocks - len(use_temp_convs)):
-                use_temp_convs.append(0)
-                temp_strides.append(1)
-        layers = []
-        for idx in range(num_blocks):
-            block_stride = 2 if (idx == 0 and stride == 2) else 1
-
-            layers.append(
-                block(dim_in, dim_out, block_stride, dim_inner, group, use_temp_convs[idx], temp_strides[idx], dcn))
-            dim_in = dim_out
-        return nn.Sequential(*layers)
-
-    def forward_single(self, x):
-        x = self.conv1(x)
-
-        x = self.bn1(x)
-        x = self.relu(x)
-        x = self.maxpool1(x)
-
-        x = self.layer1(x)
-        x = self.maxpool2(x)
-        x = self.layer2(x)
-
-        x = self.layer3(x)
-        features = self.layer4(x)
-
-        x = self.avgpool(features)
-
-        x = x.view(x.size(0), -1)
-        x = self.dropout(x)
-
-        y = self.classifier(x)
-        if self.extract_features:
-            return y, features
-        else:
-            return y
-
-    def forward_multi(self, x):
-        clip_preds = []
-        # import ipdb;ipdb.set_trace()
-        for clip_idx in range(x.shape[1]):  # B, 10, 3, 3, 32, 224, 224
-            spatial_crops = []
-            for crop_idx in range(x.shape[2]):
-                clip = x[:, clip_idx, crop_idx]
-                clip = self.forward_single(clip)
-                spatial_crops.append(clip)
-            spatial_crops = torch.stack(spatial_crops, 1).mean(1)  # (B, 400)
-            clip_preds.append(spatial_crops)
-        clip_preds = torch.stack(clip_preds, 1).mean(1)  # (B, 400)
-        return clip_preds
-
-    def forward(self, x):
-
-        # 5D tensor == single clip
-        if x.dim() == 5:
-            pred = self.forward_single(x)
-
-        # 7D tensor == 3 crops/10 clips
-        elif x.dim() == 7:
-            pred = self.forward_multi(x)
-
-        # loss_dict = {}
-        # if 'label' in batch:
-        #     loss = F.cross_entropy(pred, batch['label'], reduction='none')
-        #     loss_dict = {'clf': loss}
-
-        return pred
-
-
-def get_fine_tuning_parameters(model, ft_begin_index):
-    if ft_begin_index == 0:
-        return model.parameters()
-
-    ft_module_names = []
-    for i in range(ft_begin_index, 5):
-        ft_module_names.append('layer{}'.format(i))
-    ft_module_names.append('fc')
-    # import ipdb;ipdb.set_trace()
-    parameters = []
-    for k, v in model.named_parameters():
-        for ft_module in ft_module_names:
-            if ft_module in k:
-                parameters.append({'params': v})
-                break
-        else:
-            parameters.append({'params': v, 'lr': 0.0})
-
-    return parameters
-
-
-def obtain_arc(arc_type):
-    # c2d, ResNet50
-    if arc_type == 1:
-        use_temp_convs_1 = [0]
-        temp_strides_1 = [2]
-        use_temp_convs_2 = [0, 0, 0]
-        temp_strides_2 = [1, 1, 1]
-        use_temp_convs_3 = [0, 0, 0, 0]
-        temp_strides_3 = [1, 1, 1, 1]
-        use_temp_convs_4 = [0, ] * 6
-        temp_strides_4 = [1, ] * 6
-        use_temp_convs_5 = [0, 0, 0]
-        temp_strides_5 = [1, 1, 1]
-
-    # i3d, ResNet50
-    if arc_type == 2:
-        use_temp_convs_1 = [2]
-        temp_strides_1 = [1]
-        use_temp_convs_2 = [1, 1, 1]
-        temp_strides_2 = [1, 1, 1]
-        use_temp_convs_3 = [1, 0, 1, 0]
-        temp_strides_3 = [1, 1, 1, 1]
-        use_temp_convs_4 = [1, 0, 1, 0, 1, 0]
-        temp_strides_4 = [1, 1, 1, 1, 1, 1]
-        use_temp_convs_5 = [0, 1, 0]
-        temp_strides_5 = [1, 1, 1]
-
-    # c2d, ResNet101
-    if arc_type == 3:
-        use_temp_convs_1 = [0]
-        temp_strides_1 = [2]
-        use_temp_convs_2 = [0, 0, 0]
-        temp_strides_2 = [1, 1, 1]
-        use_temp_convs_3 = [0, 0, 0, 0]
-        temp_strides_3 = [1, 1, 1, 1]
-        use_temp_convs_4 = [0, ] * 23
-        temp_strides_4 = [1, ] * 23
-        use_temp_convs_5 = [0, 0, 0]
-        temp_strides_5 = [1, 1, 1]
-
-    # i3d, ResNet101
-    if arc_type == 4:
-        use_temp_convs_1 = [2]
-        temp_strides_1 = [2]
-        use_temp_convs_2 = [1, 1, 1]
-        temp_strides_2 = [1, 1, 1]
-        use_temp_convs_3 = [1, 0, 1, 0]
-        temp_strides_3 = [1, 1, 1, 1]
-        use_temp_convs_4 = []
-        for i in range(23):
-            if i % 2 == 0:
-                use_temp_convs_4.append(1)
-            else:
-                use_temp_convs_4.append(0)
-
-        temp_strides_4 = [1, ] * 23
-        use_temp_convs_5 = [0, 1, 0]
-        temp_strides_5 = [1, 1, 1]
-
-    use_temp_convs_set = [use_temp_convs_1, use_temp_convs_2, use_temp_convs_3, use_temp_convs_4, use_temp_convs_5]
-    temp_strides_set = [temp_strides_1, temp_strides_2, temp_strides_3, temp_strides_4, temp_strides_5]
-
-    return use_temp_convs_set, temp_strides_set
-
-
-def resnet10(**kwargs):
-    """Constructs a ResNet-18 model.
-    """
-    use_temp_convs_set = []
-    temp_strides_set = []
-    model = ResNet(BasicBlock, [1, 1, 1, 1], use_temp_convs_set, temp_strides_set, **kwargs)
-    return model
-
-
-def resnet18(**kwargs):
-    """Constructs a ResNet-18 model.
-    """
-    use_temp_convs_set = []
-    temp_strides_set = []
-    model = ResNet(BasicBlock, [2, 2, 2, 2], use_temp_convs_set, temp_strides_set, **kwargs)
-    return model
-
-
-def resnet34(**kwargs):
-    """Constructs a ResNet-34 model.
-    """
-    use_temp_convs_set = []
-    temp_strides_set = []
-    model = ResNet(BasicBlock, [3, 4, 6, 3], use_temp_convs_set, temp_strides_set, **kwargs)
-    return model
-
-
-def resnet50(extract_features, **kwargs):
-    """Constructs a ResNet-50 model.
-    """
-    use_temp_convs_set, temp_strides_set = obtain_arc(2)
-    model = ResNet(Bottleneck, [3, 4, 6, 3], use_temp_convs_set, temp_strides_set,
-                   extract_features=extract_features, **kwargs)
-    return model
-
-
-def resnet101(**kwargs):
-    """Constructs a ResNet-101 model.
-    """
-    use_temp_convs_set, temp_strides_set = obtain_arc(4)
-    model = ResNet(Bottleneck, [3, 4, 23, 3], use_temp_convs_set, temp_strides_set, **kwargs)
-    return model
-
-
-def resnet152(**kwargs):
-    """Constructs a ResNet-101 model.
-    """
-    use_temp_convs_set = []
-    temp_strides_set = []
-    model = ResNet(Bottleneck, [3, 8, 36, 3], use_temp_convs_set, temp_strides_set, **kwargs)
-    return model
-
-
-def resnet200(**kwargs):
-    """Constructs a ResNet-101 model.
-    """
-    use_temp_convs_set = []
-    temp_strides_set = []
-    model = ResNet(Bottleneck, [3, 24, 36, 3], use_temp_convs_set, temp_strides_set, **kwargs)
-    return model
-
-
-def Net(num_classes, extract_features=False, loss_type='softmax',
-        weights=None, freeze_all_but_cls=False):
-    net = globals()['resnet' + str(50)](
-        num_classes=num_classes,
-        sample_size=50,
-        sample_duration=32,
-        extract_features=extract_features,
-        loss_type=loss_type,
-    )
-
-    if weights is not None:
-        kinetics_weights = torch.load(weights)['state_dict']
-        print("Found weights in {}.".format(weights))
-        cls_name = 'fc'
-    else:
-        kinetics_weights = torch.load('i3D/kinetics-res50.pth')
-        cls_name = 'fc'
-        print('\n Restoring Kintetics \n')
-
-    new_weights = {}
-    for k, v in kinetics_weights.items():
-        if not k.startswith('module.' + cls_name):
-            new_weights[k.replace('module.', '')] = v
-    net.load_state_dict(new_weights, strict=False)
-
-    if freeze_all_but_cls:
-        for name, par in net.named_parameters():
-            if not name.startswith('classifier'):
-                par.requires_grad = False
-    return net
diff --git a/kinetics_feats.py b/kinetics_feats.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3632bc8e5a58e74f83e8f6f1437dc759001fc4d
--- /dev/null
+++ b/kinetics_feats.py
@@ -0,0 +1,99 @@
+# -*- coding: utf-8 -*-
+import os
+import sys
+import glob
+import datetime
+import argparse
+import random
+
+import numpy as np
+
+#from pathlib import Path
+#filepath = Path.cwd()
+#sys.path.append(filepath)
+from video_loaders import load_av
+from se_bb_from_np import annot_np
+from SmthSequence import SmthSequence
+from SmthFrameRelations import frame_relations
+
+from PIL import Image
+import matplotlib.pyplot as plt
+import cv2
+import i3D.gtransforms as gtransforms
+
+import torch
+from i3D.i3dpt import I3D
+
+rgb_pt_checkpoint = 'i3D/model_rgb.pth'
+
+class I3dFV:
+        def __init__(self,path):
+                self.anno = annot_np(path)
+                self.net = I3D(num_classes=400, modality='rgb')
+                self.net.eval()
+                self.net.load_state_dict(torch.load(rgb_pt_checkpoint))
+                self.net.cuda()
+                self.pre_resize_shape = (256, 340)
+                self.random_crop = gtransforms.GroupMultiScaleCrop(output_size=224,
+                                                                   scales=[1],
+                                                                   max_distort=0,
+                                                                   center_crop_only=True)
+        
+        def process_video(self,finput,verbose=False):
+                # get video id
+                vidnum = int(os.path.splitext(os.path.basename(finput))[0])
+                
+                # load video to ndarray list
+                img_array = load_av(finput)
+                
+                # convert BGR to RGB
+                frames = [cv2.cvtColor(img, cv2.COLOR_BGR2RGB) for img in img_array]
+                # convert ndarray to array of PIL Images for resize and cropping
+                frames = [Image.fromarray(img.astype('uint8'), 'RGB') for img in frames]
+                # resize
+                frames = [img.resize((self.pre_resize_shape[1], self.pre_resize_shape[0]), Image.BILINEAR) for img in frames]
+                # crop
+                frames, (offset_h, offset_w, crop_h, crop_w) = self.random_crop(frames)
+                # convert back from PIL to ndarray for cv2 channel separation
+                frames = [np.array(img) for img in frames]
+                # separate channels into R,G,B frame sequences
+                #rs = []
+                #gs = []
+                #bs = []
+                #for i in range(len(frames)):
+                #    R, G, B = cv2.split(frames[i])
+                #    rs.append(R)
+                #    gs.append(G)
+                #    bs.append(B)
+                #frames = np.asarray([[rs, gs, bs]])
+                
+                frames = np.asarray([frames]).transpose(0, 4, 1, 2, 3) # alternative to channel splitting above?
+                #print(frames.shape)
+                
+                sample_var = torch.autograd.Variable(torch.from_numpy(frames).cuda()).float()
+                _, logits = self.net(sample_var)
+                
+                return logits.cpu().detach().numpy()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--annotations',
+        dest='path_to_annotations',
+        default='../annotations_ground/',
+        help='folder to load annotations from')
+    parser.add_argument(
+        '--video',
+        dest='path_to_video',
+        default='.',
+        help='video to load')
+    
+    args = parser.parse_args()
+    
+    i3dfv = I3dFV(args.path_to_annotations)
+    fv = i3dfv.process_video(args.path_to_video, verbose=True)
+    
+    print(fv)
+    
+    print("fin")
diff --git a/regen_frame_fv.py b/regen_frame_fv.py
index 2e16af7ec85c90d93ea807b8632c4fafc378d23e..69637165e36776a47dcacc0219fc00389067623b 100644
--- a/regen_frame_fv.py
+++ b/regen_frame_fv.py
@@ -17,6 +17,7 @@ from SmthSequence import SmthSequence
 from SmthFrameRelations import frame_relations
 
 from PIL import Image
+import matplotlib.pyplot as plt
 import cv2
 import torch
 from i3D.model import VideoModel
@@ -61,33 +62,61 @@ class FrameFV:
                     bs.append(B)
                 frames = [rs, gs, bs]
                 
+                #print(self.net.i3D.classifier.weight.data)
+                print(self.net.classifier[4].weight.data)
+                
                 # read frame annotations into Sequence
-                seq = SmthSequence()
-                for framenum in range(0,len(img_array)):
-                    cats, bbs = self.anno.get_vf_bbx(vidnum, framenum+1)
-                    # add detections to Sequence
-                    for i in range(0,len(cats)):
-                        seq.add(framenum, cats[i], bbs[i])
+                #seq = SmthSequence()
+                #for framenum in range(0,len(img_array)):
+                #    cats, bbs = self.anno.get_vf_bbx(vidnum, framenum+1)
+                #    # add detections to Sequence
+                #    for i in range(0,len(cats)):
+                #        seq.add(framenum, cats[i], bbs[i])
                 
                 # compute object relations per frame
-                relations = []
-                for framenum in range(0,len(img_array)):
-                    fv = frame_relations(seq, 0, 1, framenum)
-                    relations.append(fv)
-                relations  = np.asarray(relations)
+                #relations = []
+                #for framenum in range(0,len(img_array)):
+                #    fv = frame_relations(seq, 0, 1, framenum)
+                #    relations.append(fv)
+                #relations  = np.asarray(relations)
                 
                 # i3D features per frame
                 clip = torch.from_numpy(np.asarray([frames]))
-                print(clip.shape)
+                #print(clip.shape)
                 clip = clip.float()
                 glo, vid = self.net.i3D(clip)
                 
+                #return glo.detach().numpy()
+                
                 videos_features = self.net.conv(vid)
                 
-                print(glo.shape)
-                print(vid.shape)
+                #print(glo.shape)
+                #print(vid.shape)
+                #print(videos_features.shape)
+                
+                #plt.plot(np.linspace(0,400,num=400), glo.detach().numpy()[0])
+                #plt.show()
                 
-                print(videos_features.shape)
+                pre = vid.detach().numpy().view()
+                post = videos_features.detach().numpy().view()
+                
+                rows = []
+                for f in range(len(img_array)//2):
+                    row = []
+                    for i in range(512):
+                        patch = post[0,i,f]
+                        row.append(patch)
+                    row = np.hstack(row)
+                    rows.append(row)
+                pic = np.vstack(rows)
+                
+                print(pic.shape)
+                while(1):
+                    cv2.imshow('frame', pic)
+                    k = cv2.waitKey(33)
+                    if k == 27:
+                        break
+
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
@@ -109,13 +138,11 @@ if __name__ == '__main__':
                         help='intermediate feature dimension for coord-based features')
     parser.add_argument('--size', default=224, type=int, metavar='N',
                         help='primary image input size')
-    parser.add_argument('--batch_size', '-b', default=72, type=int,
-                        metavar='N', help='mini-batch size (default: 72)')
     parser.add_argument('--num_classes', default=174, type=int,
                         help='num of class in the model')
     parser.add_argument('--num_boxes', default=4, type=int,
                         help='num of boxes for each image')
-    parser.add_argument('--num_frames', default=36, type=int,
+    parser.add_argument('--num_frames', default=16, type=int,
                         help='num of frames for the model')
     parser.add_argument('--fine_tune', help='path with ckpt to restore')
     parser.add_argument('--restore_i3d')
diff --git a/train_actions.sh b/train_actions.sh
index 5791b976fa4c5ed597ff14bb83dc4cab90f48932..303261b3cd8ce253d16fcec2c497a608d7fc56cb 100755
--- a/train_actions.sh
+++ b/train_actions.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-ACTIONS=( 131 141 )
+ACTIONS=( 1 12 13 16 25 33 34 35 38 48 51 52 54 55 56 58 61 63 64 66 69 71 75 76 78 80 81 82 88 89 95 96 97 102 111 118 127 128 130 131 133 136 138 141 147 152 155 159 160 163 )
 
 for i in ${ACTIONS[@]};
 do