integrate i3D features into action model

c79d1aee · Chrol-Cannon, Joseph Dr (Computer Science) · 6d15e6c4 · c79d1aee · c79d1aee · c79d1aee
Commit c79d1aee authored 3 years ago by Chrol-Cannon, Joseph Dr (Computer Science)
--- a/collate_fv.py
+++ b/collate_fv.py
@@ -10,10 +10,11 @@ import json
 import cv2
 import numpy as np

-from pathlib import Path
+#from pathlib import Path
 #filepath = Path.cwd()
 #sys.path.append(filepath)
 from compute_fv import ComputeFV
+from kinetics_feats import I3dFV

 if __name__ == "__main__":
        parser = argparse.ArgumentParser()
@@ -48,6 +49,7 @@ if __name__ == "__main__":
                dest='path_to_annotations',
                default='../annotations_ground/',
                help='folder to load annotations from')
+        
        args = parser.parse_args()
        
        labs = json.load(open(args.labels_file,'r'))
@@ -58,6 +60,7 @@ if __name__ == "__main__":
        
        # load something-else annotated videos
        compfv = ComputeFV(args.path_to_annotations)
+        i3dfv = I3dFV(args.path_to_annotations)
        
        #ids = [] # video id's in order of being processed
        #classes = [] # class of each video == ordinal of label
@@ -71,13 +74,14 @@ if __name__ == "__main__":
                for filename in glob.glob(folder + "/*.webm"):
                        # only process a random % of the videos (if not positive class example)
                        if int(k) != args.action_id:
-                            if random.random() > 0.01:
+                            if random.random() > 0.02:
                                continue
                        
                        print("processing file: " + filename)
                        vidnum = int(os.path.splitext(os.path.basename(filename))[0])

                        fv = compfv.process_video(filename, os.path.join(args.path_to_phase_models, 'a'+str(args.action_id)+'.joblib'))
+                        fv0 = i3dfv.process_video(filename)
                        
                        #if type(fv) is not np.ndarray:
                        #       continue
@@ -85,7 +89,7 @@ if __name__ == "__main__":
                        if int(k) == args.action_id:
                                fv[-1] = 1
                        
-                        feats.append(fv)
+                        feats.append(np.concatenate([fv0.flatten(), fv]))
                        #ids.append(vidnum)
        
        # list of np.ndarray to 2d ndarray

--- a/collect_action_features.sh
+++ b/collect_action_features.sh
 #!/bin/bash

-ACTIONS=( 131 141 )
+ACTIONS=( 1 12 13 16 25 33 34 35 38 48 51 52 54 55 56 58 61 63 64 66 69 71 75 76 78 80 81 82 88 89 95 96 97 102 111 118 127 128 130 131 133 136 138 141 147 152 155 159 160 163 )

 for i in ${ACTIONS[@]};
 do

--- a/i3D/i3dpt.py
+++ b/i3D/i3dpt.py
+import math
+import os
+import torch
+import numpy as np
+
+
+def get_padding_shape(filter_shape, stride, mod=0):
+    """Fetch a tuple describing the input padding shape.
+
+    NOTES: To replicate "TF SAME" style padding, the padding shape needs to be
+    determined at runtime to handle cases when the input dimension is not divisible
+    by the stride.
+    See https://stackoverflow.com/a/49842071 for explanation of TF SAME padding logic
+    """
+    def _pad_top_bottom(filter_dim, stride_val, mod):
+        if mod:
+            pad_along = max(filter_dim - mod, 0)
+        else:
+            pad_along = max(filter_dim - stride_val, 0)
+        pad_top = pad_along // 2
+        pad_bottom = pad_along - pad_top
+        return pad_top, pad_bottom
+
+    padding_shape = []
+    for idx, (filter_dim, stride_val) in enumerate(zip(filter_shape, stride)):
+        depth_mod = (idx == 0) and mod
+        pad_top, pad_bottom = _pad_top_bottom(filter_dim, stride_val, depth_mod)
+        padding_shape.append(pad_top)
+        padding_shape.append(pad_bottom)
+
+    depth_top = padding_shape.pop(0)
+    depth_bottom = padding_shape.pop(0)
+    padding_shape.append(depth_top)
+    padding_shape.append(depth_bottom)
+    return tuple(padding_shape)
+
+
+def simplify_padding(padding_shapes):
+    all_same = True
+    padding_init = padding_shapes[0]
+    for pad in padding_shapes[1:]:
+        if pad != padding_init:
+            all_same = False
+    return all_same, padding_init
+
+
+class Unit3Dpy(torch.nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=(1, 1, 1),
+                 stride=(1, 1, 1),
+                 activation='relu',
+                 padding='SAME',
+                 use_bias=False,
+                 use_bn=True):
+        super(Unit3Dpy, self).__init__()
+
+        self.padding = padding
+        self.activation = activation
+        self.use_bn = use_bn
+        self.stride = stride
+        if padding == 'SAME':
+            padding_shape = get_padding_shape(kernel_size, stride)
+            simplify_pad, pad_size = simplify_padding(padding_shape)
+            self.simplify_pad = simplify_pad
+            if stride[0] > 1:
+                padding_shapes = [get_padding_shape(kernel_size, stride, mod) for
+                                  mod in range(stride[0])]
+            else:
+                padding_shapes = [padding_shape]
+        elif padding == 'VALID':
+            padding_shape = 0
+        else:
+            raise ValueError(
+                'padding should be in [VALID|SAME] but got {}'.format(padding))
+
+        if padding == 'SAME':
+            if not simplify_pad:
+                self.pads = [torch.nn.ConstantPad3d(x, 0) for x in padding_shapes]
+                self.conv3d = torch.nn.Conv3d(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    stride=stride,
+                    bias=use_bias)
+            else:
+                self.conv3d = torch.nn.Conv3d(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    stride=stride,
+                    padding=pad_size,
+                    bias=use_bias)
+        elif padding == 'VALID':
+            self.conv3d = torch.nn.Conv3d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                padding=padding_shape,
+                stride=stride,
+                bias=use_bias)
+        else:
+            raise ValueError(
+                'padding should be in [VALID|SAME] but got {}'.format(padding))
+
+        if self.use_bn:
+            # This is not strictly the correct map between epsilons in keras and
+            # pytorch (which have slightly different definitions of the batch norm
+            # forward pass), but it seems to be good enough. The PyTorch formula
+            # is described here:
+            # https://pytorch.org/docs/stable/_modules/torch/nn/modules/batchnorm.html
+            tf_style_eps = 1E-3
+            self.batch3d = torch.nn.BatchNorm3d(out_channels, eps=tf_style_eps)
+
+        if activation == 'relu':
+            self.activation = torch.nn.functional.relu
+
+    def forward(self, inp):
+        if self.padding == 'SAME' and self.simplify_pad is False:
+            # Determine the padding to be applied by examining the input shape
+            pad_idx = inp.shape[2] % self.stride[0]
+            pad_op = self.pads[pad_idx]
+            inp = pad_op(inp)
+        out = self.conv3d(inp)
+        if self.use_bn:
+            out = self.batch3d(out)
+        if self.activation is not None:
+            out = torch.nn.functional.relu(out)
+        return out
+
+
+class MaxPool3dTFPadding(torch.nn.Module):
+    def __init__(self, kernel_size, stride=None, padding='SAME'):
+        super(MaxPool3dTFPadding, self).__init__()
+        if padding == 'SAME':
+            padding_shape = get_padding_shape(kernel_size, stride)
+            self.padding_shape = padding_shape
+            self.stride = stride
+            if stride[0] > 1:
+                padding_shapes = [get_padding_shape(kernel_size, stride, mod) for
+                                  mod in range(stride[0])]
+            else:
+                padding_shapes = [padding_shape]
+            self.pads = [torch.nn.ConstantPad3d(x, 0) for x in padding_shapes]
+        self.pool = torch.nn.MaxPool3d(kernel_size, stride, ceil_mode=True)
+
+    def forward(self, inp):
+        pad_idx = inp.shape[2] % self.stride[0]
+        pad_op = self.pads[pad_idx]
+        inp = pad_op(inp)
+        out = self.pool(inp)
+        return out
+
+
+class Mixed(torch.nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(Mixed, self).__init__()
+        # Branch 0
+        self.branch_0 = Unit3Dpy(
+            in_channels, out_channels[0], kernel_size=(1, 1, 1))
+
+        # Branch 1
+        branch_1_conv1 = Unit3Dpy(
+            in_channels, out_channels[1], kernel_size=(1, 1, 1))
+        branch_1_conv2 = Unit3Dpy(
+            out_channels[1], out_channels[2], kernel_size=(3, 3, 3))
+        self.branch_1 = torch.nn.Sequential(branch_1_conv1, branch_1_conv2)
+
+        # Branch 2
+        branch_2_conv1 = Unit3Dpy(
+            in_channels, out_channels[3], kernel_size=(1, 1, 1))
+        branch_2_conv2 = Unit3Dpy(
+            out_channels[3], out_channels[4], kernel_size=(3, 3, 3))
+        self.branch_2 = torch.nn.Sequential(branch_2_conv1, branch_2_conv2)
+
+        # Branch3
+        branch_3_pool = MaxPool3dTFPadding(
+            kernel_size=(3, 3, 3), stride=(1, 1, 1), padding='SAME')
+        branch_3_conv2 = Unit3Dpy(
+            in_channels, out_channels[5], kernel_size=(1, 1, 1))
+        self.branch_3 = torch.nn.Sequential(branch_3_pool, branch_3_conv2)
+
+    def forward(self, inp):
+        out_0 = self.branch_0(inp)
+        out_1 = self.branch_1(inp)
+        out_2 = self.branch_2(inp)
+        out_3 = self.branch_3(inp)
+        out = torch.cat((out_0, out_1, out_2, out_3), 1)
+        return out
+
+
+class I3D(torch.nn.Module):
+    def __init__(self,
+                 num_classes,
+                 modality='rgb',
+                 dropout_prob=0,
+                 name='inception'):
+        super(I3D, self).__init__()
+
+        self.name = name
+        self.num_classes = num_classes
+        if modality == 'rgb':
+            in_channels = 3
+        elif modality == 'flow':
+            in_channels = 2
+        else:
+            raise ValueError(
+                '{} not among known modalities [rgb|flow]'.format(modality))
+        self.modality = modality
+
+        conv3d_1a_7x7 = Unit3Dpy(
+            out_channels=64,
+            in_channels=in_channels,
+            kernel_size=(7, 7, 7),
+            stride=(2, 2, 2),
+            padding='SAME')
+        # 1st conv-pool
+        self.conv3d_1a_7x7 = conv3d_1a_7x7
+        self.maxPool3d_2a_3x3 = MaxPool3dTFPadding(
+            kernel_size=(1, 3, 3), stride=(1, 2, 2), padding='SAME')
+        # conv conv
+        conv3d_2b_1x1 = Unit3Dpy(
+            out_channels=64,
+            in_channels=64,
+            kernel_size=(1, 1, 1),
+            padding='SAME')
+        self.conv3d_2b_1x1 = conv3d_2b_1x1
+        conv3d_2c_3x3 = Unit3Dpy(
+            out_channels=192,
+            in_channels=64,
+            kernel_size=(3, 3, 3),
+            padding='SAME')
+        self.conv3d_2c_3x3 = conv3d_2c_3x3
+        self.maxPool3d_3a_3x3 = MaxPool3dTFPadding(
+            kernel_size=(1, 3, 3), stride=(1, 2, 2), padding='SAME')
+
+        # Mixed_3b
+        self.mixed_3b = Mixed(192, [64, 96, 128, 16, 32, 32])
+        self.mixed_3c = Mixed(256, [128, 128, 192, 32, 96, 64])
+
+        self.maxPool3d_4a_3x3 = MaxPool3dTFPadding(
+            kernel_size=(3, 3, 3), stride=(2, 2, 2), padding='SAME')
+
+        # Mixed 4
+        self.mixed_4b = Mixed(480, [192, 96, 208, 16, 48, 64])
+        self.mixed_4c = Mixed(512, [160, 112, 224, 24, 64, 64])
+        self.mixed_4d = Mixed(512, [128, 128, 256, 24, 64, 64])
+        self.mixed_4e = Mixed(512, [112, 144, 288, 32, 64, 64])
+        self.mixed_4f = Mixed(528, [256, 160, 320, 32, 128, 128])
+
+        self.maxPool3d_5a_2x2 = MaxPool3dTFPadding(
+            kernel_size=(2, 2, 2), stride=(2, 2, 2), padding='SAME')
+
+        # Mixed 5
+        self.mixed_5b = Mixed(832, [256, 160, 320, 32, 128, 128])
+        self.mixed_5c = Mixed(832, [384, 192, 384, 48, 128, 128])
+
+        self.avg_pool = torch.nn.AvgPool3d((2, 7, 7), (1, 1, 1))
+        self.dropout = torch.nn.Dropout(dropout_prob)
+        self.conv3d_0c_1x1 = Unit3Dpy(
+            in_channels=1024,
+            out_channels=self.num_classes,
+            kernel_size=(1, 1, 1),
+            activation=None,
+            use_bias=True,
+            use_bn=False)
+        self.softmax = torch.nn.Softmax(1)
+
+    def forward(self, inp):
+        # Preprocessing
+        out = self.conv3d_1a_7x7(inp)
+        out = self.maxPool3d_2a_3x3(out)
+        out = self.conv3d_2b_1x1(out)
+        out = self.conv3d_2c_3x3(out)
+        out = self.maxPool3d_3a_3x3(out)
+        out = self.mixed_3b(out)
+        out = self.mixed_3c(out)
+        out = self.maxPool3d_4a_3x3(out)
+        out = self.mixed_4b(out)
+        out = self.mixed_4c(out)
+        out = self.mixed_4d(out)
+        out = self.mixed_4e(out)
+        out = self.mixed_4f(out)
+        out = self.maxPool3d_5a_2x2(out)
+        out = self.mixed_5b(out)
+        out = self.mixed_5c(out)
+        out = self.avg_pool(out)
+        out = self.dropout(out)
+        out = self.conv3d_0c_1x1(out)
+        out = out.squeeze(3)
+        out = out.squeeze(3)
+        out = out.mean(2)
+        out_logits = out
+        out = self.softmax(out_logits)
+        return out, out_logits
+
+    def load_tf_weights(self, sess):
+        state_dict = {}
+        if self.modality == 'rgb':
+            prefix = 'RGB/inception_i3d'
+        elif self.modality == 'flow':
+            prefix = 'Flow/inception_i3d'
+        load_conv3d(state_dict, 'conv3d_1a_7x7', sess,
+                    os.path.join(prefix, 'Conv3d_1a_7x7'))
+        load_conv3d(state_dict, 'conv3d_2b_1x1', sess,
+                    os.path.join(prefix, 'Conv3d_2b_1x1'))
+        load_conv3d(state_dict, 'conv3d_2c_3x3', sess,
+                    os.path.join(prefix, 'Conv3d_2c_3x3'))
+
+        load_mixed(state_dict, 'mixed_3b', sess,
+                   os.path.join(prefix, 'Mixed_3b'))
+        load_mixed(state_dict, 'mixed_3c', sess,
+                   os.path.join(prefix, 'Mixed_3c'))
+        load_mixed(state_dict, 'mixed_4b', sess,
+                   os.path.join(prefix, 'Mixed_4b'))
+        load_mixed(state_dict, 'mixed_4c', sess,
+                   os.path.join(prefix, 'Mixed_4c'))
+        load_mixed(state_dict, 'mixed_4d', sess,
+                   os.path.join(prefix, 'Mixed_4d'))
+        load_mixed(state_dict, 'mixed_4e', sess,
+                   os.path.join(prefix, 'Mixed_4e'))
+        # Here goest to 0.1 max error with tf
+        load_mixed(state_dict, 'mixed_4f', sess,
+                   os.path.join(prefix, 'Mixed_4f'))
+
+        load_mixed(
+            state_dict,
+            'mixed_5b',
+            sess,
+            os.path.join(prefix, 'Mixed_5b'),
+            fix_typo=True)
+        load_mixed(state_dict, 'mixed_5c', sess,
+                   os.path.join(prefix, 'Mixed_5c'))
+        load_conv3d(
+            state_dict,
+            'conv3d_0c_1x1',
+            sess,
+            os.path.join(prefix, 'Logits', 'Conv3d_0c_1x1'),
+            bias=True,
+            bn=False)
+        self.load_state_dict(state_dict)
+
+
+def get_conv_params(sess, name, bias=False):
+    # Get conv weights
+    conv_weights_tensor = sess.graph.get_tensor_by_name(
+        os.path.join(name, 'w:0'))
+    if bias:
+        conv_bias_tensor = sess.graph.get_tensor_by_name(
+            os.path.join(name, 'b:0'))
+        conv_bias = sess.run(conv_bias_tensor)
+    conv_weights = sess.run(conv_weights_tensor)
+    conv_shape = conv_weights.shape
+
+    kernel_shape = conv_shape[0:3]
+    in_channels = conv_shape[3]
+    out_channels = conv_shape[4]
+
+    conv_op = sess.graph.get_operation_by_name(
+        os.path.join(name, 'convolution'))
+    padding_name = conv_op.get_attr('padding')
+    padding = _get_padding(padding_name, kernel_shape)
+    all_strides = conv_op.get_attr('strides')
+    strides = all_strides[1:4]
+    conv_params = [
+        conv_weights, kernel_shape, in_channels, out_channels, strides, padding
+    ]
+    if bias:
+        conv_params.append(conv_bias)
+    return conv_params
+
+
+def get_bn_params(sess, name):
+    moving_mean_tensor = sess.graph.get_tensor_by_name(
+        os.path.join(name, 'moving_mean:0'))
+    moving_var_tensor = sess.graph.get_tensor_by_name(
+        os.path.join(name, 'moving_variance:0'))
+    beta_tensor = sess.graph.get_tensor_by_name(os.path.join(name, 'beta:0'))
+    moving_mean = sess.run(moving_mean_tensor)
+    moving_var = sess.run(moving_var_tensor)
+    beta = sess.run(beta_tensor)
+    return moving_mean, moving_var, beta
+
+
+def _get_padding(padding_name, conv_shape):
+    padding_name = padding_name.decode("utf-8")
+    if padding_name == "VALID":
+        return [0, 0]
+    elif padding_name == "SAME":
+        # return [math.ceil(int(conv_shape[0])/2), math.ceil(int(conv_shape[1])/2)]
+        return [
+            math.floor(int(conv_shape[0]) / 2),
+            math.floor(int(conv_shape[1]) / 2),
+            math.floor(int(conv_shape[2]) / 2)
+        ]
+    else:
+        raise ValueError('Invalid padding name ' + padding_name)
+
+
+def load_conv3d(state_dict, name_pt, sess, name_tf, bias=False, bn=True):
+    # Transfer convolution params
+    conv_name_tf = os.path.join(name_tf, 'conv_3d')
+    conv_params = get_conv_params(sess, conv_name_tf, bias=bias)
+    if bias:
+        conv_weights, kernel_shape, in_channels, out_channels, strides, padding, conv_bias = conv_params
+    else:
+        conv_weights, kernel_shape, in_channels, out_channels, strides, padding = conv_params
+
+    conv_weights_rs = np.transpose(
+        conv_weights, (4, 3, 0, 1,
+                       2))  # to pt format (out_c, in_c, depth, height, width)
+    state_dict[name_pt + '.conv3d.weight'] = torch.from_numpy(conv_weights_rs)
+    if bias:
+        state_dict[name_pt + '.conv3d.bias'] = torch.from_numpy(conv_bias)
+
+    # Transfer batch norm params
+    if bn:
+        conv_tf_name = os.path.join(name_tf, 'batch_norm')
+        moving_mean, moving_var, beta = get_bn_params(sess, conv_tf_name)
+
+        out_planes = conv_weights_rs.shape[0]
+        state_dict[name_pt + '.batch3d.weight'] = torch.ones(out_planes)
+        state_dict[name_pt +
+                   '.batch3d.bias'] = torch.from_numpy(beta.squeeze())
+        state_dict[name_pt
+                   + '.batch3d.running_mean'] = torch.from_numpy(moving_mean.squeeze())
+        state_dict[name_pt
+                   + '.batch3d.running_var'] = torch.from_numpy(moving_var.squeeze())
+
+
+def load_mixed(state_dict, name_pt, sess, name_tf, fix_typo=False):
+    # Branch 0
+    load_conv3d(state_dict, name_pt + '.branch_0', sess,
+                os.path.join(name_tf, 'Branch_0/Conv3d_0a_1x1'))
+
+    # Branch .1
+    load_conv3d(state_dict, name_pt + '.branch_1.0', sess,
+                os.path.join(name_tf, 'Branch_1/Conv3d_0a_1x1'))
+    load_conv3d(state_dict, name_pt + '.branch_1.1', sess,
+                os.path.join(name_tf, 'Branch_1/Conv3d_0b_3x3'))
+
+    # Branch 2
+    load_conv3d(state_dict, name_pt + '.branch_2.0', sess,
+                os.path.join(name_tf, 'Branch_2/Conv3d_0a_1x1'))
+    if fix_typo:
+        load_conv3d(state_dict, name_pt + '.branch_2.1', sess,
+                    os.path.join(name_tf, 'Branch_2/Conv3d_0a_3x3'))
+    else:
+        load_conv3d(state_dict, name_pt + '.branch_2.1', sess,
+                    os.path.join(name_tf, 'Branch_2/Conv3d_0b_3x3'))
+
+    # Branch 3
+    load_conv3d(state_dict, name_pt + '.branch_3.1', sess,
+                os.path.join(name_tf, 'Branch_3/Conv3d_0b_1x1'))
--- a/i3D/model.py
+++ b/i3D/model.py
-import torch
-import torch.nn as nn
-from i3D.resnet3d_xl import Net
-import torch.nn.functional as F
-'''
-Video Classification Model library.
-'''
-
-class TrainingScheduleError(Exception):
-    pass
-
-class VideoModel(nn.Module):
-    def __init__(self,
-                 num_classes,
-                 num_boxes,
-                 num_videos=16,
-                 restore_dict=None,
-                 freeze_weights=None,
-                 device=None,
-                 loss_type='softmax'):
-        super(VideoModel, self).__init__()
-        self.device = device
-        self.num_frames = num_videos
-        self.num_classes = num_classes
-        # Network loads kinetic pre-trained weights in initialization
-        self.i3D = Net(num_classes, extract_features=True, loss_type=loss_type)
-
-
-        try:
-            # Restore weights
-            if restore_dict:
-                self.restore(restore_dict)
-            # Freeze weights
-            if freeze_weights:
-                self.freeze_weights(freeze_weights)
-            else:
-                print(" > No weights are freezed")
-        except Exception as e:
-            print(" > Exception {}".format(e))
-
-    def restore(self, restore=None):
-        # Load pre-trained I3D + Graph weights for fine-tune (replace the last FC)
-        restore_finetuned = restore.get("restore_finetuned", None)
-        if restore_finetuned:
-            self._restore_fintuned(restore_finetuned)
-            print(" > Restored I3D + Graph weights")
-            return
-
-        # Load pre-trained I3D weights
-        restore_i3d = restore.get("restore_i3d", None)
-        if restore_i3d:
-            self._restore_i3d(restore_i3d)
-            print(" > Restored only I3D weights")
-            return
-
-        # Load pre-trained I3D + Graph weights without replacing anything
-        restore_predict = restore.get("restore_predict", None)
-        if restore_predict:
-            self._restore_predict(restore_predict)
-            print(" > Restored the model with strict weights")
-            return
-
-    def _restore_predict(self, path):
-        if path is None:
-            raise TrainingScheduleError('You should pre-train the video model on your training data first')
-
-        weights = torch.load(path, map_location=self.device)['state_dict']
-        new_weights = {}
-        for k, v in weights.items():
-            new_weights[k.replace('module.', '')] = v
-
-        self.load_state_dict(new_weights, strict=True)
-        print(" > Weights {} loaded".format(path))
-
-    def _restore_i3d(self, path):
-        if path is None:
-            raise TrainingScheduleError('You should pre-train the video model on your training data first')
-       
-        weights = torch.load(path, map_location=self.device)['state_dict']
-        new_weights = {}
-        for k, v in weights.items():
-            if not k.startswith('module.fc') and not k.startswith('module.i3D.classifier'):
-                new_weights[k.replace('module.', '')] = v
-        self.load_state_dict(new_weights, strict=False)
-
-    def _restore_fintuned(self, path):
-        if path is None:
-            raise TrainingScheduleError('You should pre-train the video model on your training data first')
-
-        weights = torch.load(path, map_location=self.device)['state_dict']
-        new_weights = {}
-        for k, v in weights.items():
-            # Don't load classifiers (different classes 88 vs 86)
-            if not k.startswith('module.fc'):
-                if not k.startswith('module.i3D.classifier'):
-                    new_weights[k.replace('module.', '')] = v
-
-        self.load_state_dict(new_weights, strict=False)
-        print(" > Weights {} loaded".format(path))
-
-    def freeze_weights(self, module):
-        if module == 'i3d':
-            print(" > Freeze I3D module")
-            for param in self.i3D.parameters():
-                param.requires_grad = False
-        elif module == 'fine_tuned':
-            print(" > Freeze Graph + I3D module, only last FC is training")
-            # Fixed the entire params without the last FC
-            for name, param in self.i3D.named_parameters():
-                if not name.startswith('classifier'):
-                    param.requires_grad = False
-            for param in self.graph_embedding.parameters():
-                param.requires_grad = False
-            for param in self.conv.parameters():
-                param.requires_grad = False
-
-        else:
-            raise NotImplementedError('Unrecognized option, you can freeze either graph module or I3D module')
-        pass
-
-    def _get_i3d_features(self, videos, output_video_features=False):
-        # org_features - [V x 2048 x T / 2 x 14 x 14]
-        _, org_features = self.i3D(videos)
-        # Reduce dimension video_features - [V x 512 x T / 2 x 14 x 14]
-        videos_features = self.conv(org_features)
-        bs, d, t, h, w = videos_features.size()
-        # Get global features
-        videos_features_rs = videos_features.permute(0, 2, 1, 3, 4)  # [V x T / 2 x 512 x h x w]
-        videos_features_rs = videos_features_rs.reshape(-1, d, h, w)  # [V * T / 2 x 512 x h x w]
-        global_features = self.avgpool(videos_features_rs)  # [V * T / 2 x 512 x 1 x 1]
-        global_features = self.dropout(global_features)
-        global_features = global_features.reshape(bs, t, d)  # [V x T / 2 x 512]
-        if output_video_features:
-            return global_features, videos_features
-        else:
-            return global_features
-
-    def flatten(self, x):
-        return [item for sublist in x for item in sublist]
-
--- a/i3D/model_lib.py
+++ b/i3D/model_lib.py
--- a/i3D/resnet3d_xl.py
+++ b/i3D/resnet3d_xl.py
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.autograd import Variable
-import math
-import numpy as np
-
-from functools import partial
-
-__all__ = [
-    'ResNet', 'resnet10', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
-    'resnet152', 'resnet200',
-]
-
-
-def conv3x3x3(in_planes, out_planes, stride=1):
-    # 3x3x3 convolution with padding
-    return nn.Conv3d(
-        in_planes,
-        out_planes,
-        kernel_size=3,
-        stride=stride,
-        padding=1,
-        bias=False)
-
-
-def downsample_basic_block(x, planes, stride):
-    out = F.avg_pool3d(x, kernel_size=1, stride=stride)
-    zero_pads = torch.Tensor(
-        out.size(0), planes - out.size(1), out.size(2), out.size(3),
-        out.size(4)).zero_()
-    if isinstance(out.data, torch.cuda.FloatTensor):
-        zero_pads = zero_pads.cuda()
-
-    out = Variable(torch.cat([out.data, zero_pads], dim=1))
-
-    return out
-
-
-class BasicBlock(nn.Module):
-    expansion = 1
-
-    def __init__(self, inplanes, planes, stride=1, downsample=None):
-        super(BasicBlock, self).__init__()
-        self.conv1 = conv3x3x3(inplanes, planes, stride)
-        self.bn1 = nn.BatchNorm3d(planes)
-        self.relu = nn.ReLU(inplace=True)
-        self.conv2 = conv3x3x3(planes, planes)
-        self.bn2 = nn.BatchNorm3d(planes)
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x):
-        residual = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-
-        if self.downsample is not None:
-            residual = self.downsample(x)
-
-        out += residual
-        out = self.relu(out)
-
-        return out
-
-
-class Bottleneck(nn.Module):
-    conv_op = None
-    offset_groups = 1
-
-    def __init__(self, dim_in, dim_out, stride, dim_inner, group=1, use_temp_conv=1, temp_stride=1, dcn=False,
-                 shortcut_type='B'):
-        super(Bottleneck, self).__init__()
-        # 1 x 1 layer
-        self.with_dcn = dcn
-        self.conv1 = self.Conv3dBN(dim_in, dim_inner, (1 + use_temp_conv * 2, 1, 1), (temp_stride, 1, 1),
-                                   (use_temp_conv, 0, 0))
-        self.relu = nn.ReLU(inplace=True)
-        # 3 x 3 layer
-        self.conv2 = self.Conv3dBN(dim_inner, dim_inner, (1, 3, 3), (1, stride, stride), (0, 1, 1))
-        # 1 x 1 layer
-        self.conv3 = self.Conv3dBN(dim_inner, dim_out, (1, 1, 1), (1, 1, 1), (0, 0, 0))
-
-        self.shortcut_type = shortcut_type
-        self.dim_in = dim_in
-        self.dim_out = dim_out
-        self.temp_stride = temp_stride
-        self.stride = stride
-        # nn.Conv3d(dim_in, dim_out, (1,1,1),(temp_stride,stride,stride),(0,0,0))
-        if self.shortcut_type == 'B':
-            if self.dim_in == self.dim_out and self.temp_stride == 1 and self.stride == 1:  # or (self.dim_in == self.dim_out and self.dim_in == 64 and self.stride ==1):
-
-                pass
-            else:
-                # pass
-                self.shortcut = self.Conv3dBN(dim_in, dim_out, (1, 1, 1), (temp_stride, stride, stride), (0, 0, 0))
-
-        # nn.Conv3d(dim_in,dim_inner,kernel_size=(1+use_temp_conv*2,1,1),stride = (temp_stride,1,1),padding = )
-
-    def forward(self, x):
-        residual = x
-        out = self.conv1(x)
-        out = self.relu(out)
-        out = self.conv2(out)
-        out = self.relu(out)
-        out = self.conv3(out)
-        if self.dim_in == self.dim_out and self.temp_stride == 1 and self.stride == 1:
-            pass
-        else:
-            residual = self.shortcut(residual)
-        out += residual
-        out = self.relu(out)
-        return out
-
-    def Conv3dBN(self, dim_in, dim_out, kernels, strides, pads, group=1):
-        if self.with_dcn and kernels[0] > 1:
-            # use deformable conv
-            return nn.Sequential(
-                self.conv_op(dim_in, dim_out, kernel_size=kernels, stride=strides, padding=pads, bias=False,
-                             offset_groups=self.offset_groups),
-                nn.BatchNorm3d(dim_out)
-            )
-        else:
-            return nn.Sequential(
-                nn.Conv3d(dim_in, dim_out, kernel_size=kernels, stride=strides, padding=pads, bias=False),
-                nn.BatchNorm3d(dim_out)
-            )
-
-
-class ResNet(nn.Module):
-
-    def __init__(self,
-                 block,
-                 layers,
-                 use_temp_convs_set,
-                 temp_strides_set,
-                 sample_size,
-                 sample_duration,
-                 shortcut_type='B',
-                 num_classes=400,
-                 stage_with_dcn=(False, False, False, False),
-                 extract_features=False,
-                 loss_type='softmax'):
-        super(ResNet, self).__init__()
-        self.extract_features = extract_features
-        self.stage_with_dcn = stage_with_dcn
-        self.group = 1
-        self.width_per_group = 64
-        self.dim_inner = self.group * self.width_per_group
-        # self.shortcut_type = shortcut_type
-        self.conv1 = nn.Conv3d(
-            3,
-            64,
-            kernel_size=(1 + use_temp_convs_set[0][0] * 2, 7, 7),
-            stride=(temp_strides_set[0][0], 2, 2),
-            padding=(use_temp_convs_set[0][0], 3, 3),
-            bias=False)
-        self.bn1 = nn.BatchNorm3d(64)
-        self.relu = nn.ReLU(inplace=True)
-        self.maxpool1 = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 0, 0))
-        with_dcn = True if self.stage_with_dcn[0] else False
-        self.layer1 = self._make_layer(block, 64, 256, shortcut_type, stride=1, num_blocks=layers[0],
-                                       dim_inner=self.dim_inner, group=self.group, use_temp_convs=use_temp_convs_set[1],
-                                       temp_strides=temp_strides_set[1], dcn=with_dcn)
-        self.maxpool2 = nn.MaxPool3d(kernel_size=(2, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
-        with_dcn = True if self.stage_with_dcn[1] else False
-        self.layer2 = self._make_layer(block, 256, 512, shortcut_type, stride=2, num_blocks=layers[1],
-                                       dim_inner=self.dim_inner * 2, group=self.group,
-                                       use_temp_convs=use_temp_convs_set[2], temp_strides=temp_strides_set[2],
-                                       dcn=with_dcn)
-        with_dcn = True if self.stage_with_dcn[2] else False
-        self.layer3 = self._make_layer(block, 512, 1024, shortcut_type, stride=2, num_blocks=layers[2],
-                                       dim_inner=self.dim_inner * 4, group=self.group,
-                                       use_temp_convs=use_temp_convs_set[3], temp_strides=temp_strides_set[3],
-                                       dcn=with_dcn)
-        with_dcn = True if self.stage_with_dcn[3] else False
-        self.layer4 = self._make_layer(block, 1024, 2048, shortcut_type, stride=1, num_blocks=layers[3],
-                                       dim_inner=self.dim_inner * 8, group=self.group,
-                                       use_temp_convs=use_temp_convs_set[4], temp_strides=temp_strides_set[4],
-                                       dcn=with_dcn)
-        last_duration = int(math.ceil(sample_duration / 2))  # int(math.ceil(sample_duration / 8))
-        last_size = int(math.ceil(sample_size / 16))
-        # self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1) #nn.AdaptiveAvgPool3d((1, 1, 1)) #
-        self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
-        self.dropout = torch.nn.Dropout(p=0.5)
-        self.classifier = nn.Linear(2048, num_classes)
-
-        for m in self.modules():
-            # if isinstance(m, nn.Conv3d):
-            #     m.weight = nn.init.kaiming_normal_(m.weight, mode='fan_out')
-            # elif isinstance(m,nn.Linear):
-            #    m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')
-            # elif 
-            if isinstance(m, nn.BatchNorm3d):
-                m.weight.data.fill_(1)
-                m.bias.data.zero_()
-
-    def _make_layer(self, block, dim_in, dim_out, shortcut_type, stride, num_blocks, dim_inner=None, group=None,
-                    use_temp_convs=None, temp_strides=None, dcn=False):
-        if use_temp_convs is None:
-            use_temp_convs = np.zeros(num_blocks).astype(int)
-        if temp_strides is None:
-            temp_strides = np.ones(num_blocks).astype(int)
-        if len(use_temp_convs) < num_blocks:
-            for _ in range(num_blocks - len(use_temp_convs)):
-                use_temp_convs.append(0)
-                temp_strides.append(1)
-        layers = []
-        for idx in range(num_blocks):
-            block_stride = 2 if (idx == 0 and stride == 2) else 1
-
-            layers.append(
-                block(dim_in, dim_out, block_stride, dim_inner, group, use_temp_convs[idx], temp_strides[idx], dcn))
-            dim_in = dim_out
-        return nn.Sequential(*layers)
-
-    def forward_single(self, x):
-        x = self.conv1(x)
-
-        x = self.bn1(x)
-        x = self.relu(x)
-        x = self.maxpool1(x)
-
-        x = self.layer1(x)
-        x = self.maxpool2(x)
-        x = self.layer2(x)
-
-        x = self.layer3(x)
-        features = self.layer4(x)
-
-        x = self.avgpool(features)
-
-        x = x.view(x.size(0), -1)
-        x = self.dropout(x)
-
-        y = self.classifier(x)
-        if self.extract_features:
-            return y, features
-        else:
-            return y
-
-    def forward_multi(self, x):
-        clip_preds = []
-        # import ipdb;ipdb.set_trace()
-        for clip_idx in range(x.shape[1]):  # B, 10, 3, 3, 32, 224, 224
-            spatial_crops = []
-            for crop_idx in range(x.shape[2]):
-                clip = x[:, clip_idx, crop_idx]
-                clip = self.forward_single(clip)
-                spatial_crops.append(clip)
-            spatial_crops = torch.stack(spatial_crops, 1).mean(1)  # (B, 400)
-            clip_preds.append(spatial_crops)
-        clip_preds = torch.stack(clip_preds, 1).mean(1)  # (B, 400)
-        return clip_preds
-
-    def forward(self, x):
-
-        # 5D tensor == single clip
-        if x.dim() == 5:
-            pred = self.forward_single(x)
-
-        # 7D tensor == 3 crops/10 clips
-        elif x.dim() == 7:
-            pred = self.forward_multi(x)
-
-        # loss_dict = {}
-        # if 'label' in batch:
-        #     loss = F.cross_entropy(pred, batch['label'], reduction='none')
-        #     loss_dict = {'clf': loss}
-
-        return pred
-
-
-def get_fine_tuning_parameters(model, ft_begin_index):
-    if ft_begin_index == 0:
-        return model.parameters()
-
-    ft_module_names = []
-    for i in range(ft_begin_index, 5):
-        ft_module_names.append('layer{}'.format(i))
-    ft_module_names.append('fc')
-    # import ipdb;ipdb.set_trace()
-    parameters = []
-    for k, v in model.named_parameters():
-        for ft_module in ft_module_names:
-            if ft_module in k:
-                parameters.append({'params': v})
-                break
-        else:
-            parameters.append({'params': v, 'lr': 0.0})
-
-    return parameters
-
-
-def obtain_arc(arc_type):
-    # c2d, ResNet50
-    if arc_type == 1:
-        use_temp_convs_1 = [0]
-        temp_strides_1 = [2]
-        use_temp_convs_2 = [0, 0, 0]
-        temp_strides_2 = [1, 1, 1]
-        use_temp_convs_3 = [0, 0, 0, 0]
-        temp_strides_3 = [1, 1, 1, 1]
-        use_temp_convs_4 = [0, ] * 6
-        temp_strides_4 = [1, ] * 6
-        use_temp_convs_5 = [0, 0, 0]
-        temp_strides_5 = [1, 1, 1]
-
-    # i3d, ResNet50
-    if arc_type == 2:
-        use_temp_convs_1 = [2]
-        temp_strides_1 = [1]
-        use_temp_convs_2 = [1, 1, 1]
-        temp_strides_2 = [1, 1, 1]
-        use_temp_convs_3 = [1, 0, 1, 0]
-        temp_strides_3 = [1, 1, 1, 1]
-        use_temp_convs_4 = [1, 0, 1, 0, 1, 0]
-        temp_strides_4 = [1, 1, 1, 1, 1, 1]
-        use_temp_convs_5 = [0, 1, 0]
-        temp_strides_5 = [1, 1, 1]
-
-    # c2d, ResNet101
-    if arc_type == 3:
-        use_temp_convs_1 = [0]
-        temp_strides_1 = [2]
-        use_temp_convs_2 = [0, 0, 0]
-        temp_strides_2 = [1, 1, 1]
-        use_temp_convs_3 = [0, 0, 0, 0]
-        temp_strides_3 = [1, 1, 1, 1]
-        use_temp_convs_4 = [0, ] * 23
-        temp_strides_4 = [1, ] * 23
-        use_temp_convs_5 = [0, 0, 0]
-        temp_strides_5 = [1, 1, 1]
-
-    # i3d, ResNet101
-    if arc_type == 4:
-        use_temp_convs_1 = [2]
-        temp_strides_1 = [2]
-        use_temp_convs_2 = [1, 1, 1]
-        temp_strides_2 = [1, 1, 1]
-        use_temp_convs_3 = [1, 0, 1, 0]
-        temp_strides_3 = [1, 1, 1, 1]
-        use_temp_convs_4 = []
-        for i in range(23):
-            if i % 2 == 0:
-                use_temp_convs_4.append(1)
-            else:
-                use_temp_convs_4.append(0)
-
-        temp_strides_4 = [1, ] * 23
-        use_temp_convs_5 = [0, 1, 0]
-        temp_strides_5 = [1, 1, 1]
-
-    use_temp_convs_set = [use_temp_convs_1, use_temp_convs_2, use_temp_convs_3, use_temp_convs_4, use_temp_convs_5]
-    temp_strides_set = [temp_strides_1, temp_strides_2, temp_strides_3, temp_strides_4, temp_strides_5]
-
-    return use_temp_convs_set, temp_strides_set
-
-
-def resnet10(**kwargs):
-    """Constructs a ResNet-18 model.
-    """
-    use_temp_convs_set = []
-    temp_strides_set = []
-    model = ResNet(BasicBlock, [1, 1, 1, 1], use_temp_convs_set, temp_strides_set, **kwargs)
-    return model
-
-
-def resnet18(**kwargs):
-    """Constructs a ResNet-18 model.
-    """
-    use_temp_convs_set = []
-    temp_strides_set = []
-    model = ResNet(BasicBlock, [2, 2, 2, 2], use_temp_convs_set, temp_strides_set, **kwargs)
-    return model
-
-
-def resnet34(**kwargs):
-    """Constructs a ResNet-34 model.
-    """
-    use_temp_convs_set = []
-    temp_strides_set = []
-    model = ResNet(BasicBlock, [3, 4, 6, 3], use_temp_convs_set, temp_strides_set, **kwargs)
-    return model
-
-
-def resnet50(extract_features, **kwargs):
-    """Constructs a ResNet-50 model.
-    """
-    use_temp_convs_set, temp_strides_set = obtain_arc(2)
-    model = ResNet(Bottleneck, [3, 4, 6, 3], use_temp_convs_set, temp_strides_set,
-                   extract_features=extract_features, **kwargs)
-    return model
-
-
-def resnet101(**kwargs):
-    """Constructs a ResNet-101 model.
-    """
-    use_temp_convs_set, temp_strides_set = obtain_arc(4)
-    model = ResNet(Bottleneck, [3, 4, 23, 3], use_temp_convs_set, temp_strides_set, **kwargs)
-    return model
-
-
-def resnet152(**kwargs):
-    """Constructs a ResNet-101 model.
-    """
-    use_temp_convs_set = []
-    temp_strides_set = []
-    model = ResNet(Bottleneck, [3, 8, 36, 3], use_temp_convs_set, temp_strides_set, **kwargs)
-    return model
-
-
-def resnet200(**kwargs):
-    """Constructs a ResNet-101 model.
-    """
-    use_temp_convs_set = []
-    temp_strides_set = []
-    model = ResNet(Bottleneck, [3, 24, 36, 3], use_temp_convs_set, temp_strides_set, **kwargs)
-    return model
-
-
-def Net(num_classes, extract_features=False, loss_type='softmax',
-        weights=None, freeze_all_but_cls=False):
-    net = globals()['resnet' + str(50)](
-        num_classes=num_classes,
-        sample_size=50,
-        sample_duration=32,
-        extract_features=extract_features,
-        loss_type=loss_type,
-    )
-
-    if weights is not None:
-        kinetics_weights = torch.load(weights)['state_dict']
-        print("Found weights in {}.".format(weights))
-        cls_name = 'fc'
-    else:
-        kinetics_weights = torch.load('i3D/kinetics-res50.pth')
-        cls_name = 'fc'
-        print('\n Restoring Kintetics \n')
-
-    new_weights = {}
-    for k, v in kinetics_weights.items():
-        if not k.startswith('module.' + cls_name):
-            new_weights[k.replace('module.', '')] = v
-    net.load_state_dict(new_weights, strict=False)
-
-    if freeze_all_but_cls:
-        for name, par in net.named_parameters():
-            if not name.startswith('classifier'):
-                par.requires_grad = False
-    return net
--- a/kinetics_feats.py
+++ b/kinetics_feats.py
+# -*- coding: utf-8 -*-
+import os
+import sys
+import glob
+import datetime
+import argparse
+import random
+
+import numpy as np
+
+#from pathlib import Path
+#filepath = Path.cwd()
+#sys.path.append(filepath)
+from video_loaders import load_av
+from se_bb_from_np import annot_np
+from SmthSequence import SmthSequence
+from SmthFrameRelations import frame_relations
+
+from PIL import Image
+import matplotlib.pyplot as plt
+import cv2
+import i3D.gtransforms as gtransforms
+
+import torch
+from i3D.i3dpt import I3D
+
+rgb_pt_checkpoint = 'i3D/model_rgb.pth'
+
+class I3dFV:
+        def __init__(self,path):
+                self.anno = annot_np(path)
+                self.net = I3D(num_classes=400, modality='rgb')
+                self.net.eval()
+                self.net.load_state_dict(torch.load(rgb_pt_checkpoint))
+                self.net.cuda()
+                self.pre_resize_shape = (256, 340)
+                self.random_crop = gtransforms.GroupMultiScaleCrop(output_size=224,
+                                                                   scales=[1],
+                                                                   max_distort=0,
+                                                                   center_crop_only=True)
+        
+        def process_video(self,finput,verbose=False):
+                # get video id
+                vidnum = int(os.path.splitext(os.path.basename(finput))[0])
+                
+                # load video to ndarray list
+                img_array = load_av(finput)
+                
+                # convert BGR to RGB
+                frames = [cv2.cvtColor(img, cv2.COLOR_BGR2RGB) for img in img_array]
+                # convert ndarray to array of PIL Images for resize and cropping
+                frames = [Image.fromarray(img.astype('uint8'), 'RGB') for img in frames]
+                # resize
+                frames = [img.resize((self.pre_resize_shape[1], self.pre_resize_shape[0]), Image.BILINEAR) for img in frames]
+                # crop
+                frames, (offset_h, offset_w, crop_h, crop_w) = self.random_crop(frames)
+                # convert back from PIL to ndarray for cv2 channel separation
+                frames = [np.array(img) for img in frames]
+                # separate channels into R,G,B frame sequences
+                #rs = []
+                #gs = []
+                #bs = []
+                #for i in range(len(frames)):
+                #    R, G, B = cv2.split(frames[i])
+                #    rs.append(R)
+                #    gs.append(G)
+                #    bs.append(B)
+                #frames = np.asarray([[rs, gs, bs]])
+                
+                frames = np.asarray([frames]).transpose(0, 4, 1, 2, 3) # alternative to channel splitting above?
+                #print(frames.shape)
+                
+                sample_var = torch.autograd.Variable(torch.from_numpy(frames).cuda()).float()
+                _, logits = self.net(sample_var)
+                
+                return logits.cpu().detach().numpy()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--annotations',
+        dest='path_to_annotations',
+        default='../annotations_ground/',
+        help='folder to load annotations from')
+    parser.add_argument(
+        '--video',
+        dest='path_to_video',
+        default='.',
+        help='video to load')
+    
+    args = parser.parse_args()
+    
+    i3dfv = I3dFV(args.path_to_annotations)
+    fv = i3dfv.process_video(args.path_to_video, verbose=True)
+    
+    print(fv)
+    
+    print("fin")
--- a/regen_frame_fv.py
+++ b/regen_frame_fv.py
@@ -17,6 +17,7 @@ from SmthSequence import SmthSequence
 from SmthFrameRelations import frame_relations

 from PIL import Image
+import matplotlib.pyplot as plt
 import cv2
 import torch
 from i3D.model import VideoModel
@@ -61,33 +62,61 @@ class FrameFV:
                    bs.append(B)
                frames = [rs, gs, bs]
                
+                #print(self.net.i3D.classifier.weight.data)
+                print(self.net.classifier[4].weight.data)
+                
                # read frame annotations into Sequence
-                seq = SmthSequence()
-                for framenum in range(0,len(img_array)):
-                    cats, bbs = self.anno.get_vf_bbx(vidnum, framenum+1)
-                    # add detections to Sequence
-                    for i in range(0,len(cats)):
-                        seq.add(framenum, cats[i], bbs[i])
+                #seq = SmthSequence()
+                #for framenum in range(0,len(img_array)):
+                #    cats, bbs = self.anno.get_vf_bbx(vidnum, framenum+1)
+                #    # add detections to Sequence
+                #    for i in range(0,len(cats)):
+                #        seq.add(framenum, cats[i], bbs[i])
                
                # compute object relations per frame
-                relations = []
-                for framenum in range(0,len(img_array)):
-                    fv = frame_relations(seq, 0, 1, framenum)
-                    relations.append(fv)
-                relations  = np.asarray(relations)
+                #relations = []
+                #for framenum in range(0,len(img_array)):
+                #    fv = frame_relations(seq, 0, 1, framenum)
+                #    relations.append(fv)
+                #relations  = np.asarray(relations)
                
                # i3D features per frame
                clip = torch.from_numpy(np.asarray([frames]))
-                print(clip.shape)
+                #print(clip.shape)
                clip = clip.float()
                glo, vid = self.net.i3D(clip)
                
+                #return glo.detach().numpy()
+                
                videos_features = self.net.conv(vid)
                
-                print(glo.shape)
-                print(vid.shape)
+                #print(glo.shape)
+                #print(vid.shape)
+                #print(videos_features.shape)
+                
+                #plt.plot(np.linspace(0,400,num=400), glo.detach().numpy()[0])
+                #plt.show()
                
-                print(videos_features.shape)
+                pre = vid.detach().numpy().view()
+                post = videos_features.detach().numpy().view()
+                
+                rows = []
+                for f in range(len(img_array)//2):
+                    row = []
+                    for i in range(512):
+                        patch = post[0,i,f]
+                        row.append(patch)
+                    row = np.hstack(row)
+                    rows.append(row)
+                pic = np.vstack(rows)
+                
+                print(pic.shape)
+                while(1):
+                    cv2.imshow('frame', pic)
+                    k = cv2.waitKey(33)
+                    if k == 27:
+                        break
+

 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
@@ -109,13 +138,11 @@ if __name__ == '__main__':
                        help='intermediate feature dimension for coord-based features')
    parser.add_argument('--size', default=224, type=int, metavar='N',
                        help='primary image input size')
-    parser.add_argument('--batch_size', '-b', default=72, type=int,
-                        metavar='N', help='mini-batch size (default: 72)')
    parser.add_argument('--num_classes', default=174, type=int,
                        help='num of class in the model')
    parser.add_argument('--num_boxes', default=4, type=int,
                        help='num of boxes for each image')
-    parser.add_argument('--num_frames', default=36, type=int,
+    parser.add_argument('--num_frames', default=16, type=int,
                        help='num of frames for the model')
    parser.add_argument('--fine_tune', help='path with ckpt to restore')
    parser.add_argument('--restore_i3d')

--- a/train_actions.sh
+++ b/train_actions.sh
 #!/bin/bash

-ACTIONS=( 131 141 )
+ACTIONS=( 1 12 13 16 25 33 34 35 38 48 51 52 54 55 56 58 61 63 64 66 69 71 75 76 78 80 81 82 88 89 95 96 97 102 111 118 127 128 130 131 133 136 138 141 147 152 155 159 160 163 )

 for i in ${ACTIONS[@]};
 do