diff --git a/collate_fv.py b/collate_fv.py index e8103d2aa74c8c202aca136292ad147ce83ec810..33944a03cd24ac68dba5f4583413ea0bca9b70f7 100644 --- a/collate_fv.py +++ b/collate_fv.py @@ -10,10 +10,11 @@ import json import cv2 import numpy as np -from pathlib import Path +#from pathlib import Path #filepath = Path.cwd() #sys.path.append(filepath) from compute_fv import ComputeFV +from kinetics_feats import I3dFV if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -48,6 +49,7 @@ if __name__ == "__main__": dest='path_to_annotations', default='../annotations_ground/', help='folder to load annotations from') + args = parser.parse_args() labs = json.load(open(args.labels_file,'r')) @@ -58,6 +60,7 @@ if __name__ == "__main__": # load something-else annotated videos compfv = ComputeFV(args.path_to_annotations) + i3dfv = I3dFV(args.path_to_annotations) #ids = [] # video id's in order of being processed #classes = [] # class of each video == ordinal of label @@ -71,13 +74,14 @@ if __name__ == "__main__": for filename in glob.glob(folder + "/*.webm"): # only process a random % of the videos (if not positive class example) if int(k) != args.action_id: - if random.random() > 0.01: + if random.random() > 0.02: continue print("processing file: " + filename) vidnum = int(os.path.splitext(os.path.basename(filename))[0]) fv = compfv.process_video(filename, os.path.join(args.path_to_phase_models, 'a'+str(args.action_id)+'.joblib')) + fv0 = i3dfv.process_video(filename) #if type(fv) is not np.ndarray: # continue @@ -85,7 +89,7 @@ if __name__ == "__main__": if int(k) == args.action_id: fv[-1] = 1 - feats.append(fv) + feats.append(np.concatenate([fv0.flatten(), fv])) #ids.append(vidnum) # list of np.ndarray to 2d ndarray diff --git a/collect_action_features.sh b/collect_action_features.sh index f31f518ab8831b2006872d52052564f81ad41c4f..2f87d7c94ffce9573923b1e1271583e544b8f744 100755 --- a/collect_action_features.sh +++ b/collect_action_features.sh @@ -1,6 +1,6 @@ #!/bin/bash -ACTIONS=( 131 141 ) +ACTIONS=( 1 12 13 16 25 33 34 35 38 48 51 52 54 55 56 58 61 63 64 66 69 71 75 76 78 80 81 82 88 89 95 96 97 102 111 118 127 128 130 131 133 136 138 141 147 152 155 159 160 163 ) for i in ${ACTIONS[@]}; do diff --git a/i3D/i3dpt.py b/i3D/i3dpt.py new file mode 100644 index 0000000000000000000000000000000000000000..99d7487d5d8d558cd3d9c0ba6a82c483cb6759a8 --- /dev/null +++ b/i3D/i3dpt.py @@ -0,0 +1,455 @@ +import math +import os +import torch +import numpy as np + + +def get_padding_shape(filter_shape, stride, mod=0): + """Fetch a tuple describing the input padding shape. + + NOTES: To replicate "TF SAME" style padding, the padding shape needs to be + determined at runtime to handle cases when the input dimension is not divisible + by the stride. + See https://stackoverflow.com/a/49842071 for explanation of TF SAME padding logic + """ + def _pad_top_bottom(filter_dim, stride_val, mod): + if mod: + pad_along = max(filter_dim - mod, 0) + else: + pad_along = max(filter_dim - stride_val, 0) + pad_top = pad_along // 2 + pad_bottom = pad_along - pad_top + return pad_top, pad_bottom + + padding_shape = [] + for idx, (filter_dim, stride_val) in enumerate(zip(filter_shape, stride)): + depth_mod = (idx == 0) and mod + pad_top, pad_bottom = _pad_top_bottom(filter_dim, stride_val, depth_mod) + padding_shape.append(pad_top) + padding_shape.append(pad_bottom) + + depth_top = padding_shape.pop(0) + depth_bottom = padding_shape.pop(0) + padding_shape.append(depth_top) + padding_shape.append(depth_bottom) + return tuple(padding_shape) + + +def simplify_padding(padding_shapes): + all_same = True + padding_init = padding_shapes[0] + for pad in padding_shapes[1:]: + if pad != padding_init: + all_same = False + return all_same, padding_init + + +class Unit3Dpy(torch.nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size=(1, 1, 1), + stride=(1, 1, 1), + activation='relu', + padding='SAME', + use_bias=False, + use_bn=True): + super(Unit3Dpy, self).__init__() + + self.padding = padding + self.activation = activation + self.use_bn = use_bn + self.stride = stride + if padding == 'SAME': + padding_shape = get_padding_shape(kernel_size, stride) + simplify_pad, pad_size = simplify_padding(padding_shape) + self.simplify_pad = simplify_pad + if stride[0] > 1: + padding_shapes = [get_padding_shape(kernel_size, stride, mod) for + mod in range(stride[0])] + else: + padding_shapes = [padding_shape] + elif padding == 'VALID': + padding_shape = 0 + else: + raise ValueError( + 'padding should be in [VALID|SAME] but got {}'.format(padding)) + + if padding == 'SAME': + if not simplify_pad: + self.pads = [torch.nn.ConstantPad3d(x, 0) for x in padding_shapes] + self.conv3d = torch.nn.Conv3d( + in_channels, + out_channels, + kernel_size, + stride=stride, + bias=use_bias) + else: + self.conv3d = torch.nn.Conv3d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=pad_size, + bias=use_bias) + elif padding == 'VALID': + self.conv3d = torch.nn.Conv3d( + in_channels, + out_channels, + kernel_size, + padding=padding_shape, + stride=stride, + bias=use_bias) + else: + raise ValueError( + 'padding should be in [VALID|SAME] but got {}'.format(padding)) + + if self.use_bn: + # This is not strictly the correct map between epsilons in keras and + # pytorch (which have slightly different definitions of the batch norm + # forward pass), but it seems to be good enough. The PyTorch formula + # is described here: + # https://pytorch.org/docs/stable/_modules/torch/nn/modules/batchnorm.html + tf_style_eps = 1E-3 + self.batch3d = torch.nn.BatchNorm3d(out_channels, eps=tf_style_eps) + + if activation == 'relu': + self.activation = torch.nn.functional.relu + + def forward(self, inp): + if self.padding == 'SAME' and self.simplify_pad is False: + # Determine the padding to be applied by examining the input shape + pad_idx = inp.shape[2] % self.stride[0] + pad_op = self.pads[pad_idx] + inp = pad_op(inp) + out = self.conv3d(inp) + if self.use_bn: + out = self.batch3d(out) + if self.activation is not None: + out = torch.nn.functional.relu(out) + return out + + +class MaxPool3dTFPadding(torch.nn.Module): + def __init__(self, kernel_size, stride=None, padding='SAME'): + super(MaxPool3dTFPadding, self).__init__() + if padding == 'SAME': + padding_shape = get_padding_shape(kernel_size, stride) + self.padding_shape = padding_shape + self.stride = stride + if stride[0] > 1: + padding_shapes = [get_padding_shape(kernel_size, stride, mod) for + mod in range(stride[0])] + else: + padding_shapes = [padding_shape] + self.pads = [torch.nn.ConstantPad3d(x, 0) for x in padding_shapes] + self.pool = torch.nn.MaxPool3d(kernel_size, stride, ceil_mode=True) + + def forward(self, inp): + pad_idx = inp.shape[2] % self.stride[0] + pad_op = self.pads[pad_idx] + inp = pad_op(inp) + out = self.pool(inp) + return out + + +class Mixed(torch.nn.Module): + def __init__(self, in_channels, out_channels): + super(Mixed, self).__init__() + # Branch 0 + self.branch_0 = Unit3Dpy( + in_channels, out_channels[0], kernel_size=(1, 1, 1)) + + # Branch 1 + branch_1_conv1 = Unit3Dpy( + in_channels, out_channels[1], kernel_size=(1, 1, 1)) + branch_1_conv2 = Unit3Dpy( + out_channels[1], out_channels[2], kernel_size=(3, 3, 3)) + self.branch_1 = torch.nn.Sequential(branch_1_conv1, branch_1_conv2) + + # Branch 2 + branch_2_conv1 = Unit3Dpy( + in_channels, out_channels[3], kernel_size=(1, 1, 1)) + branch_2_conv2 = Unit3Dpy( + out_channels[3], out_channels[4], kernel_size=(3, 3, 3)) + self.branch_2 = torch.nn.Sequential(branch_2_conv1, branch_2_conv2) + + # Branch3 + branch_3_pool = MaxPool3dTFPadding( + kernel_size=(3, 3, 3), stride=(1, 1, 1), padding='SAME') + branch_3_conv2 = Unit3Dpy( + in_channels, out_channels[5], kernel_size=(1, 1, 1)) + self.branch_3 = torch.nn.Sequential(branch_3_pool, branch_3_conv2) + + def forward(self, inp): + out_0 = self.branch_0(inp) + out_1 = self.branch_1(inp) + out_2 = self.branch_2(inp) + out_3 = self.branch_3(inp) + out = torch.cat((out_0, out_1, out_2, out_3), 1) + return out + + +class I3D(torch.nn.Module): + def __init__(self, + num_classes, + modality='rgb', + dropout_prob=0, + name='inception'): + super(I3D, self).__init__() + + self.name = name + self.num_classes = num_classes + if modality == 'rgb': + in_channels = 3 + elif modality == 'flow': + in_channels = 2 + else: + raise ValueError( + '{} not among known modalities [rgb|flow]'.format(modality)) + self.modality = modality + + conv3d_1a_7x7 = Unit3Dpy( + out_channels=64, + in_channels=in_channels, + kernel_size=(7, 7, 7), + stride=(2, 2, 2), + padding='SAME') + # 1st conv-pool + self.conv3d_1a_7x7 = conv3d_1a_7x7 + self.maxPool3d_2a_3x3 = MaxPool3dTFPadding( + kernel_size=(1, 3, 3), stride=(1, 2, 2), padding='SAME') + # conv conv + conv3d_2b_1x1 = Unit3Dpy( + out_channels=64, + in_channels=64, + kernel_size=(1, 1, 1), + padding='SAME') + self.conv3d_2b_1x1 = conv3d_2b_1x1 + conv3d_2c_3x3 = Unit3Dpy( + out_channels=192, + in_channels=64, + kernel_size=(3, 3, 3), + padding='SAME') + self.conv3d_2c_3x3 = conv3d_2c_3x3 + self.maxPool3d_3a_3x3 = MaxPool3dTFPadding( + kernel_size=(1, 3, 3), stride=(1, 2, 2), padding='SAME') + + # Mixed_3b + self.mixed_3b = Mixed(192, [64, 96, 128, 16, 32, 32]) + self.mixed_3c = Mixed(256, [128, 128, 192, 32, 96, 64]) + + self.maxPool3d_4a_3x3 = MaxPool3dTFPadding( + kernel_size=(3, 3, 3), stride=(2, 2, 2), padding='SAME') + + # Mixed 4 + self.mixed_4b = Mixed(480, [192, 96, 208, 16, 48, 64]) + self.mixed_4c = Mixed(512, [160, 112, 224, 24, 64, 64]) + self.mixed_4d = Mixed(512, [128, 128, 256, 24, 64, 64]) + self.mixed_4e = Mixed(512, [112, 144, 288, 32, 64, 64]) + self.mixed_4f = Mixed(528, [256, 160, 320, 32, 128, 128]) + + self.maxPool3d_5a_2x2 = MaxPool3dTFPadding( + kernel_size=(2, 2, 2), stride=(2, 2, 2), padding='SAME') + + # Mixed 5 + self.mixed_5b = Mixed(832, [256, 160, 320, 32, 128, 128]) + self.mixed_5c = Mixed(832, [384, 192, 384, 48, 128, 128]) + + self.avg_pool = torch.nn.AvgPool3d((2, 7, 7), (1, 1, 1)) + self.dropout = torch.nn.Dropout(dropout_prob) + self.conv3d_0c_1x1 = Unit3Dpy( + in_channels=1024, + out_channels=self.num_classes, + kernel_size=(1, 1, 1), + activation=None, + use_bias=True, + use_bn=False) + self.softmax = torch.nn.Softmax(1) + + def forward(self, inp): + # Preprocessing + out = self.conv3d_1a_7x7(inp) + out = self.maxPool3d_2a_3x3(out) + out = self.conv3d_2b_1x1(out) + out = self.conv3d_2c_3x3(out) + out = self.maxPool3d_3a_3x3(out) + out = self.mixed_3b(out) + out = self.mixed_3c(out) + out = self.maxPool3d_4a_3x3(out) + out = self.mixed_4b(out) + out = self.mixed_4c(out) + out = self.mixed_4d(out) + out = self.mixed_4e(out) + out = self.mixed_4f(out) + out = self.maxPool3d_5a_2x2(out) + out = self.mixed_5b(out) + out = self.mixed_5c(out) + out = self.avg_pool(out) + out = self.dropout(out) + out = self.conv3d_0c_1x1(out) + out = out.squeeze(3) + out = out.squeeze(3) + out = out.mean(2) + out_logits = out + out = self.softmax(out_logits) + return out, out_logits + + def load_tf_weights(self, sess): + state_dict = {} + if self.modality == 'rgb': + prefix = 'RGB/inception_i3d' + elif self.modality == 'flow': + prefix = 'Flow/inception_i3d' + load_conv3d(state_dict, 'conv3d_1a_7x7', sess, + os.path.join(prefix, 'Conv3d_1a_7x7')) + load_conv3d(state_dict, 'conv3d_2b_1x1', sess, + os.path.join(prefix, 'Conv3d_2b_1x1')) + load_conv3d(state_dict, 'conv3d_2c_3x3', sess, + os.path.join(prefix, 'Conv3d_2c_3x3')) + + load_mixed(state_dict, 'mixed_3b', sess, + os.path.join(prefix, 'Mixed_3b')) + load_mixed(state_dict, 'mixed_3c', sess, + os.path.join(prefix, 'Mixed_3c')) + load_mixed(state_dict, 'mixed_4b', sess, + os.path.join(prefix, 'Mixed_4b')) + load_mixed(state_dict, 'mixed_4c', sess, + os.path.join(prefix, 'Mixed_4c')) + load_mixed(state_dict, 'mixed_4d', sess, + os.path.join(prefix, 'Mixed_4d')) + load_mixed(state_dict, 'mixed_4e', sess, + os.path.join(prefix, 'Mixed_4e')) + # Here goest to 0.1 max error with tf + load_mixed(state_dict, 'mixed_4f', sess, + os.path.join(prefix, 'Mixed_4f')) + + load_mixed( + state_dict, + 'mixed_5b', + sess, + os.path.join(prefix, 'Mixed_5b'), + fix_typo=True) + load_mixed(state_dict, 'mixed_5c', sess, + os.path.join(prefix, 'Mixed_5c')) + load_conv3d( + state_dict, + 'conv3d_0c_1x1', + sess, + os.path.join(prefix, 'Logits', 'Conv3d_0c_1x1'), + bias=True, + bn=False) + self.load_state_dict(state_dict) + + +def get_conv_params(sess, name, bias=False): + # Get conv weights + conv_weights_tensor = sess.graph.get_tensor_by_name( + os.path.join(name, 'w:0')) + if bias: + conv_bias_tensor = sess.graph.get_tensor_by_name( + os.path.join(name, 'b:0')) + conv_bias = sess.run(conv_bias_tensor) + conv_weights = sess.run(conv_weights_tensor) + conv_shape = conv_weights.shape + + kernel_shape = conv_shape[0:3] + in_channels = conv_shape[3] + out_channels = conv_shape[4] + + conv_op = sess.graph.get_operation_by_name( + os.path.join(name, 'convolution')) + padding_name = conv_op.get_attr('padding') + padding = _get_padding(padding_name, kernel_shape) + all_strides = conv_op.get_attr('strides') + strides = all_strides[1:4] + conv_params = [ + conv_weights, kernel_shape, in_channels, out_channels, strides, padding + ] + if bias: + conv_params.append(conv_bias) + return conv_params + + +def get_bn_params(sess, name): + moving_mean_tensor = sess.graph.get_tensor_by_name( + os.path.join(name, 'moving_mean:0')) + moving_var_tensor = sess.graph.get_tensor_by_name( + os.path.join(name, 'moving_variance:0')) + beta_tensor = sess.graph.get_tensor_by_name(os.path.join(name, 'beta:0')) + moving_mean = sess.run(moving_mean_tensor) + moving_var = sess.run(moving_var_tensor) + beta = sess.run(beta_tensor) + return moving_mean, moving_var, beta + + +def _get_padding(padding_name, conv_shape): + padding_name = padding_name.decode("utf-8") + if padding_name == "VALID": + return [0, 0] + elif padding_name == "SAME": + # return [math.ceil(int(conv_shape[0])/2), math.ceil(int(conv_shape[1])/2)] + return [ + math.floor(int(conv_shape[0]) / 2), + math.floor(int(conv_shape[1]) / 2), + math.floor(int(conv_shape[2]) / 2) + ] + else: + raise ValueError('Invalid padding name ' + padding_name) + + +def load_conv3d(state_dict, name_pt, sess, name_tf, bias=False, bn=True): + # Transfer convolution params + conv_name_tf = os.path.join(name_tf, 'conv_3d') + conv_params = get_conv_params(sess, conv_name_tf, bias=bias) + if bias: + conv_weights, kernel_shape, in_channels, out_channels, strides, padding, conv_bias = conv_params + else: + conv_weights, kernel_shape, in_channels, out_channels, strides, padding = conv_params + + conv_weights_rs = np.transpose( + conv_weights, (4, 3, 0, 1, + 2)) # to pt format (out_c, in_c, depth, height, width) + state_dict[name_pt + '.conv3d.weight'] = torch.from_numpy(conv_weights_rs) + if bias: + state_dict[name_pt + '.conv3d.bias'] = torch.from_numpy(conv_bias) + + # Transfer batch norm params + if bn: + conv_tf_name = os.path.join(name_tf, 'batch_norm') + moving_mean, moving_var, beta = get_bn_params(sess, conv_tf_name) + + out_planes = conv_weights_rs.shape[0] + state_dict[name_pt + '.batch3d.weight'] = torch.ones(out_planes) + state_dict[name_pt + + '.batch3d.bias'] = torch.from_numpy(beta.squeeze()) + state_dict[name_pt + + '.batch3d.running_mean'] = torch.from_numpy(moving_mean.squeeze()) + state_dict[name_pt + + '.batch3d.running_var'] = torch.from_numpy(moving_var.squeeze()) + + +def load_mixed(state_dict, name_pt, sess, name_tf, fix_typo=False): + # Branch 0 + load_conv3d(state_dict, name_pt + '.branch_0', sess, + os.path.join(name_tf, 'Branch_0/Conv3d_0a_1x1')) + + # Branch .1 + load_conv3d(state_dict, name_pt + '.branch_1.0', sess, + os.path.join(name_tf, 'Branch_1/Conv3d_0a_1x1')) + load_conv3d(state_dict, name_pt + '.branch_1.1', sess, + os.path.join(name_tf, 'Branch_1/Conv3d_0b_3x3')) + + # Branch 2 + load_conv3d(state_dict, name_pt + '.branch_2.0', sess, + os.path.join(name_tf, 'Branch_2/Conv3d_0a_1x1')) + if fix_typo: + load_conv3d(state_dict, name_pt + '.branch_2.1', sess, + os.path.join(name_tf, 'Branch_2/Conv3d_0a_3x3')) + else: + load_conv3d(state_dict, name_pt + '.branch_2.1', sess, + os.path.join(name_tf, 'Branch_2/Conv3d_0b_3x3')) + + # Branch 3 + load_conv3d(state_dict, name_pt + '.branch_3.1', sess, + os.path.join(name_tf, 'Branch_3/Conv3d_0b_1x1')) diff --git a/i3D/model.py b/i3D/model.py deleted file mode 100644 index 55cb83dfbccb3c112337f89ae7270e1e8b0da3d0..0000000000000000000000000000000000000000 --- a/i3D/model.py +++ /dev/null @@ -1,140 +0,0 @@ -import torch -import torch.nn as nn -from i3D.resnet3d_xl import Net -import torch.nn.functional as F -''' -Video Classification Model library. -''' - -class TrainingScheduleError(Exception): - pass - -class VideoModel(nn.Module): - def __init__(self, - num_classes, - num_boxes, - num_videos=16, - restore_dict=None, - freeze_weights=None, - device=None, - loss_type='softmax'): - super(VideoModel, self).__init__() - self.device = device - self.num_frames = num_videos - self.num_classes = num_classes - # Network loads kinetic pre-trained weights in initialization - self.i3D = Net(num_classes, extract_features=True, loss_type=loss_type) - - - try: - # Restore weights - if restore_dict: - self.restore(restore_dict) - # Freeze weights - if freeze_weights: - self.freeze_weights(freeze_weights) - else: - print(" > No weights are freezed") - except Exception as e: - print(" > Exception {}".format(e)) - - def restore(self, restore=None): - # Load pre-trained I3D + Graph weights for fine-tune (replace the last FC) - restore_finetuned = restore.get("restore_finetuned", None) - if restore_finetuned: - self._restore_fintuned(restore_finetuned) - print(" > Restored I3D + Graph weights") - return - - # Load pre-trained I3D weights - restore_i3d = restore.get("restore_i3d", None) - if restore_i3d: - self._restore_i3d(restore_i3d) - print(" > Restored only I3D weights") - return - - # Load pre-trained I3D + Graph weights without replacing anything - restore_predict = restore.get("restore_predict", None) - if restore_predict: - self._restore_predict(restore_predict) - print(" > Restored the model with strict weights") - return - - def _restore_predict(self, path): - if path is None: - raise TrainingScheduleError('You should pre-train the video model on your training data first') - - weights = torch.load(path, map_location=self.device)['state_dict'] - new_weights = {} - for k, v in weights.items(): - new_weights[k.replace('module.', '')] = v - - self.load_state_dict(new_weights, strict=True) - print(" > Weights {} loaded".format(path)) - - def _restore_i3d(self, path): - if path is None: - raise TrainingScheduleError('You should pre-train the video model on your training data first') - - weights = torch.load(path, map_location=self.device)['state_dict'] - new_weights = {} - for k, v in weights.items(): - if not k.startswith('module.fc') and not k.startswith('module.i3D.classifier'): - new_weights[k.replace('module.', '')] = v - self.load_state_dict(new_weights, strict=False) - - def _restore_fintuned(self, path): - if path is None: - raise TrainingScheduleError('You should pre-train the video model on your training data first') - - weights = torch.load(path, map_location=self.device)['state_dict'] - new_weights = {} - for k, v in weights.items(): - # Don't load classifiers (different classes 88 vs 86) - if not k.startswith('module.fc'): - if not k.startswith('module.i3D.classifier'): - new_weights[k.replace('module.', '')] = v - - self.load_state_dict(new_weights, strict=False) - print(" > Weights {} loaded".format(path)) - - def freeze_weights(self, module): - if module == 'i3d': - print(" > Freeze I3D module") - for param in self.i3D.parameters(): - param.requires_grad = False - elif module == 'fine_tuned': - print(" > Freeze Graph + I3D module, only last FC is training") - # Fixed the entire params without the last FC - for name, param in self.i3D.named_parameters(): - if not name.startswith('classifier'): - param.requires_grad = False - for param in self.graph_embedding.parameters(): - param.requires_grad = False - for param in self.conv.parameters(): - param.requires_grad = False - - else: - raise NotImplementedError('Unrecognized option, you can freeze either graph module or I3D module') - pass - - def _get_i3d_features(self, videos, output_video_features=False): - # org_features - [V x 2048 x T / 2 x 14 x 14] - _, org_features = self.i3D(videos) - # Reduce dimension video_features - [V x 512 x T / 2 x 14 x 14] - videos_features = self.conv(org_features) - bs, d, t, h, w = videos_features.size() - # Get global features - videos_features_rs = videos_features.permute(0, 2, 1, 3, 4) # [V x T / 2 x 512 x h x w] - videos_features_rs = videos_features_rs.reshape(-1, d, h, w) # [V * T / 2 x 512 x h x w] - global_features = self.avgpool(videos_features_rs) # [V * T / 2 x 512 x 1 x 1] - global_features = self.dropout(global_features) - global_features = global_features.reshape(bs, t, d) # [V x T / 2 x 512] - if output_video_features: - return global_features, videos_features - else: - return global_features - - def flatten(self, x): - return [item for sublist in x for item in sublist] - diff --git a/i3D/model_lib.py b/i3D/model_lib.py deleted file mode 100644 index 54027926a3e2029b7b41952ade9ec85850eb1f57..0000000000000000000000000000000000000000 --- a/i3D/model_lib.py +++ /dev/null @@ -1,1050 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F -from i3D.resnet3d_xl import Net -from i3D.nonlocal_helper import Nonlocal - - -class VideoModelCoord(nn.Module): - def __init__(self, opt): - super(VideoModelCoord, self).__init__() - self.nr_boxes = opt.num_boxes - self.nr_actions = opt.num_classes - self.nr_frames = opt.num_frames // 2 - self.coord_feature_dim = opt.coord_feature_dim - - self.coord_to_feature = nn.Sequential( - nn.Linear(4, self.coord_feature_dim//2, bias=False), - nn.BatchNorm1d(self.coord_feature_dim//2), - nn.ReLU(inplace=True), - nn.Linear(self.coord_feature_dim//2, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU() - ) - - self.spatial_node_fusion = nn.Sequential( - nn.Linear(self.coord_feature_dim*2, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU(inplace=True), - nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU() - ) - - self.box_feature_fusion = nn.Sequential( - nn.Linear(self.nr_frames*self.coord_feature_dim, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU(inplace=True), - nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU() - ) - - self.classifier = nn.Sequential( - nn.Linear(self.coord_feature_dim, self.coord_feature_dim), - # nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU(inplace=True), - nn.Linear(self.coord_feature_dim, 512), #self.coord_feature_dim), - nn.ReLU(inplace=True), - nn.Linear(512, self.nr_actions) - ) - - if opt.fine_tune: - self.fine_tune(opt.fine_tune) - - def fine_tune(self, restore_path, parameters_to_train=['classifier']): - weights = torch.load(restore_path)['state_dict'] - new_weights = {} - #import pdb - for k, v in weights.items(): - if not 'classifier.4' in k: - new_weights[k.replace('module.', '')] = v - #pdb.set_trace() - self.load_state_dict(new_weights, strict=False) - print('Num of weights in restore dict {}'.format(len(new_weights.keys()))) - - frozen_weights = 0 - for name, param in self.named_parameters(): - if not 'classifier.4' in name: - - param.requires_grad = False - frozen_weights += 1 - - else: - print('Training : {}'.format(name)) - print('Number of frozen weights {}'.format(frozen_weights)) - assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \ - 'Check the naming convention of the parameters' - - def forward(self, global_img_input, box_categories, box_input, video_label, is_inference=False): - # local_img_tensor is (b, nr_frames, nr_boxes, 3, h, w) - # global_img_tensor is (b, nr_frames, 3, h, w) - # box_input is (b, nr_frames, nr_boxes, 4) - - b, _, _, _h, _w = global_img_input.size() - # global_imgs = global_img_input.view(b*self.nr_frames, 3, _h, _w) - # local_imgs = local_img_input.view(b*self.nr_frames*self.nr_boxes, 3, _h, _w) - - box_input = box_input.transpose(2, 1).contiguous() - box_input = box_input.view(b*self.nr_boxes*self.nr_frames, 4) - - bf = self.coord_to_feature(box_input) - bf = bf.view(b, self.nr_boxes, self.nr_frames, self.coord_feature_dim) - - # spatial message passing (graph) - spatial_message = bf.sum(dim=1, keepdim=True) # (b, 1, self.nr_frames, coord_feature_dim) - # message passed should substract itself, and normalize to it as a single feature - spatial_message = (spatial_message - bf) / (self.nr_boxes - 1) # message passed should substract itself - bf_and_message = torch.cat([bf, spatial_message], dim=3) # (b, nr_boxes, nr_frames, 2*coord_feature_dim) - - # (b*nr_boxes*nr_frames, coord_feature_dim) - bf_spatial = self.spatial_node_fusion(bf_and_message.view(b*self.nr_boxes*self.nr_frames, -1)) - bf_spatial = bf_spatial.view(b, self.nr_boxes, self.nr_frames, self.coord_feature_dim) - - bf_temporal_input = bf_spatial.view(b, self.nr_boxes, self.nr_frames*self.coord_feature_dim) - - box_features = self.box_feature_fusion(bf_temporal_input.view(b*self.nr_boxes, -1)) # (b*nr_boxes, coord_feature_dim) - box_features = torch.mean(box_features.view(b, self.nr_boxes, -1), dim=1) # (b, coord_feature_dim) - # video_features = torch.cat([global_features, local_features, box_features], dim=1) - video_features = box_features - - cls_output = self.classifier(video_features) # (b, num_classes) - return cls_output - -class VideoModelCoordLatent(nn.Module): - def __init__(self, opt): - super(VideoModelCoordLatent, self).__init__() - self.nr_boxes = opt.num_boxes - self.nr_actions = opt.num_classes - self.nr_frames = opt.num_frames // 2 - self.img_feature_dim = opt.img_feature_dim - self.coord_feature_dim = opt.coord_feature_dim - - self.category_embed_layer = nn.Embedding(3, opt.coord_feature_dim // 2, padding_idx=0, scale_grad_by_freq=True) - - self.coord_to_feature = nn.Sequential( - nn.Linear(4, self.coord_feature_dim//2, bias=False), - nn.BatchNorm1d(self.coord_feature_dim//2), - nn.ReLU(inplace=True), - nn.Linear(self.coord_feature_dim//2, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU() - ) - - self.coord_category_fusion = nn.Sequential( - nn.Linear(self.coord_feature_dim+self.coord_feature_dim//2, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU(inplace=True), - ) - - self.spatial_node_fusion = nn.Sequential( - nn.Linear(self.coord_feature_dim*2, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU(inplace=True), - nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU() - ) - - self.box_feature_fusion = nn.Sequential( - nn.Linear(self.nr_frames*self.coord_feature_dim, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU(inplace=True), - nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU() - ) - - self.classifier = nn.Sequential( - nn.Linear(self.coord_feature_dim, self.coord_feature_dim), - # nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU(inplace=True), - nn.Linear(self.coord_feature_dim, 512), #self.coord_feature_dim), - nn.ReLU(inplace=True), - nn.Linear(512, self.nr_actions) - ) - - if opt.fine_tune: - self.fine_tune(opt.fine_tune) - - def fine_tune(self, restore_path, parameters_to_train=['classifier']): - weights = torch.load(restore_path)['state_dict'] - new_weights = {} - for k, v in weights.items(): - if not 'classifier.4' in k: - new_weights[k.replace('module.', '')] = v - self.load_state_dict(new_weights, strict=False) - print('Num of weights in restore dict {}'.format(len(new_weights.keys()))) - - frozen_weights = 0 - for name, param in self.named_parameters(): - if not 'classifier.4' in name: - param.requires_grad = False - frozen_weights += 1 - else: - print('Training : {}'.format(name)) - print('Number of frozen weights {}'.format(frozen_weights)) - assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \ - 'Check the naming convention of the parameters' - - def forward(self, global_img_input, box_categories, box_input, video_label, is_inference=False): - # local_img_tensor is (b, nr_frames, nr_boxes, 3, h, w) - # global_img_tensor is (b, nr_frames, 3, h, w) - # box_input is (b, nr_frames, nr_boxes, 4) - - b, _, _, _h, _w = global_img_input.size() - - box_input = box_input.transpose(2, 1).contiguous() - box_input = box_input.view(b*self.nr_boxes*self.nr_frames, 4) - - box_categories = box_categories.long() - box_categories = box_categories.transpose(2, 1).contiguous() - box_categories = box_categories.view(b*self.nr_boxes*self.nr_frames) - box_category_embeddings = self.category_embed_layer(box_categories) # (b*nr_b*nr_f, coord_feature_dim//2) - - bf = self.coord_to_feature(box_input) - bf = torch.cat([bf, box_category_embeddings], dim=1) # (b*nr_b*nr_f, coord_feature_dim + coord_feature_dim//2) - bf = self.coord_category_fusion(bf) # (b*nr_b*nr_f, coord_feature_dim) - bf = bf.view(b, self.nr_boxes, self.nr_frames, self.coord_feature_dim) - - # spatial message passing (graph) - spatial_message = bf.sum(dim=1, keepdim=True) # (b, 1, self.nr_frames, coord_feature_dim) - # message passed should substract itself, and normalize to it as a single feature - spatial_message = (spatial_message - bf) / (self.nr_boxes - 1) # message passed should substract itself - bf_and_message = torch.cat([bf, spatial_message], dim=3) # (b, nr_boxes, nr_frames, 2*coord_feature_dim) - - # (b*nr_boxes*nr_frames, coord_feature_dim) - bf_spatial = self.spatial_node_fusion(bf_and_message.view(b*self.nr_boxes*self.nr_frames, -1)) - bf_spatial = bf_spatial.view(b, self.nr_boxes, self.nr_frames, self.coord_feature_dim) - - bf_temporal_input = bf_spatial.view(b, self.nr_boxes, self.nr_frames*self.coord_feature_dim) - - box_features = self.box_feature_fusion(bf_temporal_input.view(b*self.nr_boxes, -1)) # (b*nr_boxes, coord_feature_dim) - box_features = torch.mean(box_features.view(b, self.nr_boxes, -1), dim=1) # (b, coord_feature_dim) - # video_features = torch.cat([global_features, local_features, box_features], dim=1) - video_features = box_features - - cls_output = self.classifier(video_features) # (b, num_classes) - return cls_output - -class VideoModelCoordLatentNL(nn.Module): - def __init__(self, opt): - super(VideoModelCoordLatentNL, self).__init__() - self.nr_boxes = opt.num_boxes - self.nr_actions = opt.num_classes - self.nr_frames = opt.num_frames // 2 - self.img_feature_dim = opt.img_feature_dim - self.coord_feature_dim = opt.coord_feature_dim - - self.category_embed_layer = nn.Embedding(3, opt.coord_feature_dim // 2, padding_idx=0, scale_grad_by_freq=True) - - self.coord_to_feature = nn.Sequential( - nn.Linear(4, self.coord_feature_dim // 2, bias=False), - nn.BatchNorm1d(self.coord_feature_dim // 2), - nn.ReLU(inplace=True), - nn.Linear(self.coord_feature_dim // 2, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU() - ) - - self.coord_category_fusion = nn.Sequential( - nn.Linear(self.coord_feature_dim + self.coord_feature_dim // 2, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU(inplace=True), - ) - - self.spatial_node_fusion = nn.Sequential( - nn.Linear(self.coord_feature_dim * 2, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU(inplace=True), - nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU() - ) - - self.nr_nonlocal_layers = 3 - self.nonlocal_fusion = [] - for i in range(self.nr_nonlocal_layers): - self.nonlocal_fusion.append(nn.Sequential( - Nonlocal(dim=self.coord_feature_dim, dim_inner=self.coord_feature_dim // 2), - nn.Conv1d(self.coord_feature_dim, self.coord_feature_dim, kernel_size=1, stride=1, padding=0, - bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU() - )) - self.nonlocal_fusion = nn.ModuleList(self.nonlocal_fusion) - - self.box_feature_fusion = nn.Sequential( - nn.Linear(self.nr_frames * self.coord_feature_dim, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU(inplace=True), - nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU() - ) - - self.classifier = nn.Sequential( - nn.Linear(self.coord_feature_dim, self.coord_feature_dim), - # nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU(inplace=True), - nn.Linear(self.coord_feature_dim, 512), # self.coord_feature_dim), - nn.ReLU(inplace=True), - nn.Linear(512, self.nr_actions) - ) - - if opt.fine_tune: - self.fine_tune(opt.fine_tune) - - def train(self, mode=True): # overriding default train function - super(VideoModelCoordLatentNL, self).train(mode) - for m in self.modules(): # or self.modules(), if freezing all bn layers - if isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d): - m.eval() - # shutdown update in frozen mode - m.weight.requires_grad = False - m.bias.requires_grad = False - - def fine_tune(self, restore_path, parameters_to_train=['classifier']): - weights = torch.load(restore_path)['state_dict'] - new_weights = {} - # import pdb - for k, v in weights.items(): - if not 'classifier.4' in k: - new_weights[k.replace('module.', '')] = v - # pdb.set_trace() - self.load_state_dict(new_weights, strict=False) - print('Num of weights in restore dict {}'.format(len(new_weights.keys()))) - - frozen_weights = 0 - for name, param in self.named_parameters(): - if not 'classifier.4' in name: - - param.requires_grad = False - frozen_weights += 1 - - else: - print('Training : {}'.format(name)) - print('Number of frozen weights {}'.format(frozen_weights)) - assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \ - 'Check the naming convention of the parameters' - - def forward(self, global_img_input, box_categories, box_input, video_label, is_inference=False): - # local_img_tensor is (b, nr_frames, nr_boxes, 3, h, w) - # global_img_tensor is (b, nr_frames, 3, h, w) - # box_input is (b, nr_frames, nr_boxes, 4) - - b, _, _, _h, _w = global_img_input.size() - - box_input = box_input.transpose(2, 1).contiguous() - box_input = box_input.view(b * self.nr_boxes * self.nr_frames, 4) - - box_categories = box_categories.long() - box_categories = box_categories.transpose(2, 1).contiguous() - box_categories = box_categories.view(b * self.nr_boxes * self.nr_frames) - box_category_embeddings = self.category_embed_layer(box_categories) # (b*nr_b*nr_f, coord_feature_dim//2) - - bf = self.coord_to_feature(box_input) - bf = torch.cat([bf, box_category_embeddings], dim=1) # (b*nr_b*nr_f, coord_feature_dim + coord_feature_dim//2) - bf = self.coord_category_fusion(bf) # (b*nr_b*nr_f, coord_feature_dim) - bf = bf.view(b, self.nr_boxes, self.nr_frames, self.coord_feature_dim) - - # spatial message passing (graph) - spatial_message = bf.sum(dim=1, keepdim=True) # (b, 1, self.nr_frames, coord_feature_dim) - # message passed should substract itself, and normalize to it as a single feature - - spatial_message = (spatial_message - bf) / (self.nr_boxes - 1) # message passed should substract itself - bf_and_message = torch.cat([bf, spatial_message], dim=3) # (b, nr_boxes, nr_frames, 2*coord_feature_dim) - - # (b*nr_boxes*nr_frames, coord_feature_dim) - bf_spatial = self.spatial_node_fusion(bf_and_message.view(b * self.nr_boxes * self.nr_frames, -1)) - bf_spatial = bf_spatial.view(b, self.nr_boxes, self.nr_frames, self.coord_feature_dim) - - bf_temporal_input = bf_spatial.view(b, self.nr_boxes, self.nr_frames * self.coord_feature_dim) - - bf_nonlocal = self.box_feature_fusion( - bf_temporal_input.view(b * self.nr_boxes, -1)) # (b*nr_boxes, coord_feature_dim) - bf_nonlocal = bf_nonlocal.view(b, self.nr_boxes, self.coord_feature_dim).permute(0, 2, - 1).contiguous() # (N, C, NB) - for i in range(self.nr_nonlocal_layers): - bf_nonlocal = self.nonlocal_fusion[i](bf_nonlocal) - - box_features = torch.mean(bf_nonlocal, dim=2) # (b, coord_feature_dim) - - # video_features = torch.cat([global_features, local_features, box_features], dim=1) - video_features = box_features - - cls_output = self.classifier(video_features) # (b, num_classes) - return cls_output - -class VideoModelGlobalCoordLatent(nn.Module): - """ - This model contains only global pooling without any graph. - """ - - def __init__(self, opt, - ): - super(VideoModelGlobalCoordLatent, self).__init__() - - self.nr_boxes = opt.num_boxes - self.nr_actions = opt.num_classes - self.nr_frames = opt.num_frames - self.img_feature_dim = opt.img_feature_dim - self.coord_feature_dim = opt.coord_feature_dim - self.i3D = Net(self.nr_actions, extract_features=True, loss_type='softmax') - self.dropout = nn.Dropout(0.3) - self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1)) - self.conv = nn.Conv3d(2048, 512, kernel_size=(1, 1, 1), stride=1) - - self.category_embed_layer = nn.Embedding(3, opt.coord_feature_dim // 2, padding_idx=0, scale_grad_by_freq=True) - - self.c_coord_category_fusion = nn.Sequential( - nn.Linear(self.coord_feature_dim+self.coord_feature_dim//2, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU(inplace=True), - ) - - self.c_coord_to_feature = nn.Sequential( - nn.Linear(4, self.coord_feature_dim // 2, bias=False), - nn.BatchNorm1d(self.coord_feature_dim // 2), - nn.ReLU(inplace=True), - nn.Linear(self.coord_feature_dim // 2, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU() - ) - - self.c_spatial_node_fusion = nn.Sequential( - nn.Linear(self.coord_feature_dim * 2, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU(inplace=True), - nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU() - ) - - self.c_box_feature_fusion = nn.Sequential( - nn.Linear((self.nr_frames // 2) * self.coord_feature_dim, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU(inplace=True), - nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU() - ) - - self.classifier = nn.Sequential( - nn.Linear(self.coord_feature_dim + 2*self.img_feature_dim, self.coord_feature_dim), - nn.ReLU(inplace=True), - nn.Linear(self.coord_feature_dim, 512), - nn.ReLU(inplace=True), - nn.Linear(512, self.nr_actions) - ) - if opt.fine_tune: - self.fine_tune(opt.fine_tune) - if opt.restore_i3d: - self.restore_i3d(opt.restore_i3d) - if opt.restore_custom: - self.restore_custom(opt.restore_custom) - - def train(self, mode=True): # overriding default train function - super(VideoModelGlobalCoordLatent, self).train(mode) - for m in self.modules(): # or self.modules(), if freezing all bn layers - if isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d): - m.eval() - # shutdown update in frozen mode - m.weight.requires_grad = False - m.bias.requires_grad = False - - def restore_custom(self, restore_path): - print("restoring path {}".format(restore_path)) - weights = torch.load(restore_path) - - ks = list(weights.keys()) - print('\n\n BEFORE', weights[ks[0]][0,0,0]) - new_weights = {} - # import pdb - for k, v in weights.items(): - new_weights[k.replace('module.', '')] = v - self.load_state_dict(new_weights, strict=False) - print('\n\n AFTER', self.state_dict()[ks[0]][0,0, 0]) - print('Num of weights in restore dict {}'.format(len(new_weights.keys()))) - - frozen_weights = 0 - for name, param in self.named_parameters(): - if not name.startswith('classifier') : - param.requires_grad = False - frozen_weights += 1 - else: - print('Training : {}'.format(name)) - print('Number of frozen weights {}'.format(frozen_weights)) - assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \ - 'Check the naming convention of the parameters' - - - def restore_i3d(self, restore_path, parameters_to_train=['classifier']): - weights = torch.load(restore_path)['state_dict'] - new_weights = {} - # import pdb - for k, v in weights.items(): - if 'i3D' in k : - new_weights[k.replace('module.', '')] = v - # pdb.set_trace() - self.load_state_dict(new_weights, strict=False) - print('Num of weights in restore dict {}'.format(len(new_weights.keys()))) - - for m in self.i3D.modules(): - if isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d): - m.eval() - # shutdown update in frozen mode - m.weight.requires_grad = False - m.bias.requires_grad = False - - frozen_weights = 0 - for name, param in self.named_parameters(): - if 'i3D' in name: - param.requires_grad = False - frozen_weights += 1 - else: - print('Training : {}'.format(name)) - print('Number of frozen weights {}'.format(frozen_weights)) - assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \ - 'Check the naming convention of the parameters' - - def fine_tune(self, restore_path, parameters_to_train=['classifier']): - weights = torch.load(restore_path)['state_dict'] - new_weights = {} - # import pdb - for k, v in weights.items(): - if not 'classifier.4' in k and 'i3D.classifier': - new_weights[k.replace('module.', '')] = v - # pdb.set_trace() - self.load_state_dict(new_weights, strict=False) - print('Num of weights in restore dict {}'.format(len(new_weights.keys()))) - - frozen_weights = 0 - for name, param in self.named_parameters(): - if not 'classifier.4' in name: - param.requires_grad = False - frozen_weights += 1 - else: - print('Training : {}'.format(name)) - print('Number of frozen weights {}'.format(frozen_weights)) - assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \ - 'Check the naming convention of the parameters' - - def forward(self, global_img_input, box_categories, box_input, video_label, is_inference=False): - - """ - V: num of videos - T: num of frames - P: num of proposals - :param videos: [V x 3 x T x 224 x 224] - :param proposals_t: [V x T] List of BoxList (size of num_boxes each) - :return: - """ - - # org_features - [V x 2048 x T / 2 x 14 x 14] - bs, _, _, _, _ = global_img_input.shape - y_i3d, org_features = self.i3D(global_img_input) - # Reduce dimension video_features - [V x 512 x T / 2 x 14 x 14] - videos_features = self.conv(org_features) - b = bs - - box_input = box_input.transpose(2, 1).contiguous() - box_input = box_input.view(b * self.nr_boxes * (self.nr_frames//2), 4) - - box_categories = box_categories.long() - box_categories = box_categories.transpose(2, 1).contiguous() - box_categories = box_categories.view(b * self.nr_boxes * (self.nr_frames // 2)) - box_category_embeddings = self.category_embed_layer(box_categories) # (b*nr_b*nr_f, coord_feature_dim//2) - - bf = self.c_coord_to_feature(box_input) - bf = torch.cat([bf, box_category_embeddings], dim=1) # (b*nr_b*nr_f, coord_feature_dim + coord_feature_dim//2) - bf = self.c_coord_category_fusion(bf) # (b*nr_b*nr_f, coord_feature_dim) - - bf = bf.view(b, self.nr_boxes, self.nr_frames // 2, self.coord_feature_dim) - - # spatial message passing (graph) - spatial_message = bf.sum(dim=1, keepdim=True) # (b, 1, self.nr_frames, coord_feature_dim) - # message passed should substract itself, and normalize to it as a single feature - spatial_message = (spatial_message - bf) / (self.nr_boxes - 1) # message passed should substract itself - - bf_message_gf = torch.cat([bf, spatial_message], dim=3) # (b, nr_boxes, nr_frames, 2*coord_feature_dim) - - # (b*nr_boxes*nr_frames, coord_feature_dim) - bf_spatial = self.c_spatial_node_fusion(bf_message_gf.view(b * self.nr_boxes * (self.nr_frames // 2), -1)) - bf_spatial = bf_spatial.view(b, self.nr_boxes, self.nr_frames // 2, self.coord_feature_dim) - - bf_temporal_input = bf_spatial.view(b, self.nr_boxes, (self.nr_frames // 2) * self.coord_feature_dim) - - box_features = self.c_box_feature_fusion( - bf_temporal_input.view(b * self.nr_boxes, -1)) # (b*nr_boxes, img_feature_dim) - coord_ft = torch.mean(box_features.view(b, self.nr_boxes, -1), dim=1) # (b, coord_feature_dim) - # video_features = torch.cat([global_features, local_features, box_features], dim=1) - # _gf = self.global_new_fc(_gf) - _gf = videos_features.mean(-1).mean(-1).view(b, (self.nr_frames//2), 2*self.img_feature_dim) - _gf = _gf.mean(1) - video_features = torch.cat([_gf.view(b, -1), coord_ft], dim=-1) - - cls_output = self.classifier(video_features) # (b, num_classes) - return cls_output - -class VideoModelGlobalCoordLatentNL(nn.Module): - """ - This model contains only global pooling without any graph. - """ - - def __init__(self, base_net, opt, - ): - super(VideoModelGlobalCoordLatentNL, self).__init__() - - self.nr_boxes = opt.num_boxes - self.nr_actions = opt.num_classes - self.nr_frames = opt.num_frames - self.img_feature_dim = opt.img_feature_dim - self.coord_feature_dim = opt.coord_feature_dim - self.i3D = Net(self.nr_actions, extract_features=True, loss_type='softmax') - self.dropout = nn.Dropout(0.3) - self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1)) - self.conv = nn.Conv3d(2048, 512, kernel_size=(1, 1, 1), stride=1) - - - self.category_embed_layer = nn.Embedding(3, opt.coord_feature_dim // 2, padding_idx=0, scale_grad_by_freq=True) - - self.c_coord_category_fusion = nn.Sequential( - nn.Linear(self.coord_feature_dim+self.coord_feature_dim//2, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU(inplace=True), - ) - - self.c_coord_to_feature = nn.Sequential( - nn.Linear(4, self.coord_feature_dim // 2, bias=False), - nn.BatchNorm1d(self.coord_feature_dim // 2), - nn.ReLU(inplace=True), - nn.Linear(self.coord_feature_dim // 2, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU() - ) - - self.c_spatial_node_fusion = nn.Sequential( - nn.Linear(self.coord_feature_dim * 2, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU(inplace=True), - nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU() - ) - - self.nr_nonlocal_layers = 3 - self.c_nonlocal_fusion = [] - for i in range(self.nr_nonlocal_layers): - self.c_nonlocal_fusion.append(nn.Sequential( - Nonlocal(dim=self.coord_feature_dim, dim_inner=self.coord_feature_dim // 2), - nn.Conv1d(self.coord_feature_dim, self.coord_feature_dim, kernel_size=1, stride=1, padding=0, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU() - )) - self.c_nonlocal_fusion = nn.ModuleList(self.c_nonlocal_fusion) - - self.c_box_feature_fusion = nn.Sequential( - nn.Linear((self.nr_frames // 2) * self.coord_feature_dim, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU(inplace=True), - nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU() - ) - - self.classifier = nn.Sequential( - nn.Linear(self.coord_feature_dim + 2*self.img_feature_dim, self.coord_feature_dim), - nn.ReLU(inplace=True), - nn.Linear(self.coord_feature_dim, 512), - nn.ReLU(inplace=True), - nn.Linear(512, self.nr_actions) - ) - if opt.fine_tune: - self.fine_tune(opt.fine_tune) - if opt.restore_i3d: - self.restore_i3d(opt.restore_i3d) - - if opt.restore_custom: - self.restore_custom(opt.restore_custom) - - def restore_custom(self, restore_path): - print("restoring path {}".format(restore_path)) - weights = torch.load(restore_path) - ks = list(weights.keys()) - print('\n\n BEFORE', weights[ks[0]][0,0,0]) - new_weights = {} - # import pdb - for k, v in weights.items(): - new_weights[k.replace('module.', '')] = v - self.load_state_dict(new_weights, strict=False) - print('\n\n AFTER', self.state_dict()[ks[0]][0,0, 0]) - print('Num of weights in restore dict {}'.format(len(new_weights.keys()))) - - frozen_weights = 0 - for name, param in self.named_parameters(): - if not name.startswith('classifier') : - param.requires_grad = False - frozen_weights += 1 - else: - print('Training : {}'.format(name)) - print('Number of frozen weights {}'.format(frozen_weights)) - assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \ - 'Check the naming convention of the parameters' - - - - def restore_i3d(self, restore_path, parameters_to_train=['classifier']): - weights = torch.load(restore_path)['state_dict'] - new_weights = {} - # import pdb - for k, v in weights.items(): - if 'i3D' in k or k.startswith('conv.'): - new_weights[k.replace('module.', '')] = v - # pdb.set_trace() - self.load_state_dict(new_weights, strict=False) - print('Num of weights in restore dict {}'.format(len(new_weights.keys()))) - - for m in self.i3D.modules(): - if isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d): - m.eval() - # shutdown update in frozen mode - m.weight.requires_grad = False - m.bias.requires_grad = False - - frozen_weights = 0 - for name, param in self.named_parameters(): - if 'i3D' in name or k.startswith('conv.') : - param.requires_grad = False - frozen_weights += 1 - else: - print('Training : {}'.format(name)) - print('Number of frozen weights {}'.format(frozen_weights)) - assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \ - 'Check the naming convention of the parameters' - - def train(self, mode=True): # overriding default train function - super(VideoModelGlobalCoordLatentNL, self).train(mode) - for m in self.i3D.modules(): # or self.modules(), if freezing all bn layers - if isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d): - m.eval() - # shutdown update in frozen mode - m.weight.requires_grad = False - m.bias.requires_grad = False - - def fine_tune(self, restore_path, parameters_to_train=['classifier']): - weights = torch.load(restore_path)['state_dict'] - new_weights = {} - import pdb - for k, v in weights.items(): - if not 'classifier.4' in k and 'i3D.classifier' not in k: - new_weights[k.replace('module.', '')] = v - pdb.set_trace() - self.load_state_dict(new_weights, strict=False) - print('Num of weights in restore dict {}'.format(len(new_weights.keys()))) - - frozen_weights = 0 - for name, param in self.named_parameters(): - if not 'classifier.4' in name: - param.requires_grad = False - frozen_weights += 1 - else: - print('Training : {}'.format(name)) - print('Number of frozen weights {}'.format(frozen_weights)) - assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \ - 'Check the naming convention of the parameters' - - def forward(self, global_img_input, box_categories, box_input, video_label, is_inference=False): - - """ - V: num of videos - T: num of frames - P: num of proposals - :param videos: [V x 3 x T x 224 x 224] - :param proposals_t: [V x T] List of BoxList (size of num_boxes each) - :return: - """ - - # org_features - [V x 2048 x T / 2 x 14 x 14] - bs, _, _, _, _ = global_img_input.shape - y_i3d, org_features = self.i3D(global_img_input) - # Reduce dimension video_features - [V x 512 x T / 2 x 14 x 14] - videos_features = self.conv(org_features) - b = bs - - box_input = box_input.transpose(2, 1).contiguous() - box_input = box_input.view(b * self.nr_boxes * (self.nr_frames//2), 4) - - box_categories = box_categories.long() - box_categories = box_categories.transpose(2, 1).contiguous() - box_categories = box_categories.view(b * self.nr_boxes * (self.nr_frames // 2)) - box_category_embeddings = self.category_embed_layer(box_categories) # (b*nr_b*nr_f, coord_feature_dim//2) - - bf = self.c_coord_to_feature(box_input) - bf = torch.cat([bf, box_category_embeddings], dim=1) # (b*nr_b*nr_f, coord_feature_dim + coord_feature_dim//2) - bf = self.c_coord_category_fusion(bf) # (b*nr_b*nr_f, coord_feature_dim) - - bf = bf.view(b, self.nr_boxes, self.nr_frames // 2, self.coord_feature_dim) - - # spatial message passing (graph) - spatial_message = bf.sum(dim=1, keepdim=True) # (b, 1, self.nr_frames, coord_feature_dim) - # message passed should substract itself, and normalize to it as a single feature - spatial_message = (spatial_message - bf) / (self.nr_boxes - 1) # message passed should substract itself - - bf_message_gf = torch.cat([bf, spatial_message], dim=3) # (b, nr_boxes, nr_frames, 2*coord_feature_dim) - - # (b*nr_boxes*nr_frames, coord_feature_dim) - bf_spatial = self.c_spatial_node_fusion(bf_message_gf.view(b * self.nr_boxes * (self.nr_frames // 2), -1)) - bf_spatial = bf_spatial.view(b, self.nr_boxes, self.nr_frames // 2, self.coord_feature_dim) - - bf_temporal_input = bf_spatial.view(b, self.nr_boxes, (self.nr_frames // 2) * self.coord_feature_dim) - - bf_nonlocal = self.c_box_feature_fusion( - bf_temporal_input.view(b * self.nr_boxes, -1)) # (b*nr_boxes, img_feature_dim) - - bf_nonlocal = bf_nonlocal.view(b, self.nr_boxes, self.coord_feature_dim).permute(0, 2, 1).contiguous() # (N, C, NB) - for i in range(self.nr_nonlocal_layers): - bf_nonlocal = self.c_nonlocal_fusion[i](bf_nonlocal) - - coord_ft = torch.mean(bf_nonlocal, dim=2) # (b, coord_feature_dim) - - # video_features = torch.cat([global_features, local_features, box_features], dim=1) - _gf = videos_features.mean(-1).mean(-1).view(b, (self.nr_frames//2), 2*self.img_feature_dim) - _gf = _gf.mean(1) - video_features = torch.cat([_gf.view(b, -1), coord_ft], dim=-1) - - cls_output = self.classifier(video_features) # (b, num_classes) - return cls_output - -class VideoGlobalModel(nn.Module): - """ - This model contains only global pooling without any graph. - """ - - def __init__(self, opt, - ): - super(VideoGlobalModel, self).__init__() - - self.nr_boxes = opt.num_boxes - self.nr_actions = opt.num_classes - self.nr_frames = opt.num_frames - self.img_feature_dim = opt.img_feature_dim - self.coord_feature_dim = opt.coord_feature_dim - self.i3D = Net(self.nr_actions, extract_features=True, loss_type='softmax') - self.dropout = nn.Dropout(0.3) - self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1)) - self.conv = nn.Conv3d(2048, 512, kernel_size=(1, 1, 1), stride=1) - self.fc = nn.Linear(512, self.nr_actions) - self.crit = nn.CrossEntropyLoss() - - if opt.fine_tune: - self.fine_tune(opt.fine_tune) - - def fine_tune(self, restore_path, parameters_to_train=['classifier']): - weights = torch.load(restore_path)['state_dict'] - new_weights = {} - for k, v in weights.items(): - if not 'fc' in k and not 'classifier' in k: - new_weights[k.replace('module.', '')] = v - self.load_state_dict(new_weights, strict=False) - print('Num of weights in restore dict {}'.format(len(new_weights.keys()))) - - frozen_weights = 0 - for name, param in self.named_parameters(): - if not 'fc' in name: - param.requires_grad = False - frozen_weights += 1 - else: - print('Training : {}'.format(name)) - print('Number of frozen weights {}'.format(frozen_weights)) - assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \ - 'Check the naming convention of the parameters' - - def forward(self, global_img_input, local_img_input, box_input, video_label, is_inference=False): - """ - V: num of videos - T: num of frames - P: num of proposals - :param videos: [V x 3 x T x 224 x 224] - :param proposals_t: [V x T] List of BoxList (size of num_boxes each) - :return: - """ - - # org_features - [V x 2048 x T / 2 x 14 x 14] - y_i3d, org_features = self.i3D(global_img_input) - # Reduce dimension video_features - [V x 512 x T / 2 x 14 x 14] - videos_features = self.conv(org_features) - - # Get global features - [V x 512] - global_features = self.avgpool(videos_features).squeeze() - global_features = self.dropout(global_features) - - cls_output = self.fc(global_features) - return cls_output - -class VideoModelGlobalCoord(nn.Module): - """ - This model contains only global pooling without any graph. - """ - - def __init__(self, opt): - super(VideoModelGlobalCoord, self).__init__() - - self.nr_boxes = opt.num_boxes - self.nr_actions = opt.num_classes - self.nr_frames = opt.num_frames - self.img_feature_dim = opt.img_feature_dim - self.coord_feature_dim = opt.coord_feature_dim - self.i3D = Net(self.nr_actions, extract_features=True, loss_type='softmax') - self.dropout = nn.Dropout(0.3) - self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1)) - self.conv = nn.Conv3d(2048, 256, kernel_size=(1, 1, 1), stride=1) - - - self.global_new_fc = nn.Sequential( - nn.Linear(256, self.img_feature_dim, bias=False), - nn.BatchNorm1d(self.img_feature_dim), - nn.ReLU(inplace=True) - ) - - - self.c_coord_to_feature = nn.Sequential( - nn.Linear(4, self.coord_feature_dim // 2, bias=False), - nn.BatchNorm1d(self.coord_feature_dim // 2), - nn.ReLU(inplace=True), - nn.Linear(self.coord_feature_dim // 2, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU() - ) - - self.c_spatial_node_fusion = nn.Sequential( - nn.Linear(self.coord_feature_dim * 2, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU(inplace=True), - nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU() - ) - - self.c_box_feature_fusion = nn.Sequential( - nn.Linear((self.nr_frames // 2) * self.coord_feature_dim, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU(inplace=True), - nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False), - nn.BatchNorm1d(self.coord_feature_dim), - nn.ReLU() - ) - - self.classifier = nn.Sequential( - nn.Linear(self.coord_feature_dim + self.img_feature_dim, self.coord_feature_dim), - nn.ReLU(inplace=True), - nn.Linear(self.coord_feature_dim, 512), - nn.ReLU(inplace=True), - nn.Linear(512, self.nr_actions) - ) - if opt.fine_tune: - self.fine_tune(opt.fine_tune) - if opt.restore_i3d: - self.restore_i3d(opt.restore_i3d) - - def train(self, mode=True): # overriding default train function - super(VideoModelGlobalCoord, self).train(mode) - for m in self.i3D.modules(): # or self.modules(), if freezing all bn layers - if isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d): - m.eval() - # shutdown update in frozen mode - m.weight.requires_grad = False - m.bias.requires_grad = False - - def restore_i3d(self, restore_path, parameters_to_train=['classifier']): - weights = torch.load(restore_path)['state_dict'] - new_weights = {} - # import pdb - for k, v in weights.items(): - if 'i3D' in k : - new_weights[k.replace('module.', '')] = v - # pdb.set_trace() - self.load_state_dict(new_weights, strict=False) - print('Num of weights in restore dict {}'.format(len(new_weights.keys()))) - - frozen_weights = 0 - for name, param in self.named_parameters(): - if 'i3D' in name: - param.requires_grad = False - frozen_weights += 1 - else: - print('Training : {}'.format(name)) - print('Number of frozen weights {}'.format(frozen_weights)) - assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \ - 'Check the naming convention of the parameters' - - def fine_tune(self, restore_path, parameters_to_train=['classifier']): - weights = torch.load(restore_path)['state_dict'] - new_weights = {} - # import pdb - for k, v in weights.items(): - if not 'classifier.4' in k and 'i3D.classifier': - new_weights[k.replace('module.', '')] = v - # pdb.set_trace() - self.load_state_dict(new_weights, strict=False) - print('Num of weights in restore dict {}'.format(len(new_weights.keys()))) - - frozen_weights = 0 - for name, param in self.named_parameters(): - if not 'classifier.4' in name: - param.requires_grad = False - frozen_weights += 1 - else: - print('Training : {}'.format(name)) - print('Number of frozen weights {}'.format(frozen_weights)) - assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \ - 'Check the naming convention of the parameters' - - def forward(self, global_img_input, box_categories, box_input, video_label, is_inference=False): - - """ - V: num of videos - T: num of frames - P: num of proposals - :param videos: [V x 3 x T x 224 x 224] - :param proposals_t: [V x T] List of BoxList (size of num_boxes each) - :return: - """ - - # org_features - [V x 2048 x T / 2 x 14 x 14] - bs, _, _, _, _ = global_img_input.shape - y_i3d, org_features = self.i3D(global_img_input) - # Reduce dimension video_features - [V x 512 x T / 2 x 14 x 14] - videos_features = self.conv(org_features) - b = bs - - box_input = box_input.transpose(2, 1).contiguous() - box_input = box_input.view(b * self.nr_boxes * (self.nr_frames//2), 4) - - bf = self.c_coord_to_feature(box_input) - bf = bf.view(b, self.nr_boxes, self.nr_frames // 2, self.coord_feature_dim) - - # spatial message passing (graph) - spatial_message = bf.sum(dim=1, keepdim=True) # (b, 1, self.nr_frames, coord_feature_dim) - # message passed should substract itself, and normalize to it as a single feature - spatial_message = (spatial_message - bf) / (self.nr_boxes - 1) # message passed should substract itself - - bf_message_gf = torch.cat([bf, spatial_message], dim=3) # (b, nr_boxes, nr_frames, 2*coord_feature_dim) - - # (b*nr_boxes*nr_frames, coord_feature_dim) - bf_spatial = self.c_spatial_node_fusion(bf_message_gf.view(b * self.nr_boxes * (self.nr_frames // 2), -1)) - bf_spatial = bf_spatial.view(b, self.nr_boxes, self.nr_frames // 2, self.coord_feature_dim) - - bf_temporal_input = bf_spatial.view(b, self.nr_boxes, (self.nr_frames // 2) * self.coord_feature_dim) - - box_features = self.c_box_feature_fusion( - bf_temporal_input.view(b * self.nr_boxes, -1)) # (b*nr_boxes, img_feature_dim) - coord_ft = torch.mean(box_features.view(b, self.nr_boxes, -1), dim=1) # (b, coord_feature_dim) - # video_features = torch.cat([global_features, local_features, box_features], dim=1) - _gf = videos_features.mean(-1).mean(-1).view(b*(self.nr_frames//2), self.img_feature_dim) - _gf = self.global_new_fc(_gf) - _gf = _gf.view(b, self.nr_frames // 2, self.img_feature_dim).mean(1) - video_features = torch.cat([_gf.view(b, -1), coord_ft], dim=-1) - - cls_output = self.classifier(video_features) # (b, num_classes) - return cls_output diff --git a/i3D/resnet3d_xl.py b/i3D/resnet3d_xl.py deleted file mode 100644 index b4d1695507c7a9f2b232bd886ee87c5489ff899d..0000000000000000000000000000000000000000 --- a/i3D/resnet3d_xl.py +++ /dev/null @@ -1,456 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch.autograd import Variable -import math -import numpy as np - -from functools import partial - -__all__ = [ - 'ResNet', 'resnet10', 'resnet18', 'resnet34', 'resnet50', 'resnet101', - 'resnet152', 'resnet200', -] - - -def conv3x3x3(in_planes, out_planes, stride=1): - # 3x3x3 convolution with padding - return nn.Conv3d( - in_planes, - out_planes, - kernel_size=3, - stride=stride, - padding=1, - bias=False) - - -def downsample_basic_block(x, planes, stride): - out = F.avg_pool3d(x, kernel_size=1, stride=stride) - zero_pads = torch.Tensor( - out.size(0), planes - out.size(1), out.size(2), out.size(3), - out.size(4)).zero_() - if isinstance(out.data, torch.cuda.FloatTensor): - zero_pads = zero_pads.cuda() - - out = Variable(torch.cat([out.data, zero_pads], dim=1)) - - return out - - -class BasicBlock(nn.Module): - expansion = 1 - - def __init__(self, inplanes, planes, stride=1, downsample=None): - super(BasicBlock, self).__init__() - self.conv1 = conv3x3x3(inplanes, planes, stride) - self.bn1 = nn.BatchNorm3d(planes) - self.relu = nn.ReLU(inplace=True) - self.conv2 = conv3x3x3(planes, planes) - self.bn2 = nn.BatchNorm3d(planes) - self.downsample = downsample - self.stride = stride - - def forward(self, x): - residual = x - - out = self.conv1(x) - out = self.bn1(out) - out = self.relu(out) - - out = self.conv2(out) - out = self.bn2(out) - - if self.downsample is not None: - residual = self.downsample(x) - - out += residual - out = self.relu(out) - - return out - - -class Bottleneck(nn.Module): - conv_op = None - offset_groups = 1 - - def __init__(self, dim_in, dim_out, stride, dim_inner, group=1, use_temp_conv=1, temp_stride=1, dcn=False, - shortcut_type='B'): - super(Bottleneck, self).__init__() - # 1 x 1 layer - self.with_dcn = dcn - self.conv1 = self.Conv3dBN(dim_in, dim_inner, (1 + use_temp_conv * 2, 1, 1), (temp_stride, 1, 1), - (use_temp_conv, 0, 0)) - self.relu = nn.ReLU(inplace=True) - # 3 x 3 layer - self.conv2 = self.Conv3dBN(dim_inner, dim_inner, (1, 3, 3), (1, stride, stride), (0, 1, 1)) - # 1 x 1 layer - self.conv3 = self.Conv3dBN(dim_inner, dim_out, (1, 1, 1), (1, 1, 1), (0, 0, 0)) - - self.shortcut_type = shortcut_type - self.dim_in = dim_in - self.dim_out = dim_out - self.temp_stride = temp_stride - self.stride = stride - # nn.Conv3d(dim_in, dim_out, (1,1,1),(temp_stride,stride,stride),(0,0,0)) - if self.shortcut_type == 'B': - if self.dim_in == self.dim_out and self.temp_stride == 1 and self.stride == 1: # or (self.dim_in == self.dim_out and self.dim_in == 64 and self.stride ==1): - - pass - else: - # pass - self.shortcut = self.Conv3dBN(dim_in, dim_out, (1, 1, 1), (temp_stride, stride, stride), (0, 0, 0)) - - # nn.Conv3d(dim_in,dim_inner,kernel_size=(1+use_temp_conv*2,1,1),stride = (temp_stride,1,1),padding = ) - - def forward(self, x): - residual = x - out = self.conv1(x) - out = self.relu(out) - out = self.conv2(out) - out = self.relu(out) - out = self.conv3(out) - if self.dim_in == self.dim_out and self.temp_stride == 1 and self.stride == 1: - pass - else: - residual = self.shortcut(residual) - out += residual - out = self.relu(out) - return out - - def Conv3dBN(self, dim_in, dim_out, kernels, strides, pads, group=1): - if self.with_dcn and kernels[0] > 1: - # use deformable conv - return nn.Sequential( - self.conv_op(dim_in, dim_out, kernel_size=kernels, stride=strides, padding=pads, bias=False, - offset_groups=self.offset_groups), - nn.BatchNorm3d(dim_out) - ) - else: - return nn.Sequential( - nn.Conv3d(dim_in, dim_out, kernel_size=kernels, stride=strides, padding=pads, bias=False), - nn.BatchNorm3d(dim_out) - ) - - -class ResNet(nn.Module): - - def __init__(self, - block, - layers, - use_temp_convs_set, - temp_strides_set, - sample_size, - sample_duration, - shortcut_type='B', - num_classes=400, - stage_with_dcn=(False, False, False, False), - extract_features=False, - loss_type='softmax'): - super(ResNet, self).__init__() - self.extract_features = extract_features - self.stage_with_dcn = stage_with_dcn - self.group = 1 - self.width_per_group = 64 - self.dim_inner = self.group * self.width_per_group - # self.shortcut_type = shortcut_type - self.conv1 = nn.Conv3d( - 3, - 64, - kernel_size=(1 + use_temp_convs_set[0][0] * 2, 7, 7), - stride=(temp_strides_set[0][0], 2, 2), - padding=(use_temp_convs_set[0][0], 3, 3), - bias=False) - self.bn1 = nn.BatchNorm3d(64) - self.relu = nn.ReLU(inplace=True) - self.maxpool1 = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 0, 0)) - with_dcn = True if self.stage_with_dcn[0] else False - self.layer1 = self._make_layer(block, 64, 256, shortcut_type, stride=1, num_blocks=layers[0], - dim_inner=self.dim_inner, group=self.group, use_temp_convs=use_temp_convs_set[1], - temp_strides=temp_strides_set[1], dcn=with_dcn) - self.maxpool2 = nn.MaxPool3d(kernel_size=(2, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0)) - with_dcn = True if self.stage_with_dcn[1] else False - self.layer2 = self._make_layer(block, 256, 512, shortcut_type, stride=2, num_blocks=layers[1], - dim_inner=self.dim_inner * 2, group=self.group, - use_temp_convs=use_temp_convs_set[2], temp_strides=temp_strides_set[2], - dcn=with_dcn) - with_dcn = True if self.stage_with_dcn[2] else False - self.layer3 = self._make_layer(block, 512, 1024, shortcut_type, stride=2, num_blocks=layers[2], - dim_inner=self.dim_inner * 4, group=self.group, - use_temp_convs=use_temp_convs_set[3], temp_strides=temp_strides_set[3], - dcn=with_dcn) - with_dcn = True if self.stage_with_dcn[3] else False - self.layer4 = self._make_layer(block, 1024, 2048, shortcut_type, stride=1, num_blocks=layers[3], - dim_inner=self.dim_inner * 8, group=self.group, - use_temp_convs=use_temp_convs_set[4], temp_strides=temp_strides_set[4], - dcn=with_dcn) - last_duration = int(math.ceil(sample_duration / 2)) # int(math.ceil(sample_duration / 8)) - last_size = int(math.ceil(sample_size / 16)) - # self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1) #nn.AdaptiveAvgPool3d((1, 1, 1)) # - self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1)) - self.dropout = torch.nn.Dropout(p=0.5) - self.classifier = nn.Linear(2048, num_classes) - - for m in self.modules(): - # if isinstance(m, nn.Conv3d): - # m.weight = nn.init.kaiming_normal_(m.weight, mode='fan_out') - # elif isinstance(m,nn.Linear): - # m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out') - # elif - if isinstance(m, nn.BatchNorm3d): - m.weight.data.fill_(1) - m.bias.data.zero_() - - def _make_layer(self, block, dim_in, dim_out, shortcut_type, stride, num_blocks, dim_inner=None, group=None, - use_temp_convs=None, temp_strides=None, dcn=False): - if use_temp_convs is None: - use_temp_convs = np.zeros(num_blocks).astype(int) - if temp_strides is None: - temp_strides = np.ones(num_blocks).astype(int) - if len(use_temp_convs) < num_blocks: - for _ in range(num_blocks - len(use_temp_convs)): - use_temp_convs.append(0) - temp_strides.append(1) - layers = [] - for idx in range(num_blocks): - block_stride = 2 if (idx == 0 and stride == 2) else 1 - - layers.append( - block(dim_in, dim_out, block_stride, dim_inner, group, use_temp_convs[idx], temp_strides[idx], dcn)) - dim_in = dim_out - return nn.Sequential(*layers) - - def forward_single(self, x): - x = self.conv1(x) - - x = self.bn1(x) - x = self.relu(x) - x = self.maxpool1(x) - - x = self.layer1(x) - x = self.maxpool2(x) - x = self.layer2(x) - - x = self.layer3(x) - features = self.layer4(x) - - x = self.avgpool(features) - - x = x.view(x.size(0), -1) - x = self.dropout(x) - - y = self.classifier(x) - if self.extract_features: - return y, features - else: - return y - - def forward_multi(self, x): - clip_preds = [] - # import ipdb;ipdb.set_trace() - for clip_idx in range(x.shape[1]): # B, 10, 3, 3, 32, 224, 224 - spatial_crops = [] - for crop_idx in range(x.shape[2]): - clip = x[:, clip_idx, crop_idx] - clip = self.forward_single(clip) - spatial_crops.append(clip) - spatial_crops = torch.stack(spatial_crops, 1).mean(1) # (B, 400) - clip_preds.append(spatial_crops) - clip_preds = torch.stack(clip_preds, 1).mean(1) # (B, 400) - return clip_preds - - def forward(self, x): - - # 5D tensor == single clip - if x.dim() == 5: - pred = self.forward_single(x) - - # 7D tensor == 3 crops/10 clips - elif x.dim() == 7: - pred = self.forward_multi(x) - - # loss_dict = {} - # if 'label' in batch: - # loss = F.cross_entropy(pred, batch['label'], reduction='none') - # loss_dict = {'clf': loss} - - return pred - - -def get_fine_tuning_parameters(model, ft_begin_index): - if ft_begin_index == 0: - return model.parameters() - - ft_module_names = [] - for i in range(ft_begin_index, 5): - ft_module_names.append('layer{}'.format(i)) - ft_module_names.append('fc') - # import ipdb;ipdb.set_trace() - parameters = [] - for k, v in model.named_parameters(): - for ft_module in ft_module_names: - if ft_module in k: - parameters.append({'params': v}) - break - else: - parameters.append({'params': v, 'lr': 0.0}) - - return parameters - - -def obtain_arc(arc_type): - # c2d, ResNet50 - if arc_type == 1: - use_temp_convs_1 = [0] - temp_strides_1 = [2] - use_temp_convs_2 = [0, 0, 0] - temp_strides_2 = [1, 1, 1] - use_temp_convs_3 = [0, 0, 0, 0] - temp_strides_3 = [1, 1, 1, 1] - use_temp_convs_4 = [0, ] * 6 - temp_strides_4 = [1, ] * 6 - use_temp_convs_5 = [0, 0, 0] - temp_strides_5 = [1, 1, 1] - - # i3d, ResNet50 - if arc_type == 2: - use_temp_convs_1 = [2] - temp_strides_1 = [1] - use_temp_convs_2 = [1, 1, 1] - temp_strides_2 = [1, 1, 1] - use_temp_convs_3 = [1, 0, 1, 0] - temp_strides_3 = [1, 1, 1, 1] - use_temp_convs_4 = [1, 0, 1, 0, 1, 0] - temp_strides_4 = [1, 1, 1, 1, 1, 1] - use_temp_convs_5 = [0, 1, 0] - temp_strides_5 = [1, 1, 1] - - # c2d, ResNet101 - if arc_type == 3: - use_temp_convs_1 = [0] - temp_strides_1 = [2] - use_temp_convs_2 = [0, 0, 0] - temp_strides_2 = [1, 1, 1] - use_temp_convs_3 = [0, 0, 0, 0] - temp_strides_3 = [1, 1, 1, 1] - use_temp_convs_4 = [0, ] * 23 - temp_strides_4 = [1, ] * 23 - use_temp_convs_5 = [0, 0, 0] - temp_strides_5 = [1, 1, 1] - - # i3d, ResNet101 - if arc_type == 4: - use_temp_convs_1 = [2] - temp_strides_1 = [2] - use_temp_convs_2 = [1, 1, 1] - temp_strides_2 = [1, 1, 1] - use_temp_convs_3 = [1, 0, 1, 0] - temp_strides_3 = [1, 1, 1, 1] - use_temp_convs_4 = [] - for i in range(23): - if i % 2 == 0: - use_temp_convs_4.append(1) - else: - use_temp_convs_4.append(0) - - temp_strides_4 = [1, ] * 23 - use_temp_convs_5 = [0, 1, 0] - temp_strides_5 = [1, 1, 1] - - use_temp_convs_set = [use_temp_convs_1, use_temp_convs_2, use_temp_convs_3, use_temp_convs_4, use_temp_convs_5] - temp_strides_set = [temp_strides_1, temp_strides_2, temp_strides_3, temp_strides_4, temp_strides_5] - - return use_temp_convs_set, temp_strides_set - - -def resnet10(**kwargs): - """Constructs a ResNet-18 model. - """ - use_temp_convs_set = [] - temp_strides_set = [] - model = ResNet(BasicBlock, [1, 1, 1, 1], use_temp_convs_set, temp_strides_set, **kwargs) - return model - - -def resnet18(**kwargs): - """Constructs a ResNet-18 model. - """ - use_temp_convs_set = [] - temp_strides_set = [] - model = ResNet(BasicBlock, [2, 2, 2, 2], use_temp_convs_set, temp_strides_set, **kwargs) - return model - - -def resnet34(**kwargs): - """Constructs a ResNet-34 model. - """ - use_temp_convs_set = [] - temp_strides_set = [] - model = ResNet(BasicBlock, [3, 4, 6, 3], use_temp_convs_set, temp_strides_set, **kwargs) - return model - - -def resnet50(extract_features, **kwargs): - """Constructs a ResNet-50 model. - """ - use_temp_convs_set, temp_strides_set = obtain_arc(2) - model = ResNet(Bottleneck, [3, 4, 6, 3], use_temp_convs_set, temp_strides_set, - extract_features=extract_features, **kwargs) - return model - - -def resnet101(**kwargs): - """Constructs a ResNet-101 model. - """ - use_temp_convs_set, temp_strides_set = obtain_arc(4) - model = ResNet(Bottleneck, [3, 4, 23, 3], use_temp_convs_set, temp_strides_set, **kwargs) - return model - - -def resnet152(**kwargs): - """Constructs a ResNet-101 model. - """ - use_temp_convs_set = [] - temp_strides_set = [] - model = ResNet(Bottleneck, [3, 8, 36, 3], use_temp_convs_set, temp_strides_set, **kwargs) - return model - - -def resnet200(**kwargs): - """Constructs a ResNet-101 model. - """ - use_temp_convs_set = [] - temp_strides_set = [] - model = ResNet(Bottleneck, [3, 24, 36, 3], use_temp_convs_set, temp_strides_set, **kwargs) - return model - - -def Net(num_classes, extract_features=False, loss_type='softmax', - weights=None, freeze_all_but_cls=False): - net = globals()['resnet' + str(50)]( - num_classes=num_classes, - sample_size=50, - sample_duration=32, - extract_features=extract_features, - loss_type=loss_type, - ) - - if weights is not None: - kinetics_weights = torch.load(weights)['state_dict'] - print("Found weights in {}.".format(weights)) - cls_name = 'fc' - else: - kinetics_weights = torch.load('i3D/kinetics-res50.pth') - cls_name = 'fc' - print('\n Restoring Kintetics \n') - - new_weights = {} - for k, v in kinetics_weights.items(): - if not k.startswith('module.' + cls_name): - new_weights[k.replace('module.', '')] = v - net.load_state_dict(new_weights, strict=False) - - if freeze_all_but_cls: - for name, par in net.named_parameters(): - if not name.startswith('classifier'): - par.requires_grad = False - return net diff --git a/kinetics_feats.py b/kinetics_feats.py new file mode 100644 index 0000000000000000000000000000000000000000..d3632bc8e5a58e74f83e8f6f1437dc759001fc4d --- /dev/null +++ b/kinetics_feats.py @@ -0,0 +1,99 @@ +# -*- coding: utf-8 -*- +import os +import sys +import glob +import datetime +import argparse +import random + +import numpy as np + +#from pathlib import Path +#filepath = Path.cwd() +#sys.path.append(filepath) +from video_loaders import load_av +from se_bb_from_np import annot_np +from SmthSequence import SmthSequence +from SmthFrameRelations import frame_relations + +from PIL import Image +import matplotlib.pyplot as plt +import cv2 +import i3D.gtransforms as gtransforms + +import torch +from i3D.i3dpt import I3D + +rgb_pt_checkpoint = 'i3D/model_rgb.pth' + +class I3dFV: + def __init__(self,path): + self.anno = annot_np(path) + self.net = I3D(num_classes=400, modality='rgb') + self.net.eval() + self.net.load_state_dict(torch.load(rgb_pt_checkpoint)) + self.net.cuda() + self.pre_resize_shape = (256, 340) + self.random_crop = gtransforms.GroupMultiScaleCrop(output_size=224, + scales=[1], + max_distort=0, + center_crop_only=True) + + def process_video(self,finput,verbose=False): + # get video id + vidnum = int(os.path.splitext(os.path.basename(finput))[0]) + + # load video to ndarray list + img_array = load_av(finput) + + # convert BGR to RGB + frames = [cv2.cvtColor(img, cv2.COLOR_BGR2RGB) for img in img_array] + # convert ndarray to array of PIL Images for resize and cropping + frames = [Image.fromarray(img.astype('uint8'), 'RGB') for img in frames] + # resize + frames = [img.resize((self.pre_resize_shape[1], self.pre_resize_shape[0]), Image.BILINEAR) for img in frames] + # crop + frames, (offset_h, offset_w, crop_h, crop_w) = self.random_crop(frames) + # convert back from PIL to ndarray for cv2 channel separation + frames = [np.array(img) for img in frames] + # separate channels into R,G,B frame sequences + #rs = [] + #gs = [] + #bs = [] + #for i in range(len(frames)): + # R, G, B = cv2.split(frames[i]) + # rs.append(R) + # gs.append(G) + # bs.append(B) + #frames = np.asarray([[rs, gs, bs]]) + + frames = np.asarray([frames]).transpose(0, 4, 1, 2, 3) # alternative to channel splitting above? + #print(frames.shape) + + sample_var = torch.autograd.Variable(torch.from_numpy(frames).cuda()).float() + _, logits = self.net(sample_var) + + return logits.cpu().detach().numpy() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--annotations', + dest='path_to_annotations', + default='../annotations_ground/', + help='folder to load annotations from') + parser.add_argument( + '--video', + dest='path_to_video', + default='.', + help='video to load') + + args = parser.parse_args() + + i3dfv = I3dFV(args.path_to_annotations) + fv = i3dfv.process_video(args.path_to_video, verbose=True) + + print(fv) + + print("fin") diff --git a/regen_frame_fv.py b/regen_frame_fv.py index 2e16af7ec85c90d93ea807b8632c4fafc378d23e..69637165e36776a47dcacc0219fc00389067623b 100644 --- a/regen_frame_fv.py +++ b/regen_frame_fv.py @@ -17,6 +17,7 @@ from SmthSequence import SmthSequence from SmthFrameRelations import frame_relations from PIL import Image +import matplotlib.pyplot as plt import cv2 import torch from i3D.model import VideoModel @@ -61,33 +62,61 @@ class FrameFV: bs.append(B) frames = [rs, gs, bs] + #print(self.net.i3D.classifier.weight.data) + print(self.net.classifier[4].weight.data) + # read frame annotations into Sequence - seq = SmthSequence() - for framenum in range(0,len(img_array)): - cats, bbs = self.anno.get_vf_bbx(vidnum, framenum+1) - # add detections to Sequence - for i in range(0,len(cats)): - seq.add(framenum, cats[i], bbs[i]) + #seq = SmthSequence() + #for framenum in range(0,len(img_array)): + # cats, bbs = self.anno.get_vf_bbx(vidnum, framenum+1) + # # add detections to Sequence + # for i in range(0,len(cats)): + # seq.add(framenum, cats[i], bbs[i]) # compute object relations per frame - relations = [] - for framenum in range(0,len(img_array)): - fv = frame_relations(seq, 0, 1, framenum) - relations.append(fv) - relations = np.asarray(relations) + #relations = [] + #for framenum in range(0,len(img_array)): + # fv = frame_relations(seq, 0, 1, framenum) + # relations.append(fv) + #relations = np.asarray(relations) # i3D features per frame clip = torch.from_numpy(np.asarray([frames])) - print(clip.shape) + #print(clip.shape) clip = clip.float() glo, vid = self.net.i3D(clip) + #return glo.detach().numpy() + videos_features = self.net.conv(vid) - print(glo.shape) - print(vid.shape) + #print(glo.shape) + #print(vid.shape) + #print(videos_features.shape) + + #plt.plot(np.linspace(0,400,num=400), glo.detach().numpy()[0]) + #plt.show() - print(videos_features.shape) + pre = vid.detach().numpy().view() + post = videos_features.detach().numpy().view() + + rows = [] + for f in range(len(img_array)//2): + row = [] + for i in range(512): + patch = post[0,i,f] + row.append(patch) + row = np.hstack(row) + rows.append(row) + pic = np.vstack(rows) + + print(pic.shape) + while(1): + cv2.imshow('frame', pic) + k = cv2.waitKey(33) + if k == 27: + break + if __name__ == '__main__': parser = argparse.ArgumentParser() @@ -109,13 +138,11 @@ if __name__ == '__main__': help='intermediate feature dimension for coord-based features') parser.add_argument('--size', default=224, type=int, metavar='N', help='primary image input size') - parser.add_argument('--batch_size', '-b', default=72, type=int, - metavar='N', help='mini-batch size (default: 72)') parser.add_argument('--num_classes', default=174, type=int, help='num of class in the model') parser.add_argument('--num_boxes', default=4, type=int, help='num of boxes for each image') - parser.add_argument('--num_frames', default=36, type=int, + parser.add_argument('--num_frames', default=16, type=int, help='num of frames for the model') parser.add_argument('--fine_tune', help='path with ckpt to restore') parser.add_argument('--restore_i3d') diff --git a/train_actions.sh b/train_actions.sh index 5791b976fa4c5ed597ff14bb83dc4cab90f48932..303261b3cd8ce253d16fcec2c497a608d7fc56cb 100755 --- a/train_actions.sh +++ b/train_actions.sh @@ -1,6 +1,6 @@ #!/bin/bash -ACTIONS=( 131 141 ) +ACTIONS=( 1 12 13 16 25 33 34 35 38 48 51 52 54 55 56 58 61 63 64 66 69 71 75 76 78 80 81 82 88 89 95 96 97 102 111 118 127 128 130 131 133 136 138 141 147 152 155 159 160 163 ) for i in ${ACTIONS[@]}; do