Skip to content
Snippets Groups Projects
Commit c79d1aee authored by Chrol-Cannon, Joseph Dr (Computer Science)'s avatar Chrol-Cannon, Joseph Dr (Computer Science)
Browse files

integrate i3D features into action model

parent 6d15e6c4
No related branches found
No related tags found
No related merge requests found
......@@ -10,10 +10,11 @@ import json
import cv2
import numpy as np
from pathlib import Path
#from pathlib import Path
#filepath = Path.cwd()
#sys.path.append(filepath)
from compute_fv import ComputeFV
from kinetics_feats import I3dFV
if __name__ == "__main__":
parser = argparse.ArgumentParser()
......@@ -48,6 +49,7 @@ if __name__ == "__main__":
dest='path_to_annotations',
default='../annotations_ground/',
help='folder to load annotations from')
args = parser.parse_args()
labs = json.load(open(args.labels_file,'r'))
......@@ -58,6 +60,7 @@ if __name__ == "__main__":
# load something-else annotated videos
compfv = ComputeFV(args.path_to_annotations)
i3dfv = I3dFV(args.path_to_annotations)
#ids = [] # video id's in order of being processed
#classes = [] # class of each video == ordinal of label
......@@ -71,13 +74,14 @@ if __name__ == "__main__":
for filename in glob.glob(folder + "/*.webm"):
# only process a random % of the videos (if not positive class example)
if int(k) != args.action_id:
if random.random() > 0.01:
if random.random() > 0.02:
continue
print("processing file: " + filename)
vidnum = int(os.path.splitext(os.path.basename(filename))[0])
fv = compfv.process_video(filename, os.path.join(args.path_to_phase_models, 'a'+str(args.action_id)+'.joblib'))
fv0 = i3dfv.process_video(filename)
#if type(fv) is not np.ndarray:
# continue
......@@ -85,7 +89,7 @@ if __name__ == "__main__":
if int(k) == args.action_id:
fv[-1] = 1
feats.append(fv)
feats.append(np.concatenate([fv0.flatten(), fv]))
#ids.append(vidnum)
# list of np.ndarray to 2d ndarray
......
#!/bin/bash
ACTIONS=( 131 141 )
ACTIONS=( 1 12 13 16 25 33 34 35 38 48 51 52 54 55 56 58 61 63 64 66 69 71 75 76 78 80 81 82 88 89 95 96 97 102 111 118 127 128 130 131 133 136 138 141 147 152 155 159 160 163 )
for i in ${ACTIONS[@]};
do
......
import math
import os
import torch
import numpy as np
def get_padding_shape(filter_shape, stride, mod=0):
"""Fetch a tuple describing the input padding shape.
NOTES: To replicate "TF SAME" style padding, the padding shape needs to be
determined at runtime to handle cases when the input dimension is not divisible
by the stride.
See https://stackoverflow.com/a/49842071 for explanation of TF SAME padding logic
"""
def _pad_top_bottom(filter_dim, stride_val, mod):
if mod:
pad_along = max(filter_dim - mod, 0)
else:
pad_along = max(filter_dim - stride_val, 0)
pad_top = pad_along // 2
pad_bottom = pad_along - pad_top
return pad_top, pad_bottom
padding_shape = []
for idx, (filter_dim, stride_val) in enumerate(zip(filter_shape, stride)):
depth_mod = (idx == 0) and mod
pad_top, pad_bottom = _pad_top_bottom(filter_dim, stride_val, depth_mod)
padding_shape.append(pad_top)
padding_shape.append(pad_bottom)
depth_top = padding_shape.pop(0)
depth_bottom = padding_shape.pop(0)
padding_shape.append(depth_top)
padding_shape.append(depth_bottom)
return tuple(padding_shape)
def simplify_padding(padding_shapes):
all_same = True
padding_init = padding_shapes[0]
for pad in padding_shapes[1:]:
if pad != padding_init:
all_same = False
return all_same, padding_init
class Unit3Dpy(torch.nn.Module):
def __init__(self,
in_channels,
out_channels,
kernel_size=(1, 1, 1),
stride=(1, 1, 1),
activation='relu',
padding='SAME',
use_bias=False,
use_bn=True):
super(Unit3Dpy, self).__init__()
self.padding = padding
self.activation = activation
self.use_bn = use_bn
self.stride = stride
if padding == 'SAME':
padding_shape = get_padding_shape(kernel_size, stride)
simplify_pad, pad_size = simplify_padding(padding_shape)
self.simplify_pad = simplify_pad
if stride[0] > 1:
padding_shapes = [get_padding_shape(kernel_size, stride, mod) for
mod in range(stride[0])]
else:
padding_shapes = [padding_shape]
elif padding == 'VALID':
padding_shape = 0
else:
raise ValueError(
'padding should be in [VALID|SAME] but got {}'.format(padding))
if padding == 'SAME':
if not simplify_pad:
self.pads = [torch.nn.ConstantPad3d(x, 0) for x in padding_shapes]
self.conv3d = torch.nn.Conv3d(
in_channels,
out_channels,
kernel_size,
stride=stride,
bias=use_bias)
else:
self.conv3d = torch.nn.Conv3d(
in_channels,
out_channels,
kernel_size,
stride=stride,
padding=pad_size,
bias=use_bias)
elif padding == 'VALID':
self.conv3d = torch.nn.Conv3d(
in_channels,
out_channels,
kernel_size,
padding=padding_shape,
stride=stride,
bias=use_bias)
else:
raise ValueError(
'padding should be in [VALID|SAME] but got {}'.format(padding))
if self.use_bn:
# This is not strictly the correct map between epsilons in keras and
# pytorch (which have slightly different definitions of the batch norm
# forward pass), but it seems to be good enough. The PyTorch formula
# is described here:
# https://pytorch.org/docs/stable/_modules/torch/nn/modules/batchnorm.html
tf_style_eps = 1E-3
self.batch3d = torch.nn.BatchNorm3d(out_channels, eps=tf_style_eps)
if activation == 'relu':
self.activation = torch.nn.functional.relu
def forward(self, inp):
if self.padding == 'SAME' and self.simplify_pad is False:
# Determine the padding to be applied by examining the input shape
pad_idx = inp.shape[2] % self.stride[0]
pad_op = self.pads[pad_idx]
inp = pad_op(inp)
out = self.conv3d(inp)
if self.use_bn:
out = self.batch3d(out)
if self.activation is not None:
out = torch.nn.functional.relu(out)
return out
class MaxPool3dTFPadding(torch.nn.Module):
def __init__(self, kernel_size, stride=None, padding='SAME'):
super(MaxPool3dTFPadding, self).__init__()
if padding == 'SAME':
padding_shape = get_padding_shape(kernel_size, stride)
self.padding_shape = padding_shape
self.stride = stride
if stride[0] > 1:
padding_shapes = [get_padding_shape(kernel_size, stride, mod) for
mod in range(stride[0])]
else:
padding_shapes = [padding_shape]
self.pads = [torch.nn.ConstantPad3d(x, 0) for x in padding_shapes]
self.pool = torch.nn.MaxPool3d(kernel_size, stride, ceil_mode=True)
def forward(self, inp):
pad_idx = inp.shape[2] % self.stride[0]
pad_op = self.pads[pad_idx]
inp = pad_op(inp)
out = self.pool(inp)
return out
class Mixed(torch.nn.Module):
def __init__(self, in_channels, out_channels):
super(Mixed, self).__init__()
# Branch 0
self.branch_0 = Unit3Dpy(
in_channels, out_channels[0], kernel_size=(1, 1, 1))
# Branch 1
branch_1_conv1 = Unit3Dpy(
in_channels, out_channels[1], kernel_size=(1, 1, 1))
branch_1_conv2 = Unit3Dpy(
out_channels[1], out_channels[2], kernel_size=(3, 3, 3))
self.branch_1 = torch.nn.Sequential(branch_1_conv1, branch_1_conv2)
# Branch 2
branch_2_conv1 = Unit3Dpy(
in_channels, out_channels[3], kernel_size=(1, 1, 1))
branch_2_conv2 = Unit3Dpy(
out_channels[3], out_channels[4], kernel_size=(3, 3, 3))
self.branch_2 = torch.nn.Sequential(branch_2_conv1, branch_2_conv2)
# Branch3
branch_3_pool = MaxPool3dTFPadding(
kernel_size=(3, 3, 3), stride=(1, 1, 1), padding='SAME')
branch_3_conv2 = Unit3Dpy(
in_channels, out_channels[5], kernel_size=(1, 1, 1))
self.branch_3 = torch.nn.Sequential(branch_3_pool, branch_3_conv2)
def forward(self, inp):
out_0 = self.branch_0(inp)
out_1 = self.branch_1(inp)
out_2 = self.branch_2(inp)
out_3 = self.branch_3(inp)
out = torch.cat((out_0, out_1, out_2, out_3), 1)
return out
class I3D(torch.nn.Module):
def __init__(self,
num_classes,
modality='rgb',
dropout_prob=0,
name='inception'):
super(I3D, self).__init__()
self.name = name
self.num_classes = num_classes
if modality == 'rgb':
in_channels = 3
elif modality == 'flow':
in_channels = 2
else:
raise ValueError(
'{} not among known modalities [rgb|flow]'.format(modality))
self.modality = modality
conv3d_1a_7x7 = Unit3Dpy(
out_channels=64,
in_channels=in_channels,
kernel_size=(7, 7, 7),
stride=(2, 2, 2),
padding='SAME')
# 1st conv-pool
self.conv3d_1a_7x7 = conv3d_1a_7x7
self.maxPool3d_2a_3x3 = MaxPool3dTFPadding(
kernel_size=(1, 3, 3), stride=(1, 2, 2), padding='SAME')
# conv conv
conv3d_2b_1x1 = Unit3Dpy(
out_channels=64,
in_channels=64,
kernel_size=(1, 1, 1),
padding='SAME')
self.conv3d_2b_1x1 = conv3d_2b_1x1
conv3d_2c_3x3 = Unit3Dpy(
out_channels=192,
in_channels=64,
kernel_size=(3, 3, 3),
padding='SAME')
self.conv3d_2c_3x3 = conv3d_2c_3x3
self.maxPool3d_3a_3x3 = MaxPool3dTFPadding(
kernel_size=(1, 3, 3), stride=(1, 2, 2), padding='SAME')
# Mixed_3b
self.mixed_3b = Mixed(192, [64, 96, 128, 16, 32, 32])
self.mixed_3c = Mixed(256, [128, 128, 192, 32, 96, 64])
self.maxPool3d_4a_3x3 = MaxPool3dTFPadding(
kernel_size=(3, 3, 3), stride=(2, 2, 2), padding='SAME')
# Mixed 4
self.mixed_4b = Mixed(480, [192, 96, 208, 16, 48, 64])
self.mixed_4c = Mixed(512, [160, 112, 224, 24, 64, 64])
self.mixed_4d = Mixed(512, [128, 128, 256, 24, 64, 64])
self.mixed_4e = Mixed(512, [112, 144, 288, 32, 64, 64])
self.mixed_4f = Mixed(528, [256, 160, 320, 32, 128, 128])
self.maxPool3d_5a_2x2 = MaxPool3dTFPadding(
kernel_size=(2, 2, 2), stride=(2, 2, 2), padding='SAME')
# Mixed 5
self.mixed_5b = Mixed(832, [256, 160, 320, 32, 128, 128])
self.mixed_5c = Mixed(832, [384, 192, 384, 48, 128, 128])
self.avg_pool = torch.nn.AvgPool3d((2, 7, 7), (1, 1, 1))
self.dropout = torch.nn.Dropout(dropout_prob)
self.conv3d_0c_1x1 = Unit3Dpy(
in_channels=1024,
out_channels=self.num_classes,
kernel_size=(1, 1, 1),
activation=None,
use_bias=True,
use_bn=False)
self.softmax = torch.nn.Softmax(1)
def forward(self, inp):
# Preprocessing
out = self.conv3d_1a_7x7(inp)
out = self.maxPool3d_2a_3x3(out)
out = self.conv3d_2b_1x1(out)
out = self.conv3d_2c_3x3(out)
out = self.maxPool3d_3a_3x3(out)
out = self.mixed_3b(out)
out = self.mixed_3c(out)
out = self.maxPool3d_4a_3x3(out)
out = self.mixed_4b(out)
out = self.mixed_4c(out)
out = self.mixed_4d(out)
out = self.mixed_4e(out)
out = self.mixed_4f(out)
out = self.maxPool3d_5a_2x2(out)
out = self.mixed_5b(out)
out = self.mixed_5c(out)
out = self.avg_pool(out)
out = self.dropout(out)
out = self.conv3d_0c_1x1(out)
out = out.squeeze(3)
out = out.squeeze(3)
out = out.mean(2)
out_logits = out
out = self.softmax(out_logits)
return out, out_logits
def load_tf_weights(self, sess):
state_dict = {}
if self.modality == 'rgb':
prefix = 'RGB/inception_i3d'
elif self.modality == 'flow':
prefix = 'Flow/inception_i3d'
load_conv3d(state_dict, 'conv3d_1a_7x7', sess,
os.path.join(prefix, 'Conv3d_1a_7x7'))
load_conv3d(state_dict, 'conv3d_2b_1x1', sess,
os.path.join(prefix, 'Conv3d_2b_1x1'))
load_conv3d(state_dict, 'conv3d_2c_3x3', sess,
os.path.join(prefix, 'Conv3d_2c_3x3'))
load_mixed(state_dict, 'mixed_3b', sess,
os.path.join(prefix, 'Mixed_3b'))
load_mixed(state_dict, 'mixed_3c', sess,
os.path.join(prefix, 'Mixed_3c'))
load_mixed(state_dict, 'mixed_4b', sess,
os.path.join(prefix, 'Mixed_4b'))
load_mixed(state_dict, 'mixed_4c', sess,
os.path.join(prefix, 'Mixed_4c'))
load_mixed(state_dict, 'mixed_4d', sess,
os.path.join(prefix, 'Mixed_4d'))
load_mixed(state_dict, 'mixed_4e', sess,
os.path.join(prefix, 'Mixed_4e'))
# Here goest to 0.1 max error with tf
load_mixed(state_dict, 'mixed_4f', sess,
os.path.join(prefix, 'Mixed_4f'))
load_mixed(
state_dict,
'mixed_5b',
sess,
os.path.join(prefix, 'Mixed_5b'),
fix_typo=True)
load_mixed(state_dict, 'mixed_5c', sess,
os.path.join(prefix, 'Mixed_5c'))
load_conv3d(
state_dict,
'conv3d_0c_1x1',
sess,
os.path.join(prefix, 'Logits', 'Conv3d_0c_1x1'),
bias=True,
bn=False)
self.load_state_dict(state_dict)
def get_conv_params(sess, name, bias=False):
# Get conv weights
conv_weights_tensor = sess.graph.get_tensor_by_name(
os.path.join(name, 'w:0'))
if bias:
conv_bias_tensor = sess.graph.get_tensor_by_name(
os.path.join(name, 'b:0'))
conv_bias = sess.run(conv_bias_tensor)
conv_weights = sess.run(conv_weights_tensor)
conv_shape = conv_weights.shape
kernel_shape = conv_shape[0:3]
in_channels = conv_shape[3]
out_channels = conv_shape[4]
conv_op = sess.graph.get_operation_by_name(
os.path.join(name, 'convolution'))
padding_name = conv_op.get_attr('padding')
padding = _get_padding(padding_name, kernel_shape)
all_strides = conv_op.get_attr('strides')
strides = all_strides[1:4]
conv_params = [
conv_weights, kernel_shape, in_channels, out_channels, strides, padding
]
if bias:
conv_params.append(conv_bias)
return conv_params
def get_bn_params(sess, name):
moving_mean_tensor = sess.graph.get_tensor_by_name(
os.path.join(name, 'moving_mean:0'))
moving_var_tensor = sess.graph.get_tensor_by_name(
os.path.join(name, 'moving_variance:0'))
beta_tensor = sess.graph.get_tensor_by_name(os.path.join(name, 'beta:0'))
moving_mean = sess.run(moving_mean_tensor)
moving_var = sess.run(moving_var_tensor)
beta = sess.run(beta_tensor)
return moving_mean, moving_var, beta
def _get_padding(padding_name, conv_shape):
padding_name = padding_name.decode("utf-8")
if padding_name == "VALID":
return [0, 0]
elif padding_name == "SAME":
# return [math.ceil(int(conv_shape[0])/2), math.ceil(int(conv_shape[1])/2)]
return [
math.floor(int(conv_shape[0]) / 2),
math.floor(int(conv_shape[1]) / 2),
math.floor(int(conv_shape[2]) / 2)
]
else:
raise ValueError('Invalid padding name ' + padding_name)
def load_conv3d(state_dict, name_pt, sess, name_tf, bias=False, bn=True):
# Transfer convolution params
conv_name_tf = os.path.join(name_tf, 'conv_3d')
conv_params = get_conv_params(sess, conv_name_tf, bias=bias)
if bias:
conv_weights, kernel_shape, in_channels, out_channels, strides, padding, conv_bias = conv_params
else:
conv_weights, kernel_shape, in_channels, out_channels, strides, padding = conv_params
conv_weights_rs = np.transpose(
conv_weights, (4, 3, 0, 1,
2)) # to pt format (out_c, in_c, depth, height, width)
state_dict[name_pt + '.conv3d.weight'] = torch.from_numpy(conv_weights_rs)
if bias:
state_dict[name_pt + '.conv3d.bias'] = torch.from_numpy(conv_bias)
# Transfer batch norm params
if bn:
conv_tf_name = os.path.join(name_tf, 'batch_norm')
moving_mean, moving_var, beta = get_bn_params(sess, conv_tf_name)
out_planes = conv_weights_rs.shape[0]
state_dict[name_pt + '.batch3d.weight'] = torch.ones(out_planes)
state_dict[name_pt +
'.batch3d.bias'] = torch.from_numpy(beta.squeeze())
state_dict[name_pt
+ '.batch3d.running_mean'] = torch.from_numpy(moving_mean.squeeze())
state_dict[name_pt
+ '.batch3d.running_var'] = torch.from_numpy(moving_var.squeeze())
def load_mixed(state_dict, name_pt, sess, name_tf, fix_typo=False):
# Branch 0
load_conv3d(state_dict, name_pt + '.branch_0', sess,
os.path.join(name_tf, 'Branch_0/Conv3d_0a_1x1'))
# Branch .1
load_conv3d(state_dict, name_pt + '.branch_1.0', sess,
os.path.join(name_tf, 'Branch_1/Conv3d_0a_1x1'))
load_conv3d(state_dict, name_pt + '.branch_1.1', sess,
os.path.join(name_tf, 'Branch_1/Conv3d_0b_3x3'))
# Branch 2
load_conv3d(state_dict, name_pt + '.branch_2.0', sess,
os.path.join(name_tf, 'Branch_2/Conv3d_0a_1x1'))
if fix_typo:
load_conv3d(state_dict, name_pt + '.branch_2.1', sess,
os.path.join(name_tf, 'Branch_2/Conv3d_0a_3x3'))
else:
load_conv3d(state_dict, name_pt + '.branch_2.1', sess,
os.path.join(name_tf, 'Branch_2/Conv3d_0b_3x3'))
# Branch 3
load_conv3d(state_dict, name_pt + '.branch_3.1', sess,
os.path.join(name_tf, 'Branch_3/Conv3d_0b_1x1'))
import torch
import torch.nn as nn
from i3D.resnet3d_xl import Net
import torch.nn.functional as F
'''
Video Classification Model library.
'''
class TrainingScheduleError(Exception):
pass
class VideoModel(nn.Module):
def __init__(self,
num_classes,
num_boxes,
num_videos=16,
restore_dict=None,
freeze_weights=None,
device=None,
loss_type='softmax'):
super(VideoModel, self).__init__()
self.device = device
self.num_frames = num_videos
self.num_classes = num_classes
# Network loads kinetic pre-trained weights in initialization
self.i3D = Net(num_classes, extract_features=True, loss_type=loss_type)
try:
# Restore weights
if restore_dict:
self.restore(restore_dict)
# Freeze weights
if freeze_weights:
self.freeze_weights(freeze_weights)
else:
print(" > No weights are freezed")
except Exception as e:
print(" > Exception {}".format(e))
def restore(self, restore=None):
# Load pre-trained I3D + Graph weights for fine-tune (replace the last FC)
restore_finetuned = restore.get("restore_finetuned", None)
if restore_finetuned:
self._restore_fintuned(restore_finetuned)
print(" > Restored I3D + Graph weights")
return
# Load pre-trained I3D weights
restore_i3d = restore.get("restore_i3d", None)
if restore_i3d:
self._restore_i3d(restore_i3d)
print(" > Restored only I3D weights")
return
# Load pre-trained I3D + Graph weights without replacing anything
restore_predict = restore.get("restore_predict", None)
if restore_predict:
self._restore_predict(restore_predict)
print(" > Restored the model with strict weights")
return
def _restore_predict(self, path):
if path is None:
raise TrainingScheduleError('You should pre-train the video model on your training data first')
weights = torch.load(path, map_location=self.device)['state_dict']
new_weights = {}
for k, v in weights.items():
new_weights[k.replace('module.', '')] = v
self.load_state_dict(new_weights, strict=True)
print(" > Weights {} loaded".format(path))
def _restore_i3d(self, path):
if path is None:
raise TrainingScheduleError('You should pre-train the video model on your training data first')
weights = torch.load(path, map_location=self.device)['state_dict']
new_weights = {}
for k, v in weights.items():
if not k.startswith('module.fc') and not k.startswith('module.i3D.classifier'):
new_weights[k.replace('module.', '')] = v
self.load_state_dict(new_weights, strict=False)
def _restore_fintuned(self, path):
if path is None:
raise TrainingScheduleError('You should pre-train the video model on your training data first')
weights = torch.load(path, map_location=self.device)['state_dict']
new_weights = {}
for k, v in weights.items():
# Don't load classifiers (different classes 88 vs 86)
if not k.startswith('module.fc'):
if not k.startswith('module.i3D.classifier'):
new_weights[k.replace('module.', '')] = v
self.load_state_dict(new_weights, strict=False)
print(" > Weights {} loaded".format(path))
def freeze_weights(self, module):
if module == 'i3d':
print(" > Freeze I3D module")
for param in self.i3D.parameters():
param.requires_grad = False
elif module == 'fine_tuned':
print(" > Freeze Graph + I3D module, only last FC is training")
# Fixed the entire params without the last FC
for name, param in self.i3D.named_parameters():
if not name.startswith('classifier'):
param.requires_grad = False
for param in self.graph_embedding.parameters():
param.requires_grad = False
for param in self.conv.parameters():
param.requires_grad = False
else:
raise NotImplementedError('Unrecognized option, you can freeze either graph module or I3D module')
pass
def _get_i3d_features(self, videos, output_video_features=False):
# org_features - [V x 2048 x T / 2 x 14 x 14]
_, org_features = self.i3D(videos)
# Reduce dimension video_features - [V x 512 x T / 2 x 14 x 14]
videos_features = self.conv(org_features)
bs, d, t, h, w = videos_features.size()
# Get global features
videos_features_rs = videos_features.permute(0, 2, 1, 3, 4) # [V x T / 2 x 512 x h x w]
videos_features_rs = videos_features_rs.reshape(-1, d, h, w) # [V * T / 2 x 512 x h x w]
global_features = self.avgpool(videos_features_rs) # [V * T / 2 x 512 x 1 x 1]
global_features = self.dropout(global_features)
global_features = global_features.reshape(bs, t, d) # [V x T / 2 x 512]
if output_video_features:
return global_features, videos_features
else:
return global_features
def flatten(self, x):
return [item for sublist in x for item in sublist]
This diff is collapsed.
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import math
import numpy as np
from functools import partial
__all__ = [
'ResNet', 'resnet10', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
'resnet152', 'resnet200',
]
def conv3x3x3(in_planes, out_planes, stride=1):
# 3x3x3 convolution with padding
return nn.Conv3d(
in_planes,
out_planes,
kernel_size=3,
stride=stride,
padding=1,
bias=False)
def downsample_basic_block(x, planes, stride):
out = F.avg_pool3d(x, kernel_size=1, stride=stride)
zero_pads = torch.Tensor(
out.size(0), planes - out.size(1), out.size(2), out.size(3),
out.size(4)).zero_()
if isinstance(out.data, torch.cuda.FloatTensor):
zero_pads = zero_pads.cuda()
out = Variable(torch.cat([out.data, zero_pads], dim=1))
return out
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3x3(inplanes, planes, stride)
self.bn1 = nn.BatchNorm3d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3x3(planes, planes)
self.bn2 = nn.BatchNorm3d(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class Bottleneck(nn.Module):
conv_op = None
offset_groups = 1
def __init__(self, dim_in, dim_out, stride, dim_inner, group=1, use_temp_conv=1, temp_stride=1, dcn=False,
shortcut_type='B'):
super(Bottleneck, self).__init__()
# 1 x 1 layer
self.with_dcn = dcn
self.conv1 = self.Conv3dBN(dim_in, dim_inner, (1 + use_temp_conv * 2, 1, 1), (temp_stride, 1, 1),
(use_temp_conv, 0, 0))
self.relu = nn.ReLU(inplace=True)
# 3 x 3 layer
self.conv2 = self.Conv3dBN(dim_inner, dim_inner, (1, 3, 3), (1, stride, stride), (0, 1, 1))
# 1 x 1 layer
self.conv3 = self.Conv3dBN(dim_inner, dim_out, (1, 1, 1), (1, 1, 1), (0, 0, 0))
self.shortcut_type = shortcut_type
self.dim_in = dim_in
self.dim_out = dim_out
self.temp_stride = temp_stride
self.stride = stride
# nn.Conv3d(dim_in, dim_out, (1,1,1),(temp_stride,stride,stride),(0,0,0))
if self.shortcut_type == 'B':
if self.dim_in == self.dim_out and self.temp_stride == 1 and self.stride == 1: # or (self.dim_in == self.dim_out and self.dim_in == 64 and self.stride ==1):
pass
else:
# pass
self.shortcut = self.Conv3dBN(dim_in, dim_out, (1, 1, 1), (temp_stride, stride, stride), (0, 0, 0))
# nn.Conv3d(dim_in,dim_inner,kernel_size=(1+use_temp_conv*2,1,1),stride = (temp_stride,1,1),padding = )
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.relu(out)
out = self.conv2(out)
out = self.relu(out)
out = self.conv3(out)
if self.dim_in == self.dim_out and self.temp_stride == 1 and self.stride == 1:
pass
else:
residual = self.shortcut(residual)
out += residual
out = self.relu(out)
return out
def Conv3dBN(self, dim_in, dim_out, kernels, strides, pads, group=1):
if self.with_dcn and kernels[0] > 1:
# use deformable conv
return nn.Sequential(
self.conv_op(dim_in, dim_out, kernel_size=kernels, stride=strides, padding=pads, bias=False,
offset_groups=self.offset_groups),
nn.BatchNorm3d(dim_out)
)
else:
return nn.Sequential(
nn.Conv3d(dim_in, dim_out, kernel_size=kernels, stride=strides, padding=pads, bias=False),
nn.BatchNorm3d(dim_out)
)
class ResNet(nn.Module):
def __init__(self,
block,
layers,
use_temp_convs_set,
temp_strides_set,
sample_size,
sample_duration,
shortcut_type='B',
num_classes=400,
stage_with_dcn=(False, False, False, False),
extract_features=False,
loss_type='softmax'):
super(ResNet, self).__init__()
self.extract_features = extract_features
self.stage_with_dcn = stage_with_dcn
self.group = 1
self.width_per_group = 64
self.dim_inner = self.group * self.width_per_group
# self.shortcut_type = shortcut_type
self.conv1 = nn.Conv3d(
3,
64,
kernel_size=(1 + use_temp_convs_set[0][0] * 2, 7, 7),
stride=(temp_strides_set[0][0], 2, 2),
padding=(use_temp_convs_set[0][0], 3, 3),
bias=False)
self.bn1 = nn.BatchNorm3d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool1 = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 0, 0))
with_dcn = True if self.stage_with_dcn[0] else False
self.layer1 = self._make_layer(block, 64, 256, shortcut_type, stride=1, num_blocks=layers[0],
dim_inner=self.dim_inner, group=self.group, use_temp_convs=use_temp_convs_set[1],
temp_strides=temp_strides_set[1], dcn=with_dcn)
self.maxpool2 = nn.MaxPool3d(kernel_size=(2, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
with_dcn = True if self.stage_with_dcn[1] else False
self.layer2 = self._make_layer(block, 256, 512, shortcut_type, stride=2, num_blocks=layers[1],
dim_inner=self.dim_inner * 2, group=self.group,
use_temp_convs=use_temp_convs_set[2], temp_strides=temp_strides_set[2],
dcn=with_dcn)
with_dcn = True if self.stage_with_dcn[2] else False
self.layer3 = self._make_layer(block, 512, 1024, shortcut_type, stride=2, num_blocks=layers[2],
dim_inner=self.dim_inner * 4, group=self.group,
use_temp_convs=use_temp_convs_set[3], temp_strides=temp_strides_set[3],
dcn=with_dcn)
with_dcn = True if self.stage_with_dcn[3] else False
self.layer4 = self._make_layer(block, 1024, 2048, shortcut_type, stride=1, num_blocks=layers[3],
dim_inner=self.dim_inner * 8, group=self.group,
use_temp_convs=use_temp_convs_set[4], temp_strides=temp_strides_set[4],
dcn=with_dcn)
last_duration = int(math.ceil(sample_duration / 2)) # int(math.ceil(sample_duration / 8))
last_size = int(math.ceil(sample_size / 16))
# self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1) #nn.AdaptiveAvgPool3d((1, 1, 1)) #
self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
self.dropout = torch.nn.Dropout(p=0.5)
self.classifier = nn.Linear(2048, num_classes)
for m in self.modules():
# if isinstance(m, nn.Conv3d):
# m.weight = nn.init.kaiming_normal_(m.weight, mode='fan_out')
# elif isinstance(m,nn.Linear):
# m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')
# elif
if isinstance(m, nn.BatchNorm3d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def _make_layer(self, block, dim_in, dim_out, shortcut_type, stride, num_blocks, dim_inner=None, group=None,
use_temp_convs=None, temp_strides=None, dcn=False):
if use_temp_convs is None:
use_temp_convs = np.zeros(num_blocks).astype(int)
if temp_strides is None:
temp_strides = np.ones(num_blocks).astype(int)
if len(use_temp_convs) < num_blocks:
for _ in range(num_blocks - len(use_temp_convs)):
use_temp_convs.append(0)
temp_strides.append(1)
layers = []
for idx in range(num_blocks):
block_stride = 2 if (idx == 0 and stride == 2) else 1
layers.append(
block(dim_in, dim_out, block_stride, dim_inner, group, use_temp_convs[idx], temp_strides[idx], dcn))
dim_in = dim_out
return nn.Sequential(*layers)
def forward_single(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool1(x)
x = self.layer1(x)
x = self.maxpool2(x)
x = self.layer2(x)
x = self.layer3(x)
features = self.layer4(x)
x = self.avgpool(features)
x = x.view(x.size(0), -1)
x = self.dropout(x)
y = self.classifier(x)
if self.extract_features:
return y, features
else:
return y
def forward_multi(self, x):
clip_preds = []
# import ipdb;ipdb.set_trace()
for clip_idx in range(x.shape[1]): # B, 10, 3, 3, 32, 224, 224
spatial_crops = []
for crop_idx in range(x.shape[2]):
clip = x[:, clip_idx, crop_idx]
clip = self.forward_single(clip)
spatial_crops.append(clip)
spatial_crops = torch.stack(spatial_crops, 1).mean(1) # (B, 400)
clip_preds.append(spatial_crops)
clip_preds = torch.stack(clip_preds, 1).mean(1) # (B, 400)
return clip_preds
def forward(self, x):
# 5D tensor == single clip
if x.dim() == 5:
pred = self.forward_single(x)
# 7D tensor == 3 crops/10 clips
elif x.dim() == 7:
pred = self.forward_multi(x)
# loss_dict = {}
# if 'label' in batch:
# loss = F.cross_entropy(pred, batch['label'], reduction='none')
# loss_dict = {'clf': loss}
return pred
def get_fine_tuning_parameters(model, ft_begin_index):
if ft_begin_index == 0:
return model.parameters()
ft_module_names = []
for i in range(ft_begin_index, 5):
ft_module_names.append('layer{}'.format(i))
ft_module_names.append('fc')
# import ipdb;ipdb.set_trace()
parameters = []
for k, v in model.named_parameters():
for ft_module in ft_module_names:
if ft_module in k:
parameters.append({'params': v})
break
else:
parameters.append({'params': v, 'lr': 0.0})
return parameters
def obtain_arc(arc_type):
# c2d, ResNet50
if arc_type == 1:
use_temp_convs_1 = [0]
temp_strides_1 = [2]
use_temp_convs_2 = [0, 0, 0]
temp_strides_2 = [1, 1, 1]
use_temp_convs_3 = [0, 0, 0, 0]
temp_strides_3 = [1, 1, 1, 1]
use_temp_convs_4 = [0, ] * 6
temp_strides_4 = [1, ] * 6
use_temp_convs_5 = [0, 0, 0]
temp_strides_5 = [1, 1, 1]
# i3d, ResNet50
if arc_type == 2:
use_temp_convs_1 = [2]
temp_strides_1 = [1]
use_temp_convs_2 = [1, 1, 1]
temp_strides_2 = [1, 1, 1]
use_temp_convs_3 = [1, 0, 1, 0]
temp_strides_3 = [1, 1, 1, 1]
use_temp_convs_4 = [1, 0, 1, 0, 1, 0]
temp_strides_4 = [1, 1, 1, 1, 1, 1]
use_temp_convs_5 = [0, 1, 0]
temp_strides_5 = [1, 1, 1]
# c2d, ResNet101
if arc_type == 3:
use_temp_convs_1 = [0]
temp_strides_1 = [2]
use_temp_convs_2 = [0, 0, 0]
temp_strides_2 = [1, 1, 1]
use_temp_convs_3 = [0, 0, 0, 0]
temp_strides_3 = [1, 1, 1, 1]
use_temp_convs_4 = [0, ] * 23
temp_strides_4 = [1, ] * 23
use_temp_convs_5 = [0, 0, 0]
temp_strides_5 = [1, 1, 1]
# i3d, ResNet101
if arc_type == 4:
use_temp_convs_1 = [2]
temp_strides_1 = [2]
use_temp_convs_2 = [1, 1, 1]
temp_strides_2 = [1, 1, 1]
use_temp_convs_3 = [1, 0, 1, 0]
temp_strides_3 = [1, 1, 1, 1]
use_temp_convs_4 = []
for i in range(23):
if i % 2 == 0:
use_temp_convs_4.append(1)
else:
use_temp_convs_4.append(0)
temp_strides_4 = [1, ] * 23
use_temp_convs_5 = [0, 1, 0]
temp_strides_5 = [1, 1, 1]
use_temp_convs_set = [use_temp_convs_1, use_temp_convs_2, use_temp_convs_3, use_temp_convs_4, use_temp_convs_5]
temp_strides_set = [temp_strides_1, temp_strides_2, temp_strides_3, temp_strides_4, temp_strides_5]
return use_temp_convs_set, temp_strides_set
def resnet10(**kwargs):
"""Constructs a ResNet-18 model.
"""
use_temp_convs_set = []
temp_strides_set = []
model = ResNet(BasicBlock, [1, 1, 1, 1], use_temp_convs_set, temp_strides_set, **kwargs)
return model
def resnet18(**kwargs):
"""Constructs a ResNet-18 model.
"""
use_temp_convs_set = []
temp_strides_set = []
model = ResNet(BasicBlock, [2, 2, 2, 2], use_temp_convs_set, temp_strides_set, **kwargs)
return model
def resnet34(**kwargs):
"""Constructs a ResNet-34 model.
"""
use_temp_convs_set = []
temp_strides_set = []
model = ResNet(BasicBlock, [3, 4, 6, 3], use_temp_convs_set, temp_strides_set, **kwargs)
return model
def resnet50(extract_features, **kwargs):
"""Constructs a ResNet-50 model.
"""
use_temp_convs_set, temp_strides_set = obtain_arc(2)
model = ResNet(Bottleneck, [3, 4, 6, 3], use_temp_convs_set, temp_strides_set,
extract_features=extract_features, **kwargs)
return model
def resnet101(**kwargs):
"""Constructs a ResNet-101 model.
"""
use_temp_convs_set, temp_strides_set = obtain_arc(4)
model = ResNet(Bottleneck, [3, 4, 23, 3], use_temp_convs_set, temp_strides_set, **kwargs)
return model
def resnet152(**kwargs):
"""Constructs a ResNet-101 model.
"""
use_temp_convs_set = []
temp_strides_set = []
model = ResNet(Bottleneck, [3, 8, 36, 3], use_temp_convs_set, temp_strides_set, **kwargs)
return model
def resnet200(**kwargs):
"""Constructs a ResNet-101 model.
"""
use_temp_convs_set = []
temp_strides_set = []
model = ResNet(Bottleneck, [3, 24, 36, 3], use_temp_convs_set, temp_strides_set, **kwargs)
return model
def Net(num_classes, extract_features=False, loss_type='softmax',
weights=None, freeze_all_but_cls=False):
net = globals()['resnet' + str(50)](
num_classes=num_classes,
sample_size=50,
sample_duration=32,
extract_features=extract_features,
loss_type=loss_type,
)
if weights is not None:
kinetics_weights = torch.load(weights)['state_dict']
print("Found weights in {}.".format(weights))
cls_name = 'fc'
else:
kinetics_weights = torch.load('i3D/kinetics-res50.pth')
cls_name = 'fc'
print('\n Restoring Kintetics \n')
new_weights = {}
for k, v in kinetics_weights.items():
if not k.startswith('module.' + cls_name):
new_weights[k.replace('module.', '')] = v
net.load_state_dict(new_weights, strict=False)
if freeze_all_but_cls:
for name, par in net.named_parameters():
if not name.startswith('classifier'):
par.requires_grad = False
return net
# -*- coding: utf-8 -*-
import os
import sys
import glob
import datetime
import argparse
import random
import numpy as np
#from pathlib import Path
#filepath = Path.cwd()
#sys.path.append(filepath)
from video_loaders import load_av
from se_bb_from_np import annot_np
from SmthSequence import SmthSequence
from SmthFrameRelations import frame_relations
from PIL import Image
import matplotlib.pyplot as plt
import cv2
import i3D.gtransforms as gtransforms
import torch
from i3D.i3dpt import I3D
rgb_pt_checkpoint = 'i3D/model_rgb.pth'
class I3dFV:
def __init__(self,path):
self.anno = annot_np(path)
self.net = I3D(num_classes=400, modality='rgb')
self.net.eval()
self.net.load_state_dict(torch.load(rgb_pt_checkpoint))
self.net.cuda()
self.pre_resize_shape = (256, 340)
self.random_crop = gtransforms.GroupMultiScaleCrop(output_size=224,
scales=[1],
max_distort=0,
center_crop_only=True)
def process_video(self,finput,verbose=False):
# get video id
vidnum = int(os.path.splitext(os.path.basename(finput))[0])
# load video to ndarray list
img_array = load_av(finput)
# convert BGR to RGB
frames = [cv2.cvtColor(img, cv2.COLOR_BGR2RGB) for img in img_array]
# convert ndarray to array of PIL Images for resize and cropping
frames = [Image.fromarray(img.astype('uint8'), 'RGB') for img in frames]
# resize
frames = [img.resize((self.pre_resize_shape[1], self.pre_resize_shape[0]), Image.BILINEAR) for img in frames]
# crop
frames, (offset_h, offset_w, crop_h, crop_w) = self.random_crop(frames)
# convert back from PIL to ndarray for cv2 channel separation
frames = [np.array(img) for img in frames]
# separate channels into R,G,B frame sequences
#rs = []
#gs = []
#bs = []
#for i in range(len(frames)):
# R, G, B = cv2.split(frames[i])
# rs.append(R)
# gs.append(G)
# bs.append(B)
#frames = np.asarray([[rs, gs, bs]])
frames = np.asarray([frames]).transpose(0, 4, 1, 2, 3) # alternative to channel splitting above?
#print(frames.shape)
sample_var = torch.autograd.Variable(torch.from_numpy(frames).cuda()).float()
_, logits = self.net(sample_var)
return logits.cpu().detach().numpy()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--annotations',
dest='path_to_annotations',
default='../annotations_ground/',
help='folder to load annotations from')
parser.add_argument(
'--video',
dest='path_to_video',
default='.',
help='video to load')
args = parser.parse_args()
i3dfv = I3dFV(args.path_to_annotations)
fv = i3dfv.process_video(args.path_to_video, verbose=True)
print(fv)
print("fin")
......@@ -17,6 +17,7 @@ from SmthSequence import SmthSequence
from SmthFrameRelations import frame_relations
from PIL import Image
import matplotlib.pyplot as plt
import cv2
import torch
from i3D.model import VideoModel
......@@ -61,33 +62,61 @@ class FrameFV:
bs.append(B)
frames = [rs, gs, bs]
#print(self.net.i3D.classifier.weight.data)
print(self.net.classifier[4].weight.data)
# read frame annotations into Sequence
seq = SmthSequence()
for framenum in range(0,len(img_array)):
cats, bbs = self.anno.get_vf_bbx(vidnum, framenum+1)
# add detections to Sequence
for i in range(0,len(cats)):
seq.add(framenum, cats[i], bbs[i])
#seq = SmthSequence()
#for framenum in range(0,len(img_array)):
# cats, bbs = self.anno.get_vf_bbx(vidnum, framenum+1)
# # add detections to Sequence
# for i in range(0,len(cats)):
# seq.add(framenum, cats[i], bbs[i])
# compute object relations per frame
relations = []
for framenum in range(0,len(img_array)):
fv = frame_relations(seq, 0, 1, framenum)
relations.append(fv)
relations = np.asarray(relations)
#relations = []
#for framenum in range(0,len(img_array)):
# fv = frame_relations(seq, 0, 1, framenum)
# relations.append(fv)
#relations = np.asarray(relations)
# i3D features per frame
clip = torch.from_numpy(np.asarray([frames]))
print(clip.shape)
#print(clip.shape)
clip = clip.float()
glo, vid = self.net.i3D(clip)
#return glo.detach().numpy()
videos_features = self.net.conv(vid)
print(glo.shape)
print(vid.shape)
#print(glo.shape)
#print(vid.shape)
#print(videos_features.shape)
#plt.plot(np.linspace(0,400,num=400), glo.detach().numpy()[0])
#plt.show()
print(videos_features.shape)
pre = vid.detach().numpy().view()
post = videos_features.detach().numpy().view()
rows = []
for f in range(len(img_array)//2):
row = []
for i in range(512):
patch = post[0,i,f]
row.append(patch)
row = np.hstack(row)
rows.append(row)
pic = np.vstack(rows)
print(pic.shape)
while(1):
cv2.imshow('frame', pic)
k = cv2.waitKey(33)
if k == 27:
break
if __name__ == '__main__':
parser = argparse.ArgumentParser()
......@@ -109,13 +138,11 @@ if __name__ == '__main__':
help='intermediate feature dimension for coord-based features')
parser.add_argument('--size', default=224, type=int, metavar='N',
help='primary image input size')
parser.add_argument('--batch_size', '-b', default=72, type=int,
metavar='N', help='mini-batch size (default: 72)')
parser.add_argument('--num_classes', default=174, type=int,
help='num of class in the model')
parser.add_argument('--num_boxes', default=4, type=int,
help='num of boxes for each image')
parser.add_argument('--num_frames', default=36, type=int,
parser.add_argument('--num_frames', default=16, type=int,
help='num of frames for the model')
parser.add_argument('--fine_tune', help='path with ckpt to restore')
parser.add_argument('--restore_i3d')
......
#!/bin/bash
ACTIONS=( 131 141 )
ACTIONS=( 1 12 13 16 25 33 34 35 38 48 51 52 54 55 56 58 61 63 64 66 69 71 75 76 78 80 81 82 88 89 95 96 97 102 111 118 127 128 130 131 133 136 138 141 147 152 155 159 160 163 )
for i in ${ACTIONS[@]};
do
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment