Skip to content
Snippets Groups Projects
Commit 3ece3c41 authored by Chrol-Cannon, Joseph Dr (Computer Science)'s avatar Chrol-Cannon, Joseph Dr (Computer Science)
Browse files

add initial i3d features

parent 5d9ab943
No related branches found
No related tags found
No related merge requests found
# Borrowed from: https://github.com/yjxiong/tsn-pytorch/blob/master/transforms.py
import torchvision
import random
from PIL import Image
import numbers
import torch
import torchvision.transforms.functional as F
class GroupResize(object):
def __init__(self, size, interpolation=Image.BILINEAR):
self.worker = torchvision.transforms.Resize(size, interpolation)
def __call__(self, img_group):
return [self.worker(img) for img in img_group]
class GroupRandomCrop(object):
def __init__(self, size):
if isinstance(size, numbers.Number):
self.size = (int(size), int(size))
else:
self.size = size
def __call__(self, img_group):
w, h = img_group[0].size
th, tw = self.size
out_images = list()
x1 = random.randint(0, w - tw)
y1 = random.randint(0, h - th)
for img in img_group:
assert (img.size[0] == w and img.size[1] == h)
if w == tw and h == th:
out_images.append(img)
else:
out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
return out_images
class GroupCenterCrop(object):
def __init__(self, size):
self.worker = torchvision.transforms.CenterCrop(size)
def __call__(self, img_group):
return [self.worker(img) for img in img_group]
class GroupRandomHorizontalFlip(object):
def __call__(self, img_group):
if random.random() < 0.5:
img_group = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group]
return img_group
class GroupNormalize(object):
def __init__(self, mean, std):
self.mean = mean
self.std = std
def __call__(self, tensor): # (T, 3, 224, 224)
for b in range(tensor.size(0)):
for t, m, s in zip(tensor[b], self.mean, self.std):
t.sub_(m).div_(s)
return tensor
class LoopPad(object):
def __init__(self, max_len):
self.max_len = max_len
def __call__(self, tensor):
length = tensor.size(0)
if length == self.max_len:
return tensor
# repeat the clip as many times as is necessary
n_pad = self.max_len - length
pad = [tensor] * (n_pad // length)
if n_pad % length > 0:
pad += [tensor[0:n_pad % length]]
tensor = torch.cat([tensor] + pad, 0)
return tensor
# NOTE: Returns [0-255] rather than torchvision's [0-1]
class ToTensor(object):
def __init__(self):
self.worker = lambda x: F.to_tensor(x) * 255
def __call__(self, img_group):
img_group = [self.worker(img) for img in img_group]
return torch.stack(img_group, 0)
class GroupMultiScaleCrop(object):
def __init__(self, output_size, scales=None, max_distort=1, fix_crop=True, more_fix_crop=True,
center_crop_only=False):
self.scales = scales if scales is not None else [1, .875, .75, .66]
self.max_distort = max_distort
self.fix_crop = fix_crop
self.more_fix_crop = more_fix_crop
self.center_crop_only = center_crop_only
assert center_crop_only is False or max_distort == 0 and len(self.scales) == 1, \
'Center crop should only be performed during testing time.'
self.output_size = output_size if not isinstance(output_size, int) else [output_size, output_size]
self.interpolation = Image.BILINEAR
def __call__(self, img_group):
im_size = img_group[0].size
crop_w, crop_h, offset_w, offset_h = self._sample_crop_size(im_size)
crop_img_group = [img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h)) for img in img_group]
ret_img_group = [img.resize((self.output_size[0], self.output_size[1]), self.interpolation)
for img in crop_img_group]
return ret_img_group, (offset_h, offset_w, crop_h, crop_w)
def _sample_crop_size(self, im_size):
image_w, image_h = im_size[0], im_size[1]
# find a crop size
base_size = min(image_w, image_h)
crop_sizes = [int(base_size * x) for x in self.scales]
crop_h = [self.output_size[1] if abs(x - self.output_size[1]) < 3 else x for x in crop_sizes]
crop_w = [self.output_size[0] if abs(x - self.output_size[0]) < 3 else x for x in crop_sizes]
pairs = []
for i, h in enumerate(crop_h):
for j, w in enumerate(crop_w):
if abs(i - j) <= self.max_distort:
pairs.append((w, h))
crop_pair = random.choice(pairs)
if not self.fix_crop:
w_offset = random.randint(0, image_w - crop_pair[0])
h_offset = random.randint(0, image_h - crop_pair[1])
else:
w_offset, h_offset = self._sample_fix_offset(image_w, image_h, crop_pair[0], crop_pair[1])
return crop_pair[0], crop_pair[1], w_offset, h_offset
def _sample_fix_offset(self, image_w, image_h, crop_w, crop_h):
offsets = self.fill_fix_offset(self.center_crop_only, self.more_fix_crop, image_w, image_h, crop_w, crop_h)
return random.choice(offsets)
@staticmethod
def fill_fix_offset(center_crop_only, more_fix_crop, image_w, image_h, crop_w, crop_h):
w_step = (image_w - crop_w) // 4
h_step = (image_h - crop_h) // 4
ret = list()
ret.append((0, 0)) # upper left
ret.append((2 * w_step, 2 * h_step)) # center
if center_crop_only:
return ret
ret.append((4 * w_step, 0)) # upper right
ret.append((0, 4 * h_step)) # lower left
ret.append((4 * w_step, 4 * h_step)) # lower right
if more_fix_crop:
ret.append((0, 2 * h_step)) # center left
ret.append((4 * w_step, 2 * h_step)) # center right
ret.append((2 * w_step, 4 * h_step)) # lower center
ret.append((2 * w_step, 0 * h_step)) # upper center
ret.append((1 * w_step, 1 * h_step)) # upper left quarter
ret.append((3 * w_step, 1 * h_step)) # upper right quarter
ret.append((1 * w_step, 3 * h_step)) # lower left quarter
ret.append((3 * w_step, 3 * h_step)) # lower righ quarter
return ret
import torch
import torch.nn as nn
from i3D.resnet3d_xl import Net
import torch.nn.functional as F
'''
Video Classification Model library.
'''
class TrainingScheduleError(Exception):
pass
class VideoModel(nn.Module):
def __init__(self,
num_classes,
num_boxes,
num_videos=16,
restore_dict=None,
freeze_weights=None,
device=None,
loss_type='softmax'):
super(VideoModel, self).__init__()
self.device = device
self.num_frames = num_videos
self.num_classes = num_classes
# Network loads kinetic pre-trained weights in initialization
self.i3D = Net(num_classes, extract_features=True, loss_type=loss_type)
try:
# Restore weights
if restore_dict:
self.restore(restore_dict)
# Freeze weights
if freeze_weights:
self.freeze_weights(freeze_weights)
else:
print(" > No weights are freezed")
except Exception as e:
print(" > Exception {}".format(e))
def restore(self, restore=None):
# Load pre-trained I3D + Graph weights for fine-tune (replace the last FC)
restore_finetuned = restore.get("restore_finetuned", None)
if restore_finetuned:
self._restore_fintuned(restore_finetuned)
print(" > Restored I3D + Graph weights")
return
# Load pre-trained I3D weights
restore_i3d = restore.get("restore_i3d", None)
if restore_i3d:
self._restore_i3d(restore_i3d)
print(" > Restored only I3D weights")
return
# Load pre-trained I3D + Graph weights without replacing anything
restore_predict = restore.get("restore_predict", None)
if restore_predict:
self._restore_predict(restore_predict)
print(" > Restored the model with strict weights")
return
def _restore_predict(self, path):
if path is None:
raise TrainingScheduleError('You should pre-train the video model on your training data first')
weights = torch.load(path, map_location=self.device)['state_dict']
new_weights = {}
for k, v in weights.items():
new_weights[k.replace('module.', '')] = v
self.load_state_dict(new_weights, strict=True)
print(" > Weights {} loaded".format(path))
def _restore_i3d(self, path):
if path is None:
raise TrainingScheduleError('You should pre-train the video model on your training data first')
weights = torch.load(path, map_location=self.device)['state_dict']
new_weights = {}
for k, v in weights.items():
if not k.startswith('module.fc') and not k.startswith('module.i3D.classifier'):
new_weights[k.replace('module.', '')] = v
self.load_state_dict(new_weights, strict=False)
def _restore_fintuned(self, path):
if path is None:
raise TrainingScheduleError('You should pre-train the video model on your training data first')
weights = torch.load(path, map_location=self.device)['state_dict']
new_weights = {}
for k, v in weights.items():
# Don't load classifiers (different classes 88 vs 86)
if not k.startswith('module.fc'):
if not k.startswith('module.i3D.classifier'):
new_weights[k.replace('module.', '')] = v
self.load_state_dict(new_weights, strict=False)
print(" > Weights {} loaded".format(path))
def freeze_weights(self, module):
if module == 'i3d':
print(" > Freeze I3D module")
for param in self.i3D.parameters():
param.requires_grad = False
elif module == 'fine_tuned':
print(" > Freeze Graph + I3D module, only last FC is training")
# Fixed the entire params without the last FC
for name, param in self.i3D.named_parameters():
if not name.startswith('classifier'):
param.requires_grad = False
for param in self.graph_embedding.parameters():
param.requires_grad = False
for param in self.conv.parameters():
param.requires_grad = False
else:
raise NotImplementedError('Unrecognized option, you can freeze either graph module or I3D module')
pass
def _get_i3d_features(self, videos, output_video_features=False):
# org_features - [V x 2048 x T / 2 x 14 x 14]
_, org_features = self.i3D(videos)
# Reduce dimension video_features - [V x 512 x T / 2 x 14 x 14]
videos_features = self.conv(org_features)
bs, d, t, h, w = videos_features.size()
# Get global features
videos_features_rs = videos_features.permute(0, 2, 1, 3, 4) # [V x T / 2 x 512 x h x w]
videos_features_rs = videos_features_rs.reshape(-1, d, h, w) # [V * T / 2 x 512 x h x w]
global_features = self.avgpool(videos_features_rs) # [V * T / 2 x 512 x 1 x 1]
global_features = self.dropout(global_features)
global_features = global_features.reshape(bs, t, d) # [V x T / 2 x 512]
if output_video_features:
return global_features, videos_features
else:
return global_features
def flatten(self, x):
return [item for sublist in x for item in sublist]
import torch
import torch.nn as nn
import torch.nn.functional as F
from i3D.resnet3d_xl import Net
from i3D.nonlocal_helper import Nonlocal
class VideoModelCoord(nn.Module):
def __init__(self, opt):
super(VideoModelCoord, self).__init__()
self.nr_boxes = opt.num_boxes
self.nr_actions = opt.num_classes
self.nr_frames = opt.num_frames // 2
self.coord_feature_dim = opt.coord_feature_dim
self.coord_to_feature = nn.Sequential(
nn.Linear(4, self.coord_feature_dim//2, bias=False),
nn.BatchNorm1d(self.coord_feature_dim//2),
nn.ReLU(inplace=True),
nn.Linear(self.coord_feature_dim//2, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU()
)
self.spatial_node_fusion = nn.Sequential(
nn.Linear(self.coord_feature_dim*2, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU(inplace=True),
nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU()
)
self.box_feature_fusion = nn.Sequential(
nn.Linear(self.nr_frames*self.coord_feature_dim, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU(inplace=True),
nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU()
)
self.classifier = nn.Sequential(
nn.Linear(self.coord_feature_dim, self.coord_feature_dim),
# nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU(inplace=True),
nn.Linear(self.coord_feature_dim, 512), #self.coord_feature_dim),
nn.ReLU(inplace=True),
nn.Linear(512, self.nr_actions)
)
if opt.fine_tune:
self.fine_tune(opt.fine_tune)
def fine_tune(self, restore_path, parameters_to_train=['classifier']):
weights = torch.load(restore_path)['state_dict']
new_weights = {}
#import pdb
for k, v in weights.items():
if not 'classifier.4' in k:
new_weights[k.replace('module.', '')] = v
#pdb.set_trace()
self.load_state_dict(new_weights, strict=False)
print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
frozen_weights = 0
for name, param in self.named_parameters():
if not 'classifier.4' in name:
param.requires_grad = False
frozen_weights += 1
else:
print('Training : {}'.format(name))
print('Number of frozen weights {}'.format(frozen_weights))
assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
'Check the naming convention of the parameters'
def forward(self, global_img_input, box_categories, box_input, video_label, is_inference=False):
# local_img_tensor is (b, nr_frames, nr_boxes, 3, h, w)
# global_img_tensor is (b, nr_frames, 3, h, w)
# box_input is (b, nr_frames, nr_boxes, 4)
b, _, _, _h, _w = global_img_input.size()
# global_imgs = global_img_input.view(b*self.nr_frames, 3, _h, _w)
# local_imgs = local_img_input.view(b*self.nr_frames*self.nr_boxes, 3, _h, _w)
box_input = box_input.transpose(2, 1).contiguous()
box_input = box_input.view(b*self.nr_boxes*self.nr_frames, 4)
bf = self.coord_to_feature(box_input)
bf = bf.view(b, self.nr_boxes, self.nr_frames, self.coord_feature_dim)
# spatial message passing (graph)
spatial_message = bf.sum(dim=1, keepdim=True) # (b, 1, self.nr_frames, coord_feature_dim)
# message passed should substract itself, and normalize to it as a single feature
spatial_message = (spatial_message - bf) / (self.nr_boxes - 1) # message passed should substract itself
bf_and_message = torch.cat([bf, spatial_message], dim=3) # (b, nr_boxes, nr_frames, 2*coord_feature_dim)
# (b*nr_boxes*nr_frames, coord_feature_dim)
bf_spatial = self.spatial_node_fusion(bf_and_message.view(b*self.nr_boxes*self.nr_frames, -1))
bf_spatial = bf_spatial.view(b, self.nr_boxes, self.nr_frames, self.coord_feature_dim)
bf_temporal_input = bf_spatial.view(b, self.nr_boxes, self.nr_frames*self.coord_feature_dim)
box_features = self.box_feature_fusion(bf_temporal_input.view(b*self.nr_boxes, -1)) # (b*nr_boxes, coord_feature_dim)
box_features = torch.mean(box_features.view(b, self.nr_boxes, -1), dim=1) # (b, coord_feature_dim)
# video_features = torch.cat([global_features, local_features, box_features], dim=1)
video_features = box_features
cls_output = self.classifier(video_features) # (b, num_classes)
return cls_output
class VideoModelCoordLatent(nn.Module):
def __init__(self, opt):
super(VideoModelCoordLatent, self).__init__()
self.nr_boxes = opt.num_boxes
self.nr_actions = opt.num_classes
self.nr_frames = opt.num_frames // 2
self.img_feature_dim = opt.img_feature_dim
self.coord_feature_dim = opt.coord_feature_dim
self.category_embed_layer = nn.Embedding(3, opt.coord_feature_dim // 2, padding_idx=0, scale_grad_by_freq=True)
self.coord_to_feature = nn.Sequential(
nn.Linear(4, self.coord_feature_dim//2, bias=False),
nn.BatchNorm1d(self.coord_feature_dim//2),
nn.ReLU(inplace=True),
nn.Linear(self.coord_feature_dim//2, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU()
)
self.coord_category_fusion = nn.Sequential(
nn.Linear(self.coord_feature_dim+self.coord_feature_dim//2, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU(inplace=True),
)
self.spatial_node_fusion = nn.Sequential(
nn.Linear(self.coord_feature_dim*2, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU(inplace=True),
nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU()
)
self.box_feature_fusion = nn.Sequential(
nn.Linear(self.nr_frames*self.coord_feature_dim, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU(inplace=True),
nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU()
)
self.classifier = nn.Sequential(
nn.Linear(self.coord_feature_dim, self.coord_feature_dim),
# nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU(inplace=True),
nn.Linear(self.coord_feature_dim, 512), #self.coord_feature_dim),
nn.ReLU(inplace=True),
nn.Linear(512, self.nr_actions)
)
if opt.fine_tune:
self.fine_tune(opt.fine_tune)
def fine_tune(self, restore_path, parameters_to_train=['classifier']):
weights = torch.load(restore_path)['state_dict']
new_weights = {}
for k, v in weights.items():
if not 'classifier.4' in k:
new_weights[k.replace('module.', '')] = v
self.load_state_dict(new_weights, strict=False)
print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
frozen_weights = 0
for name, param in self.named_parameters():
if not 'classifier.4' in name:
param.requires_grad = False
frozen_weights += 1
else:
print('Training : {}'.format(name))
print('Number of frozen weights {}'.format(frozen_weights))
assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
'Check the naming convention of the parameters'
def forward(self, global_img_input, box_categories, box_input, video_label, is_inference=False):
# local_img_tensor is (b, nr_frames, nr_boxes, 3, h, w)
# global_img_tensor is (b, nr_frames, 3, h, w)
# box_input is (b, nr_frames, nr_boxes, 4)
b, _, _, _h, _w = global_img_input.size()
box_input = box_input.transpose(2, 1).contiguous()
box_input = box_input.view(b*self.nr_boxes*self.nr_frames, 4)
box_categories = box_categories.long()
box_categories = box_categories.transpose(2, 1).contiguous()
box_categories = box_categories.view(b*self.nr_boxes*self.nr_frames)
box_category_embeddings = self.category_embed_layer(box_categories) # (b*nr_b*nr_f, coord_feature_dim//2)
bf = self.coord_to_feature(box_input)
bf = torch.cat([bf, box_category_embeddings], dim=1) # (b*nr_b*nr_f, coord_feature_dim + coord_feature_dim//2)
bf = self.coord_category_fusion(bf) # (b*nr_b*nr_f, coord_feature_dim)
bf = bf.view(b, self.nr_boxes, self.nr_frames, self.coord_feature_dim)
# spatial message passing (graph)
spatial_message = bf.sum(dim=1, keepdim=True) # (b, 1, self.nr_frames, coord_feature_dim)
# message passed should substract itself, and normalize to it as a single feature
spatial_message = (spatial_message - bf) / (self.nr_boxes - 1) # message passed should substract itself
bf_and_message = torch.cat([bf, spatial_message], dim=3) # (b, nr_boxes, nr_frames, 2*coord_feature_dim)
# (b*nr_boxes*nr_frames, coord_feature_dim)
bf_spatial = self.spatial_node_fusion(bf_and_message.view(b*self.nr_boxes*self.nr_frames, -1))
bf_spatial = bf_spatial.view(b, self.nr_boxes, self.nr_frames, self.coord_feature_dim)
bf_temporal_input = bf_spatial.view(b, self.nr_boxes, self.nr_frames*self.coord_feature_dim)
box_features = self.box_feature_fusion(bf_temporal_input.view(b*self.nr_boxes, -1)) # (b*nr_boxes, coord_feature_dim)
box_features = torch.mean(box_features.view(b, self.nr_boxes, -1), dim=1) # (b, coord_feature_dim)
# video_features = torch.cat([global_features, local_features, box_features], dim=1)
video_features = box_features
cls_output = self.classifier(video_features) # (b, num_classes)
return cls_output
class VideoModelCoordLatentNL(nn.Module):
def __init__(self, opt):
super(VideoModelCoordLatentNL, self).__init__()
self.nr_boxes = opt.num_boxes
self.nr_actions = opt.num_classes
self.nr_frames = opt.num_frames // 2
self.img_feature_dim = opt.img_feature_dim
self.coord_feature_dim = opt.coord_feature_dim
self.category_embed_layer = nn.Embedding(3, opt.coord_feature_dim // 2, padding_idx=0, scale_grad_by_freq=True)
self.coord_to_feature = nn.Sequential(
nn.Linear(4, self.coord_feature_dim // 2, bias=False),
nn.BatchNorm1d(self.coord_feature_dim // 2),
nn.ReLU(inplace=True),
nn.Linear(self.coord_feature_dim // 2, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU()
)
self.coord_category_fusion = nn.Sequential(
nn.Linear(self.coord_feature_dim + self.coord_feature_dim // 2, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU(inplace=True),
)
self.spatial_node_fusion = nn.Sequential(
nn.Linear(self.coord_feature_dim * 2, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU(inplace=True),
nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU()
)
self.nr_nonlocal_layers = 3
self.nonlocal_fusion = []
for i in range(self.nr_nonlocal_layers):
self.nonlocal_fusion.append(nn.Sequential(
Nonlocal(dim=self.coord_feature_dim, dim_inner=self.coord_feature_dim // 2),
nn.Conv1d(self.coord_feature_dim, self.coord_feature_dim, kernel_size=1, stride=1, padding=0,
bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU()
))
self.nonlocal_fusion = nn.ModuleList(self.nonlocal_fusion)
self.box_feature_fusion = nn.Sequential(
nn.Linear(self.nr_frames * self.coord_feature_dim, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU(inplace=True),
nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU()
)
self.classifier = nn.Sequential(
nn.Linear(self.coord_feature_dim, self.coord_feature_dim),
# nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU(inplace=True),
nn.Linear(self.coord_feature_dim, 512), # self.coord_feature_dim),
nn.ReLU(inplace=True),
nn.Linear(512, self.nr_actions)
)
if opt.fine_tune:
self.fine_tune(opt.fine_tune)
def train(self, mode=True): # overriding default train function
super(VideoModelCoordLatentNL, self).train(mode)
for m in self.modules(): # or self.modules(), if freezing all bn layers
if isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d):
m.eval()
# shutdown update in frozen mode
m.weight.requires_grad = False
m.bias.requires_grad = False
def fine_tune(self, restore_path, parameters_to_train=['classifier']):
weights = torch.load(restore_path)['state_dict']
new_weights = {}
# import pdb
for k, v in weights.items():
if not 'classifier.4' in k:
new_weights[k.replace('module.', '')] = v
# pdb.set_trace()
self.load_state_dict(new_weights, strict=False)
print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
frozen_weights = 0
for name, param in self.named_parameters():
if not 'classifier.4' in name:
param.requires_grad = False
frozen_weights += 1
else:
print('Training : {}'.format(name))
print('Number of frozen weights {}'.format(frozen_weights))
assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
'Check the naming convention of the parameters'
def forward(self, global_img_input, box_categories, box_input, video_label, is_inference=False):
# local_img_tensor is (b, nr_frames, nr_boxes, 3, h, w)
# global_img_tensor is (b, nr_frames, 3, h, w)
# box_input is (b, nr_frames, nr_boxes, 4)
b, _, _, _h, _w = global_img_input.size()
box_input = box_input.transpose(2, 1).contiguous()
box_input = box_input.view(b * self.nr_boxes * self.nr_frames, 4)
box_categories = box_categories.long()
box_categories = box_categories.transpose(2, 1).contiguous()
box_categories = box_categories.view(b * self.nr_boxes * self.nr_frames)
box_category_embeddings = self.category_embed_layer(box_categories) # (b*nr_b*nr_f, coord_feature_dim//2)
bf = self.coord_to_feature(box_input)
bf = torch.cat([bf, box_category_embeddings], dim=1) # (b*nr_b*nr_f, coord_feature_dim + coord_feature_dim//2)
bf = self.coord_category_fusion(bf) # (b*nr_b*nr_f, coord_feature_dim)
bf = bf.view(b, self.nr_boxes, self.nr_frames, self.coord_feature_dim)
# spatial message passing (graph)
spatial_message = bf.sum(dim=1, keepdim=True) # (b, 1, self.nr_frames, coord_feature_dim)
# message passed should substract itself, and normalize to it as a single feature
spatial_message = (spatial_message - bf) / (self.nr_boxes - 1) # message passed should substract itself
bf_and_message = torch.cat([bf, spatial_message], dim=3) # (b, nr_boxes, nr_frames, 2*coord_feature_dim)
# (b*nr_boxes*nr_frames, coord_feature_dim)
bf_spatial = self.spatial_node_fusion(bf_and_message.view(b * self.nr_boxes * self.nr_frames, -1))
bf_spatial = bf_spatial.view(b, self.nr_boxes, self.nr_frames, self.coord_feature_dim)
bf_temporal_input = bf_spatial.view(b, self.nr_boxes, self.nr_frames * self.coord_feature_dim)
bf_nonlocal = self.box_feature_fusion(
bf_temporal_input.view(b * self.nr_boxes, -1)) # (b*nr_boxes, coord_feature_dim)
bf_nonlocal = bf_nonlocal.view(b, self.nr_boxes, self.coord_feature_dim).permute(0, 2,
1).contiguous() # (N, C, NB)
for i in range(self.nr_nonlocal_layers):
bf_nonlocal = self.nonlocal_fusion[i](bf_nonlocal)
box_features = torch.mean(bf_nonlocal, dim=2) # (b, coord_feature_dim)
# video_features = torch.cat([global_features, local_features, box_features], dim=1)
video_features = box_features
cls_output = self.classifier(video_features) # (b, num_classes)
return cls_output
class VideoModelGlobalCoordLatent(nn.Module):
"""
This model contains only global pooling without any graph.
"""
def __init__(self, opt,
):
super(VideoModelGlobalCoordLatent, self).__init__()
self.nr_boxes = opt.num_boxes
self.nr_actions = opt.num_classes
self.nr_frames = opt.num_frames
self.img_feature_dim = opt.img_feature_dim
self.coord_feature_dim = opt.coord_feature_dim
self.i3D = Net(self.nr_actions, extract_features=True, loss_type='softmax')
self.dropout = nn.Dropout(0.3)
self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
self.conv = nn.Conv3d(2048, 512, kernel_size=(1, 1, 1), stride=1)
self.category_embed_layer = nn.Embedding(3, opt.coord_feature_dim // 2, padding_idx=0, scale_grad_by_freq=True)
self.c_coord_category_fusion = nn.Sequential(
nn.Linear(self.coord_feature_dim+self.coord_feature_dim//2, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU(inplace=True),
)
self.c_coord_to_feature = nn.Sequential(
nn.Linear(4, self.coord_feature_dim // 2, bias=False),
nn.BatchNorm1d(self.coord_feature_dim // 2),
nn.ReLU(inplace=True),
nn.Linear(self.coord_feature_dim // 2, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU()
)
self.c_spatial_node_fusion = nn.Sequential(
nn.Linear(self.coord_feature_dim * 2, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU(inplace=True),
nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU()
)
self.c_box_feature_fusion = nn.Sequential(
nn.Linear((self.nr_frames // 2) * self.coord_feature_dim, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU(inplace=True),
nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU()
)
self.classifier = nn.Sequential(
nn.Linear(self.coord_feature_dim + 2*self.img_feature_dim, self.coord_feature_dim),
nn.ReLU(inplace=True),
nn.Linear(self.coord_feature_dim, 512),
nn.ReLU(inplace=True),
nn.Linear(512, self.nr_actions)
)
if opt.fine_tune:
self.fine_tune(opt.fine_tune)
if opt.restore_i3d:
self.restore_i3d(opt.restore_i3d)
if opt.restore_custom:
self.restore_custom(opt.restore_custom)
def train(self, mode=True): # overriding default train function
super(VideoModelGlobalCoordLatent, self).train(mode)
for m in self.modules(): # or self.modules(), if freezing all bn layers
if isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d):
m.eval()
# shutdown update in frozen mode
m.weight.requires_grad = False
m.bias.requires_grad = False
def restore_custom(self, restore_path):
print("restoring path {}".format(restore_path))
weights = torch.load(restore_path)
ks = list(weights.keys())
print('\n\n BEFORE', weights[ks[0]][0,0,0])
new_weights = {}
# import pdb
for k, v in weights.items():
new_weights[k.replace('module.', '')] = v
self.load_state_dict(new_weights, strict=False)
print('\n\n AFTER', self.state_dict()[ks[0]][0,0, 0])
print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
frozen_weights = 0
for name, param in self.named_parameters():
if not name.startswith('classifier') :
param.requires_grad = False
frozen_weights += 1
else:
print('Training : {}'.format(name))
print('Number of frozen weights {}'.format(frozen_weights))
assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
'Check the naming convention of the parameters'
def restore_i3d(self, restore_path, parameters_to_train=['classifier']):
weights = torch.load(restore_path)['state_dict']
new_weights = {}
# import pdb
for k, v in weights.items():
if 'i3D' in k :
new_weights[k.replace('module.', '')] = v
# pdb.set_trace()
self.load_state_dict(new_weights, strict=False)
print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
for m in self.i3D.modules():
if isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d):
m.eval()
# shutdown update in frozen mode
m.weight.requires_grad = False
m.bias.requires_grad = False
frozen_weights = 0
for name, param in self.named_parameters():
if 'i3D' in name:
param.requires_grad = False
frozen_weights += 1
else:
print('Training : {}'.format(name))
print('Number of frozen weights {}'.format(frozen_weights))
assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
'Check the naming convention of the parameters'
def fine_tune(self, restore_path, parameters_to_train=['classifier']):
weights = torch.load(restore_path)['state_dict']
new_weights = {}
# import pdb
for k, v in weights.items():
if not 'classifier.4' in k and 'i3D.classifier':
new_weights[k.replace('module.', '')] = v
# pdb.set_trace()
self.load_state_dict(new_weights, strict=False)
print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
frozen_weights = 0
for name, param in self.named_parameters():
if not 'classifier.4' in name:
param.requires_grad = False
frozen_weights += 1
else:
print('Training : {}'.format(name))
print('Number of frozen weights {}'.format(frozen_weights))
assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
'Check the naming convention of the parameters'
def forward(self, global_img_input, box_categories, box_input, video_label, is_inference=False):
"""
V: num of videos
T: num of frames
P: num of proposals
:param videos: [V x 3 x T x 224 x 224]
:param proposals_t: [V x T] List of BoxList (size of num_boxes each)
:return:
"""
# org_features - [V x 2048 x T / 2 x 14 x 14]
bs, _, _, _, _ = global_img_input.shape
y_i3d, org_features = self.i3D(global_img_input)
# Reduce dimension video_features - [V x 512 x T / 2 x 14 x 14]
videos_features = self.conv(org_features)
b = bs
box_input = box_input.transpose(2, 1).contiguous()
box_input = box_input.view(b * self.nr_boxes * (self.nr_frames//2), 4)
box_categories = box_categories.long()
box_categories = box_categories.transpose(2, 1).contiguous()
box_categories = box_categories.view(b * self.nr_boxes * (self.nr_frames // 2))
box_category_embeddings = self.category_embed_layer(box_categories) # (b*nr_b*nr_f, coord_feature_dim//2)
bf = self.c_coord_to_feature(box_input)
bf = torch.cat([bf, box_category_embeddings], dim=1) # (b*nr_b*nr_f, coord_feature_dim + coord_feature_dim//2)
bf = self.c_coord_category_fusion(bf) # (b*nr_b*nr_f, coord_feature_dim)
bf = bf.view(b, self.nr_boxes, self.nr_frames // 2, self.coord_feature_dim)
# spatial message passing (graph)
spatial_message = bf.sum(dim=1, keepdim=True) # (b, 1, self.nr_frames, coord_feature_dim)
# message passed should substract itself, and normalize to it as a single feature
spatial_message = (spatial_message - bf) / (self.nr_boxes - 1) # message passed should substract itself
bf_message_gf = torch.cat([bf, spatial_message], dim=3) # (b, nr_boxes, nr_frames, 2*coord_feature_dim)
# (b*nr_boxes*nr_frames, coord_feature_dim)
bf_spatial = self.c_spatial_node_fusion(bf_message_gf.view(b * self.nr_boxes * (self.nr_frames // 2), -1))
bf_spatial = bf_spatial.view(b, self.nr_boxes, self.nr_frames // 2, self.coord_feature_dim)
bf_temporal_input = bf_spatial.view(b, self.nr_boxes, (self.nr_frames // 2) * self.coord_feature_dim)
box_features = self.c_box_feature_fusion(
bf_temporal_input.view(b * self.nr_boxes, -1)) # (b*nr_boxes, img_feature_dim)
coord_ft = torch.mean(box_features.view(b, self.nr_boxes, -1), dim=1) # (b, coord_feature_dim)
# video_features = torch.cat([global_features, local_features, box_features], dim=1)
# _gf = self.global_new_fc(_gf)
_gf = videos_features.mean(-1).mean(-1).view(b, (self.nr_frames//2), 2*self.img_feature_dim)
_gf = _gf.mean(1)
video_features = torch.cat([_gf.view(b, -1), coord_ft], dim=-1)
cls_output = self.classifier(video_features) # (b, num_classes)
return cls_output
class VideoModelGlobalCoordLatentNL(nn.Module):
"""
This model contains only global pooling without any graph.
"""
def __init__(self, base_net, opt,
):
super(VideoModelGlobalCoordLatentNL, self).__init__()
self.nr_boxes = opt.num_boxes
self.nr_actions = opt.num_classes
self.nr_frames = opt.num_frames
self.img_feature_dim = opt.img_feature_dim
self.coord_feature_dim = opt.coord_feature_dim
self.i3D = Net(self.nr_actions, extract_features=True, loss_type='softmax')
self.dropout = nn.Dropout(0.3)
self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
self.conv = nn.Conv3d(2048, 512, kernel_size=(1, 1, 1), stride=1)
self.category_embed_layer = nn.Embedding(3, opt.coord_feature_dim // 2, padding_idx=0, scale_grad_by_freq=True)
self.c_coord_category_fusion = nn.Sequential(
nn.Linear(self.coord_feature_dim+self.coord_feature_dim//2, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU(inplace=True),
)
self.c_coord_to_feature = nn.Sequential(
nn.Linear(4, self.coord_feature_dim // 2, bias=False),
nn.BatchNorm1d(self.coord_feature_dim // 2),
nn.ReLU(inplace=True),
nn.Linear(self.coord_feature_dim // 2, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU()
)
self.c_spatial_node_fusion = nn.Sequential(
nn.Linear(self.coord_feature_dim * 2, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU(inplace=True),
nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU()
)
self.nr_nonlocal_layers = 3
self.c_nonlocal_fusion = []
for i in range(self.nr_nonlocal_layers):
self.c_nonlocal_fusion.append(nn.Sequential(
Nonlocal(dim=self.coord_feature_dim, dim_inner=self.coord_feature_dim // 2),
nn.Conv1d(self.coord_feature_dim, self.coord_feature_dim, kernel_size=1, stride=1, padding=0, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU()
))
self.c_nonlocal_fusion = nn.ModuleList(self.c_nonlocal_fusion)
self.c_box_feature_fusion = nn.Sequential(
nn.Linear((self.nr_frames // 2) * self.coord_feature_dim, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU(inplace=True),
nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU()
)
self.classifier = nn.Sequential(
nn.Linear(self.coord_feature_dim + 2*self.img_feature_dim, self.coord_feature_dim),
nn.ReLU(inplace=True),
nn.Linear(self.coord_feature_dim, 512),
nn.ReLU(inplace=True),
nn.Linear(512, self.nr_actions)
)
if opt.fine_tune:
self.fine_tune(opt.fine_tune)
if opt.restore_i3d:
self.restore_i3d(opt.restore_i3d)
if opt.restore_custom:
self.restore_custom(opt.restore_custom)
def restore_custom(self, restore_path):
print("restoring path {}".format(restore_path))
weights = torch.load(restore_path)
ks = list(weights.keys())
print('\n\n BEFORE', weights[ks[0]][0,0,0])
new_weights = {}
# import pdb
for k, v in weights.items():
new_weights[k.replace('module.', '')] = v
self.load_state_dict(new_weights, strict=False)
print('\n\n AFTER', self.state_dict()[ks[0]][0,0, 0])
print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
frozen_weights = 0
for name, param in self.named_parameters():
if not name.startswith('classifier') :
param.requires_grad = False
frozen_weights += 1
else:
print('Training : {}'.format(name))
print('Number of frozen weights {}'.format(frozen_weights))
assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
'Check the naming convention of the parameters'
def restore_i3d(self, restore_path, parameters_to_train=['classifier']):
weights = torch.load(restore_path)['state_dict']
new_weights = {}
# import pdb
for k, v in weights.items():
if 'i3D' in k or k.startswith('conv.'):
new_weights[k.replace('module.', '')] = v
# pdb.set_trace()
self.load_state_dict(new_weights, strict=False)
print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
for m in self.i3D.modules():
if isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d):
m.eval()
# shutdown update in frozen mode
m.weight.requires_grad = False
m.bias.requires_grad = False
frozen_weights = 0
for name, param in self.named_parameters():
if 'i3D' in name or k.startswith('conv.') :
param.requires_grad = False
frozen_weights += 1
else:
print('Training : {}'.format(name))
print('Number of frozen weights {}'.format(frozen_weights))
assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
'Check the naming convention of the parameters'
def train(self, mode=True): # overriding default train function
super(VideoModelGlobalCoordLatentNL, self).train(mode)
for m in self.i3D.modules(): # or self.modules(), if freezing all bn layers
if isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d):
m.eval()
# shutdown update in frozen mode
m.weight.requires_grad = False
m.bias.requires_grad = False
def fine_tune(self, restore_path, parameters_to_train=['classifier']):
weights = torch.load(restore_path)['state_dict']
new_weights = {}
import pdb
for k, v in weights.items():
if not 'classifier.4' in k and 'i3D.classifier' not in k:
new_weights[k.replace('module.', '')] = v
pdb.set_trace()
self.load_state_dict(new_weights, strict=False)
print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
frozen_weights = 0
for name, param in self.named_parameters():
if not 'classifier.4' in name:
param.requires_grad = False
frozen_weights += 1
else:
print('Training : {}'.format(name))
print('Number of frozen weights {}'.format(frozen_weights))
assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
'Check the naming convention of the parameters'
def forward(self, global_img_input, box_categories, box_input, video_label, is_inference=False):
"""
V: num of videos
T: num of frames
P: num of proposals
:param videos: [V x 3 x T x 224 x 224]
:param proposals_t: [V x T] List of BoxList (size of num_boxes each)
:return:
"""
# org_features - [V x 2048 x T / 2 x 14 x 14]
bs, _, _, _, _ = global_img_input.shape
y_i3d, org_features = self.i3D(global_img_input)
# Reduce dimension video_features - [V x 512 x T / 2 x 14 x 14]
videos_features = self.conv(org_features)
b = bs
box_input = box_input.transpose(2, 1).contiguous()
box_input = box_input.view(b * self.nr_boxes * (self.nr_frames//2), 4)
box_categories = box_categories.long()
box_categories = box_categories.transpose(2, 1).contiguous()
box_categories = box_categories.view(b * self.nr_boxes * (self.nr_frames // 2))
box_category_embeddings = self.category_embed_layer(box_categories) # (b*nr_b*nr_f, coord_feature_dim//2)
bf = self.c_coord_to_feature(box_input)
bf = torch.cat([bf, box_category_embeddings], dim=1) # (b*nr_b*nr_f, coord_feature_dim + coord_feature_dim//2)
bf = self.c_coord_category_fusion(bf) # (b*nr_b*nr_f, coord_feature_dim)
bf = bf.view(b, self.nr_boxes, self.nr_frames // 2, self.coord_feature_dim)
# spatial message passing (graph)
spatial_message = bf.sum(dim=1, keepdim=True) # (b, 1, self.nr_frames, coord_feature_dim)
# message passed should substract itself, and normalize to it as a single feature
spatial_message = (spatial_message - bf) / (self.nr_boxes - 1) # message passed should substract itself
bf_message_gf = torch.cat([bf, spatial_message], dim=3) # (b, nr_boxes, nr_frames, 2*coord_feature_dim)
# (b*nr_boxes*nr_frames, coord_feature_dim)
bf_spatial = self.c_spatial_node_fusion(bf_message_gf.view(b * self.nr_boxes * (self.nr_frames // 2), -1))
bf_spatial = bf_spatial.view(b, self.nr_boxes, self.nr_frames // 2, self.coord_feature_dim)
bf_temporal_input = bf_spatial.view(b, self.nr_boxes, (self.nr_frames // 2) * self.coord_feature_dim)
bf_nonlocal = self.c_box_feature_fusion(
bf_temporal_input.view(b * self.nr_boxes, -1)) # (b*nr_boxes, img_feature_dim)
bf_nonlocal = bf_nonlocal.view(b, self.nr_boxes, self.coord_feature_dim).permute(0, 2, 1).contiguous() # (N, C, NB)
for i in range(self.nr_nonlocal_layers):
bf_nonlocal = self.c_nonlocal_fusion[i](bf_nonlocal)
coord_ft = torch.mean(bf_nonlocal, dim=2) # (b, coord_feature_dim)
# video_features = torch.cat([global_features, local_features, box_features], dim=1)
_gf = videos_features.mean(-1).mean(-1).view(b, (self.nr_frames//2), 2*self.img_feature_dim)
_gf = _gf.mean(1)
video_features = torch.cat([_gf.view(b, -1), coord_ft], dim=-1)
cls_output = self.classifier(video_features) # (b, num_classes)
return cls_output
class VideoGlobalModel(nn.Module):
"""
This model contains only global pooling without any graph.
"""
def __init__(self, opt,
):
super(VideoGlobalModel, self).__init__()
self.nr_boxes = opt.num_boxes
self.nr_actions = opt.num_classes
self.nr_frames = opt.num_frames
self.img_feature_dim = opt.img_feature_dim
self.coord_feature_dim = opt.coord_feature_dim
self.i3D = Net(self.nr_actions, extract_features=True, loss_type='softmax')
self.dropout = nn.Dropout(0.3)
self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
self.conv = nn.Conv3d(2048, 512, kernel_size=(1, 1, 1), stride=1)
self.fc = nn.Linear(512, self.nr_actions)
self.crit = nn.CrossEntropyLoss()
if opt.fine_tune:
self.fine_tune(opt.fine_tune)
def fine_tune(self, restore_path, parameters_to_train=['classifier']):
weights = torch.load(restore_path)['state_dict']
new_weights = {}
for k, v in weights.items():
if not 'fc' in k and not 'classifier' in k:
new_weights[k.replace('module.', '')] = v
self.load_state_dict(new_weights, strict=False)
print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
frozen_weights = 0
for name, param in self.named_parameters():
if not 'fc' in name:
param.requires_grad = False
frozen_weights += 1
else:
print('Training : {}'.format(name))
print('Number of frozen weights {}'.format(frozen_weights))
assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
'Check the naming convention of the parameters'
def forward(self, global_img_input, local_img_input, box_input, video_label, is_inference=False):
"""
V: num of videos
T: num of frames
P: num of proposals
:param videos: [V x 3 x T x 224 x 224]
:param proposals_t: [V x T] List of BoxList (size of num_boxes each)
:return:
"""
# org_features - [V x 2048 x T / 2 x 14 x 14]
y_i3d, org_features = self.i3D(global_img_input)
# Reduce dimension video_features - [V x 512 x T / 2 x 14 x 14]
videos_features = self.conv(org_features)
# Get global features - [V x 512]
global_features = self.avgpool(videos_features).squeeze()
global_features = self.dropout(global_features)
cls_output = self.fc(global_features)
return cls_output
class VideoModelGlobalCoord(nn.Module):
"""
This model contains only global pooling without any graph.
"""
def __init__(self, opt):
super(VideoModelGlobalCoord, self).__init__()
self.nr_boxes = opt.num_boxes
self.nr_actions = opt.num_classes
self.nr_frames = opt.num_frames
self.img_feature_dim = opt.img_feature_dim
self.coord_feature_dim = opt.coord_feature_dim
self.i3D = Net(self.nr_actions, extract_features=True, loss_type='softmax')
self.dropout = nn.Dropout(0.3)
self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
self.conv = nn.Conv3d(2048, 256, kernel_size=(1, 1, 1), stride=1)
self.global_new_fc = nn.Sequential(
nn.Linear(256, self.img_feature_dim, bias=False),
nn.BatchNorm1d(self.img_feature_dim),
nn.ReLU(inplace=True)
)
self.c_coord_to_feature = nn.Sequential(
nn.Linear(4, self.coord_feature_dim // 2, bias=False),
nn.BatchNorm1d(self.coord_feature_dim // 2),
nn.ReLU(inplace=True),
nn.Linear(self.coord_feature_dim // 2, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU()
)
self.c_spatial_node_fusion = nn.Sequential(
nn.Linear(self.coord_feature_dim * 2, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU(inplace=True),
nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU()
)
self.c_box_feature_fusion = nn.Sequential(
nn.Linear((self.nr_frames // 2) * self.coord_feature_dim, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU(inplace=True),
nn.Linear(self.coord_feature_dim, self.coord_feature_dim, bias=False),
nn.BatchNorm1d(self.coord_feature_dim),
nn.ReLU()
)
self.classifier = nn.Sequential(
nn.Linear(self.coord_feature_dim + self.img_feature_dim, self.coord_feature_dim),
nn.ReLU(inplace=True),
nn.Linear(self.coord_feature_dim, 512),
nn.ReLU(inplace=True),
nn.Linear(512, self.nr_actions)
)
if opt.fine_tune:
self.fine_tune(opt.fine_tune)
if opt.restore_i3d:
self.restore_i3d(opt.restore_i3d)
def train(self, mode=True): # overriding default train function
super(VideoModelGlobalCoord, self).train(mode)
for m in self.i3D.modules(): # or self.modules(), if freezing all bn layers
if isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d):
m.eval()
# shutdown update in frozen mode
m.weight.requires_grad = False
m.bias.requires_grad = False
def restore_i3d(self, restore_path, parameters_to_train=['classifier']):
weights = torch.load(restore_path)['state_dict']
new_weights = {}
# import pdb
for k, v in weights.items():
if 'i3D' in k :
new_weights[k.replace('module.', '')] = v
# pdb.set_trace()
self.load_state_dict(new_weights, strict=False)
print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
frozen_weights = 0
for name, param in self.named_parameters():
if 'i3D' in name:
param.requires_grad = False
frozen_weights += 1
else:
print('Training : {}'.format(name))
print('Number of frozen weights {}'.format(frozen_weights))
assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
'Check the naming convention of the parameters'
def fine_tune(self, restore_path, parameters_to_train=['classifier']):
weights = torch.load(restore_path)['state_dict']
new_weights = {}
# import pdb
for k, v in weights.items():
if not 'classifier.4' in k and 'i3D.classifier':
new_weights[k.replace('module.', '')] = v
# pdb.set_trace()
self.load_state_dict(new_weights, strict=False)
print('Num of weights in restore dict {}'.format(len(new_weights.keys())))
frozen_weights = 0
for name, param in self.named_parameters():
if not 'classifier.4' in name:
param.requires_grad = False
frozen_weights += 1
else:
print('Training : {}'.format(name))
print('Number of frozen weights {}'.format(frozen_weights))
assert frozen_weights != 0, 'You are trying to fine tune, but no weights are frozen!!! ' \
'Check the naming convention of the parameters'
def forward(self, global_img_input, box_categories, box_input, video_label, is_inference=False):
"""
V: num of videos
T: num of frames
P: num of proposals
:param videos: [V x 3 x T x 224 x 224]
:param proposals_t: [V x T] List of BoxList (size of num_boxes each)
:return:
"""
# org_features - [V x 2048 x T / 2 x 14 x 14]
bs, _, _, _, _ = global_img_input.shape
y_i3d, org_features = self.i3D(global_img_input)
# Reduce dimension video_features - [V x 512 x T / 2 x 14 x 14]
videos_features = self.conv(org_features)
b = bs
box_input = box_input.transpose(2, 1).contiguous()
box_input = box_input.view(b * self.nr_boxes * (self.nr_frames//2), 4)
bf = self.c_coord_to_feature(box_input)
bf = bf.view(b, self.nr_boxes, self.nr_frames // 2, self.coord_feature_dim)
# spatial message passing (graph)
spatial_message = bf.sum(dim=1, keepdim=True) # (b, 1, self.nr_frames, coord_feature_dim)
# message passed should substract itself, and normalize to it as a single feature
spatial_message = (spatial_message - bf) / (self.nr_boxes - 1) # message passed should substract itself
bf_message_gf = torch.cat([bf, spatial_message], dim=3) # (b, nr_boxes, nr_frames, 2*coord_feature_dim)
# (b*nr_boxes*nr_frames, coord_feature_dim)
bf_spatial = self.c_spatial_node_fusion(bf_message_gf.view(b * self.nr_boxes * (self.nr_frames // 2), -1))
bf_spatial = bf_spatial.view(b, self.nr_boxes, self.nr_frames // 2, self.coord_feature_dim)
bf_temporal_input = bf_spatial.view(b, self.nr_boxes, (self.nr_frames // 2) * self.coord_feature_dim)
box_features = self.c_box_feature_fusion(
bf_temporal_input.view(b * self.nr_boxes, -1)) # (b*nr_boxes, img_feature_dim)
coord_ft = torch.mean(box_features.view(b, self.nr_boxes, -1), dim=1) # (b, coord_feature_dim)
# video_features = torch.cat([global_features, local_features, box_features], dim=1)
_gf = videos_features.mean(-1).mean(-1).view(b*(self.nr_frames//2), self.img_feature_dim)
_gf = self.global_new_fc(_gf)
_gf = _gf.view(b, self.nr_frames // 2, self.img_feature_dim).mean(1)
video_features = torch.cat([_gf.view(b, -1), coord_ft], dim=-1)
cls_output = self.classifier(video_features) # (b, num_classes)
return cls_output
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
"""Non-local helper"""
import torch
import torch.nn as nn
class Nonlocal(nn.Module):
"""
Builds Non-local Neural Networks as a generic family of building
blocks for capturing long-range dependencies. Non-local Network
computes the response at a position as a weighted sum of the
features at all positions. This building block can be plugged into
many computer vision architectures.
More details in the paper: https://arxiv.org/pdf/1711.07971.pdf
"""
def __init__(
self,
dim,
dim_inner,
pool_size=None,
instantiation="softmax",
norm_type="layernorm",
zero_init_final_conv=True,
zero_init_final_norm=False,
norm_eps=1e-5,
norm_momentum=0.1,
):
"""
Args:
dim (int): number of dimension for the input.
dim_inner (int): number of dimension inside of the Non-local block.
pool_size (list): the kernel size of spatial temporal pooling,
temporal pool kernel size, spatial pool kernel size, spatial
pool kernel size in order. By default pool_size is None,
then there would be no pooling used.
instantiation (string): supports two different instantiation method:
"dot_product": normalizing correlation matrix with L2.
"softmax": normalizing correlation matrix with Softmax.
norm_type (string): support BatchNorm and LayerNorm for
normalization.
"batchnorm": using BatchNorm for normalization.
"layernorm": using LayerNorm for normalization.
"none": not using any normalization.
zero_init_final_conv (bool): If true, zero initializing the final
convolution of the Non-local block.
zero_init_final_norm (bool):
If true, zero initializing the final batch norm of the Non-local
block.
"""
super(Nonlocal, self).__init__()
self.dim = dim
self.dim_inner = dim_inner
self.pool_size = pool_size
self.instantiation = instantiation
self.norm_type = norm_type
self.use_pool = (
False
if pool_size is None
else any((size > 1 for size in pool_size))
)
self.norm_eps = norm_eps
self.norm_momentum = norm_momentum
self._construct_nonlocal(zero_init_final_conv, zero_init_final_norm)
def _construct_nonlocal(self, zero_init_final_conv, zero_init_final_norm):
# Three convolution heads: theta, phi, and g.
self.conv_theta = nn.Conv1d(
self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0
)
self.conv_phi = nn.Conv1d(
self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0
)
self.conv_g = nn.Conv1d(
self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0
)
# Final convolution output.
self.conv_out = nn.Conv1d(
self.dim_inner, self.dim, kernel_size=1, stride=1, padding=0
)
# Zero initializing the final convolution output.
self.conv_out.zero_init = zero_init_final_conv
if self.norm_type == "batchnorm":
self.bn = nn.BatchNorm1d(
self.dim, eps=self.norm_eps, momentum=self.norm_momentum
)
# Zero initializing the final bn.
self.bn.transform_final_bn = zero_init_final_norm
elif self.norm_type == "layernorm":
# In Caffe2 the LayerNorm op does not contain the scale an bias
# terms described in the paper:
# https://caffe2.ai/docs/operators-catalogue.html#layernorm
# Builds LayerNorm as GroupNorm with one single group.
# Setting Affine to false to align with Caffe2.
self.ln = nn.GroupNorm(1, self.dim, eps=self.norm_eps, affine=False)
elif self.norm_type == "none":
# Does not use any norm.
pass
else:
raise NotImplementedError(
"Norm type {} is not supported".format(self.norm_type)
)
# Optional to add the spatial-temporal pooling.
if self.use_pool:
self.pool = nn.MaxPool1d(
kernel_size=self.pool_size,
stride=self.pool_size,
padding=[0, 0, 0],
)
def forward(self, x):
x_identity = x
N, C, NB = x.size()
theta = self.conv_theta(x)
# Perform temporal-spatial pooling to reduce the computation.
if self.use_pool:
x = self.pool(x)
phi = self.conv_phi(x)
g = self.conv_g(x)
theta = theta.view(N, self.dim_inner, -1)
phi = phi.view(N, self.dim_inner, -1)
g = g.view(N, self.dim_inner, -1)
# (N, C, NB) * (N, C, NB) => (N, NB, NB).
theta_phi = torch.einsum("nct,ncp->ntp", (theta, phi))
# For original Non-local paper, there are two main ways to normalize
# the affinity tensor:
# 1) Softmax normalization (norm on exp).
# 2) dot_product normalization.
if self.instantiation == "softmax":
# Normalizing the affinity tensor theta_phi before softmax.
theta_phi = theta_phi * (self.dim_inner ** -0.5)
theta_phi = nn.functional.softmax(theta_phi, dim=2)
elif self.instantiation == "dot_product":
spatial_temporal_dim = theta_phi.shape[2]
theta_phi = theta_phi / spatial_temporal_dim
else:
raise NotImplementedError(
"Unknown norm type {}".format(self.instantiation)
)
# (N, NB, NB) * (N, C, NB) => (N, C, NB).
theta_phi_g = torch.einsum("ntg,ncg->nct", (theta_phi, g))
# (N, C, NB) => (N, C, NB).
theta_phi_g = theta_phi_g.view(N, self.dim_inner, NB)
p = self.conv_out(theta_phi_g)
if self.norm_type == "batchnorm":
p = self.bn(p)
elif self.norm_type == "layernorm":
p = self.ln(p)
return x_identity + p
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import math
import numpy as np
from functools import partial
__all__ = [
'ResNet', 'resnet10', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
'resnet152', 'resnet200',
]
def conv3x3x3(in_planes, out_planes, stride=1):
# 3x3x3 convolution with padding
return nn.Conv3d(
in_planes,
out_planes,
kernel_size=3,
stride=stride,
padding=1,
bias=False)
def downsample_basic_block(x, planes, stride):
out = F.avg_pool3d(x, kernel_size=1, stride=stride)
zero_pads = torch.Tensor(
out.size(0), planes - out.size(1), out.size(2), out.size(3),
out.size(4)).zero_()
if isinstance(out.data, torch.cuda.FloatTensor):
zero_pads = zero_pads.cuda()
out = Variable(torch.cat([out.data, zero_pads], dim=1))
return out
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3x3(inplanes, planes, stride)
self.bn1 = nn.BatchNorm3d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3x3(planes, planes)
self.bn2 = nn.BatchNorm3d(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class Bottleneck(nn.Module):
conv_op = None
offset_groups = 1
def __init__(self, dim_in, dim_out, stride, dim_inner, group=1, use_temp_conv=1, temp_stride=1, dcn=False,
shortcut_type='B'):
super(Bottleneck, self).__init__()
# 1 x 1 layer
self.with_dcn = dcn
self.conv1 = self.Conv3dBN(dim_in, dim_inner, (1 + use_temp_conv * 2, 1, 1), (temp_stride, 1, 1),
(use_temp_conv, 0, 0))
self.relu = nn.ReLU(inplace=True)
# 3 x 3 layer
self.conv2 = self.Conv3dBN(dim_inner, dim_inner, (1, 3, 3), (1, stride, stride), (0, 1, 1))
# 1 x 1 layer
self.conv3 = self.Conv3dBN(dim_inner, dim_out, (1, 1, 1), (1, 1, 1), (0, 0, 0))
self.shortcut_type = shortcut_type
self.dim_in = dim_in
self.dim_out = dim_out
self.temp_stride = temp_stride
self.stride = stride
# nn.Conv3d(dim_in, dim_out, (1,1,1),(temp_stride,stride,stride),(0,0,0))
if self.shortcut_type == 'B':
if self.dim_in == self.dim_out and self.temp_stride == 1 and self.stride == 1: # or (self.dim_in == self.dim_out and self.dim_in == 64 and self.stride ==1):
pass
else:
# pass
self.shortcut = self.Conv3dBN(dim_in, dim_out, (1, 1, 1), (temp_stride, stride, stride), (0, 0, 0))
# nn.Conv3d(dim_in,dim_inner,kernel_size=(1+use_temp_conv*2,1,1),stride = (temp_stride,1,1),padding = )
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.relu(out)
out = self.conv2(out)
out = self.relu(out)
out = self.conv3(out)
if self.dim_in == self.dim_out and self.temp_stride == 1 and self.stride == 1:
pass
else:
residual = self.shortcut(residual)
out += residual
out = self.relu(out)
return out
def Conv3dBN(self, dim_in, dim_out, kernels, strides, pads, group=1):
if self.with_dcn and kernels[0] > 1:
# use deformable conv
return nn.Sequential(
self.conv_op(dim_in, dim_out, kernel_size=kernels, stride=strides, padding=pads, bias=False,
offset_groups=self.offset_groups),
nn.BatchNorm3d(dim_out)
)
else:
return nn.Sequential(
nn.Conv3d(dim_in, dim_out, kernel_size=kernels, stride=strides, padding=pads, bias=False),
nn.BatchNorm3d(dim_out)
)
class ResNet(nn.Module):
def __init__(self,
block,
layers,
use_temp_convs_set,
temp_strides_set,
sample_size,
sample_duration,
shortcut_type='B',
num_classes=400,
stage_with_dcn=(False, False, False, False),
extract_features=False,
loss_type='softmax'):
super(ResNet, self).__init__()
self.extract_features = extract_features
self.stage_with_dcn = stage_with_dcn
self.group = 1
self.width_per_group = 64
self.dim_inner = self.group * self.width_per_group
# self.shortcut_type = shortcut_type
self.conv1 = nn.Conv3d(
3,
64,
kernel_size=(1 + use_temp_convs_set[0][0] * 2, 7, 7),
stride=(temp_strides_set[0][0], 2, 2),
padding=(use_temp_convs_set[0][0], 3, 3),
bias=False)
self.bn1 = nn.BatchNorm3d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool1 = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 0, 0))
with_dcn = True if self.stage_with_dcn[0] else False
self.layer1 = self._make_layer(block, 64, 256, shortcut_type, stride=1, num_blocks=layers[0],
dim_inner=self.dim_inner, group=self.group, use_temp_convs=use_temp_convs_set[1],
temp_strides=temp_strides_set[1], dcn=with_dcn)
self.maxpool2 = nn.MaxPool3d(kernel_size=(2, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
with_dcn = True if self.stage_with_dcn[1] else False
self.layer2 = self._make_layer(block, 256, 512, shortcut_type, stride=2, num_blocks=layers[1],
dim_inner=self.dim_inner * 2, group=self.group,
use_temp_convs=use_temp_convs_set[2], temp_strides=temp_strides_set[2],
dcn=with_dcn)
with_dcn = True if self.stage_with_dcn[2] else False
self.layer3 = self._make_layer(block, 512, 1024, shortcut_type, stride=2, num_blocks=layers[2],
dim_inner=self.dim_inner * 4, group=self.group,
use_temp_convs=use_temp_convs_set[3], temp_strides=temp_strides_set[3],
dcn=with_dcn)
with_dcn = True if self.stage_with_dcn[3] else False
self.layer4 = self._make_layer(block, 1024, 2048, shortcut_type, stride=1, num_blocks=layers[3],
dim_inner=self.dim_inner * 8, group=self.group,
use_temp_convs=use_temp_convs_set[4], temp_strides=temp_strides_set[4],
dcn=with_dcn)
last_duration = int(math.ceil(sample_duration / 2)) # int(math.ceil(sample_duration / 8))
last_size = int(math.ceil(sample_size / 16))
# self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1) #nn.AdaptiveAvgPool3d((1, 1, 1)) #
self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
self.dropout = torch.nn.Dropout(p=0.5)
self.classifier = nn.Linear(2048, num_classes)
for m in self.modules():
# if isinstance(m, nn.Conv3d):
# m.weight = nn.init.kaiming_normal_(m.weight, mode='fan_out')
# elif isinstance(m,nn.Linear):
# m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')
# elif
if isinstance(m, nn.BatchNorm3d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def _make_layer(self, block, dim_in, dim_out, shortcut_type, stride, num_blocks, dim_inner=None, group=None,
use_temp_convs=None, temp_strides=None, dcn=False):
if use_temp_convs is None:
use_temp_convs = np.zeros(num_blocks).astype(int)
if temp_strides is None:
temp_strides = np.ones(num_blocks).astype(int)
if len(use_temp_convs) < num_blocks:
for _ in range(num_blocks - len(use_temp_convs)):
use_temp_convs.append(0)
temp_strides.append(1)
layers = []
for idx in range(num_blocks):
block_stride = 2 if (idx == 0 and stride == 2) else 1
layers.append(
block(dim_in, dim_out, block_stride, dim_inner, group, use_temp_convs[idx], temp_strides[idx], dcn))
dim_in = dim_out
return nn.Sequential(*layers)
def forward_single(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool1(x)
x = self.layer1(x)
x = self.maxpool2(x)
x = self.layer2(x)
x = self.layer3(x)
features = self.layer4(x)
x = self.avgpool(features)
x = x.view(x.size(0), -1)
x = self.dropout(x)
y = self.classifier(x)
if self.extract_features:
return y, features
else:
return y
def forward_multi(self, x):
clip_preds = []
# import ipdb;ipdb.set_trace()
for clip_idx in range(x.shape[1]): # B, 10, 3, 3, 32, 224, 224
spatial_crops = []
for crop_idx in range(x.shape[2]):
clip = x[:, clip_idx, crop_idx]
clip = self.forward_single(clip)
spatial_crops.append(clip)
spatial_crops = torch.stack(spatial_crops, 1).mean(1) # (B, 400)
clip_preds.append(spatial_crops)
clip_preds = torch.stack(clip_preds, 1).mean(1) # (B, 400)
return clip_preds
def forward(self, x):
# 5D tensor == single clip
if x.dim() == 5:
pred = self.forward_single(x)
# 7D tensor == 3 crops/10 clips
elif x.dim() == 7:
pred = self.forward_multi(x)
# loss_dict = {}
# if 'label' in batch:
# loss = F.cross_entropy(pred, batch['label'], reduction='none')
# loss_dict = {'clf': loss}
return pred
def get_fine_tuning_parameters(model, ft_begin_index):
if ft_begin_index == 0:
return model.parameters()
ft_module_names = []
for i in range(ft_begin_index, 5):
ft_module_names.append('layer{}'.format(i))
ft_module_names.append('fc')
# import ipdb;ipdb.set_trace()
parameters = []
for k, v in model.named_parameters():
for ft_module in ft_module_names:
if ft_module in k:
parameters.append({'params': v})
break
else:
parameters.append({'params': v, 'lr': 0.0})
return parameters
def obtain_arc(arc_type):
# c2d, ResNet50
if arc_type == 1:
use_temp_convs_1 = [0]
temp_strides_1 = [2]
use_temp_convs_2 = [0, 0, 0]
temp_strides_2 = [1, 1, 1]
use_temp_convs_3 = [0, 0, 0, 0]
temp_strides_3 = [1, 1, 1, 1]
use_temp_convs_4 = [0, ] * 6
temp_strides_4 = [1, ] * 6
use_temp_convs_5 = [0, 0, 0]
temp_strides_5 = [1, 1, 1]
# i3d, ResNet50
if arc_type == 2:
use_temp_convs_1 = [2]
temp_strides_1 = [1]
use_temp_convs_2 = [1, 1, 1]
temp_strides_2 = [1, 1, 1]
use_temp_convs_3 = [1, 0, 1, 0]
temp_strides_3 = [1, 1, 1, 1]
use_temp_convs_4 = [1, 0, 1, 0, 1, 0]
temp_strides_4 = [1, 1, 1, 1, 1, 1]
use_temp_convs_5 = [0, 1, 0]
temp_strides_5 = [1, 1, 1]
# c2d, ResNet101
if arc_type == 3:
use_temp_convs_1 = [0]
temp_strides_1 = [2]
use_temp_convs_2 = [0, 0, 0]
temp_strides_2 = [1, 1, 1]
use_temp_convs_3 = [0, 0, 0, 0]
temp_strides_3 = [1, 1, 1, 1]
use_temp_convs_4 = [0, ] * 23
temp_strides_4 = [1, ] * 23
use_temp_convs_5 = [0, 0, 0]
temp_strides_5 = [1, 1, 1]
# i3d, ResNet101
if arc_type == 4:
use_temp_convs_1 = [2]
temp_strides_1 = [2]
use_temp_convs_2 = [1, 1, 1]
temp_strides_2 = [1, 1, 1]
use_temp_convs_3 = [1, 0, 1, 0]
temp_strides_3 = [1, 1, 1, 1]
use_temp_convs_4 = []
for i in range(23):
if i % 2 == 0:
use_temp_convs_4.append(1)
else:
use_temp_convs_4.append(0)
temp_strides_4 = [1, ] * 23
use_temp_convs_5 = [0, 1, 0]
temp_strides_5 = [1, 1, 1]
use_temp_convs_set = [use_temp_convs_1, use_temp_convs_2, use_temp_convs_3, use_temp_convs_4, use_temp_convs_5]
temp_strides_set = [temp_strides_1, temp_strides_2, temp_strides_3, temp_strides_4, temp_strides_5]
return use_temp_convs_set, temp_strides_set
def resnet10(**kwargs):
"""Constructs a ResNet-18 model.
"""
use_temp_convs_set = []
temp_strides_set = []
model = ResNet(BasicBlock, [1, 1, 1, 1], use_temp_convs_set, temp_strides_set, **kwargs)
return model
def resnet18(**kwargs):
"""Constructs a ResNet-18 model.
"""
use_temp_convs_set = []
temp_strides_set = []
model = ResNet(BasicBlock, [2, 2, 2, 2], use_temp_convs_set, temp_strides_set, **kwargs)
return model
def resnet34(**kwargs):
"""Constructs a ResNet-34 model.
"""
use_temp_convs_set = []
temp_strides_set = []
model = ResNet(BasicBlock, [3, 4, 6, 3], use_temp_convs_set, temp_strides_set, **kwargs)
return model
def resnet50(extract_features, **kwargs):
"""Constructs a ResNet-50 model.
"""
use_temp_convs_set, temp_strides_set = obtain_arc(2)
model = ResNet(Bottleneck, [3, 4, 6, 3], use_temp_convs_set, temp_strides_set,
extract_features=extract_features, **kwargs)
return model
def resnet101(**kwargs):
"""Constructs a ResNet-101 model.
"""
use_temp_convs_set, temp_strides_set = obtain_arc(4)
model = ResNet(Bottleneck, [3, 4, 23, 3], use_temp_convs_set, temp_strides_set, **kwargs)
return model
def resnet152(**kwargs):
"""Constructs a ResNet-101 model.
"""
use_temp_convs_set = []
temp_strides_set = []
model = ResNet(Bottleneck, [3, 8, 36, 3], use_temp_convs_set, temp_strides_set, **kwargs)
return model
def resnet200(**kwargs):
"""Constructs a ResNet-101 model.
"""
use_temp_convs_set = []
temp_strides_set = []
model = ResNet(Bottleneck, [3, 24, 36, 3], use_temp_convs_set, temp_strides_set, **kwargs)
return model
def Net(num_classes, extract_features=False, loss_type='softmax',
weights=None, freeze_all_but_cls=False):
net = globals()['resnet' + str(50)](
num_classes=num_classes,
sample_size=50,
sample_duration=32,
extract_features=extract_features,
loss_type=loss_type,
)
if weights is not None:
kinetics_weights = torch.load(weights)['state_dict']
print("Found weights in {}.".format(weights))
cls_name = 'fc'
else:
kinetics_weights = torch.load('i3D/kinetics-res50.pth')
cls_name = 'fc'
print('\n Restoring Kintetics \n')
new_weights = {}
for k, v in kinetics_weights.items():
if not k.startswith('module.' + cls_name):
new_weights[k.replace('module.', '')] = v
net.load_state_dict(new_weights, strict=False)
if freeze_all_but_cls:
for name, par in net.named_parameters():
if not name.startswith('classifier'):
par.requires_grad = False
return net
# -*- coding: utf-8 -*-
import os
import sys
import glob
import datetime
import argparse
import random
import numpy as np
from pathlib import Path
filepath = Path.cwd()
sys.path.append(filepath)
from video_loaders import load_av
from se_bb_from_np import annot_np
from SmthSequence import SmthSequence
from SmthFrameRelations import frame_relations
from PIL import Image
import cv2
import torch
from i3D.model import VideoModel
from i3D.model_lib import VideoModelGlobalCoordLatent
import i3D.gtransforms as gtransforms
class FrameFV:
def __init__(self,path,args):
self.anno = annot_np(path)
self.net = VideoModelGlobalCoordLatent(args)
self.pre_resize_shape = (224, 224)
self.random_crop = gtransforms.GroupMultiScaleCrop(output_size=224,
scales=[1],
max_distort=0,
center_crop_only=True)
def process_video(self,finput,verbose=False):
# get video id
vidnum = int(os.path.splitext(os.path.basename(finput))[0])
# load video to ndarray list
img_array = load_av(finput)
print(img_array[0].shape)
#for i in range(len(img_array)):
# img_array[i] = cv2.resize(img_array[i],self.pre_resize_shape)
img_array = [cv2.resize(img, (self.pre_resize_shape[1], self.pre_resize_shape[0])) for img in img_array]
rs = []
gs = []
bs = []
for i in range(len(img_array)//3):
B, R, G = cv2.split(img_array[i])
rs.append(R)
gs.append(G)
bs.append(B)
frames = [rs, gs, bs]
#frames = [Image.fromarray(img.astype('uint8'), 'RGB') for img in img_array]
#frames, (offset_h, offset_w, crop_h, crop_w) = self.random_crop(frames)
# read frame annotations into Sequence
seq = SmthSequence()
for framenum in range(0,len(img_array)):
cats, bbs = self.anno.get_vf_bbx(vidnum, framenum+1)
# add detections to Sequence
for i in range(0,len(cats)):
seq.add(framenum, cats[i], bbs[i])
# compute object relations per frame
relations = []
for framenum in range(0,len(img_array)):
fv = frame_relations(seq, 0, 1, framenum)
relations.append(fv)
relations = np.asarray(relations)
# TODO bb category embedding per frame
# i3D features per frame
#clip = torch.from_numpy(np.asarray([[img_array[0],img_array[1],img_array[2]]]))
clip = torch.from_numpy(np.asarray([frames]))
#clip = img_array
print(clip.shape)
clip = clip.float()
glo, vid = self.net.i3D(clip)
videos_features = self.net.conv(vid)
print(glo.shape)
print(vid.shape)
print(videos_features.shape)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--annotations',
dest='path_to_annotations',
default='../annotations_ground/',
help='folder to load annotations from')
parser.add_argument(
'--video',
dest='path_to_video',
default='.',
help='video to load')
# begin import
parser.add_argument('--img_feature_dim', default=256, type=int, metavar='N',
help='intermediate feature dimension for image-based features')
parser.add_argument('--coord_feature_dim', default=128, type=int, metavar='N',
help='intermediate feature dimension for coord-based features')
parser.add_argument('--size', default=224, type=int, metavar='N',
help='primary image input size')
parser.add_argument('--batch_size', '-b', default=72, type=int,
metavar='N', help='mini-batch size (default: 72)')
parser.add_argument('--num_classes', default=50, type=int,
help='num of class in the model')
parser.add_argument('--num_boxes', default=4, type=int,
help='num of boxes for each image')
parser.add_argument('--num_frames', default=36, type=int,
help='num of frames for the model')
parser.add_argument('--fine_tune', help='path with ckpt to restore')
parser.add_argument('--restore_i3d')
parser.add_argument('--restore_custom')
# end import
args = parser.parse_args()
compfv = FrameFV(args.path_to_annotations, args)
fv = compfv.process_video(args.path_to_video, verbose=True)
print("fin")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment