Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
T
TopDownVideo
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Guerin, Frank Dr (Comp Sci & Elec Eng)
TopDownVideo
Commits
e8881280
Commit
e8881280
authored
2 years ago
by
Anichenko, Anastasia (UG - Comp Sci & Elec Eng)
Browse files
Options
Downloads
Patches
Plain Diff
added missing baseline code
parent
ac32e2bb
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
3D_CNN_baseline.ipynb
+497
-0
497 additions, 0 deletions
3D_CNN_baseline.ipynb
with
497 additions
and
0 deletions
3D_CNN_baseline.ipynb
0 → 100644
+
497
−
0
View file @
e8881280
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "048616b7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using GPU.\n"
]
}
],
"source": [
"#Code for running 3D CNN adapted from:https://github.com/latte488/smth-smth-v2\n",
"import os\n",
"import cv2\n",
"import sys\n",
"import importlib\n",
"import torch\n",
"import torchvision\n",
"import numpy as np\n",
"from torch import nn\n",
"import json\n",
"\n",
"# imports for displaying a video an IPython cell\n",
"import io\n",
"import base64\n",
"from IPython.display import HTML\n",
"\n",
"from data_parser import WebmDataset\n",
"from data_loader_av import VideoFolder\n",
"\n",
"from models.multi_column import MultiColumn\n",
"from transforms_video import *\n",
"\n",
"from utils import load_json_config, remove_module_from_checkpoint_state_dict\n",
"from pprint import pprint\n",
"\n",
"from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay\n",
"from tqdm import tqdm\n",
"from matplotlib import pyplot as plt\n",
"\n",
"DEVICE = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"torch.backends.cudnn.deterministic = True\n",
"print(f\"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'}.\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "75e80d4f",
"metadata": {},
"outputs": [],
"source": [
"#helper functions\n",
"def count_parameters(model):\n",
" return sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
"\n",
"def train(model, dataloader, optimizer, criterion, device):\n",
" model.train()\n",
" running_loss = 0.0\n",
" running_correct_preds = 0\n",
" count = 0\n",
"\n",
" for i, (input, target) in tqdm(enumerate(dataloader), total=len(dataloader)):\n",
" count += 1\n",
" \n",
" optimizer.zero_grad()\n",
" \n",
" if config['nclips_train'] > 1:\n",
" input_var = list(input.split(config['clip_size'], 2))\n",
" for idx, inp in enumerate(input_var):\n",
" input_var[idx] = inp.to(device)\n",
" else:\n",
" input_var = [input.to(device)]\n",
"\n",
" target = target.to(device)\n",
"\n",
" model.zero_grad()\n",
"\n",
" # compute output and loss\n",
" output = model(input_var)\n",
" loss = criterion(output, target)\n",
" running_loss += loss.item()\n",
" \n",
" # compute accuracy\n",
" _, preds = torch.max(output.data, 1)\n",
" running_correct_preds += (preds == target).sum().item()\n",
"\n",
" # backward pass\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" # calculate loss and accuracy\n",
" epoch_loss = running_loss / count\n",
" epoch_acc = 100. * (running_correct_preds/ len(dataloader.dataset))\n",
" return epoch_loss, epoch_acc\n",
"\n",
"def eval_model(model, dataloader, device):\n",
" y_pred = []\n",
" y_true = []\n",
" \n",
" running_acc = 0\n",
" count = 0\n",
" \n",
" with torch.no_grad():\n",
" for i, (input, target) in tqdm(enumerate(dataloader), total=len(dataloader)):\n",
" count += 1\n",
"\n",
" if config['nclips_train'] > 1:\n",
" input_var = list(input.split(config['clip_size'], 2))\n",
" for idx, inp in enumerate(input_var):\n",
" input_var[idx] = inp.to(device)\n",
" else:\n",
" input_var = [input.to(device)]\n",
"\n",
" target = target.to(device)\n",
"\n",
" output = model(input_var)\n",
" _, preds = torch.max(output, 1)\n",
" \n",
" count += target.size(0)\n",
" running_acc += (preds == target).sum().item()\n",
" \n",
" y_pred.extend(preds.to('cpu').tolist())\n",
" y_true.extend(target.to('cpu').tolist())\n",
" \n",
" acc = (100 * running_acc / count)\n",
" \n",
" # classification report \n",
" print(classification_report(y_true, y_pred, target_names=['106', '112', '118'], zero_division=0))\n",
" \n",
" # confusion matrix \n",
" cm = confusion_matrix(y_true, y_pred, labels=[0,1,2], normalize='true')\n",
" disp = ConfusionMatrixDisplay(confusion_matrix=cm)\n",
" disp.plot(include_values=False)\n",
" disp.ax_.get_images()[0].set_clim(0, 1.0) # set scale so it does not vary\n",
" plt.show()"
]
},
{
"cell_type": "markdown",
"id": "3b79d192",
"metadata": {},
"source": [
"Create annotation files for subset of 3 classes that model will be fine-tuned on"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9b4cdee0",
"metadata": {},
"outputs": [],
"source": [
"label_train_path = '/vol/research/TopDownVideo/labels/something-something-v2-train.json' \n",
"label_val_path = '/vol/research/TopDownVideo/labels/something-something-v2-validation.json'\n",
"\n",
"action_list = [\n",
" 'Putting [something] into [something]',\n",
" 'Putting [something] onto [something]',\n",
" 'Putting [something] underneath [something]'\n",
"]\n",
"\n",
"with open(label_train_path) as json_file:\n",
" train_json = json.load(json_file)\n",
"\n",
"with open(label_val_path) as json_file:\n",
" val_json = json.load(json_file)\n",
" \n",
"train_json_updated = []\n",
"for d in train_json:\n",
" if d['template'] in action_list:\n",
" train_json_updated.append(d) \n",
"print(\"Length of train set:\" + str(len(train_json_updated)))\n",
"\n",
"val_json_updated = []\n",
"for d in val_json:\n",
" if d['template'] in action_list:\n",
" val_json_updated.append(d) \n",
"print(\"Length of validation set:\" + str(len(val_json_updated)))\n",
"\n",
"label_train_target = '/vol/research/TopDownVideo/aa03813/LabelsForBaseline/something-something-v2-train3.json' \n",
"label_val_target = '/vol/research/TopDownVideo/aa03813/LabelsForBaseline/something-something-v2-val3.json'\n",
"\n",
"with open(label_train_target, \"w\") as write_file:\n",
" json.dump(train_json_updated, write_file, indent=1)\n",
" \n",
"with open(label_val_target, \"w\") as write_file:\n",
" json.dump(val_json_updated, write_file, indent=1)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "07b888a1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=> Name of the model -- model3D_1\n",
"=> Checkpoint path --> ../trained_models/pretrained/model3D_1/model_best.pth.tar\n"
]
}
],
"source": [
"# load config\n",
"config = load_json_config('./configs/pretrained/config_model1_for_finetuning.json')\n",
"\n",
"#setup model from checkpoint\n",
"column_cnn_def = importlib.import_module(\"{}\".format(config['conv_model']))\n",
"model_name = config[\"model_name\"]\n",
"\n",
"print(\"=> Name of the model -- {}\".format(model_name))\n",
"\n",
"# checkpoint path to a trained model\n",
"checkpoint_path = os.path.join(\"../\", config[\"output_dir\"], config[\"model_name\"], \"model_best.pth.tar\")\n",
"print(\"=> Checkpoint path --> {}\".format(checkpoint_path))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "dd213cfb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Num of trainable parameters before freezing: 23384430\n",
"Num of trainable parameters after freezing: 1539\n"
]
},
{
"data": {
"text/plain": [
"MultiColumn(\n",
" (conv_column): Model(\n",
" (block1): Sequential(\n",
" (0): Conv3d(3, 32, kernel_size=(3, 5, 5), stride=(1, 2, 2), padding=(1, 2, 2))\n",
" (1): BatchNorm3d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
" (2): ReLU(inplace=True)\n",
" (3): Dropout3d(p=0.2, inplace=False)\n",
" )\n",
" (block2): Sequential(\n",
" (0): Conv3d(32, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
" (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
" (2): ReLU(inplace=True)\n",
" (3): Conv3d(64, 128, kernel_size=(3, 3, 3), stride=(1, 2, 2), padding=(1, 1, 1))\n",
" (4): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
" (5): ReLU(inplace=True)\n",
" (6): Dropout3d(p=0.2, inplace=False)\n",
" )\n",
" (block3): Sequential(\n",
" (0): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
" (1): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
" (2): ReLU(inplace=True)\n",
" (3): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
" (4): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
" (5): ReLU(inplace=True)\n",
" (6): Conv3d(128, 256, kernel_size=(3, 3, 3), stride=(1, 2, 2), padding=(1, 1, 1))\n",
" (7): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
" (8): ReLU(inplace=True)\n",
" (9): Dropout3d(p=0.2, inplace=False)\n",
" )\n",
" (block4): Sequential(\n",
" (0): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
" (1): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
" (2): ReLU(inplace=True)\n",
" (3): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
" (4): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
" (5): ReLU(inplace=True)\n",
" (6): Conv3d(256, 512, kernel_size=(3, 3, 3), stride=(1, 2, 2), padding=(1, 1, 1))\n",
" (7): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
" (8): ReLU(inplace=True)\n",
" (9): Dropout3d(p=0.2, inplace=False)\n",
" )\n",
" (block5): Sequential(\n",
" (0): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))\n",
" (1): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
" (2): ReLU(inplace=True)\n",
" (3): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 2, 2), padding=(1, 1, 1))\n",
" (4): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
" (5): ReLU(inplace=True)\n",
" )\n",
" )\n",
" (clf_layers): Linear(in_features=512, out_features=3, bias=True)\n",
")"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#initialize and freeze model\n",
"model = MultiColumn(config['num_classes'], column_cnn_def.Model, int(config[\"column_units\"]))\n",
"\n",
"print(\"Num of trainable parameters before freezing: \" + str(count_parameters(model)))\n",
"for param in model.parameters():\n",
" param.requires_grad = False\n",
"#replace last layer so it's output is only 3\n",
"model.clf_layers = nn.Linear(512, 3)\n",
"print(\"Num of trainable parameters after freezing: \" + str(count_parameters(model)))\n",
"model.to(DEVICE)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "9fae288b",
"metadata": {},
"outputs": [],
"source": [
"# define augmentation pipeline\n",
"upscale_size_train = int(config['input_spatial_size'] * config[\"upscale_factor_train\"])\n",
"upscale_size_eval = int(config['input_spatial_size'] * config[\"upscale_factor_eval\"])\n",
"\n",
"# Random crop videos during training\n",
"transform_train_pre = ComposeMix([\n",
" [RandomRotationVideo(15), \"vid\"],\n",
" [Scale(upscale_size_train), \"img\"],\n",
" [RandomCropVideo(config['input_spatial_size']), \"vid\"],\n",
" ])\n",
"\n",
"# Center crop videos during evaluation\n",
"transform_eval_pre = ComposeMix([\n",
" [Scale(upscale_size_eval), \"img\"],\n",
" [torchvision.transforms.ToPILImage(), \"img\"],\n",
" [torchvision.transforms.CenterCrop(config['input_spatial_size']), \"img\"],\n",
" ])\n",
"\n",
"# Transforms common to train and eval sets and applied after \"pre\" transforms\n",
"transform_post = ComposeMix([\n",
" [torchvision.transforms.ToTensor(), \"img\"],\n",
" [torchvision.transforms.Normalize(\n",
" mean=[0.485, 0.456, 0.406], # default values for imagenet\n",
" std=[0.229, 0.224, 0.225]), \"img\"]\n",
" ])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "3b5c018d",
"metadata": {},
"outputs": [],
"source": [
"train_data = VideoFolder(root=config['data_folder'],\n",
" json_file_input=config['json_data_train'],\n",
" json_file_labels=config['json_file_labels'],\n",
" clip_size=config['clip_size'],\n",
" nclips=config['nclips_train'],\n",
" step_size=config['step_size_train'],\n",
" is_val=False,\n",
" transform_pre=transform_train_pre,\n",
" transform_post=transform_post,\n",
" augmentation_mappings_json=config['augmentation_mappings_json'],\n",
" augmentation_types_todo=config['augmentation_types_todo'],\n",
" get_item_id=False,\n",
" )\n",
"\n",
"train_loader = torch.utils.data.DataLoader(\n",
" train_data,\n",
" batch_size=config['batch_size'], shuffle=True,\n",
" num_workers=config['num_workers'], pin_memory=True,\n",
" drop_last=True)\n",
"\n",
"val_data = VideoFolder(root=config['data_folder'],\n",
" json_file_input=config['json_data_val'],\n",
" json_file_labels=config['json_file_labels'],\n",
" clip_size=config['clip_size'],\n",
" nclips=config['nclips_val'],\n",
" step_size=config['step_size_val'],\n",
" is_val=True,\n",
" transform_pre=transform_eval_pre,\n",
" transform_post=transform_post,\n",
" get_item_id=False,\n",
" )\n",
"\n",
"val_loader = torch.utils.data.DataLoader(\n",
" val_data,\n",
" batch_size=config['batch_size'], shuffle=False,\n",
" num_workers=config['num_workers'], pin_memory=True,\n",
" drop_last=False)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "fafe4c40",
"metadata": {},
"outputs": [],
"source": [
"LR = 5e-2\n",
"OPTIMIZER = torch.optim.SGD(model.parameters(), LR)\n",
"CRITERION = nn.CrossEntropyLoss().to(DEVICE)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "bdbab144",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|█████████████████████████████████████████| 146/146 [04:35<00:00, 1.89s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training loss: 1.7852653726323011\n",
"Training accuracy:39.977298524404084\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"loss, acc = train(model, train_loader, OPTIMIZER, CRITERION, DEVICE)\n",
"print(\"Training loss: \" + str(loss))\n",
"print(\"Training accuracy:\" + str(acc))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "2dd0a6ae",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<All keys matched successfully>"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"torch.save(model.state_dict(), '/vol/research/TopDownVideo/aa03813/LabelsForBaseline/model.pkl')\n",
"model.load_state_dict(torch.load('/vol/research/TopDownVideo/aa03813/LabelsForBaseline/model.pkl'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b5990018",
"metadata": {},
"outputs": [],
"source": [
"eval_model(model, val_loader, DEVICE)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
%% Cell type:code id:048616b7 tags:
```
python
#Code for running 3D CNN adapted from:https://github.com/latte488/smth-smth-v2
import
os
import
cv2
import
sys
import
importlib
import
torch
import
torchvision
import
numpy
as
np
from
torch
import
nn
import
json
# imports for displaying a video an IPython cell
import
io
import
base64
from
IPython.display
import
HTML
from
data_parser
import
WebmDataset
from
data_loader_av
import
VideoFolder
from
models.multi_column
import
MultiColumn
from
transforms_video
import
*
from
utils
import
load_json_config
,
remove_module_from_checkpoint_state_dict
from
pprint
import
pprint
from
sklearn.metrics
import
confusion_matrix
,
classification_report
,
ConfusionMatrixDisplay
from
tqdm
import
tqdm
from
matplotlib
import
pyplot
as
plt
DEVICE
=
torch
.
device
(
"
cuda
"
if
torch
.
cuda
.
is_available
()
else
"
cpu
"
)
torch
.
backends
.
cudnn
.
deterministic
=
True
print
(
f
"
Using
{
'
GPU
'
if
str
(
DEVICE
)
==
'
cuda
'
else
'
CPU
'
}
.
"
)
```
%% Output
Using GPU.
%% Cell type:code id:75e80d4f tags:
```
python
#helper functions
def
count_parameters
(
model
):
return
sum
(
p
.
numel
()
for
p
in
model
.
parameters
()
if
p
.
requires_grad
)
def
train
(
model
,
dataloader
,
optimizer
,
criterion
,
device
):
model
.
train
()
running_loss
=
0.0
running_correct_preds
=
0
count
=
0
for
i
,
(
input
,
target
)
in
tqdm
(
enumerate
(
dataloader
),
total
=
len
(
dataloader
)):
count
+=
1
optimizer
.
zero_grad
()
if
config
[
'
nclips_train
'
]
>
1
:
input_var
=
list
(
input
.
split
(
config
[
'
clip_size
'
],
2
))
for
idx
,
inp
in
enumerate
(
input_var
):
input_var
[
idx
]
=
inp
.
to
(
device
)
else
:
input_var
=
[
input
.
to
(
device
)]
target
=
target
.
to
(
device
)
model
.
zero_grad
()
# compute output and loss
output
=
model
(
input_var
)
loss
=
criterion
(
output
,
target
)
running_loss
+=
loss
.
item
()
# compute accuracy
_
,
preds
=
torch
.
max
(
output
.
data
,
1
)
running_correct_preds
+=
(
preds
==
target
).
sum
().
item
()
# backward pass
loss
.
backward
()
optimizer
.
step
()
# calculate loss and accuracy
epoch_loss
=
running_loss
/
count
epoch_acc
=
100.
*
(
running_correct_preds
/
len
(
dataloader
.
dataset
))
return
epoch_loss
,
epoch_acc
def
eval_model
(
model
,
dataloader
,
device
):
y_pred
=
[]
y_true
=
[]
running_acc
=
0
count
=
0
with
torch
.
no_grad
():
for
i
,
(
input
,
target
)
in
tqdm
(
enumerate
(
dataloader
),
total
=
len
(
dataloader
)):
count
+=
1
if
config
[
'
nclips_train
'
]
>
1
:
input_var
=
list
(
input
.
split
(
config
[
'
clip_size
'
],
2
))
for
idx
,
inp
in
enumerate
(
input_var
):
input_var
[
idx
]
=
inp
.
to
(
device
)
else
:
input_var
=
[
input
.
to
(
device
)]
target
=
target
.
to
(
device
)
output
=
model
(
input_var
)
_
,
preds
=
torch
.
max
(
output
,
1
)
count
+=
target
.
size
(
0
)
running_acc
+=
(
preds
==
target
).
sum
().
item
()
y_pred
.
extend
(
preds
.
to
(
'
cpu
'
).
tolist
())
y_true
.
extend
(
target
.
to
(
'
cpu
'
).
tolist
())
acc
=
(
100
*
running_acc
/
count
)
# classification report
print
(
classification_report
(
y_true
,
y_pred
,
target_names
=
[
'
106
'
,
'
112
'
,
'
118
'
],
zero_division
=
0
))
# confusion matrix
cm
=
confusion_matrix
(
y_true
,
y_pred
,
labels
=
[
0
,
1
,
2
],
normalize
=
'
true
'
)
disp
=
ConfusionMatrixDisplay
(
confusion_matrix
=
cm
)
disp
.
plot
(
include_values
=
False
)
disp
.
ax_
.
get_images
()[
0
].
set_clim
(
0
,
1.0
)
# set scale so it does not vary
plt
.
show
()
```
%% Cell type:markdown id:3b79d192 tags:
Create annotation files for subset of 3 classes that model will be fine-tuned on
%% Cell type:code id:9b4cdee0 tags:
```
python
label_train_path
=
'
/vol/research/TopDownVideo/labels/something-something-v2-train.json
'
label_val_path
=
'
/vol/research/TopDownVideo/labels/something-something-v2-validation.json
'
action_list
=
[
'
Putting [something] into [something]
'
,
'
Putting [something] onto [something]
'
,
'
Putting [something] underneath [something]
'
]
with
open
(
label_train_path
)
as
json_file
:
train_json
=
json
.
load
(
json_file
)
with
open
(
label_val_path
)
as
json_file
:
val_json
=
json
.
load
(
json_file
)
train_json_updated
=
[]
for
d
in
train_json
:
if
d
[
'
template
'
]
in
action_list
:
train_json_updated
.
append
(
d
)
print
(
"
Length of train set:
"
+
str
(
len
(
train_json_updated
)))
val_json_updated
=
[]
for
d
in
val_json
:
if
d
[
'
template
'
]
in
action_list
:
val_json_updated
.
append
(
d
)
print
(
"
Length of validation set:
"
+
str
(
len
(
val_json_updated
)))
label_train_target
=
'
/vol/research/TopDownVideo/aa03813/LabelsForBaseline/something-something-v2-train3.json
'
label_val_target
=
'
/vol/research/TopDownVideo/aa03813/LabelsForBaseline/something-something-v2-val3.json
'
with
open
(
label_train_target
,
"
w
"
)
as
write_file
:
json
.
dump
(
train_json_updated
,
write_file
,
indent
=
1
)
with
open
(
label_val_target
,
"
w
"
)
as
write_file
:
json
.
dump
(
val_json_updated
,
write_file
,
indent
=
1
)
```
%% Cell type:code id:07b888a1 tags:
```
python
# load config
config
=
load_json_config
(
'
./configs/pretrained/config_model1_for_finetuning.json
'
)
#setup model from checkpoint
column_cnn_def
=
importlib
.
import_module
(
"
{}
"
.
format
(
config
[
'
conv_model
'
]))
model_name
=
config
[
"
model_name
"
]
print
(
"
=> Name of the model -- {}
"
.
format
(
model_name
))
# checkpoint path to a trained model
checkpoint_path
=
os
.
path
.
join
(
"
../
"
,
config
[
"
output_dir
"
],
config
[
"
model_name
"
],
"
model_best.pth.tar
"
)
print
(
"
=> Checkpoint path --> {}
"
.
format
(
checkpoint_path
))
```
%% Output
=> Name of the model -- model3D_1
=> Checkpoint path --> ../trained_models/pretrained/model3D_1/model_best.pth.tar
%% Cell type:code id:dd213cfb tags:
```
python
#initialize and freeze model
model
=
MultiColumn
(
config
[
'
num_classes
'
],
column_cnn_def
.
Model
,
int
(
config
[
"
column_units
"
]))
print
(
"
Num of trainable parameters before freezing:
"
+
str
(
count_parameters
(
model
)))
for
param
in
model
.
parameters
():
param
.
requires_grad
=
False
#replace last layer so it's output is only 3
model
.
clf_layers
=
nn
.
Linear
(
512
,
3
)
print
(
"
Num of trainable parameters after freezing:
"
+
str
(
count_parameters
(
model
)))
model
.
to
(
DEVICE
)
```
%% Output
Num of trainable parameters before freezing: 23384430
Num of trainable parameters after freezing: 1539
MultiColumn(
(conv_column): Model(
(block1): Sequential(
(0): Conv3d(3, 32, kernel_size=(3, 5, 5), stride=(1, 2, 2), padding=(1, 2, 2))
(1): BatchNorm3d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
(3): Dropout3d(p=0.2, inplace=False)
)
(block2): Sequential(
(0): Conv3d(32, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
(1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
(3): Conv3d(64, 128, kernel_size=(3, 3, 3), stride=(1, 2, 2), padding=(1, 1, 1))
(4): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(5): ReLU(inplace=True)
(6): Dropout3d(p=0.2, inplace=False)
)
(block3): Sequential(
(0): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
(1): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
(3): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
(4): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(5): ReLU(inplace=True)
(6): Conv3d(128, 256, kernel_size=(3, 3, 3), stride=(1, 2, 2), padding=(1, 1, 1))
(7): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(8): ReLU(inplace=True)
(9): Dropout3d(p=0.2, inplace=False)
)
(block4): Sequential(
(0): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
(1): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
(3): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
(4): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(5): ReLU(inplace=True)
(6): Conv3d(256, 512, kernel_size=(3, 3, 3), stride=(1, 2, 2), padding=(1, 1, 1))
(7): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(8): ReLU(inplace=True)
(9): Dropout3d(p=0.2, inplace=False)
)
(block5): Sequential(
(0): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
(1): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
(3): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 2, 2), padding=(1, 1, 1))
(4): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(5): ReLU(inplace=True)
)
)
(clf_layers): Linear(in_features=512, out_features=3, bias=True)
)
%% Cell type:code id:9fae288b tags:
```
python
# define augmentation pipeline
upscale_size_train
=
int
(
config
[
'
input_spatial_size
'
]
*
config
[
"
upscale_factor_train
"
])
upscale_size_eval
=
int
(
config
[
'
input_spatial_size
'
]
*
config
[
"
upscale_factor_eval
"
])
# Random crop videos during training
transform_train_pre
=
ComposeMix
([
[
RandomRotationVideo
(
15
),
"
vid
"
],
[
Scale
(
upscale_size_train
),
"
img
"
],
[
RandomCropVideo
(
config
[
'
input_spatial_size
'
]),
"
vid
"
],
])
# Center crop videos during evaluation
transform_eval_pre
=
ComposeMix
([
[
Scale
(
upscale_size_eval
),
"
img
"
],
[
torchvision
.
transforms
.
ToPILImage
(),
"
img
"
],
[
torchvision
.
transforms
.
CenterCrop
(
config
[
'
input_spatial_size
'
]),
"
img
"
],
])
# Transforms common to train and eval sets and applied after "pre" transforms
transform_post
=
ComposeMix
([
[
torchvision
.
transforms
.
ToTensor
(),
"
img
"
],
[
torchvision
.
transforms
.
Normalize
(
mean
=
[
0.485
,
0.456
,
0.406
],
# default values for imagenet
std
=
[
0.229
,
0.224
,
0.225
]),
"
img
"
]
])
```
%% Cell type:code id:3b5c018d tags:
```
python
train_data
=
VideoFolder
(
root
=
config
[
'
data_folder
'
],
json_file_input
=
config
[
'
json_data_train
'
],
json_file_labels
=
config
[
'
json_file_labels
'
],
clip_size
=
config
[
'
clip_size
'
],
nclips
=
config
[
'
nclips_train
'
],
step_size
=
config
[
'
step_size_train
'
],
is_val
=
False
,
transform_pre
=
transform_train_pre
,
transform_post
=
transform_post
,
augmentation_mappings_json
=
config
[
'
augmentation_mappings_json
'
],
augmentation_types_todo
=
config
[
'
augmentation_types_todo
'
],
get_item_id
=
False
,
)
train_loader
=
torch
.
utils
.
data
.
DataLoader
(
train_data
,
batch_size
=
config
[
'
batch_size
'
],
shuffle
=
True
,
num_workers
=
config
[
'
num_workers
'
],
pin_memory
=
True
,
drop_last
=
True
)
val_data
=
VideoFolder
(
root
=
config
[
'
data_folder
'
],
json_file_input
=
config
[
'
json_data_val
'
],
json_file_labels
=
config
[
'
json_file_labels
'
],
clip_size
=
config
[
'
clip_size
'
],
nclips
=
config
[
'
nclips_val
'
],
step_size
=
config
[
'
step_size_val
'
],
is_val
=
True
,
transform_pre
=
transform_eval_pre
,
transform_post
=
transform_post
,
get_item_id
=
False
,
)
val_loader
=
torch
.
utils
.
data
.
DataLoader
(
val_data
,
batch_size
=
config
[
'
batch_size
'
],
shuffle
=
False
,
num_workers
=
config
[
'
num_workers
'
],
pin_memory
=
True
,
drop_last
=
False
)
```
%% Cell type:code id:fafe4c40 tags:
```
python
LR
=
5e-2
OPTIMIZER
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
LR
)
CRITERION
=
nn
.
CrossEntropyLoss
().
to
(
DEVICE
)
```
%% Cell type:code id:bdbab144 tags:
```
python
loss
,
acc
=
train
(
model
,
train_loader
,
OPTIMIZER
,
CRITERION
,
DEVICE
)
print
(
"
Training loss:
"
+
str
(
loss
))
print
(
"
Training accuracy:
"
+
str
(
acc
))
```
%% Output
100%|█████████████████████████████████████████| 146/146 [04:35<00:00, 1.89s/it]
Training loss: 1.7852653726323011
Training accuracy:39.977298524404084
%% Cell type:code id:2dd0a6ae tags:
```
python
torch
.
save
(
model
.
state_dict
(),
'
/vol/research/TopDownVideo/aa03813/LabelsForBaseline/model.pkl
'
)
model
.
load_state_dict
(
torch
.
load
(
'
/vol/research/TopDownVideo/aa03813/LabelsForBaseline/model.pkl
'
))
```
%% Output
<All keys matched successfully>
%% Cell type:code id:b5990018 tags:
```
python
eval_model
(
model
,
val_loader
,
DEVICE
)
```
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment