Skip to content
Snippets Groups Projects
Commit 5200fd70 authored by Low, Low Jian He (PG/R - Comp Sci & Elec Eng)'s avatar Low, Low Jian He (PG/R - Comp Sci & Elec Eng)
Browse files

Adding version of json extractor which is uncompressed and more suitable for long videos

parent 4603aa55
No related branches found
No related tags found
No related merge requests found
......@@ -23,6 +23,26 @@ from vitpose_model import ViTPoseModel
import json
from typing import Dict, Optional
import subprocess
def print_gpu_usage():
try:
# Run the `nvidia-smi` command
result = subprocess.run(['nvidia-smi', '--query-gpu=utilization.gpu,memory.used,memory.total', '--format=csv,nounits,noheader'],
stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
if result.returncode != 0:
print("Error fetching GPU usage:", result.stderr)
return
# Process the output
usage_lines = result.stdout.strip().split('\n')
for i, line in enumerate(usage_lines):
gpu_util, mem_used, mem_total = map(int, line.split(', '))
print(f"GPU {i}: Utilization: {gpu_util}%, Memory: {mem_used}/{mem_total} MiB")
except FileNotFoundError:
print("nvidia-smi command not found. Ensure NVIDIA drivers are installed.")
# def Produce_Mesh_from_Dict(out, batch, model, args):
# renderer = Renderer(model_cfg, faces=model.mano.faces)
# multiplier = (2*batch['right']-1)
......@@ -124,7 +144,7 @@ def convert_tensors_to_lists(d):
convert_tensors_to_lists(value)
return d
def main(args, model, renderer, device, cpm):
def main(args, model, renderer, device):
initial_start_time = time.time()
# Load detector
......@@ -161,7 +181,7 @@ def main(args, model, renderer, device, cpm):
outfile = os.path.join(temp_dir.name, filename)
os.makedirs(outfile, exist_ok=True)
os.system(f"ffmpeg -i {args.vid} -vf fps={fps} {temp_dir.name}/Frame%d.png")
os.system(f"ffmpeg -nostdin -i {args.vid} -vf fps={fps} {temp_dir.name}/Frame%d.png")
image_folder = Path(temp_dir.name)
print(f"Total Time for Video to Image: {(time.time()-start_time)} Seconds")
else:
......@@ -169,13 +189,14 @@ def main(args, model, renderer, device, cpm):
image_folder = Path(args.img_folder)
# Get all demo images ends with .jpg or .png
img_paths = [img for end in args.file_type for img in image_folder.glob(end)]
img_paths = sorted(img_paths,key = lambda x: int(os.path.basename(x).removesuffix('.png').removeprefix('Frame')))
if args.bbox:
# Detect humans in image
img_cv2 = cv2.imread(str(img_paths[0]))
det_out = detector(img_cv2)
det_instances = det_out['instances']
print_gpu_usage()
#Clearing memory
del detector
torch.cuda.empty_cache()
......@@ -190,6 +211,10 @@ def main(args, model, renderer, device, cpm):
else:
print("No humans detected in the image")
return
# keypoint detector
cpm = ViTPoseModel(device)
print_gpu_usage()
else:
# Hardcoded bbox. This assumes person is in the center and that there is always one person in the image
# Values below strictly for Rachel's BM Videos
......@@ -301,7 +326,7 @@ if __name__ == '__main__':
parser.add_argument('--MANO_Output', type=bool, default=False, help= 'If set, generate output images')
args = parser.parse_args()
model, model_cfg = load_hamer(args.checkpoint, load_mesh=True) # Load Mesh False only if no vertices are needed
# Setup HaMeR model
......@@ -309,25 +334,21 @@ if __name__ == '__main__':
model = model.to(device)
model.eval()
renderer = Renderer(model_cfg, faces=model.mano.faces)
print_gpu_usage()
# keypoint detector
cpm = ViTPoseModel(device)
# args.in_folder = '/vol/research/SignMotion/2024.02.20_Capture_1/glosses'
Folder_List = os.listdir(args.in_folder)
Folder_Dirname = os.path.dirname(args.in_folder)
Rachel_Path = '//vol//research//SignFeaturePool//Rachel_Gloss_Features//HaMeR_Features'
Folder_List = [os.path.join(args.in_folder, f) for f in os.listdir(args.in_folder) if f.endswith('.mp4') or f.endswith('.png') or f.endswith('.jpg')]
args.out_folder = os.path.join(Rachel_Path, os.path.basename(args.in_folder.removesuffix('/glosses')))
args.out_folder = os.path.join(args.out_folder, os.path.basename(args.in_folder))
if Folder_List[0].endswith('.mp4'):
for vid_file in Folder_List:
args.vid = f'{Folder_Dirname}//{vid_file}'
args.vid = vid_file
print(args.vid)
try:
main(args, model, renderer, device, cpm)
main(args, model, renderer, device)
except Exception as e:
print(f"Error in processing {vid_file}: {e}")
elif Folder_List[0].endswith('.png') or Folder_List[0].endswith('.jpg'):
args.img_folder = args.in_folder
main(args, model, renderer, device, cpm)
main(args, model, renderer, device)
from pathlib import Path
import torch
import argparse
import os
import cv2
import numpy as np
import tempfile
import time
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import gc
from hamer.configs import CACHE_DIR_HAMER
from hamer.models import HAMER, download_models, load_hamer, DEFAULT_CHECKPOINT
from hamer.utils import recursive_to
from hamer.datasets.vitdet_dataset import ViTDetDataset, DEFAULT_MEAN, DEFAULT_STD
from hamer.utils.renderer import Renderer, cam_crop_to_full
LIGHT_BLUE=(0.65098039, 0.74117647, 0.85882353)
from vitpose_model import ViTPoseModel
import json
from typing import Dict, Optional
import subprocess
def print_gpu_usage():
try:
# Run the `nvidia-smi` command
result = subprocess.run(['nvidia-smi', '--query-gpu=utilization.gpu,memory.used,memory.total', '--format=csv,nounits,noheader'],
stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
if result.returncode != 0:
print("Error fetching GPU usage:", result.stderr)
return
# Process the output
usage_lines = result.stdout.strip().split('\n')
for i, line in enumerate(usage_lines):
gpu_util, mem_used, mem_total = map(int, line.split(', '))
print(f"GPU {i}: Utilization: {gpu_util}%, Memory: {mem_used}/{mem_total} MiB")
except FileNotFoundError:
print("nvidia-smi command not found. Ensure NVIDIA drivers are installed.")
# def Produce_Mesh_from_Dict(out, batch, model, args):
# renderer = Renderer(model_cfg, faces=model.mano.faces)
# multiplier = (2*batch['right']-1)
# pred_cam = out['pred_cam']
# pred_cam[:,1] = multiplier*pred_cam[:,1]
# box_center = batch["box_center"].float()
# box_size = batch["box_size"].float()
# img_size = batch["img_size"].float()
# multiplier = (2*batch['right']-1)
# scaled_focal_length = model_cfg.EXTRA.FOCAL_LENGTH / model_cfg.MODEL.IMAGE_SIZE * img_size.max()
# pred_cam_t_full = cam_crop_to_full(pred_cam, box_center, box_size, img_size, scaled_focal_length).detach().cpu().numpy()
# # Render the result
# batch_size = batch['img'].shape[0]
# for n in range(batch_size):
# start_time = time.time()
# # Get filename from path img_path
# img_fn, _ = os.path.splitext(os.path.basename(img_path))
# person_id = int(batch['personid'][n])
# white_img = (torch.ones_like(batch['img'][n]).cpu() - DEFAULT_MEAN[:,None,None]/255) / (DEFAULT_STD[:,None,None]/255)
# input_patch = batch['img'][n].cpu() * (DEFAULT_STD[:,None,None]/255) + (DEFAULT_MEAN[:,None,None]/255)
# input_patch = input_patch.permute(1,2,0).numpy()
# regression_img = renderer(out['pred_vertices'][n].detach().cpu().numpy(),
# out['pred_cam_t'][n].detach().cpu().numpy(),
# batch['img'][n],
# mesh_base_color=LIGHT_BLUE,
# scene_bg_color=(1, 1, 1),
# )
# if args.side_view:
# side_img = renderer(out['pred_vertices'][n].detach().cpu().numpy(),
# out['pred_cam_t'][n].detach().cpu().numpy(),
# white_img,
# mesh_base_color=LIGHT_BLUE,
# scene_bg_color=(1, 1, 1),
# side_view=True)
# final_img = np.concatenate([input_patch, regression_img, side_img], axis=1)
# else:
# final_img = np.concatenate([input_patch, regression_img], axis=1)
# cv2.imwrite(os.path.join(args.out_folder, f'{img_fn}_{person_id}.png'), 255*final_img[:, :, ::-1])
# # Add all verts and cams to list
# verts = out['pred_vertices'][n].detach().cpu().numpy()
# is_right = batch['right'][n].cpu().numpy()
# verts[:,0] = (2*is_right-1)*verts[:,0]
# cam_t = pred_cam_t_full[n]
# all_verts.append(verts)
# all_cam_t.append(cam_t)
# all_right.append(is_right)
# # Save all meshes to disk
# if args.save_mesh:
# camera_translation = cam_t.copy()
# tmesh = renderer.vertices_to_trimesh(verts, camera_translation, LIGHT_BLUE, is_right=is_right)
# tmesh.export(os.path.join(args.out_folder, f'{img_fn}_{person_id}.obj'))
# print(f"Total Time for rendering Meshes and Saving: {(time.time()-start_time)} Seconds")
# # Render front view
# if args.full_frame and len(all_verts) > 0:
# misc_args = dict(
# mesh_base_color=LIGHT_BLUE,
# scene_bg_color=(1, 1, 1),
# focal_length=scaled_focal_length,
# )
# cam_view = renderer.render_rgba_multiple(all_verts, cam_t=all_cam_t, render_res=img_size[n], is_right=all_right, **misc_args)
# # Overlay image
# input_img = img_cv2.astype(np.float32)[:,:,::-1]/255.0
# input_img = np.concatenate([input_img, np.ones_like(input_img[:,:,:1])], axis=2) # Add alpha channel
# input_img_overlay = input_img[:,:,:3] * (1-cam_view[:,:,3:]) + cam_view[:,:,:3] * cam_view[:,:,3:]
# cv2.imwrite(os.path.join(args.out_folder, f'{img_fn}_all.jpg'), 255*input_img_overlay[:, :, ::-1])
def show_bbox(img, bbox):
bbox_tensor = torch.tensor(bbox)
bbox = bbox_tensor.cpu().numpy()[0]
# Extract coordinates
x1, y1, x2, y2 = bbox
# Load an image (replace 'image.jpg' with your image file)
image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # Convert BGR to RGB for Matplotlib
# Create a figure and axis
fig, ax = plt.subplots(1)
# Display the image
ax.imshow(image)
# Create a Rectangle patch
rect = patches.Rectangle((x1, y1), x2 - x1, y2 - y1, linewidth=2, edgecolor='g', facecolor='none')
# Add the patch to the Axes
ax.add_patch(rect)
plt.show()
def convert_tensors_to_lists(d):
for key, value in d.items():
if isinstance(value, torch.Tensor):
d[key] = value.tolist() # Convert tensor to list
elif isinstance(value, dict): # If there is a nested dictionary
convert_tensors_to_lists(value)
return d
def main(args, model, renderer, device):
initial_start_time = time.time()
# Load detector
if args.bbox:
from hamer.utils.utils_detectron2 import DefaultPredictor_Lazy
if args.body_detector == 'vitdet':
from detectron2.config import LazyConfig
import hamer
cfg_path = Path(hamer.__file__).parent/'configs'/'cascade_mask_rcnn_vitdet_h_75ep.py'
detectron2_cfg = LazyConfig.load(str(cfg_path))
detectron2_cfg.train.init_checkpoint = "https://dl.fbaipublicfiles.com/detectron2/ViTDet/COCO/cascade_mask_rcnn_vitdet_h/f328730692/model_final_f05665.pkl"
for i in range(3):
detectron2_cfg.model.roi_heads.box_predictors[i].test_score_thresh = 0.25
detector = DefaultPredictor_Lazy(detectron2_cfg)
elif args.body_detector == 'regnety':
from detectron2 import model_zoo
from detectron2.config import get_cfg
detectron2_cfg = model_zoo.get_config('new_baselines/mask_rcnn_regnety_4gf_dds_FPN_400ep_LSJ.py', trained=True)
detectron2_cfg.model.roi_heads.box_predictor.test_score_thresh = 0.5
detectron2_cfg.model.roi_heads.box_predictor.test_nms_thresh = 0.4
detector = DefaultPredictor_Lazy(detectron2_cfg)
os.makedirs(args.out_folder, exist_ok=True)
print(f"Total Time for Initialization: {(time.time()-initial_start_time)} Seconds")
start_time = time.time()
if args.vid != '':
fps = args.fps
temp_dir = tempfile.TemporaryDirectory()
print(f"Temp directory created at {temp_dir.name}")
#Create Temp Out Folder
filename = os.path.basename(args.vid).removesuffix('.mp4')
outfile = os.path.join(args.out_folder, filename)
os.makedirs(outfile, exist_ok=True)
os.system(f"ffmpeg -nostdin -i {args.vid} -vf fps={fps} {temp_dir.name}/Frame%d.png")
image_folder = Path(temp_dir.name)
print(f"Total Time for Video to Image: {(time.time()-start_time)} Seconds")
else:
temp_dir = None
image_folder = Path(args.img_folder)
# Get all demo images ends with .jpg or .png
img_paths = [img for end in args.file_type for img in image_folder.glob(end)]
img_paths = sorted(img_paths,key = lambda x: int(os.path.basename(x).removesuffix('.png').removeprefix('Frame')))
if args.bbox:
# Detect humans in image
img_cv2 = cv2.imread(str(img_paths[0]))
det_out = detector(img_cv2)
det_instances = det_out['instances']
print_gpu_usage()
#Clearing memory
del detector
torch.cuda.empty_cache()
gc.collect()
valid_idx = (det_instances.pred_classes==0) & (det_instances.scores > 0.5)
if valid_idx[0]:
pred_bboxes=det_instances.pred_boxes.tensor[valid_idx].cpu().numpy()
pred_scores=det_instances.scores[valid_idx].cpu().numpy()
x1, y1, x2, y2 = pred_bboxes[0]
pred_bboxes[0] = [x1-x1*0.275, y1-y1*0.1, x2+x1*0.275, y2]
else:
print("No humans detected in the image")
return
# keypoint detector
cpm = ViTPoseModel(device)
print_gpu_usage()
else:
# Hardcoded bbox. This assumes person is in the center and that there is always one person in the image
# Values below strictly for Rachel's BM Videos
pred_bboxes = np.array([[300, 100, 1620, 1075]])
# Force confidence to be 0.99 that human is present
pred_scores = np.array([0.99])
img_cv2 = cv2.imread(str(img_paths[0]))
# Iterate over all images in folder
for img_path in img_paths:
start_time = time.time()
img_cv2 = cv2.imread(str(img_path))
img = img_cv2.copy()[:, :, ::-1]
# # Detect humans in image
# det_out = detector(img_cv2)
# det_instances = det_out['instances']
# valid_idx = (det_instances.pred_classes==0) & (det_instances.scores > 0.5)
# pred_bboxes=det_instances.pred_boxes.tensor[valid_idx].cpu().numpy()
# pred_scores=det_instances.scores[valid_idx].cpu().numpy()
start_time = time.time()
# Detect human keypoints for each person
vitposes_out = cpm.predict_pose(
img,
[np.concatenate([pred_bboxes, pred_scores[:, None]], axis=1)],
)
bboxes = []
is_right = []
start_time = time.time()
# Use hands based on hand keypoint detections
for vitposes in vitposes_out:
left_hand_keyp = vitposes['keypoints'][-42:-21]
right_hand_keyp = vitposes['keypoints'][-21:]
# Rejecting not confident detections
keyp = left_hand_keyp
valid = keyp[:,2] > 0.5
if sum(valid) > 3:
bbox = [keyp[valid,0].min(), keyp[valid,1].min(), keyp[valid,0].max(), keyp[valid,1].max()]
bboxes.append(bbox)
is_right.append(0)
keyp = right_hand_keyp
valid = keyp[:,2] > 0.5
if sum(valid) > 3:
bbox = [keyp[valid,0].min(), keyp[valid,1].min(), keyp[valid,0].max(), keyp[valid,1].max()]
bboxes.append(bbox)
is_right.append(1)
if len(bboxes) == 0:
continue
boxes = np.stack(bboxes)
right = np.stack(is_right)
start_time = time.time()
# Run reconstruction on all detected hands
dataset = ViTDetDataset(model_cfg, img_cv2, boxes, right, rescale_factor=args.rescale_factor)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=False, num_workers=0)
all_verts = []
all_cam_t = []
all_right = []
for count,batch in enumerate(dataloader):
start_time = time.time()
batch = recursive_to(batch, device)
with torch.no_grad():
out = model(batch)
output = convert_tensors_to_lists(out.copy())
output['VitPose'] = vitposes_out[0]['keypoints'].tolist()
json_path = os.path.join(args.out_folder,f"{os.path.basename(img_path).removesuffix('.png')}_{count}.json") if args.vid == '' else os.path.join(outfile,f"{os.path.basename(img_path).removesuffix('.png')}_{count}.json")
with open(json_path, 'w') as f:
json.dump(output, f, indent=4)
if temp_dir:
temp_dir.cleanup()
print(f"Temp directory {temp_dir.name} cleaned up")
print(f"Total time taken: {(time.time()-initial_start_time)/60:.2f} minutes")
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='HaMeR demo code')
parser.add_argument('--checkpoint', type=str, default=DEFAULT_CHECKPOINT, help='Path to pretrained model checkpoint')
parser.add_argument('--vid', type=str, default='', help='Path to video file')
parser.add_argument('--img_folder', type=str, default='images', help='Folder with input images')
parser.add_argument('--out_folder', type=str, default='out_demo', help='Output folder to save rendered results')
parser.add_argument('--fps', type=int, default=50, help='FPS for video output')
parser.add_argument('--side_view', dest='side_view', action='store_true', default=False, help='If set, render side view also')
parser.add_argument('--full_frame', dest='full_frame', action='store_true', default=True, help='If set, render all people together also')
parser.add_argument('--save_mesh', dest='save_mesh', action='store_true', default=False, help='If set, save meshes to disk also')
parser.add_argument('--batch_size', type=int, default=1, help='Batch size for inference/fitting')
parser.add_argument('--rescale_factor', type=float, default=2.0, help='Factor for padding the bbox')
parser.add_argument('--body_detector', type=str, default='vitdet', choices=['vitdet', 'regnety'], help='Using regnety improves runtime and reduces memory')
parser.add_argument('--file_type', nargs='+', default=['*.jpg', '*.png'], help='List of file extensions to consider')
parser.add_argument('--bbox', type=bool, default=True, help= 'If set, use provided bbox from ViT')
parser.add_argument('--MANO_Output', type=bool, default=False, help= 'If set, generate output images')
args = parser.parse_args()
args.out_folder = os.path.join(args.out_folder, os.path.basename(os.path.dirname(args.vid)))
print(f"Output folder: {args.out_folder}")
print(f'Warning: Script Format Currently for MeinDGS Processing! FPS set at {args.fps}')
model, model_cfg = load_hamer(args.checkpoint, load_mesh=True) # Load Mesh False only if no vertices are needed
# Setup HaMeR model
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)
model.eval()
renderer = Renderer(model_cfg, faces=model.mano.faces)
print_gpu_usage()
try:
main(args, model, renderer, device)
except Exception as e:
print(f"Error in processing {args.vid}: {e}")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment