diff --git a/Extract_HaMeR_Jsons.py b/Extract_Compressed_HaMeR.py similarity index 89% rename from Extract_HaMeR_Jsons.py rename to Extract_Compressed_HaMeR.py index b5471beea7fcf8b326f7d6a5f78138cb8f93edea..62465eea4f08336086cc9e5687d5ab438df432e2 100644 --- a/Extract_HaMeR_Jsons.py +++ b/Extract_Compressed_HaMeR.py @@ -23,6 +23,26 @@ from vitpose_model import ViTPoseModel import json from typing import Dict, Optional +import subprocess + +def print_gpu_usage(): + try: + # Run the `nvidia-smi` command + result = subprocess.run(['nvidia-smi', '--query-gpu=utilization.gpu,memory.used,memory.total', '--format=csv,nounits,noheader'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + + if result.returncode != 0: + print("Error fetching GPU usage:", result.stderr) + return + + # Process the output + usage_lines = result.stdout.strip().split('\n') + for i, line in enumerate(usage_lines): + gpu_util, mem_used, mem_total = map(int, line.split(', ')) + print(f"GPU {i}: Utilization: {gpu_util}%, Memory: {mem_used}/{mem_total} MiB") + except FileNotFoundError: + print("nvidia-smi command not found. Ensure NVIDIA drivers are installed.") + # def Produce_Mesh_from_Dict(out, batch, model, args): # renderer = Renderer(model_cfg, faces=model.mano.faces) # multiplier = (2*batch['right']-1) @@ -124,7 +144,7 @@ def convert_tensors_to_lists(d): convert_tensors_to_lists(value) return d -def main(args, model, renderer, device, cpm): +def main(args, model, renderer, device): initial_start_time = time.time() # Load detector @@ -161,7 +181,7 @@ def main(args, model, renderer, device, cpm): outfile = os.path.join(temp_dir.name, filename) os.makedirs(outfile, exist_ok=True) - os.system(f"ffmpeg -i {args.vid} -vf fps={fps} {temp_dir.name}/Frame%d.png") + os.system(f"ffmpeg -nostdin -i {args.vid} -vf fps={fps} {temp_dir.name}/Frame%d.png") image_folder = Path(temp_dir.name) print(f"Total Time for Video to Image: {(time.time()-start_time)} Seconds") else: @@ -169,13 +189,14 @@ def main(args, model, renderer, device, cpm): image_folder = Path(args.img_folder) # Get all demo images ends with .jpg or .png img_paths = [img for end in args.file_type for img in image_folder.glob(end)] + img_paths = sorted(img_paths,key = lambda x: int(os.path.basename(x).removesuffix('.png').removeprefix('Frame'))) if args.bbox: # Detect humans in image img_cv2 = cv2.imread(str(img_paths[0])) det_out = detector(img_cv2) det_instances = det_out['instances'] - + print_gpu_usage() #Clearing memory del detector torch.cuda.empty_cache() @@ -190,6 +211,10 @@ def main(args, model, renderer, device, cpm): else: print("No humans detected in the image") return + + # keypoint detector + cpm = ViTPoseModel(device) + print_gpu_usage() else: # Hardcoded bbox. This assumes person is in the center and that there is always one person in the image # Values below strictly for Rachel's BM Videos @@ -301,7 +326,7 @@ if __name__ == '__main__': parser.add_argument('--MANO_Output', type=bool, default=False, help= 'If set, generate output images') args = parser.parse_args() - + model, model_cfg = load_hamer(args.checkpoint, load_mesh=True) # Load Mesh False only if no vertices are needed # Setup HaMeR model @@ -309,25 +334,21 @@ if __name__ == '__main__': model = model.to(device) model.eval() renderer = Renderer(model_cfg, faces=model.mano.faces) + print_gpu_usage() - # keypoint detector - cpm = ViTPoseModel(device) - - # args.in_folder = '/vol/research/SignMotion/2024.02.20_Capture_1/glosses' - Folder_List = os.listdir(args.in_folder) - Folder_Dirname = os.path.dirname(args.in_folder) - Rachel_Path = '//vol//research//SignFeaturePool//Rachel_Gloss_Features//HaMeR_Features' + Folder_List = [os.path.join(args.in_folder, f) for f in os.listdir(args.in_folder) if f.endswith('.mp4') or f.endswith('.png') or f.endswith('.jpg')] - args.out_folder = os.path.join(Rachel_Path, os.path.basename(args.in_folder.removesuffix('/glosses'))) + args.out_folder = os.path.join(args.out_folder, os.path.basename(args.in_folder)) if Folder_List[0].endswith('.mp4'): for vid_file in Folder_List: - args.vid = f'{Folder_Dirname}//{vid_file}' + args.vid = vid_file + print(args.vid) try: - main(args, model, renderer, device, cpm) + main(args, model, renderer, device) except Exception as e: print(f"Error in processing {vid_file}: {e}") elif Folder_List[0].endswith('.png') or Folder_List[0].endswith('.jpg'): args.img_folder = args.in_folder - main(args, model, renderer, device, cpm) + main(args, model, renderer, device) diff --git a/Extract_Uncompressed_HaMeR.py b/Extract_Uncompressed_HaMeR.py new file mode 100644 index 0000000000000000000000000000000000000000..d40589ec530d2b0d23c615f86762664130c409d1 --- /dev/null +++ b/Extract_Uncompressed_HaMeR.py @@ -0,0 +1,334 @@ +from pathlib import Path +import torch +import argparse +import os +import cv2 +import numpy as np +import tempfile +import time +import matplotlib.pyplot as plt +import matplotlib.patches as patches +import gc + +from hamer.configs import CACHE_DIR_HAMER +from hamer.models import HAMER, download_models, load_hamer, DEFAULT_CHECKPOINT +from hamer.utils import recursive_to +from hamer.datasets.vitdet_dataset import ViTDetDataset, DEFAULT_MEAN, DEFAULT_STD +from hamer.utils.renderer import Renderer, cam_crop_to_full + +LIGHT_BLUE=(0.65098039, 0.74117647, 0.85882353) + +from vitpose_model import ViTPoseModel + +import json +from typing import Dict, Optional + +import subprocess + +def print_gpu_usage(): + try: + # Run the `nvidia-smi` command + result = subprocess.run(['nvidia-smi', '--query-gpu=utilization.gpu,memory.used,memory.total', '--format=csv,nounits,noheader'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + + if result.returncode != 0: + print("Error fetching GPU usage:", result.stderr) + return + + # Process the output + usage_lines = result.stdout.strip().split('\n') + for i, line in enumerate(usage_lines): + gpu_util, mem_used, mem_total = map(int, line.split(', ')) + print(f"GPU {i}: Utilization: {gpu_util}%, Memory: {mem_used}/{mem_total} MiB") + except FileNotFoundError: + print("nvidia-smi command not found. Ensure NVIDIA drivers are installed.") + +# def Produce_Mesh_from_Dict(out, batch, model, args): +# renderer = Renderer(model_cfg, faces=model.mano.faces) +# multiplier = (2*batch['right']-1) +# pred_cam = out['pred_cam'] +# pred_cam[:,1] = multiplier*pred_cam[:,1] +# box_center = batch["box_center"].float() +# box_size = batch["box_size"].float() +# img_size = batch["img_size"].float() +# multiplier = (2*batch['right']-1) +# scaled_focal_length = model_cfg.EXTRA.FOCAL_LENGTH / model_cfg.MODEL.IMAGE_SIZE * img_size.max() +# pred_cam_t_full = cam_crop_to_full(pred_cam, box_center, box_size, img_size, scaled_focal_length).detach().cpu().numpy() + +# # Render the result +# batch_size = batch['img'].shape[0] +# for n in range(batch_size): +# start_time = time.time() +# # Get filename from path img_path +# img_fn, _ = os.path.splitext(os.path.basename(img_path)) +# person_id = int(batch['personid'][n]) +# white_img = (torch.ones_like(batch['img'][n]).cpu() - DEFAULT_MEAN[:,None,None]/255) / (DEFAULT_STD[:,None,None]/255) +# input_patch = batch['img'][n].cpu() * (DEFAULT_STD[:,None,None]/255) + (DEFAULT_MEAN[:,None,None]/255) +# input_patch = input_patch.permute(1,2,0).numpy() + +# regression_img = renderer(out['pred_vertices'][n].detach().cpu().numpy(), +# out['pred_cam_t'][n].detach().cpu().numpy(), +# batch['img'][n], +# mesh_base_color=LIGHT_BLUE, +# scene_bg_color=(1, 1, 1), +# ) + +# if args.side_view: +# side_img = renderer(out['pred_vertices'][n].detach().cpu().numpy(), +# out['pred_cam_t'][n].detach().cpu().numpy(), +# white_img, +# mesh_base_color=LIGHT_BLUE, +# scene_bg_color=(1, 1, 1), +# side_view=True) +# final_img = np.concatenate([input_patch, regression_img, side_img], axis=1) +# else: +# final_img = np.concatenate([input_patch, regression_img], axis=1) + +# cv2.imwrite(os.path.join(args.out_folder, f'{img_fn}_{person_id}.png'), 255*final_img[:, :, ::-1]) + +# # Add all verts and cams to list +# verts = out['pred_vertices'][n].detach().cpu().numpy() +# is_right = batch['right'][n].cpu().numpy() +# verts[:,0] = (2*is_right-1)*verts[:,0] +# cam_t = pred_cam_t_full[n] +# all_verts.append(verts) +# all_cam_t.append(cam_t) +# all_right.append(is_right) + +# # Save all meshes to disk +# if args.save_mesh: +# camera_translation = cam_t.copy() +# tmesh = renderer.vertices_to_trimesh(verts, camera_translation, LIGHT_BLUE, is_right=is_right) +# tmesh.export(os.path.join(args.out_folder, f'{img_fn}_{person_id}.obj')) +# print(f"Total Time for rendering Meshes and Saving: {(time.time()-start_time)} Seconds") + +# # Render front view +# if args.full_frame and len(all_verts) > 0: +# misc_args = dict( +# mesh_base_color=LIGHT_BLUE, +# scene_bg_color=(1, 1, 1), +# focal_length=scaled_focal_length, +# ) +# cam_view = renderer.render_rgba_multiple(all_verts, cam_t=all_cam_t, render_res=img_size[n], is_right=all_right, **misc_args) + +# # Overlay image +# input_img = img_cv2.astype(np.float32)[:,:,::-1]/255.0 +# input_img = np.concatenate([input_img, np.ones_like(input_img[:,:,:1])], axis=2) # Add alpha channel +# input_img_overlay = input_img[:,:,:3] * (1-cam_view[:,:,3:]) + cam_view[:,:,:3] * cam_view[:,:,3:] + +# cv2.imwrite(os.path.join(args.out_folder, f'{img_fn}_all.jpg'), 255*input_img_overlay[:, :, ::-1]) + + +def show_bbox(img, bbox): + bbox_tensor = torch.tensor(bbox) + bbox = bbox_tensor.cpu().numpy()[0] + # Extract coordinates + x1, y1, x2, y2 = bbox + # Load an image (replace 'image.jpg' with your image file) + image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # Convert BGR to RGB for Matplotlib + # Create a figure and axis + fig, ax = plt.subplots(1) + # Display the image + ax.imshow(image) + # Create a Rectangle patch + rect = patches.Rectangle((x1, y1), x2 - x1, y2 - y1, linewidth=2, edgecolor='g', facecolor='none') + # Add the patch to the Axes + ax.add_patch(rect) + plt.show() + +def convert_tensors_to_lists(d): + for key, value in d.items(): + if isinstance(value, torch.Tensor): + d[key] = value.tolist() # Convert tensor to list + elif isinstance(value, dict): # If there is a nested dictionary + convert_tensors_to_lists(value) + return d + +def main(args, model, renderer, device): + initial_start_time = time.time() + + # Load detector + if args.bbox: + from hamer.utils.utils_detectron2 import DefaultPredictor_Lazy + if args.body_detector == 'vitdet': + from detectron2.config import LazyConfig + import hamer + cfg_path = Path(hamer.__file__).parent/'configs'/'cascade_mask_rcnn_vitdet_h_75ep.py' + detectron2_cfg = LazyConfig.load(str(cfg_path)) + detectron2_cfg.train.init_checkpoint = "https://dl.fbaipublicfiles.com/detectron2/ViTDet/COCO/cascade_mask_rcnn_vitdet_h/f328730692/model_final_f05665.pkl" + for i in range(3): + detectron2_cfg.model.roi_heads.box_predictors[i].test_score_thresh = 0.25 + detector = DefaultPredictor_Lazy(detectron2_cfg) + elif args.body_detector == 'regnety': + from detectron2 import model_zoo + from detectron2.config import get_cfg + detectron2_cfg = model_zoo.get_config('new_baselines/mask_rcnn_regnety_4gf_dds_FPN_400ep_LSJ.py', trained=True) + detectron2_cfg.model.roi_heads.box_predictor.test_score_thresh = 0.5 + detectron2_cfg.model.roi_heads.box_predictor.test_nms_thresh = 0.4 + detector = DefaultPredictor_Lazy(detectron2_cfg) + + os.makedirs(args.out_folder, exist_ok=True) + + print(f"Total Time for Initialization: {(time.time()-initial_start_time)} Seconds") + start_time = time.time() + if args.vid != '': + fps = args.fps + temp_dir = tempfile.TemporaryDirectory() + print(f"Temp directory created at {temp_dir.name}") + + #Create Temp Out Folder + filename = os.path.basename(args.vid).removesuffix('.mp4') + outfile = os.path.join(args.out_folder, filename) + os.makedirs(outfile, exist_ok=True) + + os.system(f"ffmpeg -nostdin -i {args.vid} -vf fps={fps} {temp_dir.name}/Frame%d.png") + image_folder = Path(temp_dir.name) + print(f"Total Time for Video to Image: {(time.time()-start_time)} Seconds") + else: + temp_dir = None + image_folder = Path(args.img_folder) + # Get all demo images ends with .jpg or .png + img_paths = [img for end in args.file_type for img in image_folder.glob(end)] + img_paths = sorted(img_paths,key = lambda x: int(os.path.basename(x).removesuffix('.png').removeprefix('Frame'))) + + if args.bbox: + # Detect humans in image + img_cv2 = cv2.imread(str(img_paths[0])) + det_out = detector(img_cv2) + det_instances = det_out['instances'] + print_gpu_usage() + #Clearing memory + del detector + torch.cuda.empty_cache() + gc.collect() + + valid_idx = (det_instances.pred_classes==0) & (det_instances.scores > 0.5) + if valid_idx[0]: + pred_bboxes=det_instances.pred_boxes.tensor[valid_idx].cpu().numpy() + pred_scores=det_instances.scores[valid_idx].cpu().numpy() + x1, y1, x2, y2 = pred_bboxes[0] + pred_bboxes[0] = [x1-x1*0.275, y1-y1*0.1, x2+x1*0.275, y2] + else: + print("No humans detected in the image") + return + + # keypoint detector + cpm = ViTPoseModel(device) + print_gpu_usage() + else: + # Hardcoded bbox. This assumes person is in the center and that there is always one person in the image + # Values below strictly for Rachel's BM Videos + pred_bboxes = np.array([[300, 100, 1620, 1075]]) + # Force confidence to be 0.99 that human is present + pred_scores = np.array([0.99]) + img_cv2 = cv2.imread(str(img_paths[0])) + + # Iterate over all images in folder + for img_path in img_paths: + start_time = time.time() + img_cv2 = cv2.imread(str(img_path)) + img = img_cv2.copy()[:, :, ::-1] + + # # Detect humans in image + # det_out = detector(img_cv2) + # det_instances = det_out['instances'] + # valid_idx = (det_instances.pred_classes==0) & (det_instances.scores > 0.5) + # pred_bboxes=det_instances.pred_boxes.tensor[valid_idx].cpu().numpy() + # pred_scores=det_instances.scores[valid_idx].cpu().numpy() + + start_time = time.time() + # Detect human keypoints for each person + vitposes_out = cpm.predict_pose( + img, + [np.concatenate([pred_bboxes, pred_scores[:, None]], axis=1)], + ) + + bboxes = [] + is_right = [] + start_time = time.time() + # Use hands based on hand keypoint detections + for vitposes in vitposes_out: + left_hand_keyp = vitposes['keypoints'][-42:-21] + right_hand_keyp = vitposes['keypoints'][-21:] + + # Rejecting not confident detections + keyp = left_hand_keyp + valid = keyp[:,2] > 0.5 + if sum(valid) > 3: + bbox = [keyp[valid,0].min(), keyp[valid,1].min(), keyp[valid,0].max(), keyp[valid,1].max()] + bboxes.append(bbox) + is_right.append(0) + keyp = right_hand_keyp + valid = keyp[:,2] > 0.5 + if sum(valid) > 3: + bbox = [keyp[valid,0].min(), keyp[valid,1].min(), keyp[valid,0].max(), keyp[valid,1].max()] + bboxes.append(bbox) + is_right.append(1) + + if len(bboxes) == 0: + continue + + boxes = np.stack(bboxes) + right = np.stack(is_right) + start_time = time.time() + # Run reconstruction on all detected hands + dataset = ViTDetDataset(model_cfg, img_cv2, boxes, right, rescale_factor=args.rescale_factor) + dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=False, num_workers=0) + + all_verts = [] + all_cam_t = [] + all_right = [] + + for count,batch in enumerate(dataloader): + start_time = time.time() + batch = recursive_to(batch, device) + with torch.no_grad(): + out = model(batch) + + output = convert_tensors_to_lists(out.copy()) + output['VitPose'] = vitposes_out[0]['keypoints'].tolist() + json_path = os.path.join(args.out_folder,f"{os.path.basename(img_path).removesuffix('.png')}_{count}.json") if args.vid == '' else os.path.join(outfile,f"{os.path.basename(img_path).removesuffix('.png')}_{count}.json") + with open(json_path, 'w') as f: + json.dump(output, f, indent=4) + + if temp_dir: + temp_dir.cleanup() + print(f"Temp directory {temp_dir.name} cleaned up") + print(f"Total time taken: {(time.time()-initial_start_time)/60:.2f} minutes") + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='HaMeR demo code') + parser.add_argument('--checkpoint', type=str, default=DEFAULT_CHECKPOINT, help='Path to pretrained model checkpoint') + parser.add_argument('--vid', type=str, default='', help='Path to video file') + parser.add_argument('--img_folder', type=str, default='images', help='Folder with input images') + parser.add_argument('--out_folder', type=str, default='out_demo', help='Output folder to save rendered results') + parser.add_argument('--fps', type=int, default=50, help='FPS for video output') + parser.add_argument('--side_view', dest='side_view', action='store_true', default=False, help='If set, render side view also') + parser.add_argument('--full_frame', dest='full_frame', action='store_true', default=True, help='If set, render all people together also') + parser.add_argument('--save_mesh', dest='save_mesh', action='store_true', default=False, help='If set, save meshes to disk also') + parser.add_argument('--batch_size', type=int, default=1, help='Batch size for inference/fitting') + parser.add_argument('--rescale_factor', type=float, default=2.0, help='Factor for padding the bbox') + parser.add_argument('--body_detector', type=str, default='vitdet', choices=['vitdet', 'regnety'], help='Using regnety improves runtime and reduces memory') + parser.add_argument('--file_type', nargs='+', default=['*.jpg', '*.png'], help='List of file extensions to consider') + parser.add_argument('--bbox', type=bool, default=True, help= 'If set, use provided bbox from ViT') + parser.add_argument('--MANO_Output', type=bool, default=False, help= 'If set, generate output images') + + args = parser.parse_args() + + args.out_folder = os.path.join(args.out_folder, os.path.basename(os.path.dirname(args.vid))) + print(f"Output folder: {args.out_folder}") + print(f'Warning: Script Format Currently for MeinDGS Processing! FPS set at {args.fps}') + + model, model_cfg = load_hamer(args.checkpoint, load_mesh=True) # Load Mesh False only if no vertices are needed + + # Setup HaMeR model + device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + model = model.to(device) + model.eval() + renderer = Renderer(model_cfg, faces=model.mano.faces) + print_gpu_usage() + + try: + main(args, model, renderer, device) + except Exception as e: + print(f"Error in processing {args.vid}: {e}")