diff --git a/Extract_HaMeR_Single.py b/Extract_HaMeR_Single.py index 69afc85883c7a6512acf74f3b68fae77d4d9273d..688aed9c10d671662d096aff54f086f8372049ed 100644 --- a/Extract_HaMeR_Single.py +++ b/Extract_HaMeR_Single.py @@ -9,6 +9,8 @@ import time import matplotlib.pyplot as plt import matplotlib.patches as patches import gc +import shutil +import lmdb, pickle from hamer.configs import CACHE_DIR_HAMER from hamer.models import HAMER, download_models, load_hamer, DEFAULT_CHECKPOINT @@ -42,81 +44,6 @@ def print_gpu_usage(): print(f"GPU {i}: Utilization: {gpu_util}%, Memory: {mem_used}/{mem_total} MiB") except FileNotFoundError: print("nvidia-smi command not found. Ensure NVIDIA drivers are installed.") - -# def Produce_Mesh_from_Dict(out, batch, model, args): -# renderer = Renderer(model_cfg, faces=model.mano.faces) -# multiplier = (2*batch['right']-1) -# pred_cam = out['pred_cam'] -# pred_cam[:,1] = multiplier*pred_cam[:,1] -# box_center = batch["box_center"].float() -# box_size = batch["box_size"].float() -# img_size = batch["img_size"].float() -# multiplier = (2*batch['right']-1) -# scaled_focal_length = model_cfg.EXTRA.FOCAL_LENGTH / model_cfg.MODEL.IMAGE_SIZE * img_size.max() -# pred_cam_t_full = cam_crop_to_full(pred_cam, box_center, box_size, img_size, scaled_focal_length).detach().cpu().numpy() - -# # Render the result -# batch_size = batch['img'].shape[0] -# for n in range(batch_size): -# start_time = time.time() -# # Get filename from path img_path -# img_fn, _ = os.path.splitext(os.path.basename(img_path)) -# person_id = int(batch['personid'][n]) -# white_img = (torch.ones_like(batch['img'][n]).cpu() - DEFAULT_MEAN[:,None,None]/255) / (DEFAULT_STD[:,None,None]/255) -# input_patch = batch['img'][n].cpu() * (DEFAULT_STD[:,None,None]/255) + (DEFAULT_MEAN[:,None,None]/255) -# input_patch = input_patch.permute(1,2,0).numpy() - -# regression_img = renderer(out['pred_vertices'][n].detach().cpu().numpy(), -# out['pred_cam_t'][n].detach().cpu().numpy(), -# batch['img'][n], -# mesh_base_color=LIGHT_BLUE, -# scene_bg_color=(1, 1, 1), -# ) - -# if args.side_view: -# side_img = renderer(out['pred_vertices'][n].detach().cpu().numpy(), -# out['pred_cam_t'][n].detach().cpu().numpy(), -# white_img, -# mesh_base_color=LIGHT_BLUE, -# scene_bg_color=(1, 1, 1), -# side_view=True) -# final_img = np.concatenate([input_patch, regression_img, side_img], axis=1) -# else: -# final_img = np.concatenate([input_patch, regression_img], axis=1) - -# cv2.imwrite(os.path.join(args.out_folder, f'{img_fn}_{person_id}.png'), 255*final_img[:, :, ::-1]) - -# # Add all verts and cams to list -# verts = out['pred_vertices'][n].detach().cpu().numpy() -# is_right = batch['right'][n].cpu().numpy() -# verts[:,0] = (2*is_right-1)*verts[:,0] -# cam_t = pred_cam_t_full[n] -# all_verts.append(verts) -# all_cam_t.append(cam_t) -# all_right.append(is_right) - -# # Save all meshes to disk -# if args.save_mesh: -# camera_translation = cam_t.copy() -# tmesh = renderer.vertices_to_trimesh(verts, camera_translation, LIGHT_BLUE, is_right=is_right) -# tmesh.export(os.path.join(args.out_folder, f'{img_fn}_{person_id}.obj')) -# print(f"Total Time for rendering Meshes and Saving: {(time.time()-start_time)} Seconds") - -# # Render front view -# if args.full_frame and len(all_verts) > 0: -# misc_args = dict( -# mesh_base_color=LIGHT_BLUE, -# scene_bg_color=(1, 1, 1), -# focal_length=scaled_focal_length, -# ) -# cam_view = renderer.render_rgba_multiple(all_verts, cam_t=all_cam_t, render_res=img_size[n], is_right=all_right, **misc_args) - -# # Overlay image -# input_img = img_cv2.astype(np.float32)[:,:,::-1]/255.0 -# input_img = np.concatenate([input_img, np.ones_like(input_img[:,:,:1])], axis=2) # Add alpha channel -# input_img_overlay = input_img[:,:,:3] * (1-cam_view[:,:,3:]) + cam_view[:,:,:3] * cam_view[:,:,3:] - -# cv2.imwrite(os.path.join(args.out_folder, f'{img_fn}_all.jpg'), 255*input_img_overlay[:, :, ::-1]) def get_video_fps(video_path): video = cv2.VideoCapture(video_path) @@ -125,23 +52,6 @@ def get_video_fps(video_path): print(f"Video FPS: {fps}") return fps -def show_bbox(img, bbox): - bbox_tensor = torch.tensor(bbox) - bbox = bbox_tensor.cpu().numpy()[0] - # Extract coordinates - x1, y1, x2, y2 = bbox - # Load an image (replace 'image.jpg' with your image file) - image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # Convert BGR to RGB for Matplotlib - # Create a figure and axis - fig, ax = plt.subplots(1) - # Display the image - ax.imshow(image) - # Create a Rectangle patch - rect = patches.Rectangle((x1, y1), x2 - x1, y2 - y1, linewidth=2, edgecolor='g', facecolor='none') - # Add the patch to the Axes - ax.add_patch(rect) - plt.show() - def convert_tensors_to_lists(d): for key, value in d.items(): if isinstance(value, torch.Tensor): @@ -154,7 +64,7 @@ def main(args, model, renderer, device): initial_start_time = time.time() # Load detector - if args.bbox: + if args.bbox == 'True': from hamer.utils.utils_detectron2 import DefaultPredictor_Lazy if args.body_detector == 'vitdet': from detectron2.config import LazyConfig @@ -176,143 +86,175 @@ def main(args, model, renderer, device): os.makedirs(args.out_folder, exist_ok=True) print(f"Total Time for Initialization: {(time.time()-initial_start_time)} Seconds") - start_time = time.time() - if args.vid != '': - fps = get_video_fps(args.vid) - temp_dir = tempfile.TemporaryDirectory() - print(f"Temp directory created at {temp_dir.name}") - - #Create Temp Out Folder - filename = os.path.basename(args.vid).removesuffix('.mp4') - outfile = os.path.join(temp_dir.name, filename) - os.makedirs(outfile, exist_ok=True) - - os.system(f"ffmpeg -nostdin -i {args.vid} -vf fps={fps} {temp_dir.name}/Frame%d.png") - image_folder = Path(temp_dir.name) - print(f"Total Time for Video to Image: {(time.time()-start_time)} Seconds") - else: - temp_dir = None - image_folder = Path(args.img_folder) - # Get all demo images ends with .jpg or .png - img_paths = [img for end in args.file_type for img in image_folder.glob(end)] - img_paths = sorted(img_paths,key = lambda x: int(os.path.basename(x).removesuffix('.png').removeprefix('Frame'))) - - img_cv2 = cv2.imread(str(img_paths[0])) - if args.bbox == 'True': - # Detect humans in image - det_out = detector(img_cv2) - det_instances = det_out['instances'] - print('Completed ViTDet for Human Bounding Box Aquisition') - print_gpu_usage() - #Clearing memory - del detector - torch.cuda.empty_cache() - gc.collect() - - valid_idx = (det_instances.pred_classes==0) & (det_instances.scores > 0.5) - if valid_idx[0]: - pred_bboxes=det_instances.pred_boxes.tensor[valid_idx].cpu().numpy() - pred_scores=det_instances.scores[valid_idx].cpu().numpy() - x1, y1, x2, y2 = pred_bboxes[0] - pred_bboxes[0] = [x1-x1*0.35, y1-y1*0.1, x2+x1*0.35, y2] + with tempfile.TemporaryDirectory() as temp_dir: + start_time = time.time() + if args.vid != '': + fps = get_video_fps(args.vid) + image_folder = Path(temp_dir) + print(f"Temp directory created at {image_folder}") + os.system(f"ffmpeg -nostdin -i {args.vid} -vf fps={fps} {image_folder}/Frame%d.png") + print(f"Total Time for Video to Image: {(time.time()-start_time)} Seconds") else: - print("No humans detected in the image") - return - else: - # Hardcoded bbox. This assumes person is in the center and that there is always one person in the image - if args.custom_bbox == '': - h, w, _ = img_cv2.shape - pred_bboxes = np.array([[0, 0, w, h]]) - print(f"Using hardcoded bbox, {pred_bboxes[0]}") + image_folder = Path(args.img_folder) + # Get all demo images ends with .jpg or .png + img_paths = [img for end in args.file_type for img in image_folder.glob(end)] + img_paths = sorted(img_paths,key = lambda x: int(os.path.basename(x).removesuffix('.png').removeprefix('Frame'))) + + img_cv2 = cv2.imread(str(img_paths[0])) + if args.bbox == 'True': + # Detect humans in image + det_out = detector(img_cv2) + det_instances = det_out['instances'] + print('Completed ViTDet for Human Bounding Box Aquisition') + print_gpu_usage() + #Clearing memory + del detector + torch.cuda.empty_cache() + gc.collect() + + valid_idx = (det_instances.pred_classes==0) & (det_instances.scores > 0.5) + if valid_idx[0]: + pred_bboxes=det_instances.pred_boxes.tensor[valid_idx].cpu().numpy() + pred_scores=det_instances.scores[valid_idx].cpu().numpy() + x1, y1, x2, y2 = pred_bboxes[0] + pred_bboxes[0] = [x1-x1*0.35, y1-y1*0.1, x2+x1*0.35, y2] + else: + print("No humans detected in the image") + return else: - pred_bboxes = np.array([list(map(int, args.custom_bbox.split(',')))]) - print(f"Using custom bbox, {pred_bboxes[0]}") - # Force confidence to be 0.99 that human is present - pred_scores = np.array([0.99]) - - # keypoint detector - cpm = ViTPoseModel(device) - print('Loading ViTPose Model') - print_gpu_usage() - - # Iterate over all images in folder - for img_path in img_paths: - start_time = time.time() - img_cv2 = cv2.imread(str(img_path)) - img = img_cv2.copy()[:, :, ::-1] - - # # Detect humans in image - # det_out = detector(img_cv2) - # det_instances = det_out['instances'] - # valid_idx = (det_instances.pred_classes==0) & (det_instances.scores > 0.5) - # pred_bboxes=det_instances.pred_boxes.tensor[valid_idx].cpu().numpy() - # pred_scores=det_instances.scores[valid_idx].cpu().numpy() - - start_time = time.time() - # Detect human keypoints for each person - vitposes_out = cpm.predict_pose( - img, - [np.concatenate([pred_bboxes, pred_scores[:, None]], axis=1)], - ) - - bboxes = [] - is_right = [] - start_time = time.time() - # Use hands based on hand keypoint detections - for vitposes in vitposes_out: - left_hand_keyp = vitposes['keypoints'][-42:-21] - right_hand_keyp = vitposes['keypoints'][-21:] - - # Rejecting not confident detections - keyp = left_hand_keyp - valid = keyp[:,2] > 0.5 - if sum(valid) > 3: - bbox = [keyp[valid,0].min(), keyp[valid,1].min(), keyp[valid,0].max(), keyp[valid,1].max()] - bboxes.append(bbox) - is_right.append(0) - keyp = right_hand_keyp - valid = keyp[:,2] > 0.5 - if sum(valid) > 3: - bbox = [keyp[valid,0].min(), keyp[valid,1].min(), keyp[valid,0].max(), keyp[valid,1].max()] - bboxes.append(bbox) - is_right.append(1) - - if len(bboxes) == 0: - continue - - boxes = np.stack(bboxes) - right = np.stack(is_right) - start_time = time.time() - # Run reconstruction on all detected hands - dataset = ViTDetDataset(model_cfg, img_cv2, boxes, right, rescale_factor=args.rescale_factor) - dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=False, num_workers=0) - - all_verts = [] - all_cam_t = [] - all_right = [] + # Hardcoded bbox. This assumes person is in the center and that there is always one person in the image + if args.custom_bbox == '': + h, w, _ = img_cv2.shape + pred_bboxes = np.array([[0, 0, w, h]]) + print(f"Using hardcoded bbox, {pred_bboxes[0]}") + else: + pred_bboxes = np.array([list(map(int, args.custom_bbox.split(',')))]) + print(f"Using custom bbox, {pred_bboxes[0]}") + # Force confidence to be 0.99 that human is present + pred_scores = np.array([0.99]) - for count,batch in enumerate(dataloader): - start_time = time.time() - batch = recursive_to(batch, device) - with torch.no_grad(): - out = model(batch) - - output = convert_tensors_to_lists(out.copy()) - output['VitPose'] = vitposes_out[0]['keypoints'].tolist() - json_path = os.path.join(args.out_folder,f"{os.path.basename(img_path).removesuffix('.png')}_{count}.json") if args.vid == '' else os.path.join(outfile,f"{os.path.basename(img_path).removesuffix('.png')}_{count}.json") - with open(json_path, 'w') as f: - json.dump(output, f, indent=4) - - if args.vid != '': - tmp_tar = os.path.join(temp_dir.name, "tmp.tar.xz") - tar_cmd = "tar -C " + temp_dir.name + " -cJf " + tmp_tar + f" {os.path.basename(outfile)}" - os.system(tar_cmd) - os.system(f"cp {tmp_tar} {os.path.join(args.out_folder, os.path.basename(outfile))}.tar.xz") - os.system(f"rm -rf {outfile}") - - if temp_dir: - temp_dir.cleanup() - print(f"Temp directory {temp_dir.name} cleaned up") + # keypoint detector + cpm = ViTPoseModel(device) + print('Loading ViTPose Model') + print_gpu_usage() + + pred_cam_list, global_orient_list, hand_pose_list, betas_list = [], [], [], [] + # Iterate over all images in folder + for img_path in img_paths: + img_cv2 = cv2.imread(str(img_path)) + img = img_cv2.copy()[:, :, ::-1] + + # Detect human keypoints for each person + vitposes_out = cpm.predict_pose( + img, + [np.concatenate([pred_bboxes, pred_scores[:, None]], axis=1)], + ) + + bboxes = [] + is_right = [] + # Use hands based on hand keypoint detections + for vitposes in vitposes_out: + left_hand_keyp = vitposes['keypoints'][-42:-21] + right_hand_keyp = vitposes['keypoints'][-21:] + + # Rejecting not confident detections + keyp = left_hand_keyp + valid = keyp[:,2] > 0.5 + if sum(valid) > 3: + bbox = [keyp[valid,0].min(), keyp[valid,1].min(), keyp[valid,0].max(), keyp[valid,1].max()] + bboxes.append(bbox) + is_right.append(0) + keyp = right_hand_keyp + valid = keyp[:,2] > 0.5 + if sum(valid) > 3: + bbox = [keyp[valid,0].min(), keyp[valid,1].min(), keyp[valid,0].max(), keyp[valid,1].max()] + bboxes.append(bbox) + is_right.append(1) + + if len(bboxes) == 0: + pred_cam_list.append(None) + global_orient_list.append(None) + hand_pose_list.append(None) + betas_list.append(None) + continue + + boxes = np.stack(bboxes) + right = np.stack(is_right) + + # Run reconstruction on all detected hands + dataset = ViTDetDataset(model_cfg, img_cv2, boxes, right, rescale_factor=args.rescale_factor) + dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=False, num_workers=0) + + all_verts = [] + all_cam_t = [] + all_right = [] + + for batch in dataloader: + batch = recursive_to(batch, device) + with torch.no_grad(): + out = model(batch) + + output = convert_tensors_to_lists(out.copy()) + pred_cam_list.append(output['pred_cam']) + global_orient_list.append(output['pred_mano_params']['global_orient']) + hand_pose_list.append(output['pred_mano_params']['hand_pose']) + betas_list.append(output['pred_mano_params']['betas']) + focal_length = output['focal_length'] + + assert len(pred_cam_list) == len(global_orient_list) == len(hand_pose_list) == len(betas_list), f"Length of lists are not equal pred_cam_list: {len(pred_cam_list)}, global_orient_list: {len(global_orient_list)}, hand_pose_list: {len(hand_pose_list)}, betas_list: {len(betas_list)}" + assert len(pred_cam_list) == len(img_paths), f"Length of output features and input images are not equal. features: {len(pred_cam_list)}, img_paths: {len(img_paths)}" + + #Create Temp Out Folder for lmdb + File_Name = os.path.basename(args.vid).removesuffix(".mp4") if args.vid != '' else os.path.basename(args.img_folder) + Temp_Database = Path(temp_dir) / f"{File_Name}.lmdb" + Temp_Database.mkdir(parents=True, exist_ok=True) + + print("Creating Database") + list_length = len(pred_cam_list) + print(f"Number of features: {list_length}\n") + # Set n_bytes to 1TB maximum + n_bytes = 2**40 + # Pickling Protocol set to 4 + protocol = 4 + + with lmdb.open(path=str(Temp_Database), map_size=n_bytes) as env: + with env.begin(write=True) as txn: + for index,feat in enumerate(global_orient_list): + txn.put( + key=f"{File_Name}_GOrient_{index}".encode("ascii"), + value=pickle.dumps(feat, protocol=protocol), + dupdata=False, + ) + + with env.begin(write=True) as txn: + for index,feat in enumerate(hand_pose_list): + txn.put( + key=f"{File_Name}_HPose_{index}".encode("ascii"), + value=pickle.dumps(feat, protocol=protocol), + dupdata=False, + ) + + with env.begin(write=True) as txn: + txn.put( + key=("pred_cam").encode("ascii"), + value=pickle.dumps(pred_cam_list, protocol=protocol), + dupdata=False, + ) + + with env.begin(write=True) as txn: + txn.put( + key=("betas").encode("ascii"), + value=pickle.dumps(betas_list, protocol=protocol), + dupdata=False, + ) + + with env.begin(write=True) as txn: + txn.put( + key=("details").encode("ascii"), + value=pickle.dumps({"num_features": list_length, "fps": fps, "focal_length": focal_length}, protocol=protocol), + dupdata=False, + ) + shutil.move(f"{Temp_Database}", args.out_folder) print(f"Total time taken: {(time.time()-initial_start_time)/60:.2f} minutes") if __name__ == '__main__': @@ -334,17 +276,23 @@ if __name__ == '__main__': args = parser.parse_args() + args.vid = '/vol/vssp/datasets/mixedmode/mein-dgs-korpus/RawData/1249951/1249951_1b1.mp4' + args.bbox = 'False' + args.custom_bbox = '100,0,530,360' + args.out_folder = '/vol/research/signVision/Projects/BSLboundaries/MeinDGS_HaMeR_Feature' + + args.out_folder = os.path.join(args.out_folder, os.path.basename(os.path.dirname(args.vid))) print(f"Output folder: {args.out_folder}") - print(f'Warning: Script Format Currently for MeinDGS Processing!') - model, model_cfg = load_hamer(args.checkpoint, load_mesh=True) # Load Mesh False only if no vertices are needed + model, model_cfg = load_hamer(args.checkpoint, load_mesh=False) # False sets model to not produce vertice on inference # Setup HaMeR model device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') model = model.to(device) model.eval() - renderer = Renderer(model_cfg, faces=model.mano.faces) + # renderer = Renderer(model_cfg, faces=model.mano.faces) + renderer = None print_gpu_usage() try: diff --git a/Inject_Json.py b/Inject_Json.py index 0f5af21b4dc9502a0f1b4610a99eae92d8c6d0bc..eccd598168de8e850b04f7808f4797daa60bb0aa 100644 --- a/Inject_Json.py +++ b/Inject_Json.py @@ -16,12 +16,35 @@ from hamer.utils.renderer import Renderer, cam_crop_to_full LIGHT_BLUE=(0.65098039, 0.74117647, 0.85882353) from vitpose_model import ViTPoseModel - -import json from typing import Dict, Optional import subprocess import gc +import lmdb, pickle + +def load_hamer_features(lmdb_path, video_name, generate_vertices=False): + env = lmdb.open( + path=lmdb_path, + readonly=True, + readahead=False, + lock=False, + meminit=False, + ) + betas, pred_cam = None, None + global_orient, hand_pose = [], [] + with env.begin(write=False) as txn: + details = pickle.loads(txn.get(key=("details").encode("ascii"))) + num_features = details['num_features'] + print(num_features) + for index in range(num_features): + global_orient.append(pickle.loads(txn.get(key=f"{video_name}_GOrient_{index}".encode("ascii")))) + hand_pose.append(pickle.loads(txn.get(key=f"{video_name}_HPose_{index}".encode("ascii")))) + + if generate_vertices: + betas = pickle.loads(txn.get(key=("betas").encode("ascii"))) + pred_cam = pickle.loads(txn.get(key=("pred_cam").encode("ascii"))) + + return global_orient, hand_pose, betas, pred_cam, details def get_video_fps(video_path): video = cv2.VideoCapture(video_path) @@ -60,6 +83,7 @@ def convert_lists_to_tensors(d): for key, value in d.items(): if isinstance(value, list): d[key] = torch.tensor(value) # Convert list to tensor + d[key] = d[key].to(device) elif isinstance(value, dict): # If there is a nested dictionary convert_lists_to_tensors(value) return d @@ -103,21 +127,24 @@ def main(args, model, device): temp_dir = None image_folder = Path(args.img_folder) - # Setup Image Paths and Json Paths Ranges - if args.json_folder.endswith('.tar.xz'): - cmd = f"tar -xf {args.json_folder} -C {temp_dir.name}" - os.system(cmd) - args.json_folder = os.path.join(temp_dir.name, os.path.basename(args.json_folder).removesuffix('.tar.xz')) + # Setup Image Paths and lmdb Paths Ranges + lmdb_files = os.listdir(args.lmdb_input) + if 'data.mdb' in lmdb_files and 'lock.mdb' in lmdb_files: + vid_name = os.path.basename(args.lmdb_input).removesuffix('.lmdb') + global_orient_list, hand_pose_list, betas_list, pred_cam_list, details = load_hamer_features(args.lmdb_input, vid_name, generate_vertices=True) + else: + raise ValueError("Input is not a lmdb file") img_paths = [img for end in args.file_type for img in image_folder.glob(end)] img_paths = sorted(img_paths,key = lambda x: int(os.path.basename(x).removesuffix('.png').removeprefix('Frame'))) - json_list = [os.path.join(args.json_folder, f) for f in os.listdir(args.json_folder) if f.endswith('.json')] - json_list = sorted(json_list,key = lambda x: int(os.path.basename(x).removeprefix('Frame').split('_')[0])) - assert len(img_paths) == len(json_list), f"Number of images {len(img_paths)} and json files {len(json_list)} do not match" + assert len(img_paths) == len(hand_pose_list), f"Number of images {len(img_paths)} and lmdb files {len(hand_pose_list)} do not match" start_frame, end_frame = map(int, args.frame_range.split(',')) img_paths = img_paths[start_frame:end_frame] - json_list = json_list[start_frame:end_frame] + global_orient_list = global_orient_list[start_frame:end_frame] + hand_pose_list = hand_pose_list[start_frame:end_frame] + betas_list = betas_list[start_frame:end_frame] + pred_cam_list = pred_cam_list[start_frame:end_frame] img_cv2 = cv2.imread(str(img_paths[0])) if args.bbox == True: @@ -161,8 +188,7 @@ def main(args, model, device): print_gpu_usage() # Iterate over all images in folder - for img_path, json_input in zip(img_paths, json_list): - print(f"Processing {img_path} with {json_input}") + for index, img_path in enumerate(img_paths): img_cv2 = cv2.imread(str(img_path)) img = img_cv2.copy()[:, :, ::-1] @@ -210,13 +236,17 @@ def main(args, model, device): for batch in dataloader: batch = recursive_to(batch, device) - with open(json_input, 'r') as f: - json_data = json.load(f) - + json_data = { + 'pred_cam': list(pred_cam_list[index]), + 'pred_mano_params': { + 'global_orient': global_orient_list[index], + 'hand_pose': hand_pose_list[index], + 'betas': betas_list[index], + } + } json_data = convert_lists_to_tensors(json_data) - with torch.no_grad(): - out = model.inject_json(json_data,args.injected_hand) + out = model.inject_json(json_data,batch['img'].shape[0]) multiplier = (2*batch['right']-1) pred_cam = out['pred_cam'] @@ -287,7 +317,9 @@ def main(args, model, device): input_img = np.concatenate([input_img, np.ones_like(input_img[:,:,:1])], axis=2) # Add alpha channel input_img_overlay = input_img[:,:,:3] * (1-cam_view[:,:,3:]) + cam_view[:,:,:3] * cam_view[:,:,3:] - cv2.imwrite(os.path.join(args.out_folder, f'{img_fn}_all.jpg'), 255*input_img_overlay[:, :, ::-1]) + Full_Frame_Folder = os.path.join(args.out_folder, 'Full_Frame') + os.makedirs(Full_Frame_Folder, exist_ok=True) + cv2.imwrite(os.path.join(Full_Frame_Folder, f'{img_fn}_all.jpg'), 255*input_img_overlay[:, :, ::-1]) if temp_dir: temp_dir.cleanup() @@ -310,16 +342,20 @@ if __name__ == '__main__': parser.add_argument('--injected_hand', type=int, default=2, help='Number of hands in the Video') parser.add_argument('--bbox', dest='bbox', action='store_true', default=True, help='If set, use detected bbox') parser.add_argument('--custom_bbox', type=str, default='', help='Custom bbox in the format x1,y1,x2,y2') - parser.add_argument('--json_folder', type=str, default='', help='Json file for input') + parser.add_argument('--lmdb_input', type=str, default='', help='lmdb file for input') parser.add_argument('--frame_range', type=str, default='0,-1', help='Frame range for input') args = parser.parse_args() - args.vid = '/vol/vssp/datasets/mixedmode/mein-dgs-korpus/RawData/1176549/1176549_1a1.mp4' - args.json_folder = '/vol/research/signVision/Projects/BSLboundaries/MeinDGS_HaMeR_Feature/1176549/1176549_1a1.tar.xz' - args.out_folder = '/vol/research/signVision/Projects/BSLboundaries/Test' - args.frame_range = '0,150' + + FileName = '1176549/1176549_1a1' + args.vid = f'/vol/vssp/datasets/mixedmode/mein-dgs-korpus/RawData/{FileName}.mp4' + args.lmdb_input = f'/mnt/fast/nobackup/scratch4weeks/jl02958/MeinDGS_HaMeR/{FileName}.lmdb' + args.out_folder = f'/vol/research/signVision/Projects/BSLboundaries/Test/{FileName.split("/")[-1]}' + + args.frame_range = '650,850' args.custom_bbox = '100,0,530,360' args.bbox = False + args.save_mesh = False # Download and load checkpoints # download_models(CACHE_DIR_HAMER) diff --git a/hamer/models/hamer.py b/hamer/models/hamer.py index 2272b50243b617420ab127159040a2b5a04b9100..9b16c50d9f8472e08530871b8b2cfac33729d1ae 100644 --- a/hamer/models/hamer.py +++ b/hamer/models/hamer.py @@ -50,7 +50,8 @@ class HAMER(pl.LightningModule): # Instantiate MANO model mano_cfg = {k.lower(): v for k,v in dict(cfg.MANO).items()} - self.mano = MANO(**mano_cfg) + if self.mesh: + self.mano = MANO(**mano_cfg) # Buffer that shows whetheer we need to initialize ActNorm layers self.register_buffer('initialized', torch.tensor(False)) @@ -100,7 +101,6 @@ class HAMER(pl.LightningModule): # Use RGB image as input x = batch['img'] batch_size = x.shape[0] - print(f'Batch Size: {x.shape}') # Compute conditioning features using the backbone # if using ViT backbone, we need to use a different aspect ratio @@ -376,12 +376,13 @@ class HAMER(pl.LightningModule): output['pred_mano_params'] = {k: v.clone() for k,v in pred_mano_params.items()} # Compute camera translation - focal_length = input_json['focal_length'] + dtype = pred_mano_params['hand_pose'].dtype + focal_length = self.cfg.EXTRA.FOCAL_LENGTH * torch.ones(batch_size, 2, device=device, dtype=dtype) pred_cam_t = torch.stack([pred_cam[:, 1], pred_cam[:, 2], 2*focal_length[:, 0]/(self.cfg.MODEL.IMAGE_SIZE * pred_cam[:, 0] +1e-9)],dim=-1) output['pred_cam_t'] = pred_cam_t - output['focal_length'] = input_json['focal_length'] + output['focal_length'] = focal_length # Compute model vertices, joints and the projected joints pred_mano_params['global_orient'] = pred_mano_params['global_orient'].reshape(batch_size, -1, 3, 3) @@ -393,10 +394,10 @@ class HAMER(pl.LightningModule): output['pred_keypoints_3d'] = pred_keypoints_3d.reshape(batch_size, -1, 3) output['pred_vertices'] = pred_vertices.reshape(batch_size, -1, 3) pred_cam_t = pred_cam_t.reshape(-1, 3) - focal_length = input_json['focal_length'].reshape(-1, 2) + focal_length = focal_length.reshape(-1, 2) pred_keypoints_2d = perspective_projection(pred_keypoints_3d, translation=pred_cam_t, focal_length=focal_length / self.cfg.MODEL.IMAGE_SIZE) output['pred_keypoints_2d'] = pred_keypoints_2d.reshape(batch_size, -1, 2) - return output + return output \ No newline at end of file