Adding version of json extractor which is uncompressed and more suitable for long videos

5200fd70 · Low, Low Jian He (PG/R - Comp Sci & Elec Eng) · 4603aa55 · 5200fd70 · 5200fd70
Commit 5200fd70 authored 8 months ago by Low, Low Jian He (PG/R - Comp Sci & Elec Eng)
--- a/Extract_HaMeR_Jsons.py
+++ b/Extract_HaMeR_Jsons.py
@@ -23,6 +23,26 @@ from vitpose_model import ViTPoseModel
 import json
 from typing import Dict, Optional

+import subprocess
+
+def print_gpu_usage():
+    try:
+        # Run the `nvidia-smi` command
+        result = subprocess.run(['nvidia-smi', '--query-gpu=utilization.gpu,memory.used,memory.total', '--format=csv,nounits,noheader'],
+                                stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        
+        if result.returncode != 0:
+            print("Error fetching GPU usage:", result.stderr)
+            return
+        
+        # Process the output
+        usage_lines = result.stdout.strip().split('\n')
+        for i, line in enumerate(usage_lines):
+            gpu_util, mem_used, mem_total = map(int, line.split(', '))
+            print(f"GPU {i}: Utilization: {gpu_util}%, Memory: {mem_used}/{mem_total} MiB")
+    except FileNotFoundError:
+        print("nvidia-smi command not found. Ensure NVIDIA drivers are installed.")
+
 # def Produce_Mesh_from_Dict(out, batch, model, args):
 #     renderer = Renderer(model_cfg, faces=model.mano.faces)
 #     multiplier = (2*batch['right']-1)
@@ -124,7 +144,7 @@ def convert_tensors_to_lists(d):
            convert_tensors_to_lists(value)
    return d

-def main(args, model, renderer, device, cpm):  
+def main(args, model, renderer, device):  
    initial_start_time = time.time()

    # Load detector
@@ -161,7 +181,7 @@ def main(args, model, renderer, device, cpm):
        outfile = os.path.join(temp_dir.name, filename)
        os.makedirs(outfile, exist_ok=True)

-        os.system(f"ffmpeg -i {args.vid} -vf fps={fps} {temp_dir.name}/Frame%d.png")
+        os.system(f"ffmpeg -nostdin -i {args.vid} -vf fps={fps} {temp_dir.name}/Frame%d.png")
        image_folder = Path(temp_dir.name)
        print(f"Total Time for Video to Image: {(time.time()-start_time)} Seconds")
    else:
@@ -169,13 +189,14 @@ def main(args, model, renderer, device, cpm):
        image_folder = Path(args.img_folder)
    # Get all demo images ends with .jpg or .png
    img_paths = [img for end in args.file_type for img in image_folder.glob(end)]
+    img_paths = sorted(img_paths,key = lambda x: int(os.path.basename(x).removesuffix('.png').removeprefix('Frame')))

    if args.bbox:
        # Detect humans in image
        img_cv2 = cv2.imread(str(img_paths[0]))
        det_out = detector(img_cv2)
        det_instances = det_out['instances']
-
+        print_gpu_usage()
        #Clearing memory
        del detector
        torch.cuda.empty_cache()
@@ -190,6 +211,10 @@ def main(args, model, renderer, device, cpm):
        else:
            print("No humans detected in the image")
            return
+        
+        # keypoint detector
+        cpm = ViTPoseModel(device)
+        print_gpu_usage()
    else:
        # Hardcoded bbox. This assumes person is in the center and that there is always one person in the image
        # Values below strictly for Rachel's BM Videos
@@ -301,7 +326,7 @@ if __name__ == '__main__':
    parser.add_argument('--MANO_Output', type=bool, default=False, help= 'If set, generate output images')

    args = parser.parse_args()
-
+    
    model, model_cfg = load_hamer(args.checkpoint, load_mesh=True) # Load Mesh False only if no vertices are needed

    # Setup HaMeR model
@@ -309,25 +334,21 @@ if __name__ == '__main__':
    model = model.to(device)
    model.eval()
    renderer = Renderer(model_cfg, faces=model.mano.faces)
+    print_gpu_usage()

-    # keypoint detector
-    cpm = ViTPoseModel(device)
-
-    # args.in_folder = '/vol/research/SignMotion/2024.02.20_Capture_1/glosses'
-    Folder_List = os.listdir(args.in_folder)
-    Folder_Dirname = os.path.dirname(args.in_folder)
-    Rachel_Path = '//vol//research//SignFeaturePool//Rachel_Gloss_Features//HaMeR_Features'
+    Folder_List = [os.path.join(args.in_folder, f) for f in os.listdir(args.in_folder) if f.endswith('.mp4') or f.endswith('.png') or f.endswith('.jpg')]
    
-    args.out_folder = os.path.join(Rachel_Path, os.path.basename(args.in_folder.removesuffix('/glosses')))
+    args.out_folder = os.path.join(args.out_folder, os.path.basename(args.in_folder))
    
    if Folder_List[0].endswith('.mp4'):
        for vid_file in Folder_List:
-            args.vid = f'{Folder_Dirname}//{vid_file}'
+            args.vid = vid_file
+            print(args.vid)
            try:
-                main(args, model, renderer, device, cpm)
+                main(args, model, renderer, device)
            except Exception as e:
                print(f"Error in processing {vid_file}: {e}")
                
    elif Folder_List[0].endswith('.png') or Folder_List[0].endswith('.jpg'):
        args.img_folder = args.in_folder
-        main(args, model, renderer, device, cpm)
+        main(args, model, renderer, device)
--- a/Extract_Uncompressed_HaMeR.py
+++ b/Extract_Uncompressed_HaMeR.py
+from pathlib import Path
+import torch
+import argparse
+import os
+import cv2
+import numpy as np
+import tempfile
+import time
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+import gc
+
+from hamer.configs import CACHE_DIR_HAMER
+from hamer.models import HAMER, download_models, load_hamer, DEFAULT_CHECKPOINT
+from hamer.utils import recursive_to
+from hamer.datasets.vitdet_dataset import ViTDetDataset, DEFAULT_MEAN, DEFAULT_STD
+from hamer.utils.renderer import Renderer, cam_crop_to_full
+
+LIGHT_BLUE=(0.65098039,  0.74117647,  0.85882353)
+
+from vitpose_model import ViTPoseModel
+
+import json
+from typing import Dict, Optional
+
+import subprocess
+
+def print_gpu_usage():
+    try:
+        # Run the `nvidia-smi` command
+        result = subprocess.run(['nvidia-smi', '--query-gpu=utilization.gpu,memory.used,memory.total', '--format=csv,nounits,noheader'],
+                                stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        
+        if result.returncode != 0:
+            print("Error fetching GPU usage:", result.stderr)
+            return
+        
+        # Process the output
+        usage_lines = result.stdout.strip().split('\n')
+        for i, line in enumerate(usage_lines):
+            gpu_util, mem_used, mem_total = map(int, line.split(', '))
+            print(f"GPU {i}: Utilization: {gpu_util}%, Memory: {mem_used}/{mem_total} MiB")
+    except FileNotFoundError:
+        print("nvidia-smi command not found. Ensure NVIDIA drivers are installed.")
+
+# def Produce_Mesh_from_Dict(out, batch, model, args):
+#     renderer = Renderer(model_cfg, faces=model.mano.faces)
+#     multiplier = (2*batch['right']-1)
+#     pred_cam = out['pred_cam']
+#     pred_cam[:,1] = multiplier*pred_cam[:,1]
+#     box_center = batch["box_center"].float()
+#     box_size = batch["box_size"].float()
+#     img_size = batch["img_size"].float()
+#     multiplier = (2*batch['right']-1)
+#     scaled_focal_length = model_cfg.EXTRA.FOCAL_LENGTH / model_cfg.MODEL.IMAGE_SIZE * img_size.max()
+#     pred_cam_t_full = cam_crop_to_full(pred_cam, box_center, box_size, img_size, scaled_focal_length).detach().cpu().numpy()
+
+#     # Render the result
+#     batch_size = batch['img'].shape[0]
+#     for n in range(batch_size):
+#         start_time = time.time()
+#         # Get filename from path img_path
+#         img_fn, _ = os.path.splitext(os.path.basename(img_path))
+#         person_id = int(batch['personid'][n])
+#         white_img = (torch.ones_like(batch['img'][n]).cpu() - DEFAULT_MEAN[:,None,None]/255) / (DEFAULT_STD[:,None,None]/255)
+#         input_patch = batch['img'][n].cpu() * (DEFAULT_STD[:,None,None]/255) + (DEFAULT_MEAN[:,None,None]/255)
+#         input_patch = input_patch.permute(1,2,0).numpy()
+
+#         regression_img = renderer(out['pred_vertices'][n].detach().cpu().numpy(),
+#                                 out['pred_cam_t'][n].detach().cpu().numpy(),
+#                                 batch['img'][n],
+#                                 mesh_base_color=LIGHT_BLUE,
+#                                 scene_bg_color=(1, 1, 1),
+#                                 )
+
+#         if args.side_view:
+#             side_img = renderer(out['pred_vertices'][n].detach().cpu().numpy(),
+#                                     out['pred_cam_t'][n].detach().cpu().numpy(),
+#                                     white_img,
+#                                     mesh_base_color=LIGHT_BLUE,
+#                                     scene_bg_color=(1, 1, 1),
+#                                     side_view=True)
+#             final_img = np.concatenate([input_patch, regression_img, side_img], axis=1)
+#         else:
+#             final_img = np.concatenate([input_patch, regression_img], axis=1)
+
+#         cv2.imwrite(os.path.join(args.out_folder, f'{img_fn}_{person_id}.png'), 255*final_img[:, :, ::-1])
+
+#         # Add all verts and cams to list
+#         verts = out['pred_vertices'][n].detach().cpu().numpy()
+#         is_right = batch['right'][n].cpu().numpy()
+#         verts[:,0] = (2*is_right-1)*verts[:,0]
+#         cam_t = pred_cam_t_full[n]
+#         all_verts.append(verts)
+#         all_cam_t.append(cam_t)
+#         all_right.append(is_right)
+
+#         # Save all meshes to disk
+#         if args.save_mesh:
+#             camera_translation = cam_t.copy()
+#             tmesh = renderer.vertices_to_trimesh(verts, camera_translation, LIGHT_BLUE, is_right=is_right)
+#             tmesh.export(os.path.join(args.out_folder, f'{img_fn}_{person_id}.obj'))
+#         print(f"Total Time for rendering Meshes and Saving: {(time.time()-start_time)} Seconds")
+
+#     # Render front view
+#     if args.full_frame and len(all_verts) > 0:
+#         misc_args = dict(
+#             mesh_base_color=LIGHT_BLUE,
+#             scene_bg_color=(1, 1, 1),
+#             focal_length=scaled_focal_length,
+#         )
+#         cam_view = renderer.render_rgba_multiple(all_verts, cam_t=all_cam_t, render_res=img_size[n], is_right=all_right, **misc_args)
+
+#         # Overlay image
+#         input_img = img_cv2.astype(np.float32)[:,:,::-1]/255.0
+#         input_img = np.concatenate([input_img, np.ones_like(input_img[:,:,:1])], axis=2) # Add alpha channel
+#         input_img_overlay = input_img[:,:,:3] * (1-cam_view[:,:,3:]) + cam_view[:,:,:3] * cam_view[:,:,3:]
+
+#         cv2.imwrite(os.path.join(args.out_folder, f'{img_fn}_all.jpg'), 255*input_img_overlay[:, :, ::-1])
+    
+
+def show_bbox(img, bbox):
+    bbox_tensor = torch.tensor(bbox)
+    bbox = bbox_tensor.cpu().numpy()[0]
+    # Extract coordinates
+    x1, y1, x2, y2 = bbox
+    # Load an image (replace 'image.jpg' with your image file)
+    image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB for Matplotlib
+    # Create a figure and axis
+    fig, ax = plt.subplots(1)
+    # Display the image
+    ax.imshow(image)
+    # Create a Rectangle patch
+    rect = patches.Rectangle((x1, y1), x2 - x1, y2 - y1, linewidth=2, edgecolor='g', facecolor='none')
+    # Add the patch to the Axes
+    ax.add_patch(rect)
+    plt.show()
+
+def convert_tensors_to_lists(d):
+    for key, value in d.items():
+        if isinstance(value, torch.Tensor):
+            d[key] = value.tolist()  # Convert tensor to list
+        elif isinstance(value, dict):  # If there is a nested dictionary
+            convert_tensors_to_lists(value)
+    return d
+
+def main(args, model, renderer, device):  
+    initial_start_time = time.time()
+
+    # Load detector
+    if args.bbox:
+        from hamer.utils.utils_detectron2 import DefaultPredictor_Lazy
+        if args.body_detector == 'vitdet':
+            from detectron2.config import LazyConfig
+            import hamer
+            cfg_path = Path(hamer.__file__).parent/'configs'/'cascade_mask_rcnn_vitdet_h_75ep.py'
+            detectron2_cfg = LazyConfig.load(str(cfg_path))
+            detectron2_cfg.train.init_checkpoint = "https://dl.fbaipublicfiles.com/detectron2/ViTDet/COCO/cascade_mask_rcnn_vitdet_h/f328730692/model_final_f05665.pkl"
+            for i in range(3):
+                detectron2_cfg.model.roi_heads.box_predictors[i].test_score_thresh = 0.25
+            detector = DefaultPredictor_Lazy(detectron2_cfg)
+        elif args.body_detector == 'regnety':
+            from detectron2 import model_zoo
+            from detectron2.config import get_cfg
+            detectron2_cfg = model_zoo.get_config('new_baselines/mask_rcnn_regnety_4gf_dds_FPN_400ep_LSJ.py', trained=True)
+            detectron2_cfg.model.roi_heads.box_predictor.test_score_thresh = 0.5
+            detectron2_cfg.model.roi_heads.box_predictor.test_nms_thresh   = 0.4
+            detector       = DefaultPredictor_Lazy(detectron2_cfg)
+
+    os.makedirs(args.out_folder, exist_ok=True)
+    
+    print(f"Total Time for Initialization: {(time.time()-initial_start_time)} Seconds")
+    start_time = time.time()
+    if args.vid != '':
+        fps = args.fps
+        temp_dir = tempfile.TemporaryDirectory()
+        print(f"Temp directory created at {temp_dir.name}")
+
+        #Create Temp Out Folder
+        filename = os.path.basename(args.vid).removesuffix('.mp4')
+        outfile = os.path.join(args.out_folder, filename)
+        os.makedirs(outfile, exist_ok=True)
+
+        os.system(f"ffmpeg -nostdin -i {args.vid} -vf fps={fps} {temp_dir.name}/Frame%d.png")
+        image_folder = Path(temp_dir.name)
+        print(f"Total Time for Video to Image: {(time.time()-start_time)} Seconds")
+    else:
+        temp_dir = None
+        image_folder = Path(args.img_folder)
+    # Get all demo images ends with .jpg or .png
+    img_paths = [img for end in args.file_type for img in image_folder.glob(end)]
+    img_paths = sorted(img_paths,key = lambda x: int(os.path.basename(x).removesuffix('.png').removeprefix('Frame')))
+
+    if args.bbox:
+        # Detect humans in image
+        img_cv2 = cv2.imread(str(img_paths[0]))
+        det_out = detector(img_cv2)
+        det_instances = det_out['instances']
+        print_gpu_usage()
+        #Clearing memory
+        del detector
+        torch.cuda.empty_cache()
+        gc.collect()
+
+        valid_idx = (det_instances.pred_classes==0) & (det_instances.scores > 0.5)
+        if valid_idx[0]:
+            pred_bboxes=det_instances.pred_boxes.tensor[valid_idx].cpu().numpy()
+            pred_scores=det_instances.scores[valid_idx].cpu().numpy()
+            x1, y1, x2, y2 = pred_bboxes[0]
+            pred_bboxes[0] = [x1-x1*0.275, y1-y1*0.1, x2+x1*0.275, y2]
+        else:
+            print("No humans detected in the image")
+            return
+        
+        # keypoint detector
+        cpm = ViTPoseModel(device)
+        print_gpu_usage()
+    else:
+        # Hardcoded bbox. This assumes person is in the center and that there is always one person in the image
+        # Values below strictly for Rachel's BM Videos
+        pred_bboxes = np.array([[300, 100, 1620, 1075]])
+        # Force confidence to be 0.99 that human is present
+        pred_scores = np.array([0.99])
+        img_cv2 = cv2.imread(str(img_paths[0]))
+    
+    # Iterate over all images in folder
+    for img_path in img_paths:
+        start_time = time.time()
+        img_cv2 = cv2.imread(str(img_path))
+        img = img_cv2.copy()[:, :, ::-1]
+
+        # # Detect humans in image
+        # det_out = detector(img_cv2)
+        # det_instances = det_out['instances']
+        # valid_idx = (det_instances.pred_classes==0) & (det_instances.scores > 0.5)
+        # pred_bboxes=det_instances.pred_boxes.tensor[valid_idx].cpu().numpy()
+        # pred_scores=det_instances.scores[valid_idx].cpu().numpy()
+
+        start_time = time.time()
+        # Detect human keypoints for each person
+        vitposes_out = cpm.predict_pose(
+            img,
+            [np.concatenate([pred_bboxes, pred_scores[:, None]], axis=1)],
+        )
+
+        bboxes = []
+        is_right = []
+        start_time = time.time()
+        # Use hands based on hand keypoint detections
+        for vitposes in vitposes_out:
+            left_hand_keyp = vitposes['keypoints'][-42:-21]
+            right_hand_keyp = vitposes['keypoints'][-21:]
+
+            # Rejecting not confident detections
+            keyp = left_hand_keyp
+            valid = keyp[:,2] > 0.5
+            if sum(valid) > 3:
+                bbox = [keyp[valid,0].min(), keyp[valid,1].min(), keyp[valid,0].max(), keyp[valid,1].max()]
+                bboxes.append(bbox)
+                is_right.append(0)
+            keyp = right_hand_keyp
+            valid = keyp[:,2] > 0.5
+            if sum(valid) > 3:
+                bbox = [keyp[valid,0].min(), keyp[valid,1].min(), keyp[valid,0].max(), keyp[valid,1].max()]
+                bboxes.append(bbox)
+                is_right.append(1)
+
+        if len(bboxes) == 0:
+            continue
+
+        boxes = np.stack(bboxes)
+        right = np.stack(is_right)
+        start_time = time.time()
+        # Run reconstruction on all detected hands
+        dataset = ViTDetDataset(model_cfg, img_cv2, boxes, right, rescale_factor=args.rescale_factor)
+        dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=False, num_workers=0)
+
+        all_verts = []
+        all_cam_t = []
+        all_right = []
+        
+        for count,batch in enumerate(dataloader):
+            start_time = time.time()
+            batch = recursive_to(batch, device)
+            with torch.no_grad():
+                out = model(batch)
+
+            output = convert_tensors_to_lists(out.copy())
+            output['VitPose'] = vitposes_out[0]['keypoints'].tolist()
+            json_path = os.path.join(args.out_folder,f"{os.path.basename(img_path).removesuffix('.png')}_{count}.json") if args.vid == '' else os.path.join(outfile,f"{os.path.basename(img_path).removesuffix('.png')}_{count}.json")
+            with open(json_path, 'w') as f:
+                json.dump(output, f, indent=4)
+            
+    if temp_dir:
+        temp_dir.cleanup()
+        print(f"Temp directory {temp_dir.name} cleaned up")
+    print(f"Total time taken: {(time.time()-initial_start_time)/60:.2f} minutes")
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='HaMeR demo code')
+    parser.add_argument('--checkpoint', type=str, default=DEFAULT_CHECKPOINT, help='Path to pretrained model checkpoint')
+    parser.add_argument('--vid', type=str, default='', help='Path to video file')
+    parser.add_argument('--img_folder', type=str, default='images', help='Folder with input images')
+    parser.add_argument('--out_folder', type=str, default='out_demo', help='Output folder to save rendered results')
+    parser.add_argument('--fps', type=int, default=50, help='FPS for video output')
+    parser.add_argument('--side_view', dest='side_view', action='store_true', default=False, help='If set, render side view also')
+    parser.add_argument('--full_frame', dest='full_frame', action='store_true', default=True, help='If set, render all people together also')
+    parser.add_argument('--save_mesh', dest='save_mesh', action='store_true', default=False, help='If set, save meshes to disk also')
+    parser.add_argument('--batch_size', type=int, default=1, help='Batch size for inference/fitting')
+    parser.add_argument('--rescale_factor', type=float, default=2.0, help='Factor for padding the bbox')
+    parser.add_argument('--body_detector', type=str, default='vitdet', choices=['vitdet', 'regnety'], help='Using regnety improves runtime and reduces memory')
+    parser.add_argument('--file_type', nargs='+', default=['*.jpg', '*.png'], help='List of file extensions to consider')
+    parser.add_argument('--bbox', type=bool, default=True, help= 'If set, use provided bbox from ViT')
+    parser.add_argument('--MANO_Output', type=bool, default=False, help= 'If set, generate output images')
+
+    args = parser.parse_args()
+
+    args.out_folder = os.path.join(args.out_folder, os.path.basename(os.path.dirname(args.vid)))
+    print(f"Output folder: {args.out_folder}")
+    print(f'Warning: Script Format Currently for MeinDGS Processing! FPS set at {args.fps}')
+    
+    model, model_cfg = load_hamer(args.checkpoint, load_mesh=True) # Load Mesh False only if no vertices are needed
+
+    # Setup HaMeR model
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    model = model.to(device)
+    model.eval()
+    renderer = Renderer(model_cfg, faces=model.mano.faces)
+    print_gpu_usage()
+
+    try:
+        main(args, model, renderer, device)
+    except Exception as e:
+        print(f"Error in processing {args.vid}: {e}")