finalize i3D processing in feature regen

6d15e6c4 · Chrol-Cannon, Joseph Dr (Computer Science) · 3ece3c41 · 6d15e6c4
Commit 6d15e6c4 authored 3 years ago by Chrol-Cannon, Joseph Dr (Computer Science)
--- a/regen_frame_fv.py
+++ b/regen_frame_fv.py
@@ -27,7 +27,7 @@ class FrameFV:
        def __init__(self,path,args):
                self.anno = annot_np(path)
                self.net = VideoModelGlobalCoordLatent(args)
-                self.pre_resize_shape = (224, 224)
+                self.pre_resize_shape = (256, 340)
                self.random_crop = gtransforms.GroupMultiScaleCrop(output_size=224,
                                                                   scales=[1],
                                                                   max_distort=0,
@@ -39,23 +39,27 @@ class FrameFV:
                
                # load video to ndarray list
                img_array = load_av(finput)
-                print(img_array[0].shape)
-                #for i in range(len(img_array)):
-                #    img_array[i] = cv2.resize(img_array[i],self.pre_resize_shape)
-                img_array = [cv2.resize(img, (self.pre_resize_shape[1], self.pre_resize_shape[0])) for img in img_array]
                
+                # convert BGR to RGB
+                frames = [cv2.cvtColor(img, cv2.COLOR_BGR2RGB) for img in img_array]
+                # convert ndarray to array of PIL Images for resize and cropping
+                frames = [Image.fromarray(img.astype('uint8'), 'RGB') for img in frames]
+                # resize
+                frames = [img.resize((self.pre_resize_shape[1], self.pre_resize_shape[0]), Image.BILINEAR) for img in frames]
+                # crop
+                frames, (offset_h, offset_w, crop_h, crop_w) = self.random_crop(frames)
+                # convert back from PIL to ndarray for cv2 channel separation
+                frames = [np.array(img) for img in frames]
+                # separate channels into R,G,B frame sequences
                rs = []
                gs = []
                bs = []
-                for i in range(len(img_array)//3):
-                    B, R, G = cv2.split(img_array[i])
+                for i in range(len(frames)):
+                    R, G, B = cv2.split(frames[i])
                    rs.append(R)
                    gs.append(G)
                    bs.append(B)
                frames = [rs, gs, bs]
-                    
-                #frames = [Image.fromarray(img.astype('uint8'), 'RGB') for img in img_array]
-                #frames, (offset_h, offset_w, crop_h, crop_w) = self.random_crop(frames)
                
                # read frame annotations into Sequence
                seq = SmthSequence()
@@ -72,12 +76,8 @@ class FrameFV:
                    relations.append(fv)
                relations  = np.asarray(relations)
                
-                # TODO bb category embedding per frame
-                
                # i3D features per frame
-                #clip = torch.from_numpy(np.asarray([[img_array[0],img_array[1],img_array[2]]]))
                clip = torch.from_numpy(np.asarray([frames]))
-                #clip = img_array
                print(clip.shape)
                clip = clip.float()
                glo, vid = self.net.i3D(clip)
@@ -111,7 +111,7 @@ if __name__ == '__main__':
                        help='primary image input size')
    parser.add_argument('--batch_size', '-b', default=72, type=int,
                        metavar='N', help='mini-batch size (default: 72)')
-    parser.add_argument('--num_classes', default=50, type=int,
+    parser.add_argument('--num_classes', default=174, type=int,
                        help='num of class in the model')
    parser.add_argument('--num_boxes', default=4, type=int,
                        help='num of boxes for each image')