diff --git a/regen_frame_fv.py b/regen_frame_fv.py index 59e6ab7ec83586e589567607f5241c4956efffe7..2e16af7ec85c90d93ea807b8632c4fafc378d23e 100644 --- a/regen_frame_fv.py +++ b/regen_frame_fv.py @@ -27,7 +27,7 @@ class FrameFV: def __init__(self,path,args): self.anno = annot_np(path) self.net = VideoModelGlobalCoordLatent(args) - self.pre_resize_shape = (224, 224) + self.pre_resize_shape = (256, 340) self.random_crop = gtransforms.GroupMultiScaleCrop(output_size=224, scales=[1], max_distort=0, @@ -39,23 +39,27 @@ class FrameFV: # load video to ndarray list img_array = load_av(finput) - print(img_array[0].shape) - #for i in range(len(img_array)): - # img_array[i] = cv2.resize(img_array[i],self.pre_resize_shape) - img_array = [cv2.resize(img, (self.pre_resize_shape[1], self.pre_resize_shape[0])) for img in img_array] + # convert BGR to RGB + frames = [cv2.cvtColor(img, cv2.COLOR_BGR2RGB) for img in img_array] + # convert ndarray to array of PIL Images for resize and cropping + frames = [Image.fromarray(img.astype('uint8'), 'RGB') for img in frames] + # resize + frames = [img.resize((self.pre_resize_shape[1], self.pre_resize_shape[0]), Image.BILINEAR) for img in frames] + # crop + frames, (offset_h, offset_w, crop_h, crop_w) = self.random_crop(frames) + # convert back from PIL to ndarray for cv2 channel separation + frames = [np.array(img) for img in frames] + # separate channels into R,G,B frame sequences rs = [] gs = [] bs = [] - for i in range(len(img_array)//3): - B, R, G = cv2.split(img_array[i]) + for i in range(len(frames)): + R, G, B = cv2.split(frames[i]) rs.append(R) gs.append(G) bs.append(B) frames = [rs, gs, bs] - - #frames = [Image.fromarray(img.astype('uint8'), 'RGB') for img in img_array] - #frames, (offset_h, offset_w, crop_h, crop_w) = self.random_crop(frames) # read frame annotations into Sequence seq = SmthSequence() @@ -72,12 +76,8 @@ class FrameFV: relations.append(fv) relations = np.asarray(relations) - # TODO bb category embedding per frame - # i3D features per frame - #clip = torch.from_numpy(np.asarray([[img_array[0],img_array[1],img_array[2]]])) clip = torch.from_numpy(np.asarray([frames])) - #clip = img_array print(clip.shape) clip = clip.float() glo, vid = self.net.i3D(clip) @@ -111,7 +111,7 @@ if __name__ == '__main__': help='primary image input size') parser.add_argument('--batch_size', '-b', default=72, type=int, metavar='N', help='mini-batch size (default: 72)') - parser.add_argument('--num_classes', default=50, type=int, + parser.add_argument('--num_classes', default=174, type=int, help='num of class in the model') parser.add_argument('--num_boxes', default=4, type=int, help='num of boxes for each image')