diff --git a/discarded_src/balance_dataset.py b/discarded_src/balance_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb2bc028cefe28bf9174664db9ba7cf9f52b8713
--- /dev/null
+++ b/discarded_src/balance_dataset.py
@@ -0,0 +1,101 @@
+import argparse
+import json
+import os
+import random
+import shutil
+
+from tqdm import tqdm
+
+
+def load_labels(labels_path):
+    with open(labels_path, 'r') as f:
+        return json.load(f)
+
+
+def get_video_paths(input_dir):
+    video_paths = {}
+    for part in ['part1', 'part2']:
+        part_dir = os.path.join(input_dir, part)
+        for video in os.listdir(part_dir):
+            video_paths[video] = os.path.join(part_dir, video)
+    return video_paths
+
+
+def get_maximum_balanced_subset(labels, video_paths):
+    artefacts = set()
+    for video_labels in labels.values():
+        artefacts.update(video_labels.keys())
+
+    balanced_subset = {}
+
+    for artefact in artefacts:
+        positive_videos = [video for video, video_labels in labels.items()
+                           if video in video_paths and video_labels.get(artefact, 0) == 1]
+        negative_videos = [video for video, video_labels in labels.items()
+                           if video in video_paths and video_labels.get(artefact, 0) == 0]
+
+        count_per_label = min(len(positive_videos), len(negative_videos))
+
+        selected_positive = set(random.sample(positive_videos, count_per_label))
+        selected_negative = set(random.sample(negative_videos, count_per_label))
+
+        for video in selected_positive.union(selected_negative):
+            if video not in balanced_subset:
+                balanced_subset[video] = labels[video]
+            balanced_subset[video][artefact] = 1 if video in selected_positive else 0
+
+    return balanced_subset
+
+
+def copy_videos(videos, video_paths, dst_dir):
+    os.makedirs(dst_dir, exist_ok=True)
+    for video in tqdm(videos, desc=f"Copying to {os.path.basename(dst_dir)}"):
+        src_path = video_paths[video]
+        dst_path = os.path.join(dst_dir, video)
+        shutil.copy2(src_path, dst_path)
+
+
+def create_subset_labels(balanced_subset):
+    return balanced_subset
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Create a maximum balanced subset of videos for all artefacts and relocate them.")
+    parser.add_argument("--input_dir", type=str, required=True, help="Path to processed_BVIArtefact folder")
+    parser.add_argument("--output_dir", type=str, required=True, help="Path to output directory")
+    args = parser.parse_args()
+
+    labels_path = os.path.join(args.input_dir, 'processed_labels.json')
+    labels = load_labels(labels_path)
+
+    video_paths = get_video_paths(args.input_dir)
+
+    balanced_subset = get_maximum_balanced_subset(labels, video_paths)
+
+    copy_videos(balanced_subset.keys(), video_paths, args.output_dir)
+
+    # Create and save the subset labels.json
+    subset_labels = create_subset_labels(balanced_subset)
+    labels_json_path = os.path.join(args.output_dir, 'labels.json')
+    with open(labels_json_path, 'w') as f:
+        json.dump(subset_labels, f, indent=4)
+
+    print(f"Maximum balanced subset created in {args.output_dir}")
+    print(f"Total videos in subset: {len(balanced_subset)}")
+    print(f"Labels.json created at {labels_json_path}")
+
+    artefacts = set()
+    for video_labels in balanced_subset.values():
+        artefacts.update(video_labels.keys())
+
+    for artefact in sorted(artefacts):
+        presence_count = sum(1 for labels in balanced_subset.values() if labels.get(artefact, 0) == 1)
+        absence_count = sum(1 for labels in balanced_subset.values() if labels.get(artefact, 0) == 0)
+        print(f"{artefact}:")
+        print(f"  Presence count: {presence_count}")
+        print(f"  Absence count: {absence_count}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/discarded_src/sample_code.py b/discarded_src/sample_code.py
new file mode 100644
index 0000000000000000000000000000000000000000..975732c89a404ead168301857c0b435ec0d3e97a
--- /dev/null
+++ b/discarded_src/sample_code.py
@@ -0,0 +1,260 @@
+# dataset structure:
+'''
+data/graininess_100_balanced_subset_split
+â”œâ”€â”€ test
+â”‚   â”œâ”€â”€ BirdsInCage_1920x1080_30fps_8bit_420_Pristine_QP32_FBT_1.avi
+â”‚   â”œâ”€â”€ Chimera1_4096x2160_60fps_10bit_420_graininess_QP32_FB_1.avi
+â”‚   â”œâ”€â”€ Chimera3_4096x2160_24fps_10bit_420_graininess_QP32_FT_1.avi
+â”‚   â”œâ”€â”€ ...
+â”‚   â””â”€â”€ labels.json
+â”œâ”€â”€ train
+â”‚   â”œâ”€â”€ labels.json
+â”‚   â”œâ”€â”€ lamppost_1920x1080_120fps_8bit_420_Pristine_QP32_BT_3.avi
+â”‚   â”œâ”€â”€ lamppost_1920x1080_120fps_8bit_420_Pristine_QP47_SF_3.avi
+â”‚   â”œâ”€â”€ leaveswall_1920x1080_120fps_8bit_420_Motion_QP32_SB_1.avi
+â”‚   â”œâ”€â”€ leaveswall_1920x1080_120fps_8bit_420_Motion_QP32_SFB_4.avi
+â”‚   â”œâ”€â”€ library_1920x1080_120fps_8bit_420_aliasing_QP47_FT_1.avi
+â”‚   â”œâ”€â”€ ...
+â””â”€â”€ val
+    â”œâ”€â”€ Chimera2_4096x2160_60fps_10bit_420_Dark_QP32_BT_1.avi
+    â”œâ”€â”€ ...
+    â”œâ”€â”€ labels.json
+    â”œâ”€â”€ shields_1280x720_50fps_8bit_420_graininess_QP47_SFB_1.avi
+    â”œâ”€â”€ station_1920x1080_30fps_8bit_420_graininess_QP32_SB_1.avi
+    â”œâ”€â”€ svtmidnightsun_3840x2160_50fps_10bit_420_banding_QP47_SBT_3.avi
+    â”œâ”€â”€ svtmidnightsun_3840x2160_50fps_10bit_420_banding_QP47_SFT_1.avi
+    â”œâ”€â”€ svtsmokesauna_3840x2160_50fps_10bit_420_banding_QP32_F_4.avi
+    â”œâ”€â”€ svtwaterflyover_3840x2160_50fps_10bit_420_banding_QP32_T_3.avi
+    â””â”€â”€ typing_1920x1080_120fps_8bit_420_aliasing_QP47_BT_4.avi
+
+4 directories, 103 files
+'''
+
+'''
+labels.json in each split is like:
+{
+  "Chimera1_4096x2160_60fps_10bit_420_graininess_QP47_FT_1.avi": {
+    "graininess": 1
+  },
+  "riverbed_1920x1080_25fps_8bit_420_banding_QP47_SBT_1.avi": {
+    "graininess": 0
+  },
+  "Meridian1_3840x2160_60fps_10bit_420_banding_QP47_SFT_1.avi": {
+    "graininess": 0
+  },
+  '''
+
+
+# Import necessary libraries
+import os
+import json
+import torch
+import numpy as np
+from transformers import VivitImageProcessor, VivitForVideoClassification, TrainingArguments, Trainer
+from datasets import Dataset, DatasetDict
+from torchvision.io import read_video
+from sklearn.metrics import accuracy_score, precision_recall_fscore_support
+from multiprocessing import Pool
+import functools
+
+
+def load_video(video_path):
+    # Read the video file
+    video, _, info = read_video(video_path, pts_unit='sec')
+
+    # Set the number of frames we want to sample
+    num_frames_to_sample = 32
+
+    # Get the total number of frames in the video
+    total_frames = video.shape[0]
+
+    # Calculate the sampling rate to evenly distribute frames
+    sampling_rate = max(total_frames // num_frames_to_sample, 1)
+
+    # Sample frames at the calculated rate
+    sampled_frames = video[::sampling_rate][:num_frames_to_sample]
+
+    # If we don't have enough frames, pad with zeros
+    if sampled_frames.shape[0] < num_frames_to_sample:
+        padding = torch.zeros(
+            (num_frames_to_sample - sampled_frames.shape[0], *sampled_frames.shape[1:]), dtype=sampled_frames.dtype)
+        sampled_frames = torch.cat([sampled_frames, padding], dim=0)
+
+    # Ensure we have exactly the number of frames we want
+    sampled_frames = sampled_frames[:num_frames_to_sample]
+
+    # Convert to numpy array and change to channel-first format (C, H, W)
+    return sampled_frames.permute(0, 3, 1, 2).numpy()
+
+
+def create_dataset(data_dir, split):
+    # Construct the path to the video directory and labels file
+    video_dir = os.path.join(data_dir, split)
+    json_path = os.path.join(video_dir, 'labels.json')
+
+    # Load the labels from the JSON file
+    with open(json_path, 'r') as f:
+        labels = json.load(f)
+
+    # Get all video files in the directory
+    video_files = [f for f in os.listdir(video_dir) if f.endswith('.avi')]
+
+    # Create a dataset with video paths and their corresponding labels
+    dataset = Dataset.from_dict({
+        'video_path': [os.path.join(video_dir, f) for f in video_files],
+        'label': [labels[f]['graininess'] for f in video_files]
+    })
+
+    return dataset
+
+
+# Load the ViViT image processor
+image_processor = VivitImageProcessor.from_pretrained(
+    "google/vivit-b-16x2-kinetics400")
+
+
+def preprocess_video(example, image_processor):
+    # Load the video
+    video = load_video(example['video_path'])
+
+    # Process the video frames using the ViViT image processor
+    inputs = image_processor(list(video), return_tensors="np")
+
+    # Add the processed inputs to the example dictionary
+    for k, v in inputs.items():
+        example[k] = v.squeeze()  # Remove batch dimension
+
+    return example
+
+
+def preprocess_dataset(dataset, num_proc=4):
+    # Use multiprocessing to preprocess the dataset in parallel
+    return dataset.map(
+        functools.partial(preprocess_video, image_processor=image_processor),
+        remove_columns=['video_path'],
+        num_proc=num_proc
+    )
+
+
+# Define the path to the dataset
+data_dir = 'graininess_100_balanced_subset_split'
+
+# Load the datasets for each split
+dataset = DatasetDict({
+    'train': create_dataset(data_dir, 'train'),
+    'validation': create_dataset(data_dir, 'val'),
+    'test': create_dataset(data_dir, 'test')
+})
+
+# Define the path where the preprocessed dataset will be saved
+preprocessed_path = './preprocessed_dataset'
+
+# Check if preprocessed dataset already exists
+if os.path.exists(preprocessed_path):
+    print("Loading preprocessed dataset...")
+    # Load the preprocessed dataset from disk
+    preprocessed_dataset = DatasetDict.load_from_disk(preprocessed_path)
+else:
+    print("Preprocessing dataset...")
+    # Preprocess each split of the dataset
+    preprocessed_dataset = DatasetDict({
+        split: preprocess_dataset(dataset[split])
+        for split in dataset.keys()
+    })
+    # Save the preprocessed dataset to disk
+    preprocessed_dataset.save_to_disk(preprocessed_path)
+    print("Preprocessed dataset saved to disk.")
+
+# Load the ViViT model
+model = VivitForVideoClassification.from_pretrained(
+    "google/vivit-b-16x2-kinetics400")
+
+# Modify the model for binary classification
+model.classifier = torch.nn.Linear(model.config.hidden_size, 2)
+model.num_labels = 2
+
+# Set up training arguments
+training_args = TrainingArguments(
+    output_dir="./results",  # Directory to save the model checkpoints
+    num_train_epochs=3,  # Number of training epochs
+    per_device_train_batch_size=2,  # Batch size for training
+    per_device_eval_batch_size=2,  # Batch size for evaluation
+    warmup_steps=500,  # Number of warmup steps for learning rate scheduler
+    weight_decay=0.01,  # Strength of weight decay
+    logging_dir='./logs',  # Directory for storing logs
+    logging_steps=10,  # Log every X updates steps
+    evaluation_strategy="steps",  # Evaluate during training
+    eval_steps=100,  # Evaluate every X steps
+    save_steps=1000,  # Save checkpoint every X steps
+    # Load the best model when finished training (default metric is loss)
+    load_best_model_at_end=True,
+)
+
+# Define function to compute evaluation metrics
+
+
+def compute_metrics(eval_pred):
+    # Get the predictions and true labels
+    predictions = np.argmax(eval_pred.predictions, axis=1)
+    labels = eval_pred.label_ids
+
+    # Compute precision, recall, and F1 score
+    precision, recall, f1, _ = precision_recall_fscore_support(
+        labels, predictions, average='binary')
+
+    # Compute accuracy
+    accuracy = accuracy_score(labels, predictions)
+
+    # Return all metrics
+    return {
+        'accuracy': accuracy,
+        'f1': f1,
+        'precision': precision,
+        'recall': recall
+    }
+
+
+# Initialize the Trainer
+trainer = Trainer(
+    model=model,  # The instantiated model to be trained
+    args=training_args,  # Training arguments, defined above
+    train_dataset=preprocessed_dataset['train'],  # Training dataset
+    eval_dataset=preprocessed_dataset['validation'],  # Evaluation dataset
+    compute_metrics=compute_metrics,  # The function that computes metrics
+)
+
+# Train the model
+trainer.train()
+
+# Evaluate the model on the test set
+evaluation_results = trainer.evaluate(preprocessed_dataset['test'])
+print(evaluation_results)
+
+# Save the final model
+trainer.save_model("./vivit_binary_classifier")
+
+# Function to predict on new videos
+
+
+def predict_video(video_path):
+    # Load and preprocess the video
+    video = load_video(video_path)
+    inputs = image_processor(list(video), return_tensors="pt")
+
+    # Make prediction
+    with torch.no_grad():
+        outputs = model(**inputs)
+
+    # Get probabilities and predicted class
+    probabilities = torch.softmax(outputs.logits, dim=1)
+    predicted_class = torch.argmax(probabilities, dim=1).item()
+
+    return predicted_class, probabilities[0][predicted_class].item()
+
+
+
+
+# Example usage of prediction function
+# video_path = "path/to/your/video.avi"
+# predicted_class, confidence = predict_video(video_path)
+# print(f"Predicted class: {predicted_class}, Confidence: {confidence:.2f}")
diff --git a/discarded_src/sample_code_try_2.py b/discarded_src/sample_code_try_2.py
new file mode 100644
index 0000000000000000000000000000000000000000..2923397483e1e2860998c76c0cb7fb4cffa8a041
--- /dev/null
+++ b/discarded_src/sample_code_try_2.py
@@ -0,0 +1,231 @@
+'''
+# dataset structure:
+data/graininess_100_balanced_subset_split
+â”œâ”€â”€ test
+â”‚   â”œâ”€â”€ BirdsInCage_1920x1080_30fps_8bit_420_Pristine_QP32_FBT_1.avi
+â”‚   â”œâ”€â”€ Chimera1_4096x2160_60fps_10bit_420_graininess_QP32_FB_1.avi
+â”‚   â”œâ”€â”€ Chimera3_4096x2160_24fps_10bit_420_graininess_QP32_FT_1.avi
+â”‚   â”œâ”€â”€ ...
+â”‚   â””â”€â”€ labels.json
+â”œâ”€â”€ train
+â”‚   â”œâ”€â”€ labels.json
+â”‚   â”œâ”€â”€ lamppost_1920x1080_120fps_8bit_420_Pristine_QP32_BT_3.avi
+â”‚   â”œâ”€â”€ lamppost_1920x1080_120fps_8bit_420_Pristine_QP47_SF_3.avi
+â”‚   â”œâ”€â”€ leaveswall_1920x1080_120fps_8bit_420_Motion_QP32_SB_1.avi
+â”‚   â”œâ”€â”€ leaveswall_1920x1080_120fps_8bit_420_Motion_QP32_SFB_4.avi
+â”‚   â”œâ”€â”€ library_1920x1080_120fps_8bit_420_aliasing_QP47_FT_1.avi
+â”‚   â”œâ”€â”€ ...
+â””â”€â”€ val
+    â”œâ”€â”€ Chimera2_4096x2160_60fps_10bit_420_Dark_QP32_BT_1.avi
+    â”œâ”€â”€ ...
+    â”œâ”€â”€ labels.json
+    â”œâ”€â”€ shields_1280x720_50fps_8bit_420_graininess_QP47_SFB_1.avi
+    â”œâ”€â”€ station_1920x1080_30fps_8bit_420_graininess_QP32_SB_1.avi
+    â”œâ”€â”€ svtmidnightsun_3840x2160_50fps_10bit_420_banding_QP47_SBT_3.avi
+    â”œâ”€â”€ svtmidnightsun_3840x2160_50fps_10bit_420_banding_QP47_SFT_1.avi
+    â”œâ”€â”€ svtsmokesauna_3840x2160_50fps_10bit_420_banding_QP32_F_4.avi
+    â”œâ”€â”€ svtwaterflyover_3840x2160_50fps_10bit_420_banding_QP32_T_3.avi
+    â””â”€â”€ typing_1920x1080_120fps_8bit_420_aliasing_QP47_BT_4.avi
+
+4 directories, 103 files
+'''
+
+'''
+labels.json in each split is like:
+{
+  "Chimera1_4096x2160_60fps_10bit_420_graininess_QP47_FT_1.avi": {
+    "graininess": 1
+  },
+  "riverbed_1920x1080_25fps_8bit_420_banding_QP47_SBT_1.avi": {
+    "graininess": 0
+  },
+  "Meridian1_3840x2160_60fps_10bit_420_banding_QP47_SFT_1.avi": {
+    "graininess": 0
+  },
+  '''
+
+import os
+import json
+import torch
+import numpy as np
+from transformers import VivitImageProcessor, VivitForVideoClassification, TrainingArguments, Trainer
+from datasets import Dataset, DatasetDict
+from torchvision.io import read_video
+import torchvision.transforms as T
+from sklearn.metrics import accuracy_score, precision_recall_fscore_support
+import albumentations as A
+from albumentations.pytorch import ToTensorV2
+import cv2
+from functools import partial
+
+
+def get_augmentation():
+    return A.Compose([
+        A.HorizontalFlip(p=0.5),
+        A.VerticalFlip(p=0.5),
+        A.RandomRotate90(p=0.5),
+        A.Transpose(p=0.5),
+        A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.05, rotate_limit=15, p=0.5),
+        ToTensorV2(),
+    ])
+
+
+def apply_augmentation(frames, augmentation):
+    aug_frames = []
+    for frame in frames:
+        augmented = augmentation(image=frame)
+        aug_frames.append(augmented['image'])
+    return torch.stack(aug_frames)
+
+
+def uniform_frame_sample(video, num_frames):
+    total_frames = len(video)
+    if total_frames <= num_frames:
+        return video
+
+    indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+    return video[indices]
+
+
+def load_video(video_path, num_frames=32, augmentation=None):
+    video, _, info = read_video(video_path, pts_unit='sec')
+
+    # Uniform sampling
+    sampled_frames = uniform_frame_sample(video, num_frames)
+
+    if augmentation:
+        sampled_frames = apply_augmentation(sampled_frames, augmentation)
+
+    return sampled_frames.permute(0, 3, 1, 2).float() / 255.0
+
+
+def create_dataset(data_dir, split):
+    video_dir = os.path.join(data_dir, split)
+    json_path = os.path.join(video_dir, 'labels.json')
+    with open(json_path, 'r') as f:
+        labels = json.load(f)
+
+    video_files = [f for f in os.listdir(video_dir) if f.endswith('.avi')]
+
+    dataset = Dataset.from_dict({
+        'video_path': [os.path.join(video_dir, f) for f in video_files],
+        'label': [labels[f]['graininess'] for f in video_files]
+    })
+
+    return dataset
+
+
+# Load the image processor
+image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
+
+
+def preprocess_video(example, image_processor, augmentation=None):
+    video = load_video(example['video_path'], augmentation=augmentation)
+    inputs = image_processor(list(video), return_tensors="pt")
+    for k, v in inputs.items():
+        example[k] = v.squeeze()
+    return example
+
+
+def preprocess_dataset(dataset, augmentation=None):
+    return dataset.map(
+        partial(preprocess_video, image_processor=image_processor, augmentation=augmentation),
+        remove_columns=['video_path'],
+        num_proc=4
+    )
+
+
+# Load and preprocess the datasets
+data_dir = 'graininess_100_balanced_subset_split'
+dataset = DatasetDict({
+    'train': create_dataset(data_dir, 'train'),
+    'validation': create_dataset(data_dir, 'val'),
+    'test': create_dataset(data_dir, 'test')
+})
+
+augmentation = get_augmentation()
+
+preprocessed_path = './preprocessed_dataset_augmented'
+if os.path.exists(preprocessed_path):
+    print("Loading preprocessed dataset...")
+    preprocessed_dataset = DatasetDict.load_from_disk(preprocessed_path)
+else:
+    print("Preprocessing dataset with augmentation...")
+    preprocessed_dataset = DatasetDict({
+        'train': preprocess_dataset(dataset['train'], augmentation),
+        'validation': preprocess_dataset(dataset['validation']),
+        'test': preprocess_dataset(dataset['test'])
+    })
+    preprocessed_dataset.save_to_disk(preprocessed_path)
+    print("Preprocessed dataset saved to disk.")
+
+# Load the model
+model = VivitForVideoClassification.from_pretrained("google/vivit-b-16x2-kinetics400")
+model.classifier = torch.nn.Linear(model.config.hidden_size, 2)
+model.num_labels = 2
+
+# Set up training arguments
+training_args = TrainingArguments(
+    output_dir="./results",
+    num_train_epochs=5,
+    per_device_train_batch_size=4,
+    per_device_eval_batch_size=4,
+    warmup_steps=500,
+    weight_decay=0.01,
+    logging_dir='./logs',
+    logging_steps=10,
+    evaluation_strategy="steps",
+    eval_steps=100,
+    save_steps=1000,
+    load_best_model_at_end=True,
+    fp16=True,  # Enable mixed precision training
+    gradient_accumulation_steps=2,  # Accumulate gradients over 2 steps
+)
+
+
+def compute_metrics(eval_pred):
+    predictions = np.argmax(eval_pred.predictions, axis=1)
+    labels = eval_pred.label_ids
+    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
+    accuracy = accuracy_score(labels, predictions)
+    return {
+        'accuracy': accuracy,
+        'f1': f1,
+        'precision': precision,
+        'recall': recall
+    }
+
+
+# Initialize Trainer
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=preprocessed_dataset['train'],
+    eval_dataset=preprocessed_dataset['validation'],
+    compute_metrics=compute_metrics,
+)
+
+# Train the model
+trainer.train()
+
+# Evaluate the model
+evaluation_results = trainer.evaluate(preprocessed_dataset['test'])
+print(evaluation_results)
+
+# Save the model
+trainer.save_model("./vivit_binary_classifier_augmented")
+
+
+def predict_video(video_path):
+    video = load_video(video_path)
+    inputs = image_processor(list(video), return_tensors="pt")
+    with torch.no_grad():
+        outputs = model(**inputs)
+    probabilities = torch.softmax(outputs.logits, dim=1)
+    predicted_class = torch.argmax(probabilities, dim=1).item()
+    return predicted_class, probabilities[0][predicted_class].item()
+
+# Example usage of prediction function
+# video_path = "path/to/your/video.avi"
+# predicted_class, confidence = predict_video(video_path)
+# print(f"Predicted class: {predicted_class}, Confidence: {confidence:.2f}")
diff --git a/discarded_src/test_run.py b/discarded_src/test_run.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2df458ad95edb73bb1248a3103a816f0f3d1a20
--- /dev/null
+++ b/discarded_src/test_run.py
@@ -0,0 +1,212 @@
+import os
+import json
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import Dataset, DataLoader
+from torchvision import transforms, models
+from torchvision.io import read_video
+from torchvision.models import ResNet50_Weights
+
+# Set device
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+elif torch.backends.mps.is_available():
+    device = torch.device("mps")
+else:
+    device = torch.device("cpu")
+
+print(f"Using device: {device}")
+
+# Define paths
+data_path = "data/graininess_100_balanced_subset_split"
+train_path = os.path.join(data_path, "train")
+val_path = os.path.join(data_path, "val")
+test_path = os.path.join(data_path, "test")
+
+# Define artifact (can be extended for multi-task later)
+artifact = "graininess"
+
+
+# Helper function to load labels
+def load_labels(split_path):
+    with open(os.path.join(split_path, "labels.json"), "r") as f:
+        return json.load(f)
+
+
+# Custom dataset class
+class VideoDataset(Dataset):
+    def __init__(self, root_dir, labels, artifact):
+        self.root_dir = root_dir
+        self.labels = labels
+        self.artifact = artifact
+        self.video_files = [f for f in os.listdir(root_dir) if f.endswith('.avi')]
+        self.transform = transforms.Compose([
+            transforms.ConvertImageDtype(torch.float32),
+            transforms.Normalize(mean=[0.45, 0.45, 0.45], std=[0.225, 0.225, 0.225])
+        ])
+
+    def __len__(self):
+        return len(self.video_files)
+
+    def __getitem__(self, idx):
+        video_name = self.video_files[idx]
+        video_path = os.path.join(self.root_dir, video_name)
+        label = self.labels[video_name][self.artifact]
+
+        # Load video using torchvision
+        video, _, _ = read_video(video_path, pts_unit='sec')
+
+        # Subsample frames (adjust as needed)
+        video = video[::video.shape[0] // 16][:16]
+
+        # Apply normalization
+        video = self.transform(video)
+
+        # Rearrange dimensions to [C, T, H, W]
+        video = video.permute(3, 0, 1, 2)
+
+        return video, torch.tensor(label, dtype=torch.float32)
+
+
+# Create datasets
+train_labels = load_labels(train_path)
+val_labels = load_labels(val_path)
+test_labels = load_labels(test_path)
+
+train_dataset = VideoDataset(train_path, train_labels, artifact)
+val_dataset = VideoDataset(val_path, val_labels, artifact)
+test_dataset = VideoDataset(test_path, test_labels, artifact)
+
+# Create data loaders
+batch_size = 8
+train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
+val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
+test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
+
+
+# Define model
+class VideoClassifier(nn.Module):
+    def __init__(self, num_classes=1):
+        super(VideoClassifier, self).__init__()
+        self.resnet = models.resnet50(weights=ResNet50_Weights.DEFAULT)
+        self.resnet.conv1 = nn.Conv2d(16, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        self.fc = nn.Linear(2048, num_classes)
+
+    def forward(self, x):
+        b, c, t, h, w = x.shape
+        x = x.transpose(1, 2).reshape(b * t, c, h, w)
+        x = self.resnet.conv1(x)
+        x = self.resnet.bn1(x)
+        x = self.resnet.relu(x)
+        x = self.resnet.maxpool(x)
+        x = self.resnet.layer1(x)
+        x = self.resnet.layer2(x)
+        x = self.resnet.layer3(x)
+        x = self.resnet.layer4(x)
+        x = self.resnet.avgpool(x)
+        x = x.reshape(b, t, -1).mean(1)
+        x = self.fc(x)
+        return torch.sigmoid(x)
+
+
+model = VideoClassifier().to(device)
+
+# Define loss function and optimizer
+criterion = nn.BCELoss()
+optimizer = optim.Adam(model.parameters(), lr=0.001)
+
+
+# Training function
+def train(model, train_loader, criterion, optimizer, device):
+    model.train()
+    running_loss = 0.0
+    correct = 0
+    total = 0
+
+    for videos, labels in train_loader:
+        videos, labels = videos.to(device), labels.to(device)
+
+        optimizer.zero_grad()
+        outputs = model(videos)
+        loss = criterion(outputs.squeeze(), labels)
+        loss.backward()
+        optimizer.step()
+
+        running_loss += loss.item()
+        predicted = (outputs.squeeze() > 0.5).float()
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+
+    epoch_loss = running_loss / len(train_loader)
+    epoch_acc = correct / total
+    return epoch_loss, epoch_acc
+
+
+# Validation function
+def validate(model, val_loader, criterion, device):
+    model.eval()
+    running_loss = 0.0
+    correct = 0
+    total = 0
+
+    with torch.no_grad():
+        for videos, labels in val_loader:
+            videos, labels = videos.to(device), labels.to(device)
+
+            outputs = model(videos)
+            loss = criterion(outputs.squeeze(), labels)
+
+            running_loss += loss.item()
+            predicted = (outputs.squeeze() > 0.5).float()
+            total += labels.size(0)
+            correct += (predicted == labels).sum().item()
+
+    epoch_loss = running_loss / len(val_loader)
+    epoch_acc = correct / total
+    return epoch_loss, epoch_acc
+
+
+# Training loop
+num_epochs = 10
+for epoch in range(num_epochs):
+    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
+    val_loss, val_acc = validate(model, val_loader, criterion, device)
+
+    print(f"Epoch {epoch + 1}/{num_epochs}")
+    print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
+    print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
+    print()
+
+
+# Test function
+def test(model, test_loader, criterion, device):
+    model.eval()
+    running_loss = 0.0
+    correct = 0
+    total = 0
+
+    with torch.no_grad():
+        for videos, labels in test_loader:
+            videos, labels = videos.to(device), labels.to(device)
+
+            outputs = model(videos)
+            loss = criterion(outputs.squeeze(), labels)
+
+            running_loss += loss.item()
+            predicted = (outputs.squeeze() > 0.5).float()
+            total += labels.size(0)
+            correct += (predicted == labels).sum().item()
+
+    test_loss = running_loss / len(test_loader)
+    test_acc = correct / total
+    return test_loss, test_acc
+
+
+# Evaluate on test set
+test_loss, test_acc = test(model, test_loader, criterion, device)
+print(f"Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")
+
+# Save the model
+torch.save(model.state_dict(), f"video_classifier_{artifact}.pth")
+print(f"Model saved as video_classifier_{artifact}.pth")
diff --git a/logs/first_run_logs/run-2024-08-25--18-57-46/events.out.tfevents.1724612332.26610f3d33fa.1894.0 b/logs/first_run_logs/run-2024-08-25--18-57-46/events.out.tfevents.1724612332.26610f3d33fa.1894.0
new file mode 100644
index 0000000000000000000000000000000000000000..cd6b3acc3dd59de5924f5c744c6dfed965de0bc9
Binary files /dev/null and b/logs/first_run_logs/run-2024-08-25--18-57-46/events.out.tfevents.1724612332.26610f3d33fa.1894.0 differ
diff --git a/logs/first_run_logs/run-2024-08-25--18-57-46/events.out.tfevents.1724612413.26610f3d33fa.1894.1 b/logs/first_run_logs/run-2024-08-25--18-57-46/events.out.tfevents.1724612413.26610f3d33fa.1894.1
new file mode 100644
index 0000000000000000000000000000000000000000..1dcb305968855178b24dd6f7a555aee19ce2cfc5
Binary files /dev/null and b/logs/first_run_logs/run-2024-08-25--18-57-46/events.out.tfevents.1724612413.26610f3d33fa.1894.1 differ
diff --git a/logs/first_run_logs/run-2024-08-25--19-00-48/events.out.tfevents.1724612452.26610f3d33fa.1894.2 b/logs/first_run_logs/run-2024-08-25--19-00-48/events.out.tfevents.1724612452.26610f3d33fa.1894.2
new file mode 100644
index 0000000000000000000000000000000000000000..3619b50087b5c9d5154ef9d48164fbb6b557b856
Binary files /dev/null and b/logs/first_run_logs/run-2024-08-25--19-00-48/events.out.tfevents.1724612452.26610f3d33fa.1894.2 differ
diff --git a/logs/first_run_logs/run-2024-08-25--19-00-48/events.out.tfevents.1724612515.26610f3d33fa.1894.3 b/logs/first_run_logs/run-2024-08-25--19-00-48/events.out.tfevents.1724612515.26610f3d33fa.1894.3
new file mode 100644
index 0000000000000000000000000000000000000000..bb878f89e4e3b7341f99e0c1014a936814064682
Binary files /dev/null and b/logs/first_run_logs/run-2024-08-25--19-00-48/events.out.tfevents.1724612515.26610f3d33fa.1894.3 differ
diff --git a/logs/first_run_logs/run-2024-08-25--19-02-49/events.out.tfevents.1724612582.26610f3d33fa.5666.0 b/logs/first_run_logs/run-2024-08-25--19-02-49/events.out.tfevents.1724612582.26610f3d33fa.5666.0
new file mode 100644
index 0000000000000000000000000000000000000000..7588ca876fce4b56f15ca5f21534edb4b4f6fcc6
Binary files /dev/null and b/logs/first_run_logs/run-2024-08-25--19-02-49/events.out.tfevents.1724612582.26610f3d33fa.5666.0 differ
diff --git a/logs/first_run_logs/run-2024-08-25--19-02-49/events.out.tfevents.1724612683.26610f3d33fa.5666.1 b/logs/first_run_logs/run-2024-08-25--19-02-49/events.out.tfevents.1724612683.26610f3d33fa.5666.1
new file mode 100644
index 0000000000000000000000000000000000000000..ef95393aca2d3ecbd709755be79cb946c7b69f49
Binary files /dev/null and b/logs/first_run_logs/run-2024-08-25--19-02-49/events.out.tfevents.1724612683.26610f3d33fa.5666.1 differ
diff --git a/logs/first_run_logs/run-2024-08-25--19-05-27/events.out.tfevents.1724612737.26610f3d33fa.5666.2 b/logs/first_run_logs/run-2024-08-25--19-05-27/events.out.tfevents.1724612737.26610f3d33fa.5666.2
new file mode 100644
index 0000000000000000000000000000000000000000..6ea8859409230e96f44bb013b856c7203d32cb1b
Binary files /dev/null and b/logs/first_run_logs/run-2024-08-25--19-05-27/events.out.tfevents.1724612737.26610f3d33fa.5666.2 differ
diff --git a/logs/first_run_logs/run-2024-08-25--19-05-27/run_info.txt b/logs/first_run_logs/run-2024-08-25--19-05-27/run_info.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a22fb47aedf86c9898107e31b3e6e02f3e0ee33d
--- /dev/null
+++ b/logs/first_run_logs/run-2024-08-25--19-05-27/run_info.txt
@@ -0,0 +1,138 @@
+Run Name: run-2024-08-25--19-05-27
+Model: MultiTaskTimeSformer
+Training Arguments:
+  output_dir: ./results/run-2024-08-25--19-05-27
+  overwrite_output_dir: False
+  do_train: False
+  do_eval: True
+  do_predict: False
+  eval_strategy: steps
+  prediction_loss_only: False
+  per_device_train_batch_size: 16
+  per_device_eval_batch_size: 16
+  per_gpu_train_batch_size: None
+  per_gpu_eval_batch_size: None
+  gradient_accumulation_steps: 2
+  eval_accumulation_steps: None
+  eval_delay: 0
+  learning_rate: 5e-05
+  weight_decay: 0.01
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_epsilon: 1e-08
+  max_grad_norm: 1.0
+  num_train_epochs: 3.0
+  max_steps: 420
+  lr_scheduler_type: linear
+  lr_scheduler_kwargs: {}
+  warmup_ratio: 0.1
+  warmup_steps: 0
+  log_level: passive
+  log_level_replica: warning
+  log_on_each_node: True
+  logging_dir: ./logs/run-2024-08-25--19-05-27
+  logging_strategy: steps
+  logging_first_step: False
+  logging_steps: 20
+  logging_nan_inf_filter: True
+  save_strategy: steps
+  save_steps: 100
+  save_total_limit: 2
+  save_safetensors: True
+  save_on_each_node: False
+  save_only_model: False
+  restore_callback_states_from_checkpoint: False
+  no_cuda: False
+  use_cpu: False
+  use_mps_device: False
+  seed: 42
+  data_seed: None
+  jit_mode_eval: False
+  use_ipex: False
+  bf16: False
+  fp16: True
+  fp16_opt_level: O1
+  half_precision_backend: auto
+  bf16_full_eval: False
+  fp16_full_eval: False
+  tf32: None
+  local_rank: 0
+  ddp_backend: None
+  tpu_num_cores: None
+  tpu_metrics_debug: False
+  debug: []
+  dataloader_drop_last: False
+  eval_steps: 50
+  dataloader_num_workers: 12
+  dataloader_prefetch_factor: None
+  past_index: -1
+  run_name: run-2024-08-25--19-05-27
+  disable_tqdm: False
+  remove_unused_columns: True
+  label_names: None
+  load_best_model_at_end: True
+  metric_for_best_model: f1
+  greater_is_better: True
+  ignore_data_skip: False
+  fsdp: []
+  fsdp_min_num_params: 0
+  fsdp_config: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
+  fsdp_transformer_layer_cls_to_wrap: None
+  accelerator_config: AcceleratorConfig(split_batches=False, dispatch_batches=None, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False)
+  deepspeed: None
+  label_smoothing_factor: 0.0
+  optim: adamw_torch
+  optim_args: None
+  adafactor: False
+  group_by_length: False
+  length_column_name: length
+  report_to: ['tensorboard']
+  ddp_find_unused_parameters: None
+  ddp_bucket_cap_mb: None
+  ddp_broadcast_buffers: None
+  dataloader_pin_memory: True
+  dataloader_persistent_workers: False
+  skip_memory_metrics: True
+  use_legacy_prediction_loop: False
+  push_to_hub: False
+  resume_from_checkpoint: None
+  hub_model_id: None
+  hub_strategy: every_save
+  hub_token: None
+  hub_private_repo: False
+  hub_always_push: False
+  gradient_checkpointing: False
+  gradient_checkpointing_kwargs: None
+  include_inputs_for_metrics: False
+  eval_do_concat_batches: True
+  fp16_backend: auto
+  evaluation_strategy: None
+  push_to_hub_model_id: None
+  push_to_hub_organization: None
+  push_to_hub_token: None
+  mp_parameters: 
+  auto_find_batch_size: False
+  full_determinism: False
+  torchdynamo: None
+  ray_scope: last
+  ddp_timeout: 1800
+  torch_compile: False
+  torch_compile_backend: None
+  torch_compile_mode: None
+  dispatch_batches: None
+  split_batches: None
+  include_tokens_per_second: False
+  include_num_input_tokens_seen: False
+  neftune_noise_alpha: None
+  optim_target_modules: None
+  batch_eval_metrics: False
+  eval_on_start: False
+  distributed_state: Distributed environment: NO
+Num processes: 1
+Process index: 0
+Local process index: 0
+Device: cuda
+
+  _n_gpu: 1
+  __cached__setup_devices: cuda:0
+  deepspeed_plugin: None
diff --git a/logs/logs_successful_run_2/run-2024-08-25--23-53-08/events.out.tfevents.1724630048.0826fd70f652.869.0 b/logs/logs_successful_run_2/run-2024-08-25--23-53-08/events.out.tfevents.1724630048.0826fd70f652.869.0
new file mode 100644
index 0000000000000000000000000000000000000000..6011474c6cca20208705553f872d20e212a29edd
Binary files /dev/null and b/logs/logs_successful_run_2/run-2024-08-25--23-53-08/events.out.tfevents.1724630048.0826fd70f652.869.0 differ
diff --git a/logs/logs_successful_run_2/run-2024-08-25--23-53-08/events.out.tfevents.1724630112.0826fd70f652.869.1 b/logs/logs_successful_run_2/run-2024-08-25--23-53-08/events.out.tfevents.1724630112.0826fd70f652.869.1
new file mode 100644
index 0000000000000000000000000000000000000000..552bc0441b298fc0199417237135023b8a741954
Binary files /dev/null and b/logs/logs_successful_run_2/run-2024-08-25--23-53-08/events.out.tfevents.1724630112.0826fd70f652.869.1 differ
diff --git a/logs/logs_successful_run_2/run-2024-08-25--23-53-08/events.out.tfevents.1724630132.0826fd70f652.869.2 b/logs/logs_successful_run_2/run-2024-08-25--23-53-08/events.out.tfevents.1724630132.0826fd70f652.869.2
new file mode 100644
index 0000000000000000000000000000000000000000..6b9adbf3fc6f38a81768f98e594fa1eae681dc1e
Binary files /dev/null and b/logs/logs_successful_run_2/run-2024-08-25--23-53-08/events.out.tfevents.1724630132.0826fd70f652.869.2 differ
diff --git a/logs/logs_successful_run_2/run-2024-08-26--00-02-24/events.out.tfevents.1724630555.0826fd70f652.869.3 b/logs/logs_successful_run_2/run-2024-08-26--00-02-24/events.out.tfevents.1724630555.0826fd70f652.869.3
new file mode 100644
index 0000000000000000000000000000000000000000..8ca1d6ad846d489b7b578fb916e39aeabbb94a96
Binary files /dev/null and b/logs/logs_successful_run_2/run-2024-08-26--00-02-24/events.out.tfevents.1724630555.0826fd70f652.869.3 differ
diff --git a/logs/logs_successful_run_2/run-2024-08-26--00-03-58/events.out.tfevents.1724630643.0826fd70f652.6853.0 b/logs/logs_successful_run_2/run-2024-08-26--00-03-58/events.out.tfevents.1724630643.0826fd70f652.6853.0
new file mode 100644
index 0000000000000000000000000000000000000000..52264f3345f36505026169c61baa8d7eee1185fe
Binary files /dev/null and b/logs/logs_successful_run_2/run-2024-08-26--00-03-58/events.out.tfevents.1724630643.0826fd70f652.6853.0 differ
diff --git a/logs/second_run_logs/run-2024-08-25--18-57-46/events.out.tfevents.1724612332.26610f3d33fa.1894.0 b/logs/second_run_logs/run-2024-08-25--18-57-46/events.out.tfevents.1724612332.26610f3d33fa.1894.0
new file mode 100644
index 0000000000000000000000000000000000000000..cd6b3acc3dd59de5924f5c744c6dfed965de0bc9
Binary files /dev/null and b/logs/second_run_logs/run-2024-08-25--18-57-46/events.out.tfevents.1724612332.26610f3d33fa.1894.0 differ
diff --git a/logs/second_run_logs/run-2024-08-25--18-57-46/events.out.tfevents.1724612413.26610f3d33fa.1894.1 b/logs/second_run_logs/run-2024-08-25--18-57-46/events.out.tfevents.1724612413.26610f3d33fa.1894.1
new file mode 100644
index 0000000000000000000000000000000000000000..1dcb305968855178b24dd6f7a555aee19ce2cfc5
Binary files /dev/null and b/logs/second_run_logs/run-2024-08-25--18-57-46/events.out.tfevents.1724612413.26610f3d33fa.1894.1 differ
diff --git a/logs/second_run_logs/run-2024-08-25--19-00-48/events.out.tfevents.1724612452.26610f3d33fa.1894.2 b/logs/second_run_logs/run-2024-08-25--19-00-48/events.out.tfevents.1724612452.26610f3d33fa.1894.2
new file mode 100644
index 0000000000000000000000000000000000000000..3619b50087b5c9d5154ef9d48164fbb6b557b856
Binary files /dev/null and b/logs/second_run_logs/run-2024-08-25--19-00-48/events.out.tfevents.1724612452.26610f3d33fa.1894.2 differ
diff --git a/logs/second_run_logs/run-2024-08-25--19-00-48/events.out.tfevents.1724612515.26610f3d33fa.1894.3 b/logs/second_run_logs/run-2024-08-25--19-00-48/events.out.tfevents.1724612515.26610f3d33fa.1894.3
new file mode 100644
index 0000000000000000000000000000000000000000..bb878f89e4e3b7341f99e0c1014a936814064682
Binary files /dev/null and b/logs/second_run_logs/run-2024-08-25--19-00-48/events.out.tfevents.1724612515.26610f3d33fa.1894.3 differ
diff --git a/logs/second_run_logs/run-2024-08-25--19-02-49/events.out.tfevents.1724612582.26610f3d33fa.5666.0 b/logs/second_run_logs/run-2024-08-25--19-02-49/events.out.tfevents.1724612582.26610f3d33fa.5666.0
new file mode 100644
index 0000000000000000000000000000000000000000..7588ca876fce4b56f15ca5f21534edb4b4f6fcc6
Binary files /dev/null and b/logs/second_run_logs/run-2024-08-25--19-02-49/events.out.tfevents.1724612582.26610f3d33fa.5666.0 differ
diff --git a/logs/second_run_logs/run-2024-08-25--19-02-49/events.out.tfevents.1724612683.26610f3d33fa.5666.1 b/logs/second_run_logs/run-2024-08-25--19-02-49/events.out.tfevents.1724612683.26610f3d33fa.5666.1
new file mode 100644
index 0000000000000000000000000000000000000000..ef95393aca2d3ecbd709755be79cb946c7b69f49
Binary files /dev/null and b/logs/second_run_logs/run-2024-08-25--19-02-49/events.out.tfevents.1724612683.26610f3d33fa.5666.1 differ
diff --git a/logs/second_run_logs/run-2024-08-25--19-05-27/events.out.tfevents.1724612737.26610f3d33fa.5666.2 b/logs/second_run_logs/run-2024-08-25--19-05-27/events.out.tfevents.1724612737.26610f3d33fa.5666.2
new file mode 100644
index 0000000000000000000000000000000000000000..6ea8859409230e96f44bb013b856c7203d32cb1b
Binary files /dev/null and b/logs/second_run_logs/run-2024-08-25--19-05-27/events.out.tfevents.1724612737.26610f3d33fa.5666.2 differ
diff --git a/logs/second_run_logs/run-2024-08-25--19-05-27/run_info.txt b/logs/second_run_logs/run-2024-08-25--19-05-27/run_info.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a22fb47aedf86c9898107e31b3e6e02f3e0ee33d
--- /dev/null
+++ b/logs/second_run_logs/run-2024-08-25--19-05-27/run_info.txt
@@ -0,0 +1,138 @@
+Run Name: run-2024-08-25--19-05-27
+Model: MultiTaskTimeSformer
+Training Arguments:
+  output_dir: ./results/run-2024-08-25--19-05-27
+  overwrite_output_dir: False
+  do_train: False
+  do_eval: True
+  do_predict: False
+  eval_strategy: steps
+  prediction_loss_only: False
+  per_device_train_batch_size: 16
+  per_device_eval_batch_size: 16
+  per_gpu_train_batch_size: None
+  per_gpu_eval_batch_size: None
+  gradient_accumulation_steps: 2
+  eval_accumulation_steps: None
+  eval_delay: 0
+  learning_rate: 5e-05
+  weight_decay: 0.01
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_epsilon: 1e-08
+  max_grad_norm: 1.0
+  num_train_epochs: 3.0
+  max_steps: 420
+  lr_scheduler_type: linear
+  lr_scheduler_kwargs: {}
+  warmup_ratio: 0.1
+  warmup_steps: 0
+  log_level: passive
+  log_level_replica: warning
+  log_on_each_node: True
+  logging_dir: ./logs/run-2024-08-25--19-05-27
+  logging_strategy: steps
+  logging_first_step: False
+  logging_steps: 20
+  logging_nan_inf_filter: True
+  save_strategy: steps
+  save_steps: 100
+  save_total_limit: 2
+  save_safetensors: True
+  save_on_each_node: False
+  save_only_model: False
+  restore_callback_states_from_checkpoint: False
+  no_cuda: False
+  use_cpu: False
+  use_mps_device: False
+  seed: 42
+  data_seed: None
+  jit_mode_eval: False
+  use_ipex: False
+  bf16: False
+  fp16: True
+  fp16_opt_level: O1
+  half_precision_backend: auto
+  bf16_full_eval: False
+  fp16_full_eval: False
+  tf32: None
+  local_rank: 0
+  ddp_backend: None
+  tpu_num_cores: None
+  tpu_metrics_debug: False
+  debug: []
+  dataloader_drop_last: False
+  eval_steps: 50
+  dataloader_num_workers: 12
+  dataloader_prefetch_factor: None
+  past_index: -1
+  run_name: run-2024-08-25--19-05-27
+  disable_tqdm: False
+  remove_unused_columns: True
+  label_names: None
+  load_best_model_at_end: True
+  metric_for_best_model: f1
+  greater_is_better: True
+  ignore_data_skip: False
+  fsdp: []
+  fsdp_min_num_params: 0
+  fsdp_config: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
+  fsdp_transformer_layer_cls_to_wrap: None
+  accelerator_config: AcceleratorConfig(split_batches=False, dispatch_batches=None, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False)
+  deepspeed: None
+  label_smoothing_factor: 0.0
+  optim: adamw_torch
+  optim_args: None
+  adafactor: False
+  group_by_length: False
+  length_column_name: length
+  report_to: ['tensorboard']
+  ddp_find_unused_parameters: None
+  ddp_bucket_cap_mb: None
+  ddp_broadcast_buffers: None
+  dataloader_pin_memory: True
+  dataloader_persistent_workers: False
+  skip_memory_metrics: True
+  use_legacy_prediction_loop: False
+  push_to_hub: False
+  resume_from_checkpoint: None
+  hub_model_id: None
+  hub_strategy: every_save
+  hub_token: None
+  hub_private_repo: False
+  hub_always_push: False
+  gradient_checkpointing: False
+  gradient_checkpointing_kwargs: None
+  include_inputs_for_metrics: False
+  eval_do_concat_batches: True
+  fp16_backend: auto
+  evaluation_strategy: None
+  push_to_hub_model_id: None
+  push_to_hub_organization: None
+  push_to_hub_token: None
+  mp_parameters: 
+  auto_find_batch_size: False
+  full_determinism: False
+  torchdynamo: None
+  ray_scope: last
+  ddp_timeout: 1800
+  torch_compile: False
+  torch_compile_backend: None
+  torch_compile_mode: None
+  dispatch_batches: None
+  split_batches: None
+  include_tokens_per_second: False
+  include_num_input_tokens_seen: False
+  neftune_noise_alpha: None
+  optim_target_modules: None
+  batch_eval_metrics: False
+  eval_on_start: False
+  distributed_state: Distributed environment: NO
+Num processes: 1
+Process index: 0
+Local process index: 0
+Device: cuda
+
+  _n_gpu: 1
+  __cached__setup_devices: cuda:0
+  deepspeed_plugin: None
diff --git a/notebooks/data_prep.ipynb b/notebooks/data_prep.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..110b2e532ff30b031b073405edfd8a02e3459f30
--- /dev/null
+++ b/notebooks/data_prep.ipynb
@@ -0,0 +1,473 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e006d00d0980cdb6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "from pathlib import Path\n",
+    "import torch\n",
+    "from torch.utils.data import Dataset\n",
+    "import torchvision.transforms as transforms\n",
+    "from torchvision.io import read_video\n",
+    "\n",
+    "\n",
+    "class VideoNormalize(torch.nn.Module):\n",
+    "    def __init__(self, mean, std):\n",
+    "        super().__init__()\n",
+    "        self.mean = torch.tensor(mean).view(3, 1, 1, 1)\n",
+    "        self.std = torch.tensor(std).view(3, 1, 1, 1)\n",
+    "\n",
+    "    def forward(self, video):\n",
+    "        return (video - self.mean) / self.std\n",
+    "\n",
+    "\n",
+    "class VideoDataset(Dataset):\n",
+    "    def __init__(self, root_dir, split, transform=None, clip_duration=5.0, target_fps=30):\n",
+    "        self.root_dir = Path(root_dir) / split\n",
+    "        self.transform = transform\n",
+    "        self.clip_duration = clip_duration\n",
+    "        self.target_fps = target_fps\n",
+    "        self.target_frames = int(clip_duration * target_fps)\n",
+    "        self.video_files = []\n",
+    "        self.labels = {}\n",
+    "\n",
+    "        # Load labels from labels.json\n",
+    "        labels_path = self.root_dir / 'labels.json'\n",
+    "        with open(labels_path, 'r') as f:\n",
+    "            self.labels = json.load(f)\n",
+    "\n",
+    "        # Collect video file paths\n",
+    "        self.video_files = list(self.root_dir.glob('*.avi'))\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.video_files)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        video_path = str(self.video_files[idx])\n",
+    "        video_name = self.video_files[idx].name\n",
+    "        label = self.labels[video_name]['graininess']\n",
+    "\n",
+    "        # Read video using torchvision\n",
+    "        video, audio, meta = read_video(video_path, pts_unit='sec')\n",
+    "\n",
+    "        # Extract frame rate from metadata\n",
+    "        fps = meta['video_fps']\n",
+    "\n",
+    "        # Calculate the number of frames to sample based on the clip duration and video's fps\n",
+    "        num_frames_to_sample = min(int(self.clip_duration * fps), video.shape[0])\n",
+    "\n",
+    "        # Sample frames\n",
+    "        if num_frames_to_sample < video.shape[0]:\n",
+    "            start_idx = torch.randint(0, video.shape[0] - num_frames_to_sample + 1, (1,)).item()\n",
+    "            video = video[start_idx:start_idx + num_frames_to_sample]\n",
+    "\n",
+    "        # Resample to target FPS\n",
+    "        if fps != self.target_fps:\n",
+    "            indices = torch.linspace(0, video.shape[0] - 1, self.target_frames).long()\n",
+    "            video = video[indices]\n",
+    "\n",
+    "        # Ensure we have exactly target_frames\n",
+    "        if video.shape[0] < self.target_frames:\n",
+    "            video = torch.cat([video, video[-1].unsqueeze(0).repeat(self.target_frames - video.shape[0], 1, 1, 1)])\n",
+    "        elif video.shape[0] > self.target_frames:\n",
+    "            video = video[:self.target_frames]\n",
+    "\n",
+    "        # Change from (T, H, W, C) to (C, T, H, W)\n",
+    "        video = video.permute(3, 0, 1, 2)\n",
+    "\n",
+    "        if self.transform:\n",
+    "            video = self.transform(video)\n",
+    "\n",
+    "        return video, torch.tensor(label, dtype=torch.long)\n",
+    "\n",
+    "\n",
+    "# Example usage\n",
+    "transform = transforms.Compose([\n",
+    "    transforms.Lambda(lambda x: x.float() / 255.0),  # Normalize to [0, 1]\n",
+    "    VideoNormalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])\n",
+    "])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "a21a7b0a8e86913c",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-08-19T16:31:42.133858Z",
+     "start_time": "2024-08-19T16:31:42.128809Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Path to the dataset\n",
+    "data_root = Path('/Users/sv7/Projects/mtl-video-classification/data/graininess_100_balanced_subset_split')\n",
+    "\n",
+    "train_dataset = VideoDataset(data_root,\n",
+    "                             split='train',\n",
+    "                             transform=transform)\n",
+    "\n",
+    "test_dataset = VideoDataset(data_root,\n",
+    "                            split='test',\n",
+    "                            transform=transform)\n",
+    "\n",
+    "val_dataset = VideoDataset(data_root,\n",
+    "                           split='val',\n",
+    "                           transform=transform)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "a9092ed9c5027597",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-08-19T16:31:42.761488Z",
+     "start_time": "2024-08-19T16:31:42.759166Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# DataLoader example\n",
+    "from torch.utils.data import DataLoader\n",
+    "import os\n",
+    "\n",
+    "batch_size = 4\n",
+    "num_workers = os.cpu_count()\n",
+    "\n",
+    "train_loader = DataLoader(train_dataset,\n",
+    "                          batch_size=batch_size,\n",
+    "                          shuffle=True,\n",
+    "                          num_workers=num_workers)\n",
+    "\n",
+    "test_loader = DataLoader(test_dataset,\n",
+    "                         batch_size=batch_size,\n",
+    "                         shuffle=False,\n",
+    "                         num_workers=num_workers)\n",
+    "\n",
+    "val_loader = DataLoader(val_dataset,\n",
+    "                        batch_size=batch_size,\n",
+    "                        shuffle=False,\n",
+    "                        num_workers=num_workers)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "77d2d43a9fe4c2c2",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-08-19T16:55:37.595972Z",
+     "start_time": "2024-08-19T16:55:36.873079Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "from pathlib import Path\n",
+    "\n",
+    "train_data_path = Path('/Users/sv7/Projects/mtl-video-classification/data/graininess_100_balanced_subset_split/train')\n",
+    "labels_path = train_data_path / 'labels.json'\n",
+    "\n",
+    "# /Users/sv7/Projects/mtl-video-classification/data/graininess_100_balanced_subset_split/train/labels.json\n",
+    "video_files = list(train_data_path.glob('*.avi'))\n",
+    "with open(labels_path) as f:\n",
+    "    labels = json.load(f)\n",
+    "\n",
+    "video_path = str(video_files[5])\n",
+    "video, audio, meta = read_video(video_path, pts_unit='sec')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "f7d927a0c9c73948",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-08-19T16:57:53.317039Z",
+     "start_time": "2024-08-19T16:57:53.314764Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "clip_duration = 5.0\n",
+    "\n",
+    "# Extract frame rate from metadata\n",
+    "fps = meta['video_fps']\n",
+    "\n",
+    "# Calculate the number of frames to sample based on the clip duration and video's fps\n",
+    "num_frames_to_sample = min(int(clip_duration * fps), video.shape[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "b2c6a74027e9f3",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-08-19T17:13:49.049139Z",
+     "start_time": "2024-08-19T17:13:49.046501Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "300"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "num_frames_to_sample"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "4d960113bee6e247",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-08-19T18:51:49.590079Z",
+     "start_time": "2024-08-19T18:51:19.547632Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of VivitForVideoClassification were not initialized from the model checkpoint at google/vivit-b-16x2-kinetics400 and are newly initialized because the shapes did not match:\n",
+      "- classifier.weight: found shape torch.Size([400, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated\n",
+      "- classifier.bias: found shape torch.Size([400]) in the checkpoint and torch.Size([2]) in the model instantiated\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+      "/Users/sv7/opt/anaconda3/envs/video_classification/lib/python3.12/site-packages/transformers/feature_extraction_utils.py:142: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_new.cpp:281.)\n",
+      "  return torch.tensor(value)\n"
+     ]
+    },
+    {
+     "ename": "RuntimeError",
+     "evalue": "MPS backend out of memory (MPS allocated: 17.77 GB, other allocations: 40.66 MB, max allowed: 18.13 GB). Tried to allocate 1.76 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[41], line 124\u001b[0m\n\u001b[1;32m    116\u001b[0m trainer \u001b[38;5;241m=\u001b[39m Trainer(\n\u001b[1;32m    117\u001b[0m     model\u001b[38;5;241m=\u001b[39mmodel,\n\u001b[1;32m    118\u001b[0m     args\u001b[38;5;241m=\u001b[39mtraining_args,\n\u001b[1;32m    119\u001b[0m     train_dataset\u001b[38;5;241m=\u001b[39mtrain_dataset,\n\u001b[1;32m    120\u001b[0m     eval_dataset\u001b[38;5;241m=\u001b[39mval_dataset,\n\u001b[1;32m    121\u001b[0m )\n\u001b[1;32m    123\u001b[0m \u001b[38;5;66;03m# Cell 8: Train the model\u001b[39;00m\n\u001b[0;32m--> 124\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    126\u001b[0m \u001b[38;5;66;03m# Cell 9: Evaluate on test set\u001b[39;00m\n\u001b[1;32m    127\u001b[0m test_results \u001b[38;5;241m=\u001b[39m trainer\u001b[38;5;241m.\u001b[39mevaluate(test_dataset)\n",
+      "File \u001b[0;32m~/opt/anaconda3/envs/video_classification/lib/python3.12/site-packages/transformers/trainer.py:1948\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m   1946\u001b[0m         hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n\u001b[1;32m   1947\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1948\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1949\u001b[0m \u001b[43m        \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1950\u001b[0m \u001b[43m        \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1951\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1952\u001b[0m \u001b[43m        \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1953\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/opt/anaconda3/envs/video_classification/lib/python3.12/site-packages/transformers/trainer.py:2289\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m   2286\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_step_begin(args, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n\u001b[1;32m   2288\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39maccumulate(model):\n\u001b[0;32m-> 2289\u001b[0m     tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2291\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m   2292\u001b[0m     args\u001b[38;5;241m.\u001b[39mlogging_nan_inf_filter\n\u001b[1;32m   2293\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_xla_available()\n\u001b[1;32m   2294\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m (torch\u001b[38;5;241m.\u001b[39misnan(tr_loss_step) \u001b[38;5;129;01mor\u001b[39;00m torch\u001b[38;5;241m.\u001b[39misinf(tr_loss_step))\n\u001b[1;32m   2295\u001b[0m ):\n\u001b[1;32m   2296\u001b[0m     \u001b[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[1;32m   2297\u001b[0m     tr_loss \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m tr_loss \u001b[38;5;241m/\u001b[39m (\u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_globalstep_last_logged)\n",
+      "File \u001b[0;32m~/opt/anaconda3/envs/video_classification/lib/python3.12/site-packages/transformers/trainer.py:3328\u001b[0m, in \u001b[0;36mTrainer.training_step\u001b[0;34m(self, model, inputs)\u001b[0m\n\u001b[1;32m   3325\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m loss_mb\u001b[38;5;241m.\u001b[39mreduce_mean()\u001b[38;5;241m.\u001b[39mdetach()\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[1;32m   3327\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompute_loss_context_manager():\n\u001b[0;32m-> 3328\u001b[0m     loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_loss\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3330\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m inputs\n\u001b[1;32m   3331\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m   3332\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mtorch_empty_cache_steps \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   3333\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m%\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mtorch_empty_cache_steps \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m   3334\u001b[0m ):\n",
+      "File \u001b[0;32m~/opt/anaconda3/envs/video_classification/lib/python3.12/site-packages/transformers/trainer.py:3373\u001b[0m, in \u001b[0;36mTrainer.compute_loss\u001b[0;34m(self, model, inputs, return_outputs)\u001b[0m\n\u001b[1;32m   3371\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   3372\u001b[0m     labels \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 3373\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3374\u001b[0m \u001b[38;5;66;03m# Save past state if it exists\u001b[39;00m\n\u001b[1;32m   3375\u001b[0m \u001b[38;5;66;03m# TODO: this needs to be fixed and made cleaner later.\u001b[39;00m\n\u001b[1;32m   3376\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mpast_index \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
+      "File \u001b[0;32m~/opt/anaconda3/envs/video_classification/lib/python3.12/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1551\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/opt/anaconda3/envs/video_classification/lib/python3.12/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1560\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1561\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1565\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m~/opt/anaconda3/envs/video_classification/lib/python3.12/site-packages/transformers/models/vivit/modeling_vivit.py:759\u001b[0m, in \u001b[0;36mVivitForVideoClassification.forward\u001b[0;34m(self, pixel_values, head_mask, labels, output_attentions, output_hidden_states, interpolate_pos_encoding, return_dict)\u001b[0m\n\u001b[1;32m    673\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    674\u001b[0m \u001b[38;5;124;03mlabels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):\u001b[39;00m\n\u001b[1;32m    675\u001b[0m \u001b[38;5;124;03m    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    755\u001b[0m \u001b[38;5;124;03mLABEL_116\u001b[39;00m\n\u001b[1;32m    756\u001b[0m \u001b[38;5;124;03m```\"\"\"\u001b[39;00m\n\u001b[1;32m    757\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[0;32m--> 759\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvivit\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    760\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpixel_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    761\u001b[0m \u001b[43m    \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    762\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    763\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    764\u001b[0m \u001b[43m    \u001b[49m\u001b[43minterpolate_pos_encoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minterpolate_pos_encoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    765\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    766\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    768\u001b[0m sequence_output \u001b[38;5;241m=\u001b[39m outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m    770\u001b[0m logits \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclassifier(sequence_output[:, \u001b[38;5;241m0\u001b[39m, :])\n",
+      "File \u001b[0;32m~/opt/anaconda3/envs/video_classification/lib/python3.12/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1551\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/opt/anaconda3/envs/video_classification/lib/python3.12/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1560\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1561\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1565\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m~/opt/anaconda3/envs/video_classification/lib/python3.12/site-packages/transformers/models/vivit/modeling_vivit.py:611\u001b[0m, in \u001b[0;36mVivitModel.forward\u001b[0;34m(self, pixel_values, head_mask, output_attentions, output_hidden_states, interpolate_pos_encoding, return_dict)\u001b[0m\n\u001b[1;32m    607\u001b[0m head_mask \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_head_mask(head_mask, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mnum_hidden_layers)\n\u001b[1;32m    609\u001b[0m embedding_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membeddings(pixel_values, interpolate_pos_encoding\u001b[38;5;241m=\u001b[39minterpolate_pos_encoding)\n\u001b[0;32m--> 611\u001b[0m encoder_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoder\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    612\u001b[0m \u001b[43m    \u001b[49m\u001b[43membedding_output\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    613\u001b[0m \u001b[43m    \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    614\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    615\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    616\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    617\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    618\u001b[0m sequence_output \u001b[38;5;241m=\u001b[39m encoder_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m    619\u001b[0m sequence_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlayernorm(sequence_output)\n",
+      "File \u001b[0;32m~/opt/anaconda3/envs/video_classification/lib/python3.12/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1551\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/opt/anaconda3/envs/video_classification/lib/python3.12/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1560\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1561\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1565\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m~/opt/anaconda3/envs/video_classification/lib/python3.12/site-packages/transformers/models/vivit/modeling_vivit.py:378\u001b[0m, in \u001b[0;36mVivitEncoder.forward\u001b[0;34m(self, hidden_states, head_mask, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m    371\u001b[0m     layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_gradient_checkpointing_func(\n\u001b[1;32m    372\u001b[0m         layer_module\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m,\n\u001b[1;32m    373\u001b[0m         hidden_states,\n\u001b[1;32m    374\u001b[0m         layer_head_mask,\n\u001b[1;32m    375\u001b[0m         output_attentions,\n\u001b[1;32m    376\u001b[0m     )\n\u001b[1;32m    377\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 378\u001b[0m     layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[43mlayer_module\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlayer_head_mask\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    380\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m layer_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m    382\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m output_attentions:\n",
+      "File \u001b[0;32m~/opt/anaconda3/envs/video_classification/lib/python3.12/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1551\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/opt/anaconda3/envs/video_classification/lib/python3.12/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1560\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1561\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1565\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m~/opt/anaconda3/envs/video_classification/lib/python3.12/site-packages/transformers/models/vivit/modeling_vivit.py:321\u001b[0m, in \u001b[0;36mVivitLayer.forward\u001b[0;34m(self, hidden_states, head_mask, output_attentions)\u001b[0m\n\u001b[1;32m    320\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, hidden_states, head_mask\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, output_attentions\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m):\n\u001b[0;32m--> 321\u001b[0m     self_attention_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mattention\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    322\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;66;43;03m# in Vivit, layernorm is applied before self-attention\u001b[39;49;00m\n\u001b[1;32m    323\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlayernorm_before\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    324\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    325\u001b[0m \u001b[43m        \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    326\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    327\u001b[0m     attention_output \u001b[38;5;241m=\u001b[39m self_attention_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m    328\u001b[0m     \u001b[38;5;66;03m# add self attentions if we output attention weights\u001b[39;00m\n",
+      "File \u001b[0;32m~/opt/anaconda3/envs/video_classification/lib/python3.12/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1551\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/opt/anaconda3/envs/video_classification/lib/python3.12/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1560\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1561\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1565\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m~/opt/anaconda3/envs/video_classification/lib/python3.12/site-packages/transformers/models/vivit/modeling_vivit.py:265\u001b[0m, in \u001b[0;36mVivitAttention.forward\u001b[0;34m(self, hidden_states, head_mask, output_attentions)\u001b[0m\n\u001b[1;32m    259\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m    260\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m    261\u001b[0m     hidden_states: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[1;32m    262\u001b[0m     head_mask: Optional[torch\u001b[38;5;241m.\u001b[39mTensor] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m    263\u001b[0m     output_attentions: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m    264\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Union[Tuple[torch\u001b[38;5;241m.\u001b[39mTensor, torch\u001b[38;5;241m.\u001b[39mTensor], Tuple[torch\u001b[38;5;241m.\u001b[39mTensor]]:\n\u001b[0;32m--> 265\u001b[0m     self_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mattention\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    267\u001b[0m     attention_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput(self_outputs[\u001b[38;5;241m0\u001b[39m], hidden_states)\n\u001b[1;32m    269\u001b[0m     outputs \u001b[38;5;241m=\u001b[39m (attention_output,) \u001b[38;5;241m+\u001b[39m self_outputs[\u001b[38;5;241m1\u001b[39m:]  \u001b[38;5;66;03m# add attentions if we output them\u001b[39;00m\n",
+      "File \u001b[0;32m~/opt/anaconda3/envs/video_classification/lib/python3.12/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1551\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/opt/anaconda3/envs/video_classification/lib/python3.12/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1560\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1561\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1565\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m~/opt/anaconda3/envs/video_classification/lib/python3.12/site-packages/transformers/models/vivit/modeling_vivit.py:188\u001b[0m, in \u001b[0;36mVivitSelfAttention.forward\u001b[0;34m(self, hidden_states, head_mask, output_attentions)\u001b[0m\n\u001b[1;32m    185\u001b[0m query_layer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtranspose_for_scores(mixed_query_layer)\n\u001b[1;32m    187\u001b[0m \u001b[38;5;66;03m# Take the dot product between \"query\" and \"key\" to get the raw attention scores.\u001b[39;00m\n\u001b[0;32m--> 188\u001b[0m attention_scores \u001b[38;5;241m=\u001b[39m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmatmul\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery_layer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey_layer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtranspose\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    190\u001b[0m attention_scores \u001b[38;5;241m=\u001b[39m attention_scores \u001b[38;5;241m/\u001b[39m math\u001b[38;5;241m.\u001b[39msqrt(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mattention_head_size)\n\u001b[1;32m    192\u001b[0m \u001b[38;5;66;03m# Normalize the attention scores to probabilities.\u001b[39;00m\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: MPS backend out of memory (MPS allocated: 17.77 GB, other allocations: 40.66 MB, max allowed: 18.13 GB). Tried to allocate 1.76 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure)."
+     ]
+    }
+   ],
+   "source": [
+    "# Cell 1: Import necessary libraries\n",
+    "import os\n",
+    "import json\n",
+    "import random\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "from torchvision.io import read_video\n",
+    "from transformers import VivitImageProcessor, VivitForVideoClassification, TrainingArguments, Trainer\n",
+    "\n",
+    "\n",
+    "# Cell 2: Set random seed for reproducibility\n",
+    "def set_seed(seed):\n",
+    "    random.seed(seed)\n",
+    "    np.random.seed(seed)\n",
+    "    torch.manual_seed(seed)\n",
+    "    torch.cuda.manual_seed_all(seed)\n",
+    "\n",
+    "\n",
+    "set_seed(42)\n",
+    "\n",
+    "\n",
+    "# Cell 3: Define custom dataset class\n",
+    "# Cell 3: Define custom dataset class\n",
+    "class VideoDataset(Dataset):\n",
+    "    def __init__(self, data_dir, split, processor, max_frames=32):\n",
+    "        self.data_dir = os.path.join(data_dir, split)\n",
+    "        self.processor = processor\n",
+    "        self.max_frames = max_frames\n",
+    "        \n",
+    "        with open(os.path.join(self.data_dir, 'labels.json'), 'r') as f:\n",
+    "            self.labels = json.load(f)\n",
+    "        \n",
+    "        self.video_files = list(self.labels.keys())\n",
+    "    \n",
+    "    def __len__(self):\n",
+    "        return len(self.video_files)\n",
+    "    \n",
+    "    def __getitem__(self, idx):\n",
+    "        video_file = self.video_files[idx]\n",
+    "        video_path = os.path.join(self.data_dir, video_file)\n",
+    "        \n",
+    "        # Read video\n",
+    "        video, _, _ = read_video(video_path, pts_unit='sec')\n",
+    "        \n",
+    "        # Sample frames\n",
+    "        num_frames = video.shape[0]\n",
+    "        if num_frames > self.max_frames:\n",
+    "            start = random.randint(0, num_frames - self.max_frames)\n",
+    "            video = video[start:start+self.max_frames]\n",
+    "        else:\n",
+    "            video = video[:self.max_frames]\n",
+    "        \n",
+    "        # Ensure we have 3 channels (RGB)\n",
+    "        if video.shape[-1] != 3:\n",
+    "            video = video.expand(-1, -1, -1, 3)\n",
+    "        \n",
+    "        # Convert to numpy array and ensure correct shape\n",
+    "        video = video.numpy()\n",
+    "        \n",
+    "        # Ensure the video has the correct shape (num_frames, height, width, channels)\n",
+    "        if video.shape[1] == 3:  # If channels are in the second dimension\n",
+    "            video = np.transpose(video, (0, 2, 3, 1))\n",
+    "        \n",
+    "        # Process frames\n",
+    "        pixel_values = self.processor(\n",
+    "            list(video),\n",
+    "            return_tensors=\"pt\",\n",
+    "            do_resize=True,\n",
+    "            size={\"shortest_edge\": 224},  # Adjust this size as needed\n",
+    "            do_center_crop=True,\n",
+    "            crop_size={\"height\": 224, \"width\": 224},  # Adjust this size as needed\n",
+    "        ).pixel_values\n",
+    "        \n",
+    "        # Get label\n",
+    "        label = self.labels[video_file]['graininess']\n",
+    "        \n",
+    "        return {'pixel_values': pixel_values.squeeze(), 'label': torch.tensor(label)}\n",
+    "\n",
+    "\n",
+    "# Cell 4: Initialize ViViT model and processor\n",
+    "model_name = \"google/vivit-b-16x2-kinetics400\"\n",
+    "processor = VivitImageProcessor.from_pretrained(model_name,\n",
+    "                                                ignore_mismatched_sizes=True)\n",
+    "model = VivitForVideoClassification.from_pretrained(model_name, num_labels=2,\n",
+    "                                                    ignore_mismatched_sizes=True)\n",
+    "\n",
+    "# Cell 5: Prepare datasets and dataloaders\n",
+    "data_dir = \"/Users/sv7/Projects/mtl-video-classification/data/graininess_100_balanced_subset_split\"\n",
+    "batch_size = 4\n",
+    "\n",
+    "train_dataset = VideoDataset(data_dir, 'train', processor)\n",
+    "val_dataset = VideoDataset(data_dir, 'val', processor)\n",
+    "test_dataset = VideoDataset(data_dir, 'test', processor)\n",
+    "\n",
+    "train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)\n",
+    "val_dataloader = DataLoader(val_dataset, batch_size=batch_size)\n",
+    "test_dataloader = DataLoader(test_dataset, batch_size=batch_size)\n",
+    "\n",
+    "# Cell 6: Define training arguments\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"./results\",\n",
+    "    num_train_epochs=3,\n",
+    "    per_device_train_batch_size=batch_size,\n",
+    "    per_device_eval_batch_size=batch_size,\n",
+    "    warmup_steps=500,\n",
+    "    weight_decay=0.01,\n",
+    "    logging_dir='./logs',\n",
+    "    logging_steps=10,\n",
+    "    eval_strategy=\"epoch\",\n",
+    "    save_strategy=\"epoch\",\n",
+    "    load_best_model_at_end=True,\n",
+    ")\n",
+    "\n",
+    "# Cell 7: Define Trainer\n",
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=train_dataset,\n",
+    "    eval_dataset=val_dataset,\n",
+    ")\n",
+    "\n",
+    "# Cell 8: Train the model\n",
+    "trainer.train()\n",
+    "\n",
+    "# Cell 9: Evaluate on test set\n",
+    "test_results = trainer.evaluate(test_dataset)\n",
+    "print(test_results)\n",
+    "\n",
+    "# Cell 10: Save the model\n",
+    "model.save_pretrained(\"./vivit_graininess_classifier\")\n",
+    "processor.save_pretrained(\"./vivit_graininess_classifier\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c239dc3cc6e29490",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# Cell 11: Inference example\n",
+    "def predict_video(video_path):\n",
+    "    video, _, _ = read_video(video_path, pts_unit='sec')\n",
+    "    inputs = processor(list(video.permute(0, 2, 3, 1).numpy()), return_tensors=\"pt\")\n",
+    "\n",
+    "    with torch.no_grad():\n",
+    "        outputs = model(**inputs)\n",
+    "        logits = outputs.logits\n",
+    "        predicted_class = logits.argmax(-1).item()\n",
+    "\n",
+    "    return \"Grainy\" if predicted_class == 1 else \"Not Grainy\"\n",
+    "\n",
+    "\n",
+    "# Example usage\n",
+    "example_video_path = \"path/to/example/video.avi\"\n",
+    "prediction = predict_video(example_video_path)\n",
+    "print(f\"The video is predicted to be: {prediction}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/subset_for_patching.ipynb b/notebooks/subset_for_patching.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..b5852cfb8d24e8db6e164e8488ea7a8dc8f15c74
--- /dev/null
+++ b/notebooks/subset_for_patching.ipynb
@@ -0,0 +1,293 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true,
+    "ExecuteTime": {
+     "end_time": "2024-08-23T23:46:55.159025Z",
+     "start_time": "2024-08-23T23:46:55.155910Z"
+    }
+   },
+   "source": [
+    "files_to_use = ['Tennis_1920x1080_24fps_8bit_420_Motion_QP47_SFB_1.avi',\n",
+    "                'Tennis_1920x1080_24fps_8bit_420_Motion_QP32_BT_1.avi',\n",
+    "                'DanceKiss_1920x1080_25fps_8bit_420_Dark_QP47_FB_4.avi',\n",
+    "                'DanceKiss_1920x1080_25fps_8bit_420_Dark_QP32_SB_4.avi',\n",
+    "                'Kimono1_1920x1080_24fps_8bit_420_graininess_QP47_B_4.avi',\n",
+    "                'Kimono1_1920x1080_24fps_8bit_420_graininess_QP32_FB_1.avi',\n",
+    "                'OldTownCross_1920x1080_25fps_8bit_420_graininess_QP47_SB_4.avi',\n",
+    "                'OldTownCross_1920x1080_25fps_8bit_420_graininess_QP32_SBT_2.avi',\n",
+    "                'BirdsInCage_1920x1080_30fps_8bit_420_Pristine_QP47_SFB_3.avi',\n",
+    "                'BirdsInCage_1920x1080_30fps_8bit_420_Pristine_QP32_FBT_1.avi',\n",
+    "                'ElFuente1_1920x1080_30fps_8bit_420_aliasing_QP47_SFB_1.avi',\n",
+    "                'ElFuente1_1920x1080_30fps_8bit_420_aliasing_QP32_FB_4.avi',\n",
+    "                'ElFuente2_1920x1080_30fps_8bit_420_graininess_QP47_SFB_3.avi',\n",
+    "                'ElFuente2_1920x1080_30fps_8bit_420_graininess_QP32_S_2.avi',\n",
+    "                'BQTerrace_1920x1080_30fps_8bit_420_aliasing_QP47_FB_3.avi',\n",
+    "                'BQTerrace_1920x1080_30fps_8bit_420_aliasing_QP32_SF_4.avi',\n",
+    "                'CrowdRun_1920x1080_25fps_8bit_420_aliasing_QP47_SFT_4.avi',\n",
+    "                'CrowdRun_1920x1080_25fps_8bit_420_aliasing_QP32_SF_1.avi',\n",
+    "                'Seeking_1920x1080_25fps_8bit_420_graininess_QP47_SF_2.avi',\n",
+    "                'Seeking_1920x1080_25fps_8bit_420_graininess_QP32_SFT_1.avi',\n",
+    "                'riverbed_1920x1080_25fps_8bit_420_banding_QP47_SFBT_2.avi',\n",
+    "                'riverbed_1920x1080_25fps_8bit_420_banding_QP32_S_3.avi',\n",
+    "                'station_1920x1080_30fps_8bit_420_graininess_QP47_SBT_2.avi',\n",
+    "                'station_1920x1080_30fps_8bit_420_graininess_QP32_SB_1.avi',\n",
+    "                'shields_1280x720_50fps_8bit_420_graininess_QP47_SBT_3.avi',\n",
+    "                'shields_1280x720_50fps_8bit_420_graininess_QP32_SFBT_2.avi']"
+   ],
+   "outputs": [],
+   "execution_count": 1
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-08-23T23:47:04.814760Z",
+     "start_time": "2024-08-23T23:47:04.812533Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "from pathlib import Path",
+   "id": "f68ef83150ac3734",
+   "outputs": [],
+   "execution_count": 2
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-08-23T23:50:25.116050Z",
+     "start_time": "2024-08-23T23:50:25.090048Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "dataset_path = Path('/Volumes/SSD/BVIArtefact')\n",
+    "\n",
+    "parts = ['part1', 'part2']\n",
+    "\n",
+    "# file paths of all files in files_to_use in part1 and part2\n",
+    "file_paths = []\n",
+    "for part in parts:\n",
+    "    file_path = dataset_path / part\n",
+    "    all_files = list(file_path.glob('*.avi'))\n",
+    "    for file in all_files:\n",
+    "        if file.name in files_to_use:\n",
+    "            file_paths.append(file)    "
+   ],
+   "id": "fdfacf937f9f286e",
+   "outputs": [],
+   "execution_count": 3
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-08-23T23:50:36.713565Z",
+     "start_time": "2024-08-23T23:50:36.711235Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "len(file_paths)",
+   "id": "b4c910a7e71b9503",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "26"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 5
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-08-23T23:51:31.282402Z",
+     "start_time": "2024-08-23T23:51:05.913927Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "# copy files to a new folder\n",
+    "import shutil\n",
+    "\n",
+    "new_folder = Path('/Volumes/SSD/BVIArtefact/subset_for_patching')\n",
+    "new_folder.mkdir(exist_ok=True)\n",
+    "for file in file_paths:\n",
+    "    shutil.copy(file, new_folder)"
+   ],
+   "id": "fa2b07cf8f56b3c6",
+   "outputs": [],
+   "execution_count": 6
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-08-23T23:53:20.804168Z",
+     "start_time": "2024-08-23T23:53:20.793023Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "# copy labels of files in file from /Volumes/SSD/BVIArtefact/processed_labels.json to /Volumes/SSD/BVIArtefact/subset_for_patching\n",
+    "import json\n",
+    "\n",
+    "with open(dataset_path / 'processed_labels.json', 'r') as f:\n",
+    "    labels = json.load(f)\n",
+    "    \n",
+    "new_labels = {}\n",
+    "for file in file_paths:\n",
+    "    new_labels[file.name] = labels[file.name]\n",
+    "    \n",
+    "with open(new_folder / 'labels.json', 'w') as f:\n",
+    "    json.dump(new_labels, f)"
+   ],
+   "id": "3ab6eaf72d2ebf1c",
+   "outputs": [],
+   "execution_count": 7
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-08-24T00:02:44.629506Z",
+     "start_time": "2024-08-24T00:02:44.547315Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "import os\n",
+    "import random\n",
+    "\n",
+    "# Paths (Assuming the script is in the same directory as the dataset)\n",
+    "dataset_dir = '/Volumes/SSD/subsets/subset_for_patching'\n",
+    "labels_file = os.path.join(dataset_dir, 'labels.json')\n",
+    "\n",
+    "# Load the labels\n",
+    "with open(labels_file, 'r') as f:\n",
+    "    labels = json.load(f)\n",
+    "\n",
+    "# Split ratios\n",
+    "train_ratio = 0.7\n",
+    "val_ratio = 0.15\n",
+    "test_ratio = 0.15\n",
+    "\n",
+    "# Ensure the output directories exist\n",
+    "train_dir = os.path.join(dataset_dir, 'train')\n",
+    "val_dir = os.path.join(dataset_dir, 'val')\n",
+    "test_dir = os.path.join(dataset_dir, 'test')\n",
+    "\n",
+    "os.makedirs(train_dir, exist_ok=True)\n",
+    "os.makedirs(val_dir, exist_ok=True)\n",
+    "os.makedirs(test_dir, exist_ok=True)\n",
+    "\n",
+    "# Get list of all video files\n",
+    "video_files = [f for f in os.listdir(dataset_dir) if f.endswith('.avi')]\n",
+    "\n",
+    "# Shuffle the dataset\n",
+    "random.shuffle(video_files)\n",
+    "\n",
+    "# Calculate the split indices\n",
+    "train_idx = int(len(video_files) * train_ratio)\n",
+    "val_idx = train_idx + int(len(video_files) * val_ratio)\n",
+    "\n",
+    "# Split the files\n",
+    "train_files = video_files[:train_idx]\n",
+    "val_files = video_files[train_idx:val_idx]\n",
+    "test_files = video_files[val_idx:]\n",
+    "\n",
+    "# Helper function to move files and save labels\n",
+    "def move_files_and_save_labels(files, destination_dir, label_dict):\n",
+    "    dest_labels = {}\n",
+    "    for file in files:\n",
+    "        # Skip hidden files or files not present in the label_dict\n",
+    "        if file not in label_dict:\n",
+    "            print(f\"Skipping {file} as it is not found in labels.json\")\n",
+    "            continue\n",
+    "        src_path = os.path.join(dataset_dir, file)\n",
+    "        dest_path = os.path.join(destination_dir, file)\n",
+    "        shutil.move(src_path, dest_path)\n",
+    "        dest_labels[file] = label_dict[file]\n",
+    "    \n",
+    "    # Save the labels file\n",
+    "    labels_file_path = os.path.join(destination_dir, 'labels.json')\n",
+    "    with open(labels_file_path, 'w') as f:\n",
+    "        json.dump(dest_labels, f, indent=4)\n",
+    "\n",
+    "# Move the files and save the corresponding labels\n",
+    "move_files_and_save_labels(train_files, train_dir, labels)\n",
+    "move_files_and_save_labels(val_files, val_dir, labels)\n",
+    "move_files_and_save_labels(test_files, test_dir, labels)\n",
+    "\n",
+    "print(\"Dataset has been reorganized successfully!\")"
+   ],
+   "id": "9b909bde7c2e0915",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Skipping ._Kimono1_1920x1080_24fps_8bit_420_graininess_QP32_FB_1.avi as it is not found in labels.json\n",
+      "Skipping ._ElFuente1_1920x1080_30fps_8bit_420_aliasing_QP32_FB_4.avi as it is not found in labels.json\n",
+      "Skipping ._BQTerrace_1920x1080_30fps_8bit_420_aliasing_QP32_SF_4.avi as it is not found in labels.json\n",
+      "Skipping ._Seeking_1920x1080_25fps_8bit_420_graininess_QP47_SF_2.avi as it is not found in labels.json\n",
+      "Skipping ._BirdsInCage_1920x1080_30fps_8bit_420_Pristine_QP32_FBT_1.avi as it is not found in labels.json\n",
+      "Skipping ._riverbed_1920x1080_25fps_8bit_420_banding_QP32_S_3.avi as it is not found in labels.json\n",
+      "Skipping ._station_1920x1080_30fps_8bit_420_graininess_QP32_SB_1.avi as it is not found in labels.json\n",
+      "Skipping ._shields_1280x720_50fps_8bit_420_graininess_QP32_SFBT_2.avi as it is not found in labels.json\n",
+      "Skipping ._DanceKiss_1920x1080_25fps_8bit_420_Dark_QP32_SB_4.avi as it is not found in labels.json\n",
+      "Skipping ._DanceKiss_1920x1080_25fps_8bit_420_Dark_QP47_FB_4.avi as it is not found in labels.json\n",
+      "Skipping ._riverbed_1920x1080_25fps_8bit_420_banding_QP47_SFBT_2.avi as it is not found in labels.json\n",
+      "Skipping ._Seeking_1920x1080_25fps_8bit_420_graininess_QP32_SFT_1.avi as it is not found in labels.json\n",
+      "Skipping ._BQTerrace_1920x1080_30fps_8bit_420_aliasing_QP47_FB_3.avi as it is not found in labels.json\n",
+      "Skipping ._shields_1280x720_50fps_8bit_420_graininess_QP47_SBT_3.avi as it is not found in labels.json\n",
+      "Skipping ._BirdsInCage_1920x1080_30fps_8bit_420_Pristine_QP47_SFB_3.avi as it is not found in labels.json\n",
+      "Skipping ._Tennis_1920x1080_24fps_8bit_420_Motion_QP32_BT_1.avi as it is not found in labels.json\n",
+      "Skipping ._ElFuente1_1920x1080_30fps_8bit_420_aliasing_QP47_SFB_1.avi as it is not found in labels.json\n",
+      "Skipping ._OldTownCross_1920x1080_25fps_8bit_420_graininess_QP47_SB_4.avi as it is not found in labels.json\n",
+      "Skipping ._ElFuente2_1920x1080_30fps_8bit_420_graininess_QP32_S_2.avi as it is not found in labels.json\n",
+      "Skipping ._CrowdRun_1920x1080_25fps_8bit_420_aliasing_QP32_SF_1.avi as it is not found in labels.json\n",
+      "Skipping ._ElFuente2_1920x1080_30fps_8bit_420_graininess_QP47_SFB_3.avi as it is not found in labels.json\n",
+      "Skipping ._Kimono1_1920x1080_24fps_8bit_420_graininess_QP47_B_4.avi as it is not found in labels.json\n",
+      "Skipping ._Tennis_1920x1080_24fps_8bit_420_Motion_QP47_SFB_1.avi as it is not found in labels.json\n",
+      "Dataset has been reorganized successfully!\n"
+     ]
+    }
+   ],
+   "execution_count": 10
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "",
+   "id": "e52181730c5b3138"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..74a4f29ab4f1aec7e302f2968d5aae5c25c62db0
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,12 @@
+albumentations==1.4.14
+av==12.3.0
+datasets==2.20.0
+numpy==2.1.0
+opencv_python==4.10.0.84
+paramiko==3.4.1
+scikit_learn==1.5.1
+torch==2.4.0
+torchmetrics==1.4.1
+torchvision==0.19.0
+tqdm==4.66.5
+transformers==4.44.0
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/data_prep_utils/__init__.py b/src/data_prep_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/data_prep_utils/data_setup.py b/src/data_prep_utils/data_setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..2122d13d8963579ddc0ff83b612e406ba79d060e
--- /dev/null
+++ b/src/data_prep_utils/data_setup.py
@@ -0,0 +1,31 @@
+"""
+Contains functionality for creating PyTorch DataLoaders
+"""
+
+import os
+
+from torchvision import datasets, transforms
+from torch.utils.data import DataLoader
+
+NUM_WORKERS = os.cpu_count()
+
+
+def create_dataloaders(
+        train_dir: str,
+        test_dir: str,
+        transform: transforms.Compose,
+        batch_size: int,
+        num_workers: int = NUM_WORKERS
+):
+    # todo: implement
+    # return train_dataloader, test_dataloader, class_names
+    pass
+
+
+"""
+Usage:
+from going_modular import data_setup
+
+# Create train/test dataloader and get class names as a list
+train_dataloader, test_dataloader, class_names = data_setup.create_dataloaders(...
+"""
diff --git a/src/data_prep_utils/preprocess.py b/src/data_prep_utils/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdee0ea6e9a37e0e8c753f93fe443db80ff63a87
--- /dev/null
+++ b/src/data_prep_utils/preprocess.py
@@ -0,0 +1,92 @@
+import os
+import torch
+import random
+import json
+from torchvision.io import read_video
+from transformers import VideoMAEImageProcessor
+from pathlib import Path
+
+# Load the VideoMAE image processor
+model_ckpt = "MCG-NJU/videomae-base"
+image_processor = VideoMAEImageProcessor.from_pretrained(model_ckpt,
+                                                         do_rescale=False)
+
+
+def random_spatio_temporal_crop(video, num_frames=16, height=224, width=224):
+    T, H, W, C = video.shape
+
+    # Random temporal crop
+    start_frame = random.randint(0, T - num_frames)
+    video = video[start_frame:start_frame + num_frames]
+
+    # Random spatial crop
+    if H > height and W > width:
+        top = random.randint(0, H - height)
+        left = random.randint(0, W - width)
+        video = video[:, top:top + height, left:left + width, :]
+    else:
+        video = torch.nn.functional.interpolate(video.permute(0, 3, 1, 2), size=(height, width)).permute(0, 2, 3, 1)
+
+    return video
+
+
+def preprocess_video(video_path, num_crops=6, num_frames=16, height=224, width=224):
+    video, _, _ = read_video(video_path, pts_unit="sec")
+    video = video.float() / 255.0  # Normalize to [0, 1]
+
+    crops = []
+    for _ in range(num_crops):
+        crop = random_spatio_temporal_crop(video, num_frames, height, width)
+        # Apply VideoMAE preprocessing
+        crop = image_processor(list(crop.permute(0, 3, 1, 2)), return_tensors="pt")["pixel_values"]
+        crops.append(crop.squeeze(0))  # Remove batch dimension
+
+    return torch.stack(crops)  # Stack all crops
+
+
+def main():
+    dataset_root_path = "/Volumes/SSD/BVIArtefact"
+    output_root_path = "/Volumes/SSD/BVIArtefact_preprocessed"
+    os.makedirs(output_root_path, exist_ok=True)
+
+    # Load original labels
+    with open(os.path.join(dataset_root_path, "processed_labels.json"), "r") as f:
+        original_labels = json.load(f)
+
+    # New labels dictionary
+    new_labels = {}
+
+    # Process videos
+    for part in ["part1", "part2"]:
+        part_dir = os.path.join(dataset_root_path, part)
+        for video_name in os.listdir(part_dir):
+            if video_name.endswith('.avi'):
+                video_path = os.path.join(part_dir, video_name)
+
+                if video_name in original_labels:
+                    try:
+                        preprocessed_crops = preprocess_video(video_path)
+
+                        # Save preprocessed video crops
+                        output_name = f"{Path(video_name).stem}_crops.pt"
+                        output_path = os.path.join(output_root_path, output_name)
+                        torch.save(preprocessed_crops, output_path)
+
+                        # Add to new labels dictionary
+                        new_labels[output_name] = original_labels[video_name]
+
+                        print(f"Processed {video_name}")
+                    except Exception as e:
+                        print(f"Error processing {video_name}: {str(e)}")
+                else:
+                    print(f"Skipping {video_name} - not found in labels")
+
+    # Save the new labels
+    with open(os.path.join(output_root_path, "preprocessed_labels.json"), "w") as f:
+        json.dump(new_labels, f)
+
+    print("Preprocessing complete.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/data_prep_utils/resize_bvi_artefact.py b/src/data_prep_utils/resize_bvi_artefact.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca8d064efd2aa3e27881e4c672fbf2dd99048921
--- /dev/null
+++ b/src/data_prep_utils/resize_bvi_artefact.py
@@ -0,0 +1,108 @@
+# resize_bvi_artefact.py
+
+import multiprocessing
+import os
+import re
+import shutil
+
+import ffmpeg
+from tqdm import tqdm
+
+
+def resize_video(input_path, output_path, width=224, height=224):
+    try:
+        (
+            ffmpeg
+            .input(input_path)
+            .filter('scale', width, height)
+            .output(output_path)
+            .overwrite_output()
+            .run(capture_stdout=True, capture_stderr=True)
+        )
+        return None  # Success
+    except ffmpeg.Error as e:
+        return f"Error processing {input_path}: {e.stderr.decode()}"
+
+
+def get_new_filename(old_filename, width, height):
+    pattern = r'(.+)_(\d+x\d+)_(\d+fps)_(.+)\.avi'
+    match = re.match(pattern, old_filename)
+
+    if match:
+        video_name, old_resolution, fps, rest = match.groups()
+        return f"{video_name}_{old_resolution}_to_{width}x{height}_{fps}_{rest}.avi"
+    else:
+        name, ext = os.path.splitext(old_filename)
+        return f"{name}_to_{width}x{height}{ext}"
+
+
+def process_video(args):
+    input_path, output_dir, relative_path, width, height = args
+    file = os.path.basename(input_path)
+    new_filename = get_new_filename(file, width, height)
+    output_path = os.path.join(output_dir, relative_path, new_filename)
+
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    return resize_video(input_path, output_path, width, height)
+
+
+def preprocess_dataset(input_dir, output_dir, width=560, height=560, num_processes=None):
+    if num_processes is None:
+        num_processes = multiprocessing.cpu_count()
+
+    video_files = []
+    for part in ['part1', 'part2']:
+        part_dir = os.path.join(input_dir, part)
+        print(f"Searching for videos in: {part_dir}")
+        if not os.path.exists(part_dir):
+            print(f"Directory not found: {part_dir}")
+            continue
+        for root, _, files in os.walk(part_dir):
+            for file in files:
+                if file.endswith('.avi'):
+                    relative_path = os.path.relpath(root, input_dir)
+                    input_path = os.path.join(root, file)
+                    video_files.append((input_path, output_dir, relative_path, width, height))
+
+    print(f"Found {len(video_files)} video files to process.")
+
+    if not video_files:
+        print("No video files found. Please check the input directory.")
+        return
+
+    with multiprocessing.Pool(processes=num_processes) as pool:
+        results = list(tqdm(pool.imap(process_video, video_files), total=len(video_files), desc="Processing videos"))
+
+    # Print any errors that occurred
+    errors = [error for error in results if error is not None]
+    for error in errors:
+        print(error)
+
+    # Copy json files to the output directory
+    json_files = ['labels.json', 'processed_labels.json', 'subsets.json']
+    for json_file in json_files:
+        src = os.path.join(input_dir, json_file)
+        dst = os.path.join(output_dir, json_file)
+        if os.path.exists(src):
+            shutil.copy2(src, dst)
+        else:
+            print(f"Warning: {json_file} not found in {input_dir}")
+
+    print(f"Preprocessing completed! Processed {len(video_files)} videos with {len(errors)} errors.")
+
+
+if __name__ == "__main__":
+    input_dir = "/Volumes/SSD/BVIArtefact"
+    output_dir = "/Volumes/SSD/preprocessed_BVIArtefact"
+
+    # Get the full path of the current script
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+
+    # Construct full paths for input and output directories
+    input_dir = os.path.join(script_dir, input_dir)
+    output_dir = os.path.join(script_dir, output_dir)
+
+    print(f"Input directory: {input_dir}")
+    print(f"Output directory: {output_dir}")
+
+    preprocess_dataset(input_dir, output_dir)
diff --git a/src/data_prep_utils/split_dataset.py b/src/data_prep_utils/split_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bb73114ceed7e0004e0e99f4e4db55e5541aea0
--- /dev/null
+++ b/src/data_prep_utils/split_dataset.py
@@ -0,0 +1,92 @@
+import random
+import os
+import json
+import shutil
+from collections import defaultdict
+from pathlib import Path
+
+
+def split_dataset(preprocessed_dir, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
+    # Load labels
+    with open(os.path.join(preprocessed_dir, 'preprocessed_labels.json'), 'r') as f:
+        labels = json.load(f)
+
+    # Group crops by artifacts
+    artifact_crops = defaultdict(lambda: {'positive': set(), 'negative': set()})
+    for crop, artifacts in labels.items():
+        for artifact, value in artifacts.items():
+            if value == 1:
+                artifact_crops[artifact]['positive'].add(crop)
+            else:
+                artifact_crops[artifact]['negative'].add(crop)
+
+    # Find the minimum number of crops for any artifact
+    min_pos = min(len(crops['positive']) for crops in artifact_crops.values())
+    min_neg = min(len(crops['negative']) for crops in artifact_crops.values())
+    min_crops = min(min_pos, min_neg) * 2  # Ensure balance between positive and negative
+
+    # Calculate the number of crops for each split
+    train_size = int(min_crops * train_ratio)
+    val_size = int(min_crops * val_ratio)
+    test_size = min_crops - train_size - val_size
+
+    splits = {'train': set(), 'val': set(), 'test': set()}
+    split_artifacts = {split: defaultdict(lambda: {'positive': set(), 'negative': set()}) for split in splits}
+
+    # Distribute crops ensuring balance for each artifact in each split
+    for split, size in [('train', train_size), ('val', val_size), ('test', test_size)]:
+        pos_count = size // 2
+        neg_count = size - pos_count
+
+        for artifact, crops in artifact_crops.items():
+            pos_crops = list(crops['positive'])
+            neg_crops = list(crops['negative'])
+            random.shuffle(pos_crops)
+            random.shuffle(neg_crops)
+
+            for _ in range(pos_count):
+                if pos_crops:
+                    crop = pos_crops.pop()
+                    if crop not in splits['train'] and crop not in splits['val'] and crop not in splits['test']:
+                        splits[split].add(crop)
+                        split_artifacts[split][artifact]['positive'].add(crop)
+
+            for _ in range(neg_count):
+                if neg_crops:
+                    crop = neg_crops.pop()
+                    if crop not in splits['train'] and crop not in splits['val'] and crop not in splits['test']:
+                        splits[split].add(crop)
+                        split_artifacts[split][artifact]['negative'].add(crop)
+
+    # Create directories and move crops
+    preprocessed_dir_path = Path(preprocessed_dir)
+    data_split_path = preprocessed_dir_path.parent / str(preprocessed_dir_path.name + "_split")
+
+    for split, crops in splits.items():
+        os.makedirs(data_split_path / split, exist_ok=True)
+        split_labels = {}
+        for crop in crops:
+            src = os.path.join(preprocessed_dir, crop)
+            dst = os.path.join(data_split_path, split, crop)
+            shutil.copy(src, dst)  # Use copy instead of move to preserve original data
+            split_labels[crop] = labels[crop]
+        with open(os.path.join(data_split_path, split, 'labels.json'), 'w') as f:
+            json.dump(split_labels, f, indent=2)
+
+    print("Dataset split complete")
+    print(f"Train set: {len(splits['train'])} crops")
+    print(f"Validation set: {len(splits['val'])} crops")
+    print(f"Test set: {len(splits['test'])} crops")
+
+    # Print balance information for each artifact in each split
+    for split in splits:
+        print(f"\n{split.capitalize()} set balance:")
+        for artifact in artifact_crops:
+            pos = len(split_artifacts[split][artifact]['positive'])
+            neg = len(split_artifacts[split][artifact]['negative'])
+            print(f"  {artifact}: Positive: {pos}, Negative: {neg}")
+
+
+if __name__ == "__main__":
+    preprocessed_dir = "/Volumes/SSD/BVIArtefact_crops"  # Update this to your preprocessed dataset path
+    split_dataset(preprocessed_dir)
diff --git a/src/data_prep_utils/subset_and_process.py b/src/data_prep_utils/subset_and_process.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7332750ceb7a305efac46ef6b2ea9d487b79ae3
--- /dev/null
+++ b/src/data_prep_utils/subset_and_process.py
@@ -0,0 +1,274 @@
+import os
+import json
+import random
+from collections import Counter
+
+import torch
+import cv2
+import numpy as np
+from pathlib import Path
+from tqdm import tqdm
+import argparse
+from sklearn.model_selection import train_test_split
+
+# Argument parser
+parser = argparse.ArgumentParser(description='Preprocess BVIArtefact dataset')
+parser.add_argument('--input_dir', type=str, default="/Volumes/SSD/BVIArtefact",
+                    help='Input directory containing BVIArtefact dataset')
+parser.add_argument('--output_dir', type=str, default="/Volumes/SSD/BVIArtefact_8_crops_all_videos",
+                    help='Output directory for preprocessed data')
+parser.add_argument('--num_samples', type=int, default=None, help='Number of videos to sample (None for all)')
+parser.add_argument('--crop_size', type=int, default=224, help='Size of spatial crop')
+parser.add_argument('--num_frames', type=int, default=8, help='Number of frames to extract')
+parser.add_argument('--crops_per_video', type=int, default=4, help='Number of crops to extract per video')
+parser.add_argument('--train_ratio', type=float, default=0.7, help='Ratio of videos for training set')
+parser.add_argument('--val_ratio', type=float, default=0.15, help='Ratio of videos for validation set')
+args = parser.parse_args()
+
+# Configuration
+INPUT_DIR = args.input_dir
+OUTPUT_DIR = args.output_dir
+LABELS_FILE = os.path.join(INPUT_DIR, "labels.json")
+CROP_SIZE = (args.crop_size, args.crop_size)
+NUM_FRAMES = args.num_frames
+NUM_CROPS_PER_VIDEO = args.crops_per_video
+
+random.seed(42)
+
+# Create output directories
+for split in ['train', 'val', 'test']:
+    os.makedirs(os.path.join(OUTPUT_DIR, split), exist_ok=True)
+
+# Load labels
+with open(LABELS_FILE, 'r') as f:
+    labels = json.load(f)
+
+
+def parse_size(size_str):
+    """Convert size string to bytes"""
+    size = float(size_str[:-1])
+    unit = size_str[-1]
+    if unit == 'G':
+        return int(size * 1e9)
+    elif unit == 'M':
+        return int(size * 1e6)
+    else:
+        return int(size)
+
+
+def read_file_sizes(filename):
+    """Read file sizes from text file"""
+    sizes = {}
+    with open(filename, 'r') as f:
+        for line in f:
+            parts = line.strip().split()
+            if len(parts) == 2:
+                sizes[parts[0]] = parse_size(parts[1])
+    return sizes
+
+
+def extract_random_crop(frames, num_frames, crop_size):
+    """Extract a random spatio-temporal crop from the frames."""
+    t, h, w, _ = frames.shape
+
+    if t < num_frames:
+        raise ValueError(f"Video has fewer frames ({t}) than required ({num_frames})")
+
+    start_frame = random.randint(0, t - num_frames)
+    top = random.randint(0, h - crop_size[0])
+    left = random.randint(0, w - crop_size[1])
+
+    crop = frames[start_frame:start_frame + num_frames,
+           top:top + crop_size[0],
+           left:left + crop_size[1]]
+
+    return crop
+
+
+def normalize(video, mean, std):
+    """Normalize the video tensor"""
+    mean = torch.tensor(mean).view(1, 3, 1, 1)
+    std = torch.tensor(std).view(1, 3, 1, 1)
+    return (video - mean) / std
+
+
+def process_videos(video_list, split):
+    """Process videos and save crops for a specific split"""
+    preprocessed_labels = {}
+    label_counts = Counter()
+    total_crops = 0
+
+    for video_file, video_name in tqdm(video_list, desc=f"Processing {split} set"):
+        video_path = os.path.join(INPUT_DIR, video_file)
+
+        # Skip if video is not in labels
+        if video_name not in labels:
+            print(f"Skipping {video_file}: No labels found")
+            continue
+
+        video_labels = labels[video_name]
+
+        try:
+            # Read video
+            cap = cv2.VideoCapture(video_path)
+            frames = []
+            while len(frames) < NUM_FRAMES * 2:  # Read more frames than needed
+                ret, frame = cap.read()
+                if not ret:
+                    break
+                frames.append(frame)
+            cap.release()
+
+            if len(frames) < NUM_FRAMES:
+                print(f"Warning: {video_file} has fewer than {NUM_FRAMES} frames. Skipping.")
+                continue
+
+            frames = np.array(frames)
+
+            for i in range(NUM_CROPS_PER_VIDEO):
+                # Extract random crop
+                crop = extract_random_crop(frames, NUM_FRAMES, CROP_SIZE)
+
+                # Convert to torch tensor and normalize
+                crop = torch.from_numpy(crop).permute(0, 3, 1, 2).float() / 255.0
+
+                # Normalize using ImageNet stats
+                crop = normalize(crop, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+
+                # Generate unique filename for the crop
+                crop_filename = f"{Path(video_name).stem}_crop_{i}.pt"
+                crop_path = os.path.join(OUTPUT_DIR, split, crop_filename)
+
+                # Save crop as .pt file
+                torch.save(crop, crop_path)
+
+                # Store labels for the crop
+                preprocessed_labels[crop_filename] = video_labels
+
+                total_crops += 1
+
+            # Update label counts
+            for artifact, present in video_labels.items():
+                if present == 1:
+                    label_counts[f"{artifact}_Positive"] += NUM_CROPS_PER_VIDEO
+                else:
+                    label_counts[f"{artifact}_Negative"] += NUM_CROPS_PER_VIDEO
+
+        except Exception as e:
+            print(f"Error processing {video_file}: {str(e)}")
+
+    # Save preprocessed labels
+    labels_path = os.path.join(OUTPUT_DIR, split, "labels.json")
+    with open(labels_path, 'w') as f:
+        json.dump(preprocessed_labels, f, indent=4)
+
+    print(f"\n{split} set statistics:")
+    print(f"Total crops generated: {total_crops}")
+    print(f"Number of entries in labels JSON: {len(preprocessed_labels)}")
+
+    # Check if numbers match
+    if total_crops == len(preprocessed_labels):
+        print("âœ… Numbers match!")
+    else:
+        print("âŒ Numbers don't match. There might be an issue.")
+
+    return label_counts, total_crops
+
+
+def check_split_overlap(output_dir):
+    splits = ['train', 'val', 'test']
+    parent_videos = {split: set() for split in splits}
+
+    for split in splits:
+        labels_path = Path(output_dir) / split / "labels.json"
+        with open(labels_path, 'r') as f:
+            labels = json.load(f)
+
+        for crop_filename in labels.keys():
+            # Extract parent video name by removing the "_crop_{i}.pt" suffix
+            parent_video = crop_filename.rsplit('_crop_', 1)[0]
+            parent_videos[split].add(parent_video)
+
+    # Check for overlap between splits
+    for i, split1 in enumerate(splits):
+        for split2 in splits[i + 1:]:
+            overlap = parent_videos[split1].intersection(parent_videos[split2])
+            if overlap:
+                print(f"âŒ Overlap found between {split1} and {split2} splits:")
+                print(f"   Common parent videos: {overlap}")
+            else:
+                print(f"âœ… No overlap found between {split1} and {split2} splits")
+
+    # Print summary
+    print("\nSummary:")
+    for split in splits:
+        print(f"{split} split: {len(parent_videos[split])} unique parent videos")
+
+
+def print_label_balance(label_counts, split_name):
+    print(f"\n{split_name} set balance:")
+    artifacts = ['black_screen', 'frame_drop', 'spatial_blur', 'transmission_error', 'aliasing', 'banding',
+                 'dark_scenes', 'graininess', 'motion_blur']
+    for artifact in artifacts:
+        positive = label_counts[f"{artifact}_Positive"]
+        negative = label_counts[f"{artifact}_Negative"]
+        print(f"    {artifact}: Positive: {positive}, Negative: {negative}")
+
+
+# Read file sizes
+part1_sizes = read_file_sizes(os.path.join(INPUT_DIR, "part1_files_sizes.txt"))
+part2_sizes = read_file_sizes(os.path.join(INPUT_DIR, "part2_files_sizes.txt"))
+
+all_sizes = {**part1_sizes, **part2_sizes}
+
+# Sort videos by size
+sorted_videos = sorted(all_sizes.items(), key=lambda x: x[1])
+
+# Sample videos if num_samples is specified
+if args.num_samples is not None:
+    sampled_videos = sorted_videos[:args.num_samples]
+else:
+    sampled_videos = sorted_videos
+
+# Extract video files and their corresponding folders
+video_files = [(os.path.join('part1' if f in part1_sizes else 'part2', f), f) for f, _ in sampled_videos]
+
+# Split videos into train, validation, and test sets
+train_videos, temp_videos = train_test_split(video_files, train_size=args.train_ratio, random_state=42)
+val_ratio = args.val_ratio / (1 - args.train_ratio)
+val_videos, test_videos = train_test_split(temp_videos, train_size=val_ratio, random_state=42)
+
+# Modify the main part of the script to use the updated function
+train_label_counts, train_crops = process_videos(train_videos, 'train')
+val_label_counts, val_crops = process_videos(val_videos, 'val')
+test_label_counts, test_crops = process_videos(test_videos, 'test')
+
+# Add a final summary
+print("\nFinal Summary:")
+print(f"Total crops - Train: {train_crops}, Val: {val_crops}, Test: {test_crops}")
+total_crops = train_crops + val_crops + test_crops
+print(f"Total crops across all splits: {total_crops}")
+
+# Check total number of label entries
+train_labels = json.load(open(os.path.join(OUTPUT_DIR, 'train', 'labels.json')))
+val_labels = json.load(open(os.path.join(OUTPUT_DIR, 'val', 'labels.json')))
+test_labels = json.load(open(os.path.join(OUTPUT_DIR, 'test', 'labels.json')))
+
+total_label_entries = len(train_labels) + len(val_labels) + len(test_labels)
+print(f"Total label entries across all splits: {total_label_entries}")
+
+if total_crops == total_label_entries:
+    print("âœ… Total crops match total label entries!")
+else:
+    print("âŒ Total crops and total label entries don't match. There might be an issue.")
+
+print_label_balance(train_label_counts, "Train")
+print_label_balance(val_label_counts, "Val")
+print_label_balance(test_label_counts, "Test")
+
+check_split_overlap(OUTPUT_DIR)
+
+print("Preprocessing completed.")
+
+# sample usage of this script:
+# python src/subset_and_process.py --input_dir /Volumes/SSD/BVIArtefact --output_dir /Volumes/SSD/BVIArtefact_crops --num_samples 100 --crop_size 224 --num_frames 8 --crops_per_video 2 --train_ratio 0.7 --val_ratio 0.15
diff --git a/src/data_prep_utils/subset_data.py b/src/data_prep_utils/subset_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..f30aa02daf86311a08b964cd1ac31e960888bbc8
--- /dev/null
+++ b/src/data_prep_utils/subset_data.py
@@ -0,0 +1,158 @@
+import argparse
+import json
+import os
+import shutil
+from collections import defaultdict
+from pathlib import Path
+from tqdm import tqdm
+from src.data_prep_utils.split_dataset import split_dataset
+
+# Configuration
+local_labels_path = 'data/bviArtefactMetaInfo/processed_labels.json'
+artefacts_to_choose = ['graininess', 'aliasing', 'banding', 'motion_blur']  # Add more labels as needed
+size_limit_gb = 4  # Size limit in GB
+
+part1_sizes_path = 'data/bviArtefactMetaInfo/part1_files_sizes.txt'
+part2_sizes_path = 'data/bviArtefactMetaInfo/part2_files_sizes.txt'
+
+
+def convert_to_bytes(size_str):
+    size_unit = size_str[-1]
+    size_value = float(size_str[:-1])
+    if size_unit == 'G':
+        return int(size_value * 1e9)
+    elif size_unit == 'M':
+        return int(size_value * 1e6)
+    elif size_unit == 'K':
+        return int(size_value * 1e3)
+    else:
+        return int(size_value)
+
+
+def load_file_sizes(file_path):
+    file_sizes = {}
+    with open(file_path, 'r') as f:
+        for line in f:
+            parts = line.strip().split()
+            file_name = parts[0]
+            file_size = convert_to_bytes(parts[1])
+            file_sizes[file_name] = file_size
+    return file_sizes
+
+
+def get_balanced_videos(labels, artefacts, size_limit):
+    video_labels = defaultdict(dict)
+    for video, details in labels.items():
+        for artefact in artefacts:
+            video_labels[video][artefact] = details.get(artefact, 0)
+
+    # Separate positive and negative videos
+    positive_videos = [v for v, l in video_labels.items() if all(l[a] == 1 for a in artefacts)]
+    negative_videos = [v for v, l in video_labels.items() if all(l[a] == 0 for a in artefacts)]
+
+    # Sort videos by size (smallest to largest)
+    positive_videos.sort(key=lambda x: file_sizes.get(x, 0))
+    negative_videos.sort(key=lambda x: file_sizes.get(x, 0))
+
+    balanced_videos = []
+    total_size = 0
+
+    print(f"Size limit: {size_limit / 1e9:.2f} GB")
+    print(f"Total positive videos available: {len(positive_videos)}")
+    print(f"Total negative videos available: {len(negative_videos)}")
+
+    # Select videos while maintaining balance and respecting size limit
+    for pos, neg in zip(positive_videos, negative_videos):
+        pos_size = file_sizes.get(pos, 0)
+        neg_size = file_sizes.get(neg, 0)
+
+        if total_size + pos_size + neg_size <= size_limit:
+            balanced_videos.extend([pos, neg])
+            total_size += pos_size + neg_size
+        else:
+            break
+
+    final_subset = {video: video_labels[video] for video in balanced_videos}
+
+    final_size = sum(file_sizes.get(video, 0) for video in final_subset)
+    print(f"\nFinal balanced dataset:")
+    print(f"Size: {final_size / 1e9:.2f} GB")
+    print(f"Total videos: {len(final_subset)}")
+    print(f"Positive videos: {len(final_subset) // 2}")
+    print(f"Negative videos: {len(final_subset) // 2}")
+
+    return final_subset
+
+
+def copy_videos_local(subset_videos, source_base_path, destination_base_path):
+    progress_bar = tqdm(total=len(subset_videos), desc="Copying videos", unit="file", dynamic_ncols=True)
+
+    for video in subset_videos:
+        found = False
+        for part in ['part1', 'part2']:
+            source_path = os.path.join(source_base_path, part, video)
+            destination_path = os.path.join(destination_base_path, video)
+            if os.path.exists(source_path):
+                progress_bar.set_postfix(file=video)
+                shutil.copy2(source_path, destination_path)
+                found = True
+                break
+        if not found:
+            print(f"Video {video} not found in either part1 or part2.")
+        progress_bar.update(1)
+
+    progress_bar.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Create a balanced subset of videos for multi-label classification.")
+    parser.add_argument("--local", help="Path to local bviDataset folder", type=str, required=True)
+    parser.add_argument("--size_limit", help="Size limit in GB", type=float, default=2.0)
+    args = parser.parse_args()
+
+    global size_limit_gb
+    size_limit_gb = args.size_limit
+
+    # Load file sizes
+    part1_file_sizes = load_file_sizes(part1_sizes_path)
+    part2_file_sizes = load_file_sizes(part2_sizes_path)
+    global file_sizes
+    file_sizes = {**part1_file_sizes, **part2_file_sizes}
+
+    # Load labels
+    with open(local_labels_path, 'r') as f:
+        labels = json.load(f)
+
+    size_limit_bytes = size_limit_gb * 1e9
+    balanced_subset = get_balanced_videos(labels, artefacts_to_choose, size_limit_bytes)
+
+    # Create the local download directory
+    local_download_dir = f'/Volumes/SSD/subsets/{"_".join([art for art in artefacts_to_choose])}_subset_{int(size_limit_gb)}_GB'
+    os.makedirs(local_download_dir, exist_ok=True)
+
+    # Save the subset list locally
+    subset_file_path = f'{local_download_dir}/labels.json'
+    with open(subset_file_path, 'w') as f:
+        json.dump(balanced_subset, f, indent=4)
+
+    print(f"Balanced subset saved to {subset_file_path}")
+
+    # Verify the balance of the subset labels
+    for artefact in artefacts_to_choose:
+        presence_count = sum(1 for labels in balanced_subset.values() if labels[artefact] == 1)
+        absence_count = sum(1 for labels in balanced_subset.values() if labels[artefact] == 0)
+        print(f"{artefact}:")
+        print(f"  Presence count: {presence_count}")
+        print(f"  Absence count: {absence_count}")
+
+    # Use local dataset
+    print(f"Using local dataset at: {args.local}")
+    copy_videos_local(balanced_subset.keys(), args.local, local_download_dir)
+
+    print(f"All raw videos copied to {local_download_dir}")
+
+    split_dataset(local_download_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/data_prep_utils/subset_processed_dataset.py b/src/data_prep_utils/subset_processed_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a28d06d4fbd9999ab6a3e1e4280582f071c3448
--- /dev/null
+++ b/src/data_prep_utils/subset_processed_dataset.py
@@ -0,0 +1,113 @@
+import argparse
+import json
+import os
+import random
+import shutil
+from collections import defaultdict
+
+from tqdm import tqdm
+
+
+def load_labels(labels_path):
+    with open(labels_path, 'r') as f:
+        return json.load(f)
+
+
+def get_balanced_subset(labels, artefacts, count_per_label):
+    video_labels = defaultdict(dict)
+    for video, details in labels.items():
+        for artefact in artefacts:
+            video_labels[video][artefact] = details.get(artefact, 0)
+
+    final_subset = {}
+    artefact_counts = {artefact: {'positive': 0, 'negative': 0} for artefact in artefacts}
+
+    # Shuffle videos to ensure random selection
+    shuffled_videos = list(video_labels.keys())
+    random.shuffle(shuffled_videos)
+
+    for video in shuffled_videos:
+        include_video = True
+        for artefact in artefacts:
+            label = video_labels[video][artefact]
+            if label == 1 and artefact_counts[artefact]['positive'] >= count_per_label:
+                include_video = False
+                break
+            elif label == 0 and artefact_counts[artefact]['negative'] >= count_per_label:
+                include_video = False
+                break
+
+        if include_video:
+            final_subset[video] = video_labels[video]
+            for artefact in artefacts:
+                if video_labels[video][artefact] == 1:
+                    artefact_counts[artefact]['positive'] += 1
+                else:
+                    artefact_counts[artefact]['negative'] += 1
+
+        # Check if we have reached the target count for all artefacts
+        if all(counts['positive'] >= count_per_label and counts['negative'] >= count_per_label
+               for counts in artefact_counts.values()):
+            break
+
+    return final_subset
+
+
+def copy_videos(videos, src_dir, dst_dir):
+    os.makedirs(dst_dir, exist_ok=True)
+    for video in tqdm(videos, desc=f"Copying to {os.path.basename(dst_dir)}"):
+        src_path_part1 = os.path.join(src_dir, 'part1', video)
+        src_path_part2 = os.path.join(src_dir, 'part2', video)
+        dst_path = os.path.join(dst_dir, video)
+
+        if os.path.exists(src_path_part1):
+            shutil.copy2(src_path_part1, dst_path)
+        elif os.path.exists(src_path_part2):
+            shutil.copy2(src_path_part2, dst_path)
+        else:
+            print(f"Warning: Video {video} not found in either part1 or part2.")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Create a balanced subset of videos and relocate them.")
+    parser.add_argument("--input_dir", type=str, required=True, help="Path to processed_BVIArtefact folder")
+    parser.add_argument("--output_dir", type=str, required=True, help="Path to output directory")
+    parser.add_argument("--count_per_label", type=int, default=500,
+                        help="Number of videos per label (positive/negative)")
+    args = parser.parse_args()
+
+    # Load labels
+    labels_path = os.path.join(args.input_dir, 'processed_labels.json')
+    labels = load_labels(labels_path)
+
+    # Define artefacts
+    artefacts = ['']  # Add more labels as needed
+
+    # Get balanced subset
+    balanced_subset = get_balanced_subset(labels, artefacts, args.count_per_label)
+
+    # Copy videos to output directory
+    copy_videos(balanced_subset.keys(), args.input_dir, args.output_dir)
+
+    # Save the subset labels
+    subset_labels_path = os.path.join(args.output_dir, 'labels.json')
+    with open(subset_labels_path, 'w') as f:
+        json.dump(balanced_subset, f, indent=4)
+
+    print(f"Balanced subset created in {args.output_dir}")
+    print(f"Total videos in subset: {len(balanced_subset)}")
+
+    # Verify the balance of the subset labels
+    for artefact in artefacts:
+        presence_count = sum(1 for labels in balanced_subset.values() if labels[artefact] == 1)
+        absence_count = sum(1 for labels in balanced_subset.values() if labels[artefact] == 0)
+        print(f"{artefact}:")
+        print(f"  Presence count: {presence_count}")
+        print(f"  Absence count: {absence_count}")
+
+
+if __name__ == "__main__":
+    main()
+
+    # sample usage of the script
+    # python subset_processed_dataset.py --input_dir /Volumes/SSD/preprocessed_BVIArtefact --output_dir /Volumes/SSD/balanced_subset --count_per_label 500
diff --git a/src/data_prep_utils/subset_random.py b/src/data_prep_utils/subset_random.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2b94521837756d5b937b420067fc08592cc1e28
--- /dev/null
+++ b/src/data_prep_utils/subset_random.py
@@ -0,0 +1,82 @@
+import os
+import json
+import random
+
+
+def get_file_sizes(file_path):
+    sizes = {}
+    with open(file_path, 'r') as f:
+        for line in f:
+            parts = line.strip().split()
+            if len(parts) == 2:
+                filename, size = parts
+                sizes[filename] = int(size[:-1])  # Remove 'M' and convert to int
+    return sizes
+
+
+def create_dataset(labels_file, part1_sizes, part2_sizes, target_size_gb):
+    # Load labels
+    with open(labels_file, 'r') as f:
+        labels = json.load(f)
+
+    # Combine file sizes
+    all_sizes = {**part1_sizes, **part2_sizes}
+
+    # Create a list of (filename, size) tuples, sorted by size
+    sorted_files = sorted(all_sizes.items(), key=lambda x: x[1])
+
+    target_size_mb = target_size_gb * 1024
+    selected_files = []
+    current_size = 0
+
+    # Randomly select files, prioritizing smaller ones
+    while current_size < target_size_mb and sorted_files:
+        # Randomly choose from the smallest 10% of remaining files
+        chunk_size = max(1, len(sorted_files) // 10)
+        chosen_file, file_size = random.choice(sorted_files[:chunk_size])
+
+        if chosen_file in labels and (current_size + file_size) <= target_size_mb:
+            selected_files.append(chosen_file)
+            current_size += file_size
+
+        sorted_files.remove((chosen_file, file_size))
+
+    # Create a new labels dictionary with only the selected files
+    selected_labels = {file: labels[file] for file in selected_files if file in labels}
+
+    return selected_files, selected_labels, current_size / 1024  # Convert back to GB
+
+
+# File paths
+labels_file = '/Volumes/SSD/BVIArtefact/processed_labels.json'
+part1_sizes_file = '/Volumes/SSD/BVIArtefact/part1_files_sizes.txt'
+part2_sizes_file = '/Volumes/SSD/BVIArtefact/part1_files_sizes.txt'
+
+# Target dataset size in GB
+target_size_gb = 2  # Change this to your desired size
+
+# Get file sizes
+part1_sizes = get_file_sizes(part1_sizes_file)
+part2_sizes = get_file_sizes(part2_sizes_file)
+
+# Create the dataset
+selected_files, selected_labels, actual_size_gb = create_dataset(
+    labels_file, part1_sizes, part2_sizes, target_size_gb
+)
+
+# Print results
+print(f"Selected {len(selected_files)} files")
+print(f"Total size: {actual_size_gb:.2f} GB")
+
+# Save the new labels to a file
+output_dir = '/Volumes/SSD/BVIArtefact'
+with open(os.path.join(output_dir, 'selected_labels.json'), 'w') as f:
+    json.dump(selected_labels, f, indent=2)
+
+# Save the list of selected files
+with open(os.path.join(output_dir, 'selected_files.txt'), 'w') as f:
+    for file in selected_files:
+        f.write(f"{file}\n")
+
+print("Selected labels saved to 'selected_labels.json'")
+print("Selected files list saved to 'selected_files.txt'")
diff --git a/src/plots.py b/src/plots.py
new file mode 100644
index 0000000000000000000000000000000000000000..baec0570115a0767f7a24068c58f5cf6c9447e1d
--- /dev/null
+++ b/src/plots.py
@@ -0,0 +1,123 @@
+import json
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+import numpy as np
+import os
+
+
+def load_json_labels(file_path):
+    with open(file_path, 'r') as f:
+        return json.load(f)
+
+
+def create_label_df(json_data):
+    return pd.DataFrame.from_dict(json_data, orient='index')
+
+
+def plot_label_balance_stacked(df, title, save_path):
+    """
+    Plot the positive/negative balance for each label using stacked bars and save as PNG.
+    """
+    label_balance = df.mean()
+    label_balance_negative = 1 - label_balance
+
+    plt.figure(figsize=(14, 6))
+    bar_width = 0.8
+
+    labels = label_balance.index
+    pos_bars = plt.bar(labels, label_balance, bar_width, label='Positive', color='#2ecc71')
+    neg_bars = plt.bar(labels, label_balance_negative, bar_width, bottom=label_balance, label='Negative',
+                       color='#e74c3c')
+
+    plt.title(f'Label Balance - {title}')
+    plt.xlabel('Labels')
+    plt.ylabel('Proportion')
+    plt.legend(title='Class')
+    plt.xticks(rotation=45, ha='right')
+
+    # Add percentage labels on the bars
+    for i, (pos, neg) in enumerate(zip(label_balance, label_balance_negative)):
+        plt.text(i, pos / 2, f'{pos:.1%}', ha='center', va='center', color='white', fontweight='bold')
+        plt.text(i, pos + neg / 2, f'{neg:.1%}', ha='center', va='center', color='white', fontweight='bold')
+
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=300, bbox_inches='tight')
+    plt.close()
+
+
+def plot_label_distribution_across_splits_stacked(train_df, val_df, test_df, save_path):
+    """
+    Plot the distribution of positive and negative labels across train, validation, and test splits and save as PNG.
+    """
+    train_dist = train_df.mean()
+    val_dist = val_df.mean()
+    test_dist = test_df.mean()
+
+    df = pd.DataFrame({
+        'Train Positive': train_dist,
+        'Train Negative': 1 - train_dist,
+        'Validation Positive': val_dist,
+        'Validation Negative': 1 - val_dist,
+        'Test Positive': test_dist,
+        'Test Negative': 1 - test_dist
+    })
+
+    plt.figure(figsize=(16, 6))
+    df.plot(kind='bar', stacked=True, width=0.8)
+    plt.title('Label Distribution Across Splits')
+    plt.xlabel('Labels')
+    plt.ylabel('Proportion')
+    plt.xticks(rotation=45, ha='right')
+    plt.legend(title='Split and Class', bbox_to_anchor=(1.05, 1), loc='upper left')
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=300, bbox_inches='tight')
+    plt.close()
+
+
+def plot_sample_counts(train_df, val_df, test_df, save_path):
+    """
+    Plot the number of samples in each split and save as PNG.
+    """
+    counts = [len(train_df), len(val_df), len(test_df)]
+    splits = ['Train', 'Validation', 'Test']
+
+    plt.figure(figsize=(4, 6))
+    bars = plt.bar(splits, counts)
+    plt.title('Number of Samples in Each Split')
+    plt.ylabel('Number of Samples')
+
+    # Add value labels on the bars
+    for bar in bars:
+        height = bar.get_height()
+        plt.text(bar.get_x() + bar.get_width() / 2., height,
+                 f'{height:,}',
+                 ha='center', va='bottom')
+
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=300, bbox_inches='tight')
+    plt.close()
+
+
+# Get the directory of the JSON files
+json_dir = os.path.dirname('/Volumes/SSD/BVIArtefact_8_crops_all_videos/train_labels.json')
+
+# Load the data
+train_data = load_json_labels(os.path.join(json_dir, 'train_labels.json'))
+val_data = load_json_labels(os.path.join(json_dir, 'val_labels.json'))
+test_data = load_json_labels(os.path.join(json_dir, 'test_labels.json'))
+
+# Create DataFrames
+train_df = create_label_df(train_data)
+val_df = create_label_df(val_data)
+test_df = create_label_df(test_data)
+
+# Generate and save plots
+plot_label_balance_stacked(train_df, 'Train Set', os.path.join(json_dir, 'label_balance_train.png'))
+plot_label_balance_stacked(val_df, 'Validation Set', os.path.join(json_dir, 'label_balance_val.png'))
+plot_label_balance_stacked(test_df, 'Test Set', os.path.join(json_dir, 'label_balance_test.png'))
+plot_label_distribution_across_splits_stacked(train_df, val_df, test_df,
+                                              os.path.join(json_dir, 'label_distribution_across_splits.png'))
+plot_sample_counts(train_df, val_df, test_df, os.path.join(json_dir, 'sample_counts.png'))
+
+print(f"Plots have been saved in the directory: {json_dir}")