diff --git a/vitookit/datasets/ffcv_transform.py b/vitookit/datasets/ffcv_transform.py
index 89d749e18474bf6bb8deeb490582271d6d415bc9..a8ea5890d7a92f5d25b1efe78e13a481d80c98c1 100644
--- a/vitookit/datasets/ffcv_transform.py
+++ b/vitookit/datasets/ffcv_transform.py
@@ -29,6 +29,7 @@ from ffcv.fields.decoders import IntDecoder, RandomResizedCropRGBImageDecoder, S
 
 import torch
 import torchvision.transforms as tfms
+from torchvision.transforms import functional as F
 from torch import nn
 
 IMAGENET_MEAN = np.array([0.485, 0.456, 0.406]) * 255
@@ -315,6 +316,69 @@ class Solarization(Operation):
         return previous_state, None
   
 
+@njit
+def generate_gaussian_filter(sigma: int | float,filter_shape: list | tuple = (3, 3)):
+    # 'sigma' is the standard deviation of the gaussian distribution
+
+    m, n = filter_shape
+    m_half = m // 2
+    n_half = n // 2
+
+    # initializing the filter
+    gaussian_filter = np.zeros((m, n), np.float32)
+    k = 1 / (2.0 * sigma**2.0)
+    # generating the filter
+    for y in range(-m_half, m_half+1):
+        for x in range(-n_half, n_half+1):
+            exp_term = np.exp(-(x**2 + y**2) *k)
+            gaussian_filter[y+m_half, x+n_half] = exp_term
+    kernel = gaussian_filter/gaussian_filter.sum()
+    return kernel
+
+@njit
+def convolution(image: np.ndarray, kernel: list | tuple, output: np.ndarray) -> np.ndarray:
+    '''
+    It is a "valid" Convolution algorithm implementaion.
+    ### Example
+    >>> import numpy as np
+    >>> from PIL import Image
+    >>>
+    >>> kernel = np.array(
+    >>>     [[-1, 0, 1],
+    >>>     [-2, 0, 2],
+    >>>     [-1, 0, 1]], np.float32
+    >>> )
+    >>> img = np.array(Image.open('./lenna.png'))
+    >>> res = convolution(img, Kx)
+    '''
+    if len(image.shape) == 3:
+        m_i, n_i, c_i = image.shape
+    else:
+        raise Exception('Shape of image not supported')
+
+    m_k, n_k, _ = kernel.shape
+
+    y_strides = m_i - m_k + 1  # possible number of strides in y direction
+    x_strides = n_i - n_k + 1  # possible number of strides in x direction
+
+    pad_y = (m_k - 1) // 2
+    pad_x = (n_k - 1) // 2
+    sub_matrix = image[:m_k, :n_k]
+    # center region
+    for i in range(y_strides):
+        for j in range(x_strides):
+            for c in range(c_i): # looping over the all channels
+                sub_matrix = image[i:i+m_k, j:j+n_k,c]
+                output[i+pad_y,j+pad_x, c] = np.sum(sub_matrix * kernel)
+    # no filter for the padding region
+    output[:pad_y] = image[:pad_y]
+    output[-pad_y:] = image[-pad_y:]
+    output[:, :pad_x] = image[:, :pad_x]
+    output[:, -pad_x:] = image[:, -pad_x:]
+    
+    return output
+
+
 class ThreeAugmentation(Operation):
     def __init__(
         self, threshold=128, radius_min=0.1, radius_max=2.
@@ -330,32 +394,35 @@ class ThreeAugmentation(Operation):
         radius_min = self.radius_min
         radius_max = self.radius_max
         
-        def randchoice(images, _):
+        def randchoice(images, dst):
             for i in my_range(images.shape[0]):
                 idx = random.randint(0, 2)
                 if idx == 0:       
                     # solarize             
                     mask = images[i] >= threshold
-                    images[i] = np.where(mask, 255 - images[i], images[i])
+                    dst[i] = np.where(mask, 255 - images[i], images[i])
                 elif idx == 1:
                     # grayscale
-                    images[i] = (
+                    dst[i] = (
                         0.2989 * images[i, ..., 0:1]
                         + 0.5870 * images[i, ..., 1:2]
                         + 0.1140 * images[i, ..., 2:3]
                     )
                 else:
-                    # TODO: GaussianBlur
-                    radius = np.random.uniform(radius_min, radius_max)
-                    # images[i] = gaussian_filter(images[i], radius)
-            return images
-        # randchoice.is_parallel = True
-        
+                    sigma = np.random.uniform(radius_min, radius_max)                    
+                    kernel = generate_gaussian_filter(sigma,filter_shape=(5, 5))
+                    convolution(images[i], kernel, dst[i])
+                    
+            return dst
+        randchoice.is_parallel = True        
         return randchoice
 
     def declare_state_and_memory(self, previous_state: State) -> Tuple[State, Optional[AllocationQuery]]:
         # No updates to state or extra memory necessary!
-        return previous_state, None
+        mem_alloc = AllocationQuery(previous_state.shape,dtype=previous_state.dtype)
+        return previous_state, mem_alloc
+
+
 
 @gin.configurable
 def ThreeAugmentPipeline(img_size=224,scale=(0.08,1), color_jitter=None):
@@ -376,7 +443,7 @@ def ThreeAugmentPipeline(img_size=224,scale=(0.08,1), color_jitter=None):
                 # ToDevice(torch.device('cuda')),        
                 ToTorchImage(),
             ])
-    label_pipeline = [IntDecoder(), ToTensor(),ToDevice(torch.device('cuda')),View(-1)]
+    label_pipeline = [IntDecoder(), ToTensor(),View(-1)]
     # Pipeline for each data field
     pipelines = {
         'image': image_pipeline,
diff --git a/vitookit/datasets/transform.py b/vitookit/datasets/transform.py
index 4fdfb0f384d9e218e1fd047bc1a7da72f2697084..24f877c932987a09d0cdb08036bc7f785d16e4ad 100644
--- a/vitookit/datasets/transform.py
+++ b/vitookit/datasets/transform.py
@@ -13,7 +13,7 @@ IMAGENET_MEAN = np.array([0.485, 0.456, 0.406]) * 255
 IMAGENET_STD = np.array([0.229, 0.224, 0.225]) * 255
 
 @gin.configurable
-def SimplePipeline(img_size=224,scale=(0.2,1), ratio=(3.0/4.0, 4.0/3.0)):
+def SimplePipeline(img_size=224,scale=(0.2,1), ratio=(3.0/4.0, 4.0/3.0),blur=False):
     image_pipeline = [
             RandomResizedCropRGBImageDecoder((img_size, img_size), scale=scale,ratio=ratio),
             RandomHorizontalFlip(),
@@ -22,6 +22,8 @@ def SimplePipeline(img_size=224,scale=(0.2,1), ratio=(3.0/4.0, 4.0/3.0)):
             ToDevice(torch.device('cuda')),        
             ToTorchImage(),
             ]
+    if blur:
+        image_pipeline.append(transforms.GaussianBlur(3))
     label_pipeline = [IntDecoder(), ToTensor(),ToDevice(torch.device('cuda'))]
     # Pipeline for each data field
     pipelines = {
diff --git a/vitookit/evaluation/eval_cls.py b/vitookit/evaluation/eval_cls.py
index d06c688632b723d5ea4deadf2613218a5006904d..e0896cff0c0291a6bb21aff0117389deb8c1a646 100644
--- a/vitookit/evaluation/eval_cls.py
+++ b/vitookit/evaluation/eval_cls.py
@@ -335,7 +335,10 @@ def main(args):
     if args.pretrained_weights:
         load_pretrained_weights(model, args.pretrained_weights, checkpoint_key=args.checkpoint_key, prefix=args.prefix)
     if args.compile:
-        model = torch.compile(model)    
+        model = torch.compile(model)   
+        import torch._dynamo
+        torch._dynamo.config.suppress_errors = True 
+        
     trunc_normal_(model.head.weight, std=2e-5)
     
     model.to(device)
diff --git a/vitookit/evaluation/eval_cls_ffcv.py b/vitookit/evaluation/eval_cls_ffcv.py
index 9a477d16fb5ec0ac310b2b0396d0e87f8ac607f5..b71e3aa99207fba4aa80c0868d3e58c27ae10bd5 100644
--- a/vitookit/evaluation/eval_cls_ffcv.py
+++ b/vitookit/evaluation/eval_cls_ffcv.py
@@ -106,7 +106,7 @@ def get_args_parser():
                         help='LR decay rate (default: 0.1)')
 
     # Augmentation parameters
-    parser.add_argument('--ThreeAugment', action='store_true', default=False) #3augment
+    parser.add_argument('--ThreeAugment', action='store_true', default=True) #3augment
     parser.add_argument('--src',action='store_true', default=False, 
                         help="Use Simple Random Crop (SRC) or Random Resized Crop (RRC). Use SRC when there is less risk of overfitting, such as on ImageNet-21k.")
     parser.add_argument('--color_jitter', type=float, default=None, metavar='PCT',
@@ -300,6 +300,8 @@ def main(args):
         load_pretrained_weights(model, args.pretrained_weights, checkpoint_key=args.checkpoint_key, prefix=args.prefix)
     if args.compile:
         model = torch.compile(model)    
+        import torch._dynamo
+        torch._dynamo.config.suppress_errors = True 
     trunc_normal_(model.head.weight, std=2e-5)
     
     model.to(device)
diff --git a/vitookit/evaluation/eval_linear.py b/vitookit/evaluation/eval_linear.py
index 8a2ec0eea121f5ce162e83c417b0aa80b563d6e1..aeac274b79e0354d5abe18357364804a6db6f861 100644
--- a/vitookit/evaluation/eval_linear.py
+++ b/vitookit/evaluation/eval_linear.py
@@ -213,6 +213,8 @@ def main(args):
         
     if args.compile:
         model = torch.compile(model)    
+        import torch._dynamo
+        torch._dynamo.config.suppress_errors = True 
     model.to(device)
 
     model_without_ddp = model
diff --git a/vitookit/evaluation/eval_linear_ffcv.py b/vitookit/evaluation/eval_linear_ffcv.py
index 5368cc36fa1a4b30cbdeb968844c936fbe2c036b..b37c0792811624006d8bd680178117c8213feff7 100644
--- a/vitookit/evaluation/eval_linear_ffcv.py
+++ b/vitookit/evaluation/eval_linear_ffcv.py
@@ -167,7 +167,9 @@ def main(args):
 
     if args.compile:
         model = torch.compile(model)    
-
+        import torch._dynamo
+        torch._dynamo.config.suppress_errors = True 
+        
     model.to(device)
 
     model_without_ddp = model