Added MFCC code

Added MFCC code to codebase as audio_processing() function Improved debug print() output Removed vid_num argument from frame_processing() function

Added MFCC code
Added MFCC code to codebase as audio_processing() function Improved debug print() output Removed vid_num argument from frame_processing() function
8818db1c · Patel, Shrey G (UG - Comp Sci & Elec Eng) · 9971f0f1 · 8818db1c · 8818db1c
Commit 8818db1c authored 2 years ago by Patel, Shrey G (UG - Comp Sci & Elec Eng)
--- a/step2_and_step3.py
+++ b/step2_and_step3.py
+import cv2
+import numpy as np
+import os
+import shutil
+from python_speech_features import mfcc
+import scipy.io.wavfile as wav
+
+#DATATASET_SIZE = 20
+DATASET_SIZE = 2
+
+
+def frame_processing(img, face_cascade, eye_cascade):
+
+    dimensions = img.shape
+    #print(dimensions)
+    y = dimensions[0]
+    x = dimensions[1]
+    #print(y, x)
+    # This assumes that the image never taller than it is wide
+    if (y > 1920):
+        pixel_y_scale = 1080 / y
+        # scaling to maintain vertical FOV
+        img = cv2.resize(img, None, fx=pixel_y_scale,
+                         fy=pixel_y_scale, interpolation=cv2.INTER_AREA)
+
+    #winname = 'image' + str(img_num).zfill(4) + '.jpg'
+
+    # Converting the image into grayscale
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    # Creating variable faces
+    faces = face_cascade.detectMultiScale(gray, 1.1, 4)
+    # Defining and drawing the rectangle around the face
+    for (x, y,  w,  h) in faces:
+        #print("faces: x =", x, "y =", y, "w =", w, "h =", h)
+        #cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 3)
+        pass
+
+    # Creating two regions of interest
+    #print("ROI: x =", x, "y =", y, "w =", w, "h =", h)
+    roi_gray = gray[y:(y+h), x:(x+w)]
+    roi_color = img[y:(y+h), x:(x+w)]
+
+    # Creating variable eyes
+    eyes = eye_cascade.detectMultiScale(roi_gray, 1.1, 4)
+    index = 0
+    # Creating for loop in order to divide one eye from another
+    for (ex, ey,  ew,  eh) in eyes:
+        if index == 0:
+            eye_1 = (ex, ey, ew, eh)
+        elif index == 1:
+            eye_2 = (ex, ey, ew, eh)
+        # Drawing rectangles around the eyes
+        #cv2.rectangle(roi_color, (ex, ey), (ex+ew, ey+eh), (0, 0, 255), 3)
+        index = index + 1
+    #cv2.imshow(winname, img)
+
+    # Assumes that left eye is smaller!!! - need to relook at this later
+    if eye_1[0] < eye_2[0]:
+        left_eye = eye_1
+        right_eye = eye_2
+    else:
+        left_eye = eye_2
+        right_eye = eye_1
+
+    # Calculating coordinates of a central points of the rectangles
+    left_eye_center = (int(left_eye[0] + (left_eye[2] / 2)),
+                       int(left_eye[1] + (left_eye[3] / 2)))
+    left_eye_x = left_eye_center[0]
+    left_eye_y = left_eye_center[1]
+
+    right_eye_center = (
+        int(right_eye[0] + (right_eye[2]/2)), int(right_eye[1] + (right_eye[3]/2)))
+    right_eye_x = right_eye_center[0]
+    right_eye_y = right_eye_center[1]
+
+    """cv2.circle(roi_color, left_eye_center, 5, (255, 0, 0), -1)
+    cv2.circle(roi_color, right_eye_center, 5, (255, 0, 0), -1)
+    cv2.line(roi_color, right_eye_center, left_eye_center, (0, 200, 200), 3)"""
+
+    if left_eye_y > right_eye_y:
+        A = (right_eye_x, left_eye_y)
+        # Integer -1 indicates that the image will rotate in the clockwise direction
+        direction = -1
+    else:
+        A = (left_eye_x, right_eye_y)
+    # Integer 1 indicates that image will rotate in the counter clockwise
+    # direction
+        direction = 1
+
+    """cv2.circle(roi_color, A, 5, (255, 0, 0), -1)
+    cv2.line(roi_color, right_eye_center, left_eye_center, (0, 200, 200), 3)
+    cv2.line(roi_color, left_eye_center, A, (0, 200, 200), 3)
+    cv2.line(roi_color, right_eye_center, A, (0, 200, 200), 3)
+    #cv2.imshow(winname, img)"""
+
+    delta_x = right_eye_x - left_eye_x
+    delta_y = right_eye_y - left_eye_y
+    angle = np.arctan(delta_y/delta_x)
+    angle = (angle * 180) / np.pi
+
+    # Width and height of the image
+    h, w = img.shape[:2]
+    #print("img: h = ", h, "w = ", w)
+    # Calculating a center point of the image
+    # Integer division "//"" ensures that we receive whole numbers
+    center = (w // 2, h // 2)
+    # Defining a matrix M and calling
+    # cv2.getRotationMatrix2D method
+    M = cv2.getRotationMatrix2D(center, (angle), 1.0)
+    # Applying the rotation to our image using the
+    # cv2.warpAffine method
+    rotated = cv2.warpAffine(img, M, (w, h))
+
+    # Converting the image into grayscale
+    crop_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    # Creating variable faces
+    crop_faces = face_cascade.detectMultiScale(crop_gray, 1.1, 4)
+    # Defining and drawing the rectangle around the face
+    for (x, y,  w,  h) in crop_faces:
+        #print("crop_faces: x =", x, "y =", y, "w =", w, "h =", h)
+        # Crops around face
+        cropped = rotated[y:(y+h), x:(x+w)]
+        #cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 3)
+
+    # Recalculate width and height of the image
+    h, w = cropped.shape[:2]
+    #print("cropped: h = ", h, "w = ", w)
+
+    if (w != h):
+        raise Exception("Error! Crop region is not square!")
+
+    # note that by default python uses double precision floats (i.e. 64 bit representation)
+    # numpy specific floats will still be limited to 64 bits, due to the C compiler... check logbook for more information
+    scaledown_ratio = 256 / h
+    #print("scaldown_ratio =", scaledown_ratio)
+    # print(np.finfo(np.longdouble))
+    #resized = cv2.resize(img, (256, 256), interpolation = cv2.INTER_AREA)
+    resized = cv2.resize(cropped, None, fx=scaledown_ratio,
+                         fy=scaledown_ratio, interpolation=cv2.INTER_AREA)
+    # cv2.destroyAllWindows()
+
+    return resized
+
+
+def audio_processing(clip_num):
+
+    (rate, sig) = wav.read("vids/dataset/audio" + str(clip_num) + ".wav")
+
+    frame_time = 1 / 30
+    print("frame_time = " + str(frame_time) + "s")
+
+    #Returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
+    #Parameters: numcep – the number of cepstrum to return, default 13
+    #For a very basic understanding, cepstrum is the information of rate of change in spectral bands
+
+    #mfcc(signal, samplerate=16000, winlen=0.025, winstep=0.01, numcep=13, nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=22, appendEnergy=True, winfunc=<function <lambda>>)
+    mfcc_feat = mfcc(sig, rate, 0.025, frame_time, 13, 26, 1200)
+
+    #numpy.save(file, arr, allow_pickle=True, fix_imports=True)
+    #Save an array to a binary file in NumPy .npy format.
+    #Notes: Any data saved to the file is appended to the end of the file.
+
+    return mfcc_feat
+
+
+def data_processing(vid_num, face_cascade, eye_cascade):
+
+    # Create a VideoCapture object and read from input file
+    # If the input is the camera, pass 0 instead of the video file name
+    #cap = cv2.VideoCapture('D:vids/dataset/video1.mp4')
+    #print(vid_num)
+    print('vids/dataset/video' + str(vid_num) + '.mp4')
+    cap = cv2.VideoCapture('vids/dataset/video' + str(vid_num) + '.mp4')
+
+    # Check if camera opened successfully
+    if (cap.isOpened() == False):
+        raise Exception("Error opening video stream or file")
+
+    new_path = 'data/clip' + str(vid_num) + '/images/'
+
+    # Creates data folder and other subdirectories, throws error if it already exists
+    os.makedirs(new_path, exist_ok=False)
+
+    # Read until video is completed
+    i = 0
+    while (cap.isOpened()):
+        # Capture frame-by-frame
+        ret, frame = cap.read()
+        if ret == True:
+            print("vid", vid_num, "frame", i)
+            processed_frame = frame_processing(
+                frame, face_cascade, eye_cascade)
+            cv2.imwrite(
+                (new_path + 'image' + str(i).zfill(4) + '.jpg'), processed_frame)
+            i += 1
+        # Break the loop
+        else:
+            break
+
+    # When everything done, release the video capture object
+    cap.release()
+
+    mfcc_array = audio_processing(vid_num)
+    np.save("data/clip" + str(vid_num) + "/mfcc", mfcc_array)
+    print("mfcc for audio" + str(vid_num) + " saved!")
+
+    return
+
+
+print("about to create cascades")
+# Creating face_cascade and eye_cascade objects
+face_cascade = cv2.CascadeClassifier("haarcascade_frontalface_default.xml")
+eye_cascade = cv2.CascadeClassifier("haarcascade_eye.xml")
+print("created cascades")
+
+print("about to delete data folder")
+# NEED TO ADD ERROR CHECKING / EXCEPTION HANDLING HERE!!!
+# Deletes any previous data folder and old frames
+try:
+    shutil.rmtree('data/')
+    print("data folder deleted")
+except FileNotFoundError:
+    print("data folder did not exist")
+
+for i in range(DATASET_SIZE):
+    data_processing(i+1, face_cascade, eye_cascade)
+
+print("Complete!")
--- a/step3.py
+++ b/step3.py
@@ -4,7 +4,7 @@ import scipy.io.wavfile as wav
 import numpy as np

 frame_rate = 30
-clip_num = 2
+clip_num = 1

 (rate, sig) = wav.read("vids/dataset/audio" + str(clip_num) + ".wav")