Synchronization for video/audio/text message in flask web app framework for facial emotion recognition

I have trained a CNN model in Google Colab for facial expression detection with the FER2013 dataset containing 7 emotion classes (‘Angry’, ‘Disgust’, ‘Fear’, ‘Happy’, ‘Sad’, ‘Surprise’, ‘Neutral’).

Used flask framework to build a web application. OpenCV’s haarcascade_frontalface_default.xml is used to detect faces. With this I’m able to do real-time live streaming of the video using my laptop’s webcam and detect the facial expressions.

But when added the audio corresponding to the expressions together with the text message the live video stream is lagging (getting stuck in between) and video/audio are not in sync. How to make them in sync or how to deliver the text and audio corresponding to the expression every 3 seconds (while the video stream is going on take frames every 3 seconds and deliver the audio message corresponding to that particular frame together with the text message)?

Any help would be much appreciated, thanks in advance.

I referred the below link for app.py and index.html for webapp creation.

https://levelup.gitconnected.com/how-to-build-a-real-time-emotion-detection-web-app-ce7e3ed7b7de

This is having the text messages corresponding to the facial expressions(frame by frame), but I want the audio as well(eg: ‘happy.mp3‘ will play for happy face expression). Audio files which I used have a size between 2-4 kb

my python file : app.py

from flask import Flask, render_template, Response
import cv2
import numpy as np
from playsound import playsound
from tensorflow.keras.models import model_from_json  
from tensorflow.keras.preprocessing import image  
  

#load model  
model = model_from_json(open(r'C:UsersHPemotion_model.json', 'r').read())  
#load weights  
model.load_weights(r'C:UsersHPemotion_model.h5')
face_haar_cascade = cv2.CascadeClassifier(r'C:UsersHPhaarcascade_frontalface_default.xml')

app = Flask(__name__)
camera = cv2.VideoCapture(0)

def gen_frames():  # generate frame by frame from camera
    while True:
        # Capture frame by frame
        success, frame = camera.read()
        if not success:
            break
        else:
            gray_img= cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)  
        
            faces_detected = face_haar_cascade.detectMultiScale(gray_img, 1.32, 5)  
                    
            for (x,y,w,h) in faces_detected:
                print('WORKING')
                cv2.rectangle(frame,(x,y),(x+w,y+h),(255,0,0),thickness=4)  
                roi_gray=gray_img[y:y+w,x:x+h]          #cropping region of interest i.e. face area from image  
                roi_gray=cv2.resize(roi_gray,(48,48))  
                img_pixels = image.img_to_array(roi_gray)  
                img_pixels = np.expand_dims(img_pixels, axis = 0)  
                img_pixels /= 255  
                    
                predictions = model.predict(img_pixels)  
        
                #find max indexed array                 
                max_index = np.argmax(predictions[0])  
        
                emotions = ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral'] 
                predicted_emotion = emotions[max_index]  
                print(predicted_emotion)

                cv2.putText(frame, predicted_emotion, (int(x), int(y)), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2)

        # below code is commented because it is making the live video stream stuck in between(audio and video with text message not in sync)
                """if predicted_emotion=='Happy':
                    playsound(r'C:UsersHPemotions_audiohappy.mp3')
                elif predicted_emotion=='Angry':
                    playsound(r'C:UsersHPemotions_audioangry.mp3')""" 


            resized_img = cv2.resize(frame, (1000, 700))  
            
            ret, buffer = cv2.imencode('.jpg', frame)
            
            frame = buffer.tobytes()
            yield (b'--framern'
                   b'Content-Type: image/jpegrnrn' + frame + b'rn')  # concat frame one by one and show result


@app.route('/video_feed')
def video_feed():
    #Video streaming route. Put this in the src attribute of an img tag
    return Response(gen_frames(), mimetype='multipart/x-mixed-replace; boundary=frame')


@app.route('/')
def index():
    return render_template('index.html')


if __name__ == '__main__':
    app.run(debug=True)

JavaScript
​x
 
from flask import Flask, render_template, Response
import cv2
import numpy as np
from playsound import playsound
from tensorflow.keras.models import model_from_json  
from tensorflow.keras.preprocessing import image  
  
​
#load model  
model = model_from_json(open(r'C:UsersHPemotion_model.json', 'r').read())  
#load weights  
model.load_weights(r'C:UsersHPemotion_model.h5')
face_haar_cascade = cv2.CascadeClassifier(r'C:UsersHPhaarcascade_frontalface_default.xml')
​
app = Flask(__name__)
camera = cv2.VideoCapture(0)
​
def gen_frames():  # generate frame by frame from camera
    while True:
        # Capture frame by frame
        success, frame = camera.read()
        if not success:
            break
        else:
            gray_img= cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)  
        
            faces_detected = face_haar_cascade.detectMultiScale(gray_img, 1.32, 5)  
                    
            for (x,y,w,h) in faces_detected:
                print('WORKING')
                cv2.rectangle(frame,(x,y),(x+w,y+h),(255,0,0),thickness=4)  
                roi_gray=gray_img[y:y+w,x:x+h]          #cropping region of interest i.e. face area from image  
                roi_gray=cv2.resize(roi_gray,(48,48))  
                img_pixels = image.img_to_array(roi_gray)  
                img_pixels = np.expand_dims(img_pixels, axis = 0)  
                img_pixels /= 255  
                    
                predictions = model.predict(img_pixels)  
        
                #find max indexed array                 
                max_index = np.argmax(predictions[0])  
        
                emotions = ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral'] 
                predicted_emotion = emotions[max_index]  
                print(predicted_emotion)
​
                cv2.putText(frame, predicted_emotion, (int(x), int(y)), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2)
​
        # below code is commented because it is making the live video stream stuck in between(audio and video with text message not in sync)
                """if predicted_emotion=='Happy':
                    playsound(r'C:UsersHPemotions_audiohappy.mp3')
                elif predicted_emotion=='Angry':
                    playsound(r'C:UsersHPemotions_audioangry.mp3')""" 
​
​
            resized_img = cv2.resize(frame, (1000, 700))  
            
            ret, buffer = cv2.imencode('.jpg', frame)
            
            frame = buffer.tobytes()
            yield (b'--framern'
                   b'Content-Type: image/jpegrnrn' + frame + b'rn')  # concat frame one by one and show result
​
​
@app.route('/video_feed')
def video_feed():
    #Video streaming route. Put this in the src attribute of an img tag
    return Response(gen_frames(), mimetype='multipart/x-mixed-replace; boundary=frame')
​
​
@app.route('/')
def index():
    return render_template('index.html')
​
​
if __name__ == '__main__':
    app.run(debug=True)
​
​

My html file in templates folder: index.html

<!doctype html>
<html lang="en">
<head>
    <meta name="viewport" content="width=device-width, initial-scale=1">

    <!-- Required meta tags -->
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">

    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
    <meta name="description" content="">
    <meta name="viewport" content="width=device-width, initial-scale=1">

    <link rel="stylesheet" href="css/bootstrap.min.css">
    <link rel="stylesheet" href="css/fontAwesome.css">
    <link rel="stylesheet" href="css/templatemo-main.css">

    <!-- Bootstrap CSS -->
    <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/css/bootstrap.min.css"
          integrity="sha384-MCw98/SFnGE8fJT3GXwEOngsV7Zt27NXFoaoApmYm81iuXoPkFOJwJ8ERdknLPMO" crossorigin="anonymous">

    <title>Real Time Emotion Detection</title>
</head>
<body style="background-color:#002147;">
    <div class="parallax-content baner-content" id="home">
        <div class="container">
            <div class="row">
                <div class="col-lg-8  offset-lg-2">
                    <h3 class="mt-5"><font color="white" style="font-family:verdana;" style="font-size:300%;"><center>Real-Time Emotion Detection</center></font></h3>
                    <center><img src="{{ url_for('video_feed') }}" width="80%"></center>
                </div>
            </div>
        </div>
    </div>
</body>
</html>

JavaScript
 
<!doctype html>
<html lang="en">
<head>
    <meta name="viewport" content="width=device-width, initial-scale=1">
​
    <!-- Required meta tags -->
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
​
    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
    <meta name="description" content="">
    <meta name="viewport" content="width=device-width, initial-scale=1">
​
    <link rel="stylesheet" href="css/bootstrap.min.css">
    <link rel="stylesheet" href="css/fontAwesome.css">
    <link rel="stylesheet" href="css/templatemo-main.css">
​
    <!-- Bootstrap CSS -->
    <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/css/bootstrap.min.css"
          integrity="sha384-MCw98/SFnGE8fJT3GXwEOngsV7Zt27NXFoaoApmYm81iuXoPkFOJwJ8ERdknLPMO" crossorigin="anonymous">
​
    <title>Real Time Emotion Detection</title>
</head>
<body style="background-color:#002147;">
    <div class="parallax-content baner-content" id="home">
        <div class="container">
            <div class="row">
                <div class="col-lg-8  offset-lg-2">
                    <h3 class="mt-5"><font color="white" style="font-family:verdana;" style="font-size:300%;"><center>Real-Time Emotion Detection</center></font></h3>
                    <center><img src="{{ url_for('video_feed') }}" width="80%"></center>
                </div>
            </div>
        </div>
    </div>
</body>
</html>
​

Answer

This is because the playsound execution is synchronous.

Before going to the next frame detection, playsound must be executed completely, introducing the lag you experience.

There is the block flag in playsound:

import playsound
playsound.playsound('test.mp3', block=False)

JavaScript
 
import playsound
playsound.playsound('test.mp3', block=False)
​

which works on Linux but not in Windows, I’ve tested myself.

If this is your case I would highly suggest to use separate threads/processes for the video processing and audio playback.

Something like this:

import playsound
import threading
import time 

def sound(play_sound: threading.Event):
    while True:
        play_sound.wait()
        print('playing sounds...')
        time.sleep(3) # playsound.playsound('test.mp3')
        print('playing sounds...done')
        play_sound.clear()
    

def video(play_sound: threading.Event):

    i = 1

    while i < 10:
        print(f'processing frame {i}')
        # processing camera info here

        if i % 4 == 0: # your facial detection activates
            if not play_sound.is_set():
                play_sound.set()
    
        time.sleep(1)
        i += 1

def main():
    play_sound = threading.Event()
    sound_thread = threading.Thread(target=sound, args=(play_sound,))
    video_thread = threading.Thread(target=video, args=(play_sound,))

    # make the sound thread daemon otherwise the program won't terminate with the video thread.
    sound_thread.daemon = True

    video_thread.start()
    sound_thread.start()

    # it is sufficient to wait for video thread termination
    video_thread.join()

if __name__ == '__main__':
    main()

JavaScript
 
import playsound
import threading
import time 
​
def sound(play_sound: threading.Event):
    while True:
        play_sound.wait()
        print('playing sounds...')
        time.sleep(3) # playsound.playsound('test.mp3')
        print('playing sounds...done')
        play_sound.clear()
    
​
def video(play_sound: threading.Event):
​
    i = 1
​
    while i < 10:
        print(f'processing frame {i}')
        # processing camera info here
​
        if i % 4 == 0: # your facial detection activates
            if not play_sound.is_set():
                play_sound.set()
    
        time.sleep(1)
        i += 1
​
def main():
    play_sound = threading.Event()
    sound_thread = threading.Thread(target=sound, args=(play_sound,))
    video_thread = threading.Thread(target=video, args=(play_sound,))
​
    # make the sound thread daemon otherwise the program won't terminate with the video thread.
    sound_thread.daemon = True
​
    video_thread.start()
    sound_thread.start()
​
    # it is sufficient to wait for video thread termination
    video_thread.join()
​
if __name__ == '__main__':
    main()
​

Advertisement

Answer