Predictive-Code-Repo/main_pipeline_whisper.py at main · LouisBrammer/Predictive-Code-Repo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
from llm_api import get_sentiment_and_emotion
import pickle
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from prediction_pipeline import prediction_pipeline
import sys
import re
import emoji
import contractions
import whisper
import sounddevice as sd
import numpy as np
import scipy.io.wavfile as wav
import os
import time
import signal
from datetime import datetime

# Load Keras models and tokenizer
sentiment_model = keras.models.load_model('imdb_gru.keras')
emotion_model = keras.models.load_model('emotion_model_transformer.keras')

with open("tokenizer1.pkl", "rb") as f:
    tokenizer = pickle.load(f)
max_len = 100  # Should match what was used in training

# Set the microphone device index here
MICROPHONE_DEVICE_INDEX = 0  # 0 = MacBook Air-Mikrofon

def signal_handler(sig, frame):
    print("\nStopping the recording process...")
    sys.exit(0)

def record_audio(duration=10, sample_rate=16000):
    """Record audio for a specified duration."""
    print(f"\nRecording for {duration} seconds...")

    # Initialize recording array
    recording = np.zeros((int(duration * sample_rate), 1), dtype='float32')

    # Start recording
    with sd.InputStream(samplerate=sample_rate, channels=1, dtype='float32', device=MICROPHONE_DEVICE_INDEX) as stream:
        for i in range(0, int(duration * sample_rate), sample_rate):
            chunk, _ = stream.read(sample_rate)
            recording[i:i+sample_rate] = chunk

            # Calculate and print audio level
            level = np.abs(chunk).mean()
            print(f"Audio level: {level:.4f}", end='\r')

    print("\nRecording finished!")

    # Check if we got any audio
    avg_level = np.abs(recording).mean()
    if avg_level < 0.01:
        print("\n⚠️  WARNING: Very low audio levels detected!")
        print("Please try:")
        print("1. Speaking louder")
        print("2. Moving closer to the microphone")
        print("3. Checking your system's microphone settings")
        print(f"Current audio level: {avg_level:.4f}")

    return recording

def save_audio(recording, sample_rate=16000):
    """Save the recording to a temporary WAV file."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"temp_recording_{timestamp}.wav"
    # Convert float32 to int16
    recording = (recording * 32767).astype(np.int16)
    wav.write(filename, sample_rate, recording)
    return filename

def transcribe_audio(model, audio_file):
    """Transcribe the audio file using Whisper."""
    print("Transcribing...")
    result = model.transcribe(audio_file)
    # Clean up the temporary file
    os.remove(audio_file)
    return result["text"]

def preprocess_text(text):
    """
    Applies pre-processing steps as described in the paper:
    1. Convert Emojis to text
    2. Expand Contractions
    3. Fix specific Acronyms and Misspellings
    4. Lowercase text
    5. Normalize repeated characters
    """
    if not isinstance(text, str):
        return ""  # Return empty string for non-string inputs

    # 1. Convert Emojis to text
    text = emoji.demojize(text, delimiters=(" ", " "))

    # 2. Expand Contractions
    text = contractions.fix(text)

    # 3. Fix specific Acronyms and Misspellings
    text = re.sub(r'\b(Cuz|coz)\b', 'because', text, flags=re.IGNORECASE)
    text = re.sub(r'\b(Ikr)\b', 'I know right', text, flags=re.IGNORECASE)
    text = re.sub(r'\b(Faux pas)\b', 'mistake', text, flags=re.IGNORECASE)

    # 4. Lowercase text
    text = text.lower()

    # 5. Normalize repeated characters
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def predict_emotion(text_input, model, tokenizer_instance, max_len_sequences):
    """Predicts top 3 emotions for a given text."""
    # Pre-process the input text
    processed_text_input = preprocess_text(text_input)

    # Tokenize and pad the input text
    sequence = tokenizer_instance.texts_to_sequences([processed_text_input])
    padded_sequence = pad_sequences(sequence, maxlen=max_len_sequences, padding='post', truncating='post')

    # Get prediction
    if padded_sequence.shape[0] == 0:
        print("Warning: Text could not be tokenized effectively.")
        return []

    prediction_probs = model.predict(padded_sequence)[0]

    # Get the top 3 emotions
    top_3_indices = prediction_probs.argsort()[-3:][::-1]

    # Define emotion labels (these should match your training data)
    emotion_labels = [
        'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
        'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment',
        'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism',
        'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'
    ]

    top_3_emotions_with_scores = []
    for idx in top_3_indices:
        if idx < len(emotion_labels):
            top_3_emotions_with_scores.append((emotion_labels[idx], prediction_probs[idx]))
        else:
            print(f"Warning: Predicted index {idx} is out of bounds for emotion labels.")

    return top_3_emotions_with_scores

def main():
    # Set up signal handler for Ctrl+C
    signal.signal(signal.SIGINT, signal_handler)

    # Load the Whisper model
    print("Loading Whisper model...")
    whisper_model = whisper.load_model("tiny")

    # Print selected audio device
    device_info = sd.query_devices(MICROPHONE_DEVICE_INDEX)
    print(f"\nUsing audio device: {device_info['name']}")
    print(f"Input channels: {device_info['max_input_channels']}")

    print("\nStarting continuous recording and analysis...")
    print("Press Ctrl+C to stop")

    try:
        while True:
            # Record audio
            recording = record_audio(duration=10)

            # Save and transcribe
            audio_file = save_audio(recording)
            transcription = transcribe_audio(whisper_model, audio_file)

            if transcription.strip():
                print(f"\nTranscription: {transcription}")

                # Keras sentiment model prediction
                sentiment_keras = prediction_pipeline(transcription, sentiment_model, tokenizer, max_len)

                # Emotion model prediction
                emotions = predict_emotion(transcription, emotion_model, tokenizer, max_len)

                # LLM prediction
                sentiment_llm, emotion_llm = get_sentiment_and_emotion(transcription)

                print(f"\nKeras Model Sentiment: {sentiment_keras}")
                print("Top 3 Emotions:")
                for emotion, score in emotions:
                    print(f"- {emotion}: {score:.4f}")
                print(f"LLM Sentiment: {sentiment_llm}, LLM Emotion: {emotion_llm}\n")
            else:
                print("\nNo speech detected in this recording.")

            # Small pause before next recording
            time.sleep(0.5)

    except KeyboardInterrupt:
        print("\nStopping the recording process...")
        sys.exit(0)

if __name__ == "__main__":
    main()