feat: refine meeting auto-start, silence timeout (25s) and improve transcription logging

2026-01-24 14:16:55 +01:00
parent a3e4fa4ec7
commit 9a65f42f51
5 changed files with 135 additions and 112 deletions
@@ -1,5 +1,6 @@
 use std::sync::{Arc, Mutex};
 use tauri::{AppHandle, Emitter};
+use crate::emit_log;
 use cpal::Sample;
 use hound::WavWriter;
 use rubato::{Resampler, FastFixedIn, PolynomialDegree};
@@ -42,6 +43,9 @@ pub struct AudioProcessor {
    
    // System Audio Queue for Mixing
    pub system_queue: Arc<Mutex<std::collections::VecDeque<f32>>>,
+    
+    // Recording Mode (voice or meeting)
+    recording_mode: String,
 }

 impl AudioProcessor {
@@ -50,7 +54,8 @@ impl AudioProcessor {
        channel_count: u16,
        writer: Arc<Mutex<WavWriter<std::io::BufWriter<std::fs::File>>>>,
        app_handle: AppHandle,
-        wait_for_speech: bool
+        wait_for_speech: bool,
+        recording_mode: String,
    ) -> Result<Self, String> {
        let vad_sample_rate = 16000;
        let vad_chunk_size = 512; 
@@ -100,30 +105,51 @@ impl AudioProcessor {
            app_handle: Some(app_handle),
            last_event_time: std::time::Instant::now(),
            system_queue: Arc::new(Mutex::new(std::collections::VecDeque::new())),
+            recording_mode,
        })
    }

    pub fn process(&mut self, input_data: &[f32]) {
        // MIXING LOGIC:
        // We have `input_data` (Microphone). We check `system_queue` for System Audio.
-        // We mix them: Out = Mic + System.
+        // System Audio is hardcoded to 2 channels (Stereo) in sc_audio.rs.
+        // Microphone `self.channel_count` can be 1 (Mono) or 2 (Stereo).
+        
+        let mic_channels = self.channel_count as usize;
        let mut mixed_data = input_data.to_vec();
        let mut max_system_energy = 0.0;
        
+        let gain_mic = 1.0;
+        let gain_sys = 0.8; // Slightly lower system audio to prioritize speaker
+        
        if let Ok(mut queue) = self.system_queue.lock() {
-             for i in 0..mixed_data.len() {
-                 if let Some(sys_sample) = queue.pop_front() {
-                     // Track system energy for trigger logic
-                     let abs_sample = sys_sample.abs();
-                     if abs_sample > max_system_energy {
-                         max_system_energy = abs_sample;
-                     }
-                     
-                     // Simple addition mixing with clamping to avoid clipping
-                     let mixed = mixed_data[i] + sys_sample;
-                     mixed_data[i] = mixed.max(-1.0).min(1.0);
-                 }
-             }
+            let frames = mixed_data.len() / mic_channels;
+            
+            for f in 0..frames {
+                // system_queue is always stereo (L, R, L, R...)
+                if let (Some(l), Some(r)) = (queue.pop_front(), queue.pop_front()) {
+                    let abs_l = l.abs();
+                    let abs_r = r.abs();
+                    let current_sys_max = if abs_l > abs_r { abs_l } else { abs_r };
+                    if current_sys_max > max_system_energy {
+                        max_system_energy = current_sys_max;
+                    }
+
+                    if mic_channels == 1 {
+                        // Mic is Mono: Mix System L+R down to Mono
+                        let sys_mono = (l + r) / 2.0;
+                        let mixed = (mixed_data[f] * gain_mic) + (sys_mono * gain_sys);
+                        mixed_data[f] = mixed.max(-1.0).min(1.0);
+                    } else {
+                        // Mic is Stereo: Mix L-to-L and R-to-R
+                        let f_start = f * 2;
+                        let mixed_l = (mixed_data[f_start] * gain_mic) + (l * gain_sys);
+                        let mixed_r = (mixed_data[f_start + 1] * gain_mic) + (r * gain_sys);
+                        mixed_data[f_start] = mixed_l.max(-1.0).min(1.0);
+                        mixed_data[f_start + 1] = mixed_r.max(-1.0).min(1.0);
+                    }
+                }
+            }
        }
        
        let data = &mixed_data;
@@ -181,11 +207,7 @@ impl AudioProcessor {
            // Run Detection
            let probability = self.vad.predict(vad_chunk.clone());
            
-            // Hybrid VAD: Probability > 0.9 OR System Audio Active
-            // We want to keep recording if there is meaningful audio from the system (Call in progress),
-            // even if the VAD doesn't strictly classify it as 'speech' (e.g. ringing, laughter, noise).
-            
-            let system_is_active = max_system_energy > 0.01; // Same threshold as trigger
+            let system_is_active = max_system_energy > 0.005; // Lowered to match trigger
            let is_speech = probability > 0.9; 

            if is_speech || system_is_active {
@@ -219,23 +241,23 @@ impl AudioProcessor {
            // 2. AND System Audio has energy (Meaning audio is coming from the PC, i.e., Call started)
            // Threshold 0.01 is roughly -40dB, should cover ringtones/speech easily but ignore silence/hiss.
            
-            let system_active = max_system_energy > 0.01;
+            let system_active = max_system_energy > 0.005;
            
-            // Special Case: If System Audio acts like a Ringtone (Constant high energy but maybe not VAD speech?)
-            // We trust VAD for speech. But we also trust "Loud System Sound" = Call.
-            // If system is consistently loud, it's likely a call.
-            
-            // For now, Strict Mode:
-            // Trigger if: (Speech Detected) AND (System Audio Present)
-            // This prevents "User talking alone" -> No trigger (System silent).
-            // This allows "Partner talking" -> Trigger (Speech + System).
-            
-            // What about Ringtone? Ringtone has energy but maybe no speech.
-            // If we want to record the ringtone, we should trigger on `system_active` alone?
-            // "erst wen der call startet" -> usually ringing.
-            // Let's be generous: If System Audio is loud (> 0.05), we trigger regardless of VAD.
-            
-            let trigger = (self.is_speech_active && system_active) || (max_system_energy > 0.05);
+            // Periodically log energy to help debug why meeting mode might not start
+            if self.last_event_time.elapsed().as_millis() > 2000 && self.recording_mode == "meeting" {
+                if let Some(app) = &self.app_handle {
+                    emit_log(app, "DEBUG", &format!("Waiting for Meeting... Current System Energy: {:.4} (Threshold: 0.005)", max_system_energy));
+                }
+            }
+
+            // MODE-SPECIFIC TRIGGER LOGIC:
+            // "voice" -> Trigger if user speaks (is_speech_active)
+            // "meeting" -> Trigger ONLY if system audio energy detected (Call starting)
+            let trigger = if self.recording_mode == "voice" {
+                self.is_speech_active
+            } else {
+                system_active
+            };

            if trigger {
                // Trigger Detected!
@@ -271,7 +293,13 @@ impl AudioProcessor {
        // Standard Recording Logic (Active or Hangover)
        let time_since_speech = self.total_processed_samples.saturating_sub(self.last_speech_time);
        
-        if self.is_speech_active || time_since_speech < self.hangover_samples {
+        // We write to file if:
+        // 1. VAD thinks someone is speaking (Mic or System)
+        // 2. OR System audio energy is currently above threshold (Ensures calls are captured)
+        // 3. OR we are within the hangover period
+        let system_is_active = max_system_energy > 0.005;
+        
+        if self.is_speech_active || system_is_active || time_since_speech < self.hangover_samples {
             let mut guard = self.writer.lock().unwrap();
             for &sample in data {
                 let amplitude = i16::MAX as f32;
@@ -37,7 +37,7 @@ struct LogEvent {
    timestamp: String,
 }

-fn emit_log(app: &AppHandle, level: &str, message: &str) {
+pub(crate) fn emit_log(app: &AppHandle, level: &str, message: &str) {
    let log = LogEvent {
        level: level.to_string(),
        message: message.to_string(),
@@ -73,8 +73,8 @@ fn get_input_devices() -> Result<Vec<AudioDevice>, String> {


 #[tauri::command]
-async fn start_recording(app: AppHandle, state: State<'_, AppState>, device_id: String, save_path: Option<String>, custom_filename: Option<String>, wait_for_speech: Option<bool>) -> Result<(), String> {
-    emit_log(&app, "INFO", &format!("Starting recording on device: {}", device_id));
+async fn start_recording(app: AppHandle, state: State<'_, AppState>, device_id: String, save_path: Option<String>, custom_filename: Option<String>, wait_for_speech: Option<bool>, mode: String) -> Result<(), String> {
+    emit_log(&app, "INFO", &format!("Starting recording [Mode: {}] on device: {}", mode, device_id));
    let host = cpal::default_host();
    
    // Find device by name (using name as ID)
@@ -143,10 +143,10 @@ async fn start_recording(app: AppHandle, state: State<'_, AppState>, device_id:
    // We pass the writer to it.
    let should_wait = wait_for_speech.unwrap_or(false);
    if should_wait {
-        emit_log(&app, "INFO", "Recording started in WAITING mode (buffer-only until speech).");
+        emit_log(&app, "INFO", &format!("Recording started in WAITING mode (Trigger: {}).", if mode == "voice" { "Speech" } else { "System Audio" }));
    }

-    let processor = AudioProcessor::new(config.sample_rate(), config.channels(), writer.clone(), app.clone(), should_wait)
+    let processor = AudioProcessor::new(config.sample_rate(), config.channels(), writer.clone(), app.clone(), should_wait, mode)
        .map_err(|e| format!("Failed to create AudioProcessor: {}", e))?;
        
    // Wrap processor in Arc<Mutex> so we can share/move it into callback
@@ -158,61 +158,40 @@ async fn start_recording(app: AppHandle, state: State<'_, AppState>, device_id:
    let processor_clone = processor.clone();

    // --- SYSTEM AUDIO CAPTURE START ---
-    let mut sys_capture = sc_audio::SystemAudioCapture::new(config.sample_rate());
+    // Prevent Doubling: If user selected an aggregate device (Hearbit Audio/BlackHole), 
+    // it ALREADY contains system audio. In that case, we don't need internal SCK capture.
+    let is_aggregate = device_id.contains("Hearbit") || device_id.contains("BlackHole");
    
-    // Get the queue to share with the capture callback
-    let queue_clone = {
-        let p = processor.lock().unwrap();
-        p.system_queue.clone() // Access the pub field we added
-    };
+    if is_aggregate {
+        emit_log(&app, "INFO", "Aggregate device detected. Disabling internal System Audio Capture to prevent doubling.");
+    } else {
+        let mut sys_capture = sc_audio::SystemAudioCapture::new(config.sample_rate());
+        
+        // Get the queue to share with the capture callback
+        let queue_clone = {
+            let p = processor.lock().unwrap();
+            p.system_queue.clone() // Access the pub field we added
+        };

-    let sys_handle = app.clone();
-    let sys_callback = move |data: &[f32]| {
-        // Push to queue
-        if let Ok(mut q) = queue_clone.lock() {
-            q.extend(data.iter());
-            
-            // Limit queue size to avoid memory leaks if main process loop is slow
-            while q.len() > 48000 * 5 { // 5 seconds buffer
-               q.pop_front();
+        let sys_callback = move |data: &[f32]| {
+            // Push to queue
+            if let Ok(mut q) = queue_clone.lock() {
+                q.extend(data.iter());
+                
+                // Limit queue size to avoid memory leaks if main process loop is slow
+                while q.len() > 48000 * 5 { // 5 seconds buffer
+                   q.pop_front();
+                }
            }
-        }
-    };
+        };

-    // Need to run async start in sync command? 
-    // Tauri commands are async if they return Future, but here we returned Result.
-    // We should probably spawn it.
-    // Actually, SystemAudioCapture::start is async.
-    // We can spawn a tokio task to start it. But we need to keep the object alive.
-    // The start method modifies self.stream.
-    // If we make start synchronous or use block_in_place?
-    // Better: change start_recording to async fn (it is not currently async in signature used by tauri::command macros? No, tauri supports async commands).
-    // Let's check line 76: `fn start_recording`... it is NOT async. 
-    // We should make it `async fn start_recording`.
-    
-    // However, changing to async might affect how state is passed or other things.
-    // Actually Tauri works fine with async commands.
-    // But then we need to await `sys_capture.start`.
-    
-    // Wait, let's look at `SystemAudioCapture::start`. It takes `&mut self`.
-    // We can't easily spawn it away properly if we want to keep `sys_capture` in State.
-    // The `sys_capture` struct holds the `SCStream` which must be kept alive.
-    
-    // Let's assume we can make `start_recording` into `async fn`.
-    
-    // TEMPORARY: Just putting placeholder for logic flow. 
-    // We will need to change the function signature of start_recording to async first in a separate step or assume I can do it here if I replace the whole signature.
-    // The replace_file_content replaces a block. 
-    // I will replace line 76 in a separate call to make it async.
-    
-    // For this block, I will assume it's async context.
-    
-    match sys_capture.start(sys_callback) {
-        Ok(_) => emit_log(&app, "INFO", "System Audio Capture started."),
-        Err(e) => emit_log(&app, "WARN", &format!("System Audio Capture failed (Permissions?): {}", e)),
+        match sys_capture.start(sys_callback).await {
+            Ok(_) => emit_log(&app, "INFO", "System Audio Capture started."),
+            Err(e) => emit_log(&app, "WARN", &format!("System Audio Capture failed (Permissions?): {}", e)),
+        }
+        
+        *state.system_capture.lock().unwrap() = Some(sys_capture);
    }
-    
-    *state.system_capture.lock().unwrap() = Some(sys_capture);
    // --- SYSTEM AUDIO CAPTURE END ---

    let app_handle = app.clone();
@@ -585,8 +564,9 @@ async fn poll_transcription(app: &AppHandle, client: &reqwest::Client, api_key:
                         return Err(format!("Download failed: {}", dl_res.status()));
                     }
                } else if status == "failed" || status == "error" {
-                    emit_log(app, "ERROR", &format!("Batch processing failed: {:?}", json));
-                    return Err(format!("Batch processing failed: {:?}", json));
+                    let err_msg = format!("Batch processing failed [Status: {}]. Full Response: {:?}", status, json);
+                    emit_log(app, "ERROR", &err_msg);
+                    return Err(err_msg);
                }
                // If 'processing' or 'pending', continue loop
            }
@@ -56,7 +56,7 @@ impl SystemAudioCapture {
        Self { stream: None, sample_rate }
    }

-    pub fn start<F>(&mut self, callback: F) -> Result<(), String> 
+    pub async fn start<F>(&mut self, callback: F) -> Result<(), String> 
    where F: Fn(&[f32]) + Send + Sync + 'static {
        
        let content = UnsafeSCShareableContent::get().map_err(|e| format!("Failed to get content"))?;