From 6584136cb042057501722cf4c994e15f34d8e49a Mon Sep 17 00:00:00 2001 From: "michael.borak" Date: Fri, 6 Feb 2026 13:13:31 +0100 Subject: [PATCH] feat: meeting mode auto-start with sustained speech fallback Meeting mode trigger now has two paths: - Primary: system audio energy > 0.005 (immediate, catches most apps) - Fallback: ~3 seconds sustained speech detection via VAD counter (catches Electron/WebRTC apps like Nextcloud Talk where ScreenCaptureKit may not capture audio) Brief sounds or momentary speech won't trigger - only a real conversation lasting ~3s will activate the fallback. The counter decays during silence to prevent accumulation of brief detections. Co-Authored-By: Claude Opus 4.5 --- src-tauri/src/audio_processor.rs | 42 +++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/src-tauri/src/audio_processor.rs b/src-tauri/src/audio_processor.rs index 39d00d3..e7e5e92 100644 --- a/src-tauri/src/audio_processor.rs +++ b/src-tauri/src/audio_processor.rs @@ -46,6 +46,10 @@ pub struct AudioProcessor { // Recording Mode (voice or meeting) recording_mode: String, + + // Meeting mode: consecutive speech detection counter. + // Prevents false triggers from brief sounds; requires sustained speech. + meeting_speech_frames: u32, } impl AudioProcessor { @@ -106,6 +110,7 @@ impl AudioProcessor { last_event_time: std::time::Instant::now(), system_queue: Arc::new(Mutex::new(std::collections::VecDeque::new())), recording_mode, + meeting_speech_frames: 0, }) } @@ -247,29 +252,44 @@ impl AudioProcessor { if self.last_event_time.elapsed().as_millis() > 2000 && self.recording_mode == "meeting" { if let Some(app) = &self.app_handle { emit_log(app, "DEBUG", &format!( - "Waiting for Meeting... SysEnergy: {:.4} (thr: 0.005), VAD Speech: {} | SysQueue empty: {}", - max_system_energy, self.is_speech_active, + "Waiting... SysEnergy: {:.4} (thr: 0.005), Speech: {}, SpeechFrames: {}/90 | SysQ empty: {}", + max_system_energy, self.is_speech_active, self.meeting_speech_frames, if let Ok(q) = self.system_queue.lock() { q.is_empty() } else { true } )); } } // MODE-SPECIFIC TRIGGER LOGIC: - // "voice" -> Trigger if user speaks (VAD) - // "meeting" -> Trigger ONLY on system audio energy above threshold. - // Speech alone NEVER triggers meeting mode to prevent - // false starts when user speaks near mic without a call. - // The threshold (0.005) is low enough to catch ringtones, - // call audio, and notification sounds from any app. + // "voice" -> Trigger if user speaks (VAD) - immediate + // "meeting" -> Primary: system audio energy above threshold (immediate) + // Fallback: sustained speech for ~3 seconds (catches apps like + // Nextcloud Talk where ScreenCaptureKit may not capture audio). + // Brief speech resets the counter to prevent false triggers. + // + // The sustained speech counter uses VAD chunks (~32ms each at 16kHz/512). + // ~90 consecutive chunks ≈ ~3 seconds of sustained speech. + const MEETING_SPEECH_THRESHOLD: u32 = 90; + + if self.recording_mode == "meeting" { + if self.is_speech_active || system_active { + self.meeting_speech_frames += 1; + } else { + // Reset if silence detected - prevents brief sounds from accumulating + self.meeting_speech_frames = self.meeting_speech_frames.saturating_sub(3); + } + } + let trigger = if self.recording_mode == "voice" { self.is_speech_active } else { - system_active + // Immediate trigger on system audio energy + // OR sustained speech (~3s) as fallback for Electron/WebRTC apps + system_active || self.meeting_speech_frames >= MEETING_SPEECH_THRESHOLD }; if trigger { - // Trigger Detected! - println!("Auto-Start: Trigger! (Mode: {}, SysEnergy: {:.4}, Speech: {})", self.recording_mode, max_system_energy, self.is_speech_active); + let reason = if system_active { "SystemAudio" } else { "SustainedSpeech" }; + println!("Auto-Start: Trigger! (Mode: {}, Reason: {}, SysEnergy: {:.4}, SpeechFrames: {})", self.recording_mode, reason, max_system_energy, self.meeting_speech_frames); self.waiting_for_speech = false; // Flush Ring Buffer (Orderly: from ring_pos to end, then 0 to ring_pos)