feat: meeting mode auto-start with sustained speech fallback

Meeting mode trigger now has two paths:
- Primary: system audio energy > 0.005 (immediate, catches most apps)
- Fallback: ~3 seconds sustained speech detection via VAD counter
  (catches Electron/WebRTC apps like Nextcloud Talk where
  ScreenCaptureKit may not capture audio)

Brief sounds or momentary speech won't trigger - only a real
conversation lasting ~3s will activate the fallback. The counter
decays during silence to prevent accumulation of brief detections.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
michael.borak
2026-02-06 13:13:31 +01:00
parent 2c93afce9e
commit 6584136cb0

View File

@@ -46,6 +46,10 @@ pub struct AudioProcessor {
// Recording Mode (voice or meeting) // Recording Mode (voice or meeting)
recording_mode: String, recording_mode: String,
// Meeting mode: consecutive speech detection counter.
// Prevents false triggers from brief sounds; requires sustained speech.
meeting_speech_frames: u32,
} }
impl AudioProcessor { impl AudioProcessor {
@@ -106,6 +110,7 @@ impl AudioProcessor {
last_event_time: std::time::Instant::now(), last_event_time: std::time::Instant::now(),
system_queue: Arc::new(Mutex::new(std::collections::VecDeque::new())), system_queue: Arc::new(Mutex::new(std::collections::VecDeque::new())),
recording_mode, recording_mode,
meeting_speech_frames: 0,
}) })
} }
@@ -247,29 +252,44 @@ impl AudioProcessor {
if self.last_event_time.elapsed().as_millis() > 2000 && self.recording_mode == "meeting" { if self.last_event_time.elapsed().as_millis() > 2000 && self.recording_mode == "meeting" {
if let Some(app) = &self.app_handle { if let Some(app) = &self.app_handle {
emit_log(app, "DEBUG", &format!( emit_log(app, "DEBUG", &format!(
"Waiting for Meeting... SysEnergy: {:.4} (thr: 0.005), VAD Speech: {} | SysQueue empty: {}", "Waiting... SysEnergy: {:.4} (thr: 0.005), Speech: {}, SpeechFrames: {}/90 | SysQ empty: {}",
max_system_energy, self.is_speech_active, max_system_energy, self.is_speech_active, self.meeting_speech_frames,
if let Ok(q) = self.system_queue.lock() { q.is_empty() } else { true } if let Ok(q) = self.system_queue.lock() { q.is_empty() } else { true }
)); ));
} }
} }
// MODE-SPECIFIC TRIGGER LOGIC: // MODE-SPECIFIC TRIGGER LOGIC:
// "voice" -> Trigger if user speaks (VAD) // "voice" -> Trigger if user speaks (VAD) - immediate
// "meeting" -> Trigger ONLY on system audio energy above threshold. // "meeting" -> Primary: system audio energy above threshold (immediate)
// Speech alone NEVER triggers meeting mode to prevent // Fallback: sustained speech for ~3 seconds (catches apps like
// false starts when user speaks near mic without a call. // Nextcloud Talk where ScreenCaptureKit may not capture audio).
// The threshold (0.005) is low enough to catch ringtones, // Brief speech resets the counter to prevent false triggers.
// call audio, and notification sounds from any app. //
// The sustained speech counter uses VAD chunks (~32ms each at 16kHz/512).
// ~90 consecutive chunks ≈ ~3 seconds of sustained speech.
const MEETING_SPEECH_THRESHOLD: u32 = 90;
if self.recording_mode == "meeting" {
if self.is_speech_active || system_active {
self.meeting_speech_frames += 1;
} else {
// Reset if silence detected - prevents brief sounds from accumulating
self.meeting_speech_frames = self.meeting_speech_frames.saturating_sub(3);
}
}
let trigger = if self.recording_mode == "voice" { let trigger = if self.recording_mode == "voice" {
self.is_speech_active self.is_speech_active
} else { } else {
system_active // Immediate trigger on system audio energy
// OR sustained speech (~3s) as fallback for Electron/WebRTC apps
system_active || self.meeting_speech_frames >= MEETING_SPEECH_THRESHOLD
}; };
if trigger { if trigger {
// Trigger Detected! let reason = if system_active { "SystemAudio" } else { "SustainedSpeech" };
println!("Auto-Start: Trigger! (Mode: {}, SysEnergy: {:.4}, Speech: {})", self.recording_mode, max_system_energy, self.is_speech_active); println!("Auto-Start: Trigger! (Mode: {}, Reason: {}, SysEnergy: {:.4}, SpeechFrames: {})", self.recording_mode, reason, max_system_energy, self.meeting_speech_frames);
self.waiting_for_speech = false; self.waiting_for_speech = false;
// Flush Ring Buffer (Orderly: from ring_pos to end, then 0 to ring_pos) // Flush Ring Buffer (Orderly: from ring_pos to end, then 0 to ring_pos)