feat(v1.2.0): Final Release - Native Audio, Smart VAD, Auto-Loop & Quality Fixes

- Implemented standard 48kHz audio pipeline to fix sample rate mismatch/distortion - Added Native System Audio (ScreenCaptureKit) support - Implemented Smart VAD (Voice Activity Detection) with Auto-Start on valid audio - Added Auto-Loop: Automatically re-arms recording after stop - Added Empty Guard: Prevents transcribing silent recordings (< 20s empty) - Increased Pre-Roll buffer to 3.0s to prevent cut-off speech - Fixed clipping with clamped audio mixing
2026-01-24 01:35:09 +01:00
parent 31f59ba4a2
commit 4e9a1fd038
10 changed files with 513 additions and 145 deletions
@@ -347,6 +347,12 @@ dependencies = [
 "wyz",
 ]

+[[package]]
+name = "block"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0d8c1fef690941d3e7788d328517591fecc684c084084702d6ff1641e993699a"
+
 [[package]]
 name = "block-buffer"
 version = "0.10.4"
@@ -1739,7 +1745,7 @@ checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"

 [[package]]
 name = "hearbit-ai"
-version = "0.1.2"
+version = "1.2.0"
 dependencies = [
 "base64 0.22.1",
 "chrono",
@@ -1749,6 +1755,8 @@ dependencies = [
 "oauth2",
 "reqwest 0.11.27",
 "rubato",
+ "screencapturekit",
+ "screencapturekit-sys",
 "serde",
 "serde_json",
 "tauri",
@@ -2425,6 +2433,15 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "malloc_buf"
+version = "0.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62bb907fe88d54d8d9ce32a3cceab4218ed2f6b7d35617cafe9adf84e43919cb"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "markup5ever"
 version = "0.14.1"
@@ -2717,6 +2734,27 @@ dependencies = [
 "url",
 ]

+[[package]]
+name = "objc"
+version = "0.2.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "915b1b472bc21c53464d6c8461c9d3af805ba1ef837e1cac254428f4a77177b1"
+dependencies = [
+ "malloc_buf",
+ "objc_exception",
+]
+
+[[package]]
+name = "objc-foundation"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1add1b659e36c9607c7aab864a76c7a4c2760cd0cd2e120f3fb8b952c7e22bf9"
+dependencies = [
+ "block",
+ "objc",
+ "objc_id",
+]
+
 [[package]]
 name = "objc2"
 version = "0.6.3"
@@ -2979,6 +3017,24 @@ dependencies = [
 "objc2-security",
 ]

+[[package]]
+name = "objc_exception"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ad970fb455818ad6cba4c122ad012fae53ae8b4795f86378bce65e4f6bab2ca4"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "objc_id"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c92d4ddb4bd7b50d730c215ff871754d0da6b2178849f8a2a2ab69712d0c073b"
+dependencies = [
+ "objc",
+]
+
 [[package]]
 name = "object"
 version = "0.32.2"
@@ -4114,6 +4170,29 @@ version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"

+[[package]]
+name = "screencapturekit"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a5eeeb57ac94960cfe5ff4c402be6585ae4c8d29a2cf41b276048c2e849d64e"
+dependencies = [
+ "screencapturekit-sys",
+]
+
+[[package]]
+name = "screencapturekit-sys"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22411b57f7d49e7fe08025198813ee6fd65e1ee5eff4ebc7880c12c82bde4c60"
+dependencies = [
+ "block",
+ "dispatch",
+ "objc",
+ "objc-foundation",
+ "objc_id",
+ "once_cell",
+]
+
 [[package]]
 name = "sct"
 version = "0.7.1"
@@ -1,6 +1,6 @@
 [package]
 name = "hearbit-ai"
-version = "0.1.2"
+version = "1.2.0"
 description = "A Tauri App"
 authors = ["you"]
 edition = "2021"
@@ -38,3 +38,5 @@ lettre = { version = "0.11", features = ["tokio1", "tokio1-native-tls", "builder
 tauri-plugin-log = "2.0.0"
 tauri-plugin-shell = "2.3.4"
 base64 = "0.22"
+screencapturekit = "0.2.0"
+screencapturekit-sys = "0.2.8"
@@ -39,6 +39,9 @@ pub struct AudioProcessor {
    // Event Emission
    app_handle: Option<AppHandle>,
    last_event_time: std::time::Instant,
+    
+    // System Audio Queue for Mixing
+    pub system_queue: Arc<Mutex<std::collections::VecDeque<f32>>>,
 }

 impl AudioProcessor {
@@ -68,8 +71,8 @@ impl AudioProcessor {
            1
        ).map_err(|e| format!("Failed to init Resampler: {:?}", e))?;

-        // Pre-roll buffer (1.0 seconds) * Channels (interleaved store)
-        let ring_curr_seconds = 1.0; 
+        // Pre-roll buffer (3.0 seconds) * Channels (interleaved store)
+        let ring_curr_seconds = 3.0; 
        // WavWriter writes interleaved, so we store interleaved.
        let ring_size = (sample_rate as f32 * ring_curr_seconds) as usize * channel_count as usize;

@@ -96,10 +99,35 @@ impl AudioProcessor {
            total_processed_samples: 0,
            app_handle: Some(app_handle),
            last_event_time: std::time::Instant::now(),
+            system_queue: Arc::new(Mutex::new(std::collections::VecDeque::new())),
        })
    }

-    pub fn process(&mut self, data: &[f32]) {
+    pub fn process(&mut self, input_data: &[f32]) {
+        // MIXING LOGIC:
+        // We have `input_data` (Microphone). We check `system_queue` for System Audio.
+        // We mix them: Out = Mic + System.
+        let mut mixed_data = input_data.to_vec();
+        let mut max_system_energy = 0.0;
+        
+        if let Ok(mut queue) = self.system_queue.lock() {
+             for i in 0..mixed_data.len() {
+                 if let Some(sys_sample) = queue.pop_front() {
+                     // Track system energy for trigger logic
+                     let abs_sample = sys_sample.abs();
+                     if abs_sample > max_system_energy {
+                         max_system_energy = abs_sample;
+                     }
+                     
+                     // Simple addition mixing with clamping to avoid clipping
+                     let mixed = mixed_data[i] + sys_sample;
+                     mixed_data[i] = mixed.max(-1.0).min(1.0);
+                 }
+             }
+        }
+        
+        let data = &mixed_data;
+
        // 1. Add to Ring Buffer (Interleaved data - Record EVERYTHING)
        for &sample in data {
            self.ring_buffer[self.ring_pos] = sample;
@@ -108,8 +136,7 @@ impl AudioProcessor {

        // 2. Prepare VAD Signal (Mono Mixdown)
        // FRESH START LOGIC (v0.2.0):
-        // We expect standard Stereo Input (BlackHole 2ch).
-        // No magic 3-channel aggregate.
+        // We expect standard Stereo Input.
        
        let channels = self.channel_count as usize;
        let frame_count = data.len() / channels;
@@ -146,7 +173,6 @@ impl AudioProcessor {
                     self.vad_buffer.extend_from_slice(&waves_out[0][0..out_len]);
                 }
             }
-             // Update output buffer usage... logic is tricky with drain.
        }

        // 4. Process VAD
@@ -155,21 +181,19 @@ impl AudioProcessor {
            // Run Detection
            let probability = self.vad.predict(vad_chunk.clone());
            
-            // Calculate RMS for this chunk to use as fallback/hybrid detection
-            let sq_sum: f32 = vad_chunk.iter().map(|x| x * x).sum();
-            let rms = (sq_sum / vad_chunk.len() as f32).sqrt();
+            // Hybrid VAD: Probability > 0.9 OR System Audio Active
+            // We want to keep recording if there is meaningful audio from the system (Call in progress),
+            // even if the VAD doesn't strictly classify it as 'speech' (e.g. ringing, laughter, noise).
+            
+            let system_is_active = max_system_energy > 0.01; // Same threshold as trigger
+            let is_speech = probability > 0.9; 

-            // Hybrid VAD: Probability > 0.9 OR RMS > 0.025
-            // INCREASED THRESHOLDS (v1.1.1): 
-            // Reduced sensitivity to avoid background noise triggering recording.
-            let is_speech = probability > 0.9 || rms > 0.025; 
-
-            if is_speech {
+            if is_speech || system_is_active {
                self.is_speech_active = true;
                self.last_speech_time = self.total_processed_samples;
            }
            
-            // Emit VAD event periodically (every 500ms is enough for non-diagnostic mode)
+            // Emit VAD event periodically
            if self.last_event_time.elapsed().as_millis() > 500 {
                if let Some(app) = &self.app_handle {
                     #[derive(Clone, serde::Serialize)]
@@ -183,11 +207,6 @@ impl AudioProcessor {
                     });
                }
                self.last_event_time = std::time::Instant::now();
-                
-                // IMPORTANT: We reset is_speech_active after emitting, 
-                // so we don't latch it forever if the user stops talking.
-                // However, the main loop sets it to true if current chunk is speech.
-                // This logic is a bit of a "latch for X ms".
                self.is_speech_active = false; 
            }
        }
@@ -195,9 +214,32 @@ impl AudioProcessor {
        
        // 4. Update Hangover and Check Write condition
        if self.waiting_for_speech {
-            if self.is_speech_active {
+            // TRIGGER CONDITION:
+            // 1. VAD says speech (Someone is talking)
+            // 2. AND System Audio has energy (Meaning audio is coming from the PC, i.e., Call started)
+            // Threshold 0.01 is roughly -40dB, should cover ringtones/speech easily but ignore silence/hiss.
+            
+            let system_active = max_system_energy > 0.01;
+            
+            // Special Case: If System Audio acts like a Ringtone (Constant high energy but maybe not VAD speech?)
+            // We trust VAD for speech. But we also trust "Loud System Sound" = Call.
+            // If system is consistently loud, it's likely a call.
+            
+            // For now, Strict Mode:
+            // Trigger if: (Speech Detected) AND (System Audio Present)
+            // This prevents "User talking alone" -> No trigger (System silent).
+            // This allows "Partner talking" -> Trigger (Speech + System).
+            
+            // What about Ringtone? Ringtone has energy but maybe no speech.
+            // If we want to record the ringtone, we should trigger on `system_active` alone?
+            // "erst wen der call startet" -> usually ringing.
+            // Let's be generous: If System Audio is loud (> 0.05), we trigger regardless of VAD.
+            
+            let trigger = (self.is_speech_active && system_active) || (max_system_energy > 0.05);
+
+            if trigger {
                // Trigger Detected!
-                println!("Auto-Start: Speech detected. Flushing pre-roll...");
+                println!("Auto-Start: Call detected (SysEnergy: {}). Flushing pre-roll...", max_system_energy);
                self.waiting_for_speech = false;

                // Flush Ring Buffer (Orderly: from ring_pos to end, then 0 to ring_pos)
@@ -15,11 +15,13 @@ mod audio_processor;
 use audio_processor::AudioProcessor;
 mod auth;
 mod email;
+mod sc_audio;

 // State to hold the active recording stream
 struct AppState {
    recording_stream: Mutex<Option<cpal::Stream>>,
    recording_file_path: Mutex<Option<String>>,
+    system_capture: Mutex<Option<sc_audio::SystemAudioCapture>>,
 }

 #[derive(serde::Serialize)]
@@ -71,7 +73,7 @@ fn get_input_devices() -> Result<Vec<AudioDevice>, String> {


 #[tauri::command]
-fn start_recording(app: AppHandle, state: State<'_, AppState>, device_id: String, save_path: Option<String>, custom_filename: Option<String>, wait_for_speech: Option<bool>) -> Result<(), String> {
+async fn start_recording(app: AppHandle, state: State<'_, AppState>, device_id: String, save_path: Option<String>, custom_filename: Option<String>, wait_for_speech: Option<bool>) -> Result<(), String> {
    emit_log(&app, "INFO", &format!("Starting recording on device: {}", device_id));
    let host = cpal::default_host();
    
@@ -85,13 +87,23 @@ fn start_recording(app: AppHandle, state: State<'_, AppState>, device_id: String

    // Select the configuration with the MAXIMUM number of channels
    // This is crucial for "Hearbit Audio" (Aggregate) which lists 3 channels but might default to 2.
-    // We want the raw 3 channels to separate Mic (Ch0) from System (Ch1+2).
-    let supported_configs = device.supported_input_configs().map_err(|e| e.to_string())?;
-    let config = supported_configs
-        .max_by_key(|c| c.channels())
-        .map(|c| c.with_max_sample_rate())
+    // Select Audio Configuration
+    // We prioritize 48kHz because System Audio (ScreenCaptureKit) acts best at 48k.
+    let supported_configs: Vec<_> = device.supported_input_configs().map_err(|e| e.to_string())?.collect();
+    
+    // Try to find 48kHz specifically
+    // Note: cpal::SampleRate is likely a type alias for u32 here, so we pass 48000 directly.
+    let config = supported_configs.iter()
+        .find(|c| c.min_sample_rate() <= 48000 && c.max_sample_rate() >= 48000)
+        .map(|c| c.with_sample_rate(48000))
+        .or_else(|| {
+            // Fallback: Max sample rate
+            supported_configs.iter()
+                .max_by_key(|c| c.channels())
+                .map(|c| c.with_max_sample_rate())
+        })
        .ok_or("No supported input configurations found")?;
-
+    
    emit_log(&app, "INFO", &format!("Selected Audio Config: {} Channels, {} Hz", config.channels(), config.sample_rate()));

    let spec = hound::WavSpec {
@@ -145,6 +157,64 @@ fn start_recording(app: AppHandle, state: State<'_, AppState>, device_id: String
    let processor = Arc::new(Mutex::new(processor));
    let processor_clone = processor.clone();

+    // --- SYSTEM AUDIO CAPTURE START ---
+    let mut sys_capture = sc_audio::SystemAudioCapture::new(config.sample_rate());
+    
+    // Get the queue to share with the capture callback
+    let queue_clone = {
+        let p = processor.lock().unwrap();
+        p.system_queue.clone() // Access the pub field we added
+    };
+
+    let sys_handle = app.clone();
+    let sys_callback = move |data: &[f32]| {
+        // Push to queue
+        if let Ok(mut q) = queue_clone.lock() {
+            q.extend(data.iter());
+            
+            // Limit queue size to avoid memory leaks if main process loop is slow
+            while q.len() > 48000 * 5 { // 5 seconds buffer
+               q.pop_front();
+            }
+        }
+    };
+
+    // Need to run async start in sync command? 
+    // Tauri commands are async if they return Future, but here we returned Result.
+    // We should probably spawn it.
+    // Actually, SystemAudioCapture::start is async.
+    // We can spawn a tokio task to start it. But we need to keep the object alive.
+    // The start method modifies self.stream.
+    // If we make start synchronous or use block_in_place?
+    // Better: change start_recording to async fn (it is not currently async in signature used by tauri::command macros? No, tauri supports async commands).
+    // Let's check line 76: `fn start_recording`... it is NOT async. 
+    // We should make it `async fn start_recording`.
+    
+    // However, changing to async might affect how state is passed or other things.
+    // Actually Tauri works fine with async commands.
+    // But then we need to await `sys_capture.start`.
+    
+    // Wait, let's look at `SystemAudioCapture::start`. It takes `&mut self`.
+    // We can't easily spawn it away properly if we want to keep `sys_capture` in State.
+    // The `sys_capture` struct holds the `SCStream` which must be kept alive.
+    
+    // Let's assume we can make `start_recording` into `async fn`.
+    
+    // TEMPORARY: Just putting placeholder for logic flow. 
+    // We will need to change the function signature of start_recording to async first in a separate step or assume I can do it here if I replace the whole signature.
+    // The replace_file_content replaces a block. 
+    // I will replace line 76 in a separate call to make it async.
+    
+    // For this block, I will assume it's async context.
+    
+    match sys_capture.start(sys_callback) {
+        Ok(_) => emit_log(&app, "INFO", "System Audio Capture started."),
+        Err(e) => emit_log(&app, "WARN", &format!("System Audio Capture failed (Permissions?): {}", e)),
+    }
+    
+    *state.system_capture.lock().unwrap() = Some(sys_capture);
+    // --- SYSTEM AUDIO CAPTURE END ---
+
    let app_handle = app.clone();
    let err_fn = move |err| {
        eprintln!("an error occurred on stream: {}", err);
@@ -206,6 +276,13 @@ fn stop_recording(app: AppHandle, state: State<'_, AppState>) -> Result<String,
    // Drop stream to stop recording
    {
        let mut stream_guard = state.recording_stream.lock().unwrap();
+        // Also stop System Capture
+        let mut sys_guard = state.system_capture.lock().unwrap();
+        if let Some(sys) = sys_guard.as_mut() {
+             sys.stop();
+        }
+        *sys_guard = None;
+
        if stream_guard.is_none() {
            return Err("Not recording".to_string());
        }
@@ -804,6 +881,12 @@ fn create_hearbit_audio_device(app: AppHandle) -> Result<String, String> {
    }
 }

+#[tauri::command]
+async fn check_screen_recording_permission() -> bool {
+    sc_audio::check_permissions().await
+}
+
+
 #[tauri::command]
 async fn save_text_file(app: AppHandle, path: String, content: String) -> Result<(), String> {
    emit_log(&app, "INFO", &format!("Saving text file to: {}", path));
@@ -891,6 +974,7 @@ pub fn run() {
        .manage(AppState {
            recording_stream: Mutex::new(None),
            recording_file_path: Mutex::new(None),
+            system_capture: Mutex::new(None),
        })
        .invoke_handler(tauri::generate_handler![
            greet,
@@ -904,6 +988,7 @@ pub fn run() {
            get_available_models,
            open_audio_midi_setup,
            create_hearbit_audio_device,
+            check_screen_recording_permission,
            auth::start_auth_flow,
            auth::get_calendar_events,
            save_text_file,
@@ -0,0 +1,103 @@
+use screencapturekit_sys::{
+    os_types::rc::Id,
+    shareable_content::UnsafeSCShareableContent,
+    content_filter::{UnsafeContentFilter, UnsafeInitParams},
+    stream_configuration::UnsafeStreamConfiguration,
+    stream::UnsafeSCStream,
+    stream_error_handler::UnsafeSCStreamError,
+    stream_output_handler::UnsafeSCStreamOutput,
+    cm_sample_buffer_ref::CMSampleBufferRef,
+};
+
+pub struct SystemAudioCapture {
+    stream: Option<Id<UnsafeSCStream>>,
+    sample_rate: u32,
+}
+
+struct AudioOutputWrapper {
+    callback: Box<dyn Fn(&[f32]) + Send + Sync>,
+}
+
+impl UnsafeSCStreamOutput for AudioOutputWrapper {
+    fn did_output_sample_buffer(&self, sample: Id<CMSampleBufferRef>, of_type: u8) {
+        if of_type == 1 { // Audio
+            let buffers = sample.get_av_audio_buffer_list();
+            for buffer in buffers {
+                // Buffer data is u8, we usually get F32 from SCK if configured.
+                // Assuming f32 (Floating Point) based on our config.
+                // We need to convert [u8] to [f32].
+                let data_u8 = buffer.data;
+                let data_f32: &[f32] = unsafe {
+                    std::slice::from_raw_parts(
+                        data_u8.as_ptr() as *const f32,
+                        data_u8.len() / 4,
+                    )
+                };
+                
+                (self.callback)(data_f32);
+            }
+        }
+    }
+}
+
+struct ErrorHandler;
+impl UnsafeSCStreamError for ErrorHandler {
+    fn handle_error(&self) {
+        // eprintln!("Stream Error");
+    }
+}
+
+pub async fn check_permissions() -> bool {
+    UnsafeSCShareableContent::get().is_ok()
+}
+
+impl SystemAudioCapture {
+    pub fn new(sample_rate: u32) -> Self {
+        Self { stream: None, sample_rate }
+    }
+
+    pub fn start<F>(&mut self, callback: F) -> Result<(), String> 
+    where F: Fn(&[f32]) + Send + Sync + 'static {
+        
+        let content = UnsafeSCShareableContent::get().map_err(|e| format!("Failed to get content"))?;
+        let displays = content.displays();
+        let display = displays.first().ok_or("No display found")?;
+
+        let filter_init = UnsafeInitParams::Display(display.clone());
+        let filter = UnsafeContentFilter::init(filter_init);
+
+        // Wait, 'pixel_format' is OSType. b"BGRA" is &[u8;4].
+        // FourCharCode::from_chars exists in crate::os_types::four_char_code but we didn't import it.
+        // Actually, we can just use the Default and overwrite fields.
+        // But better: use Default and only set what we need.
+        
+        let mut config = UnsafeStreamConfiguration::default();
+        config.width = 100;
+        config.height = 100;
+        config.captures_audio = 1;
+        config.sample_rate = self.sample_rate;
+        config.channel_count = 2;
+        config.excludes_current_process_audio = 0;
+
+        let output_wrapper = AudioOutputWrapper {
+            callback: Box::new(callback), 
+        };
+
+        // Convert config to Id<UnsafeStreamConfigurationRef> using Into
+        let stream = UnsafeSCStream::init(filter, config.into(), ErrorHandler);
+        
+        stream.add_stream_output(output_wrapper, 1); // 1 = Audio
+        
+        stream.start_capture().map_err(|e| "Failed to start capture".to_string())?;
+        
+        self.stream = Some(stream);
+        Ok(())
+    }
+    
+    pub fn stop(&mut self) {
+        if let Some(stream) = &self.stream {
+            stream.stop_capture();
+        }
+        self.stream = None;
+    }
+}
@@ -1,7 +1,7 @@
 {
  "$schema": "https://schema.tauri.app/config/2",
  "productName": "Hearbit AI",
-  "version": "1.1.1",
+  "version": "1.2.0",
  "identifier": "com.hearbit-ai.desktop",
  "build": {
    "beforeDevCommand": "npm run dev",