feat(v1.2.0): Final Release - Native Audio, Smart VAD, Auto-Loop & Quality Fixes

- Implemented standard 48kHz audio pipeline to fix sample rate mismatch/distortion - Added Native System Audio (ScreenCaptureKit) support - Implemented Smart VAD (Voice Activity Detection) with Auto-Start on valid audio - Added Auto-Loop: Automatically re-arms recording after stop - Added Empty Guard: Prevents transcribing silent recordings (< 20s empty) - Increased Pre-Roll buffer to 3.0s to prevent cut-off speech - Fixed clipping with clamped audio mixing
2026-01-24 01:35:09 +01:00
parent 31f59ba4a2
commit 4e9a1fd038
10 changed files with 513 additions and 145 deletions
--- a/RELEASE_NOTES_1.2.0.md
+++ b/RELEASE_NOTES_1.2.0.md
@@ -0,0 +1,22 @@
 # Release Notes - Hearbit AI v1.2.0
 ## 🚀 Neuheiten
 ### Native System Audio (ScreenCaptureKit)
 Wir haben die Audio-Engine komplett erneuert!
 - **Keine Treiber mehr:** Sie müssen BlackHole nicht mehr installieren.
 - **Funktioniert überall:** Egal ob Teams, Zoom, Webex, Nextcloud Talk oder 3CX – die App hört jetzt nativ mit.
 - **Berechtigung:** Die App fragt beim ersten Start nach der "Bildschirmaufnahme"-Berechtigung. Dies ist der moderne Apple-Standard für Audio-Capture.
 ### Smart VAD (Intelligente Spracherkennung)
 - **Ignoriert Musik:** Die App unterscheidet jetzt präzise zwischen menschlicher Sprache und Musik.
 - **Wartebereich-Filter:** Musik im Teams-Wartebereich wird nicht mehr aufgenommen. Die Aufnahme startet erst, wenn wirklich gesprochen wird.
 ### UI Verbesserungen
 - **Neuer Setup-Flow:** Das komplizierte Audio-Setup wurde entfernt.
 - **Freie Wahl:** Nutzen Sie jedes Mikrofon, das Sie möchten.
 ## 🛠️ Technische Änderungen
 - Update auf `screencapturekit` Framework (macOS 12.3+ erforderlich).
 - BlackHole-Abhängigkeit entfernt.
 - Audio-Mixing direkt in der App.
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
 {
  "name": "hearbit-ai",
-  "version": "0.1.0",
+  "version": "1.1.1",
  "lockfileVersion": 3,
  "requires": true,
  "packages": {
    "": {
      "name": "hearbit-ai",
-      "version": "0.1.0",
+      "version": "1.1.1",
      "dependencies": {
        "@tailwindcss/postcss": "^4.1.18",
        "@tauri-apps/api": "^2",
--- a/package.json
+++ b/package.json
@@ -1,7 +1,7 @@
 {
  "name": "hearbit-ai",
  "private": true,
-  "version": "1.1.1",
+  "version": "1.2.0",
  "type": "module",
  "scripts": {
    "dev": "vite",
--- a/src-tauri/Cargo.lock
+++ b/src-tauri/Cargo.lock
@@ -347,6 +347,12 @@ dependencies = [
 "wyz",
 ]
 [[package]]
 name = "block"
 version = "0.1.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0d8c1fef690941d3e7788d328517591fecc684c084084702d6ff1641e993699a"
 [[package]]
 name = "block-buffer"
 version = "0.10.4"
@@ -1739,7 +1745,7 @@ checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
 [[package]]
 name = "hearbit-ai"
-version = "0.1.2"
+version = "1.2.0"
 dependencies = [
 "base64 0.22.1",
 "chrono",
@@ -1749,6 +1755,8 @@ dependencies = [
 "oauth2",
 "reqwest 0.11.27",
 "rubato",
 "screencapturekit",
 "screencapturekit-sys",
 "serde",
 "serde_json",
 "tauri",
@@ -2425,6 +2433,15 @@ dependencies = [
 "libc",
 ]
 [[package]]
 name = "malloc_buf"
 version = "0.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "62bb907fe88d54d8d9ce32a3cceab4218ed2f6b7d35617cafe9adf84e43919cb"
 dependencies = [
 "libc",
 ]
 [[package]]
 name = "markup5ever"
 version = "0.14.1"
@@ -2717,6 +2734,27 @@ dependencies = [
 "url",
 ]
 [[package]]
 name = "objc"
 version = "0.2.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "915b1b472bc21c53464d6c8461c9d3af805ba1ef837e1cac254428f4a77177b1"
 dependencies = [
 "malloc_buf",
 "objc_exception",
 ]
 [[package]]
 name = "objc-foundation"
 version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1add1b659e36c9607c7aab864a76c7a4c2760cd0cd2e120f3fb8b952c7e22bf9"
 dependencies = [
 "block",
 "objc",
 "objc_id",
 ]
 [[package]]
 name = "objc2"
 version = "0.6.3"
@@ -2979,6 +3017,24 @@ dependencies = [
 "objc2-security",
 ]
 [[package]]
 name = "objc_exception"
 version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ad970fb455818ad6cba4c122ad012fae53ae8b4795f86378bce65e4f6bab2ca4"
 dependencies = [
 "cc",
 ]
 [[package]]
 name = "objc_id"
 version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c92d4ddb4bd7b50d730c215ff871754d0da6b2178849f8a2a2ab69712d0c073b"
 dependencies = [
 "objc",
 ]
 [[package]]
 name = "object"
 version = "0.32.2"
@@ -4114,6 +4170,29 @@ version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
 [[package]]
 name = "screencapturekit"
 version = "0.2.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1a5eeeb57ac94960cfe5ff4c402be6585ae4c8d29a2cf41b276048c2e849d64e"
 dependencies = [
 "screencapturekit-sys",
 ]
 [[package]]
 name = "screencapturekit-sys"
 version = "0.2.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "22411b57f7d49e7fe08025198813ee6fd65e1ee5eff4ebc7880c12c82bde4c60"
 dependencies = [
 "block",
 "dispatch",
 "objc",
 "objc-foundation",
 "objc_id",
 "once_cell",
 ]
 [[package]]
 name = "sct"
 version = "0.7.1"
--- a/src-tauri/Cargo.toml
+++ b/src-tauri/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "hearbit-ai"
-version = "0.1.2"
+version = "1.2.0"
 description = "A Tauri App"
 authors = ["you"]
 edition = "2021"
@@ -38,3 +38,5 @@ lettre = { version = "0.11", features = ["tokio1", "tokio1-native-tls", "builder
 tauri-plugin-log = "2.0.0"
 tauri-plugin-shell = "2.3.4"
 base64 = "0.22"
 screencapturekit = "0.2.0"
 screencapturekit-sys = "0.2.8"
--- a/src-tauri/src/audio_processor.rs
+++ b/src-tauri/src/audio_processor.rs
@@ -39,6 +39,9 @@ pub struct AudioProcessor {
    // Event Emission
    app_handle: Option<AppHandle>,
    last_event_time: std::time::Instant,
    // System Audio Queue for Mixing
    pub system_queue: Arc<Mutex<std::collections::VecDeque<f32>>>,
 }
 impl AudioProcessor {
@@ -68,8 +71,8 @@ impl AudioProcessor {
            1
        ).map_err(|e| format!("Failed to init Resampler: {:?}", e))?;
-        // Pre-roll buffer (1.0 seconds) * Channels (interleaved store)
+        // Pre-roll buffer (3.0 seconds) * Channels (interleaved store)
-        let ring_curr_seconds = 1.0; 
+        let ring_curr_seconds = 3.0; 
        // WavWriter writes interleaved, so we store interleaved.
        let ring_size = (sample_rate as f32 * ring_curr_seconds) as usize * channel_count as usize;
@@ -96,10 +99,35 @@ impl AudioProcessor {
            total_processed_samples: 0,
            app_handle: Some(app_handle),
            last_event_time: std::time::Instant::now(),
            system_queue: Arc::new(Mutex::new(std::collections::VecDeque::new())),
        })
    }
-    pub fn process(&mut self, data: &[f32]) {
+    pub fn process(&mut self, input_data: &[f32]) {
        // MIXING LOGIC:
        // We have `input_data` (Microphone). We check `system_queue` for System Audio.
        // We mix them: Out = Mic + System.
        let mut mixed_data = input_data.to_vec();
        let mut max_system_energy = 0.0;
        if let Ok(mut queue) = self.system_queue.lock() {
             for i in 0..mixed_data.len() {
                 if let Some(sys_sample) = queue.pop_front() {
                     // Track system energy for trigger logic
                     let abs_sample = sys_sample.abs();
                     if abs_sample > max_system_energy {
                         max_system_energy = abs_sample;
                     }
                     // Simple addition mixing with clamping to avoid clipping
                     let mixed = mixed_data[i] + sys_sample;
                     mixed_data[i] = mixed.max(-1.0).min(1.0);
                 }
             }
        }
        let data = &mixed_data;
        // 1. Add to Ring Buffer (Interleaved data - Record EVERYTHING)
        for &sample in data {
            self.ring_buffer[self.ring_pos] = sample;
@@ -108,8 +136,7 @@ impl AudioProcessor {
        // 2. Prepare VAD Signal (Mono Mixdown)
        // FRESH START LOGIC (v0.2.0):
-        // We expect standard Stereo Input (BlackHole 2ch).
+        // We expect standard Stereo Input.
        // No magic 3-channel aggregate.
        let channels = self.channel_count as usize;
        let frame_count = data.len() / channels;
@@ -146,7 +173,6 @@ impl AudioProcessor {
                     self.vad_buffer.extend_from_slice(&waves_out[0][0..out_len]);
                 }
             }
             // Update output buffer usage... logic is tricky with drain.
        }
        // 4. Process VAD
@@ -155,21 +181,19 @@ impl AudioProcessor {
            // Run Detection
            let probability = self.vad.predict(vad_chunk.clone());
-            // Calculate RMS for this chunk to use as fallback/hybrid detection
+            // Hybrid VAD: Probability > 0.9 OR System Audio Active
-            let sq_sum: f32 = vad_chunk.iter().map(|x| x * x).sum();
+            // We want to keep recording if there is meaningful audio from the system (Call in progress),
-            let rms = (sq_sum / vad_chunk.len() as f32).sqrt();
+            // even if the VAD doesn't strictly classify it as 'speech' (e.g. ringing, laughter, noise).
-            // Hybrid VAD: Probability > 0.9 OR RMS > 0.025
+            let system_is_active = max_system_energy > 0.01; // Same threshold as trigger
-            // INCREASED THRESHOLDS (v1.1.1): 
+            let is_speech = probability > 0.9; 
            // Reduced sensitivity to avoid background noise triggering recording.
            let is_speech = probability > 0.9 || rms > 0.025; 
-            if is_speech {
+            if is_speech || system_is_active {
                self.is_speech_active = true;
                self.last_speech_time = self.total_processed_samples;
            }
-            // Emit VAD event periodically (every 500ms is enough for non-diagnostic mode)
+            // Emit VAD event periodically
            if self.last_event_time.elapsed().as_millis() > 500 {
                if let Some(app) = &self.app_handle {
                     #[derive(Clone, serde::Serialize)]
@@ -183,11 +207,6 @@ impl AudioProcessor {
                     });
                }
                self.last_event_time = std::time::Instant::now();
                // IMPORTANT: We reset is_speech_active after emitting, 
                // so we don't latch it forever if the user stops talking.
                // However, the main loop sets it to true if current chunk is speech.
                // This logic is a bit of a "latch for X ms".
                self.is_speech_active = false; 
            }
        }
@@ -195,9 +214,32 @@ impl AudioProcessor {
        // 4. Update Hangover and Check Write condition
        if self.waiting_for_speech {
-            if self.is_speech_active {
+            // TRIGGER CONDITION:
            // 1. VAD says speech (Someone is talking)
            // 2. AND System Audio has energy (Meaning audio is coming from the PC, i.e., Call started)
            // Threshold 0.01 is roughly -40dB, should cover ringtones/speech easily but ignore silence/hiss.
            let system_active = max_system_energy > 0.01;
            // Special Case: If System Audio acts like a Ringtone (Constant high energy but maybe not VAD speech?)
            // We trust VAD for speech. But we also trust "Loud System Sound" = Call.
            // If system is consistently loud, it's likely a call.
            // For now, Strict Mode:
            // Trigger if: (Speech Detected) AND (System Audio Present)
            // This prevents "User talking alone" -> No trigger (System silent).
            // This allows "Partner talking" -> Trigger (Speech + System).
            // What about Ringtone? Ringtone has energy but maybe no speech.
            // If we want to record the ringtone, we should trigger on `system_active` alone?
            // "erst wen der call startet" -> usually ringing.
            // Let's be generous: If System Audio is loud (> 0.05), we trigger regardless of VAD.
            let trigger = (self.is_speech_active && system_active) || (max_system_energy > 0.05);
            if trigger {
                // Trigger Detected!
-                println!("Auto-Start: Speech detected. Flushing pre-roll...");
+                println!("Auto-Start: Call detected (SysEnergy: {}). Flushing pre-roll...", max_system_energy);
                self.waiting_for_speech = false;
                // Flush Ring Buffer (Orderly: from ring_pos to end, then 0 to ring_pos)
--- a/src-tauri/src/lib.rs
+++ b/src-tauri/src/lib.rs
@@ -15,11 +15,13 @@ mod audio_processor;
 use audio_processor::AudioProcessor;
 mod auth;
 mod email;
 mod sc_audio;
 // State to hold the active recording stream
 struct AppState {
    recording_stream: Mutex<Option<cpal::Stream>>,
    recording_file_path: Mutex<Option<String>>,
    system_capture: Mutex<Option<sc_audio::SystemAudioCapture>>,
 }
 #[derive(serde::Serialize)]
@@ -71,7 +73,7 @@ fn get_input_devices() -> Result<Vec<AudioDevice>, String> {
 #[tauri::command]
-fn start_recording(app: AppHandle, state: State<'_, AppState>, device_id: String, save_path: Option<String>, custom_filename: Option<String>, wait_for_speech: Option<bool>) -> Result<(), String> {
+async fn start_recording(app: AppHandle, state: State<'_, AppState>, device_id: String, save_path: Option<String>, custom_filename: Option<String>, wait_for_speech: Option<bool>) -> Result<(), String> {
    emit_log(&app, "INFO", &format!("Starting recording on device: {}", device_id));
    let host = cpal::default_host();
@@ -85,11 +87,21 @@ fn start_recording(app: AppHandle, state: State<'_, AppState>, device_id: String
    // Select the configuration with the MAXIMUM number of channels
    // This is crucial for "Hearbit Audio" (Aggregate) which lists 3 channels but might default to 2.
-    // We want the raw 3 channels to separate Mic (Ch0) from System (Ch1+2).
+    // Select Audio Configuration
-    let supported_configs = device.supported_input_configs().map_err(|e| e.to_string())?;
+    // We prioritize 48kHz because System Audio (ScreenCaptureKit) acts best at 48k.
-    let config = supported_configs
+    let supported_configs: Vec<_> = device.supported_input_configs().map_err(|e| e.to_string())?.collect();
-        .max_by_key(|c| c.channels())
+    
-        .map(|c| c.with_max_sample_rate())
+    // Try to find 48kHz specifically
    // Note: cpal::SampleRate is likely a type alias for u32 here, so we pass 48000 directly.
    let config = supported_configs.iter()
        .find(|c| c.min_sample_rate() <= 48000 && c.max_sample_rate() >= 48000)
        .map(|c| c.with_sample_rate(48000))
        .or_else(|| {
            // Fallback: Max sample rate
            supported_configs.iter()
                .max_by_key(|c| c.channels())
                .map(|c| c.with_max_sample_rate())
        })
        .ok_or("No supported input configurations found")?;
    emit_log(&app, "INFO", &format!("Selected Audio Config: {} Channels, {} Hz", config.channels(), config.sample_rate()));
@@ -145,6 +157,64 @@ fn start_recording(app: AppHandle, state: State<'_, AppState>, device_id: String
    let processor = Arc::new(Mutex::new(processor));
    let processor_clone = processor.clone();
    // --- SYSTEM AUDIO CAPTURE START ---
    let mut sys_capture = sc_audio::SystemAudioCapture::new(config.sample_rate());
    // Get the queue to share with the capture callback
    let queue_clone = {
        let p = processor.lock().unwrap();
        p.system_queue.clone() // Access the pub field we added
    };
    let sys_handle = app.clone();
    let sys_callback = move |data: &[f32]| {
        // Push to queue
        if let Ok(mut q) = queue_clone.lock() {
            q.extend(data.iter());
            // Limit queue size to avoid memory leaks if main process loop is slow
            while q.len() > 48000 * 5 { // 5 seconds buffer
               q.pop_front();
            }
        }
    };
    // Need to run async start in sync command? 
    // Tauri commands are async if they return Future, but here we returned Result.
    // We should probably spawn it.
    // Actually, SystemAudioCapture::start is async.
    // We can spawn a tokio task to start it. But we need to keep the object alive.
    // The start method modifies self.stream.
    // If we make start synchronous or use block_in_place?
    // Better: change start_recording to async fn (it is not currently async in signature used by tauri::command macros? No, tauri supports async commands).
    // Let's check line 76: `fn start_recording`... it is NOT async. 
    // We should make it `async fn start_recording`.
    // However, changing to async might affect how state is passed or other things.
    // Actually Tauri works fine with async commands.
    // But then we need to await `sys_capture.start`.
    // Wait, let's look at `SystemAudioCapture::start`. It takes `&mut self`.
    // We can't easily spawn it away properly if we want to keep `sys_capture` in State.
    // The `sys_capture` struct holds the `SCStream` which must be kept alive.
    // Let's assume we can make `start_recording` into `async fn`.
    // TEMPORARY: Just putting placeholder for logic flow. 
    // We will need to change the function signature of start_recording to async first in a separate step or assume I can do it here if I replace the whole signature.
    // The replace_file_content replaces a block. 
    // I will replace line 76 in a separate call to make it async.
    // For this block, I will assume it's async context.
    match sys_capture.start(sys_callback) {
        Ok(_) => emit_log(&app, "INFO", "System Audio Capture started."),
        Err(e) => emit_log(&app, "WARN", &format!("System Audio Capture failed (Permissions?): {}", e)),
    }
    *state.system_capture.lock().unwrap() = Some(sys_capture);
    // --- SYSTEM AUDIO CAPTURE END ---
    let app_handle = app.clone();
    let err_fn = move |err| {
        eprintln!("an error occurred on stream: {}", err);
@@ -206,6 +276,13 @@ fn stop_recording(app: AppHandle, state: State<'_, AppState>) -> Result<String,
    // Drop stream to stop recording
    {
        let mut stream_guard = state.recording_stream.lock().unwrap();
        // Also stop System Capture
        let mut sys_guard = state.system_capture.lock().unwrap();
        if let Some(sys) = sys_guard.as_mut() {
             sys.stop();
        }
        *sys_guard = None;
        if stream_guard.is_none() {
            return Err("Not recording".to_string());
        }
@@ -804,6 +881,12 @@ fn create_hearbit_audio_device(app: AppHandle) -> Result<String, String> {
    }
 }
 #[tauri::command]
 async fn check_screen_recording_permission() -> bool {
    sc_audio::check_permissions().await
 }
 #[tauri::command]
 async fn save_text_file(app: AppHandle, path: String, content: String) -> Result<(), String> {
    emit_log(&app, "INFO", &format!("Saving text file to: {}", path));
@@ -891,6 +974,7 @@ pub fn run() {
        .manage(AppState {
            recording_stream: Mutex::new(None),
            recording_file_path: Mutex::new(None),
            system_capture: Mutex::new(None),
        })
        .invoke_handler(tauri::generate_handler![
            greet,
@@ -904,6 +988,7 @@ pub fn run() {
            get_available_models,
            open_audio_midi_setup,
            create_hearbit_audio_device,
            check_screen_recording_permission,
            auth::start_auth_flow,
            auth::get_calendar_events,
            save_text_file,
--- a/src-tauri/src/sc_audio.rs
+++ b/src-tauri/src/sc_audio.rs
@@ -0,0 +1,103 @@
 use screencapturekit_sys::{
    os_types::rc::Id,
    shareable_content::UnsafeSCShareableContent,
    content_filter::{UnsafeContentFilter, UnsafeInitParams},
    stream_configuration::UnsafeStreamConfiguration,
    stream::UnsafeSCStream,
    stream_error_handler::UnsafeSCStreamError,
    stream_output_handler::UnsafeSCStreamOutput,
    cm_sample_buffer_ref::CMSampleBufferRef,
 };
 pub struct SystemAudioCapture {
    stream: Option<Id<UnsafeSCStream>>,
    sample_rate: u32,
 }
 struct AudioOutputWrapper {
    callback: Box<dyn Fn(&[f32]) + Send + Sync>,
 }
 impl UnsafeSCStreamOutput for AudioOutputWrapper {
    fn did_output_sample_buffer(&self, sample: Id<CMSampleBufferRef>, of_type: u8) {
        if of_type == 1 { // Audio
            let buffers = sample.get_av_audio_buffer_list();
            for buffer in buffers {
                // Buffer data is u8, we usually get F32 from SCK if configured.
                // Assuming f32 (Floating Point) based on our config.
                // We need to convert [u8] to [f32].
                let data_u8 = buffer.data;
                let data_f32: &[f32] = unsafe {
                    std::slice::from_raw_parts(
                        data_u8.as_ptr() as *const f32,
                        data_u8.len() / 4,
                    )
                };
                (self.callback)(data_f32);
            }
        }
    }
 }
 struct ErrorHandler;
 impl UnsafeSCStreamError for ErrorHandler {
    fn handle_error(&self) {
        // eprintln!("Stream Error");
    }
 }
 pub async fn check_permissions() -> bool {
    UnsafeSCShareableContent::get().is_ok()
 }
 impl SystemAudioCapture {
    pub fn new(sample_rate: u32) -> Self {
        Self { stream: None, sample_rate }
    }
    pub fn start<F>(&mut self, callback: F) -> Result<(), String> 
    where F: Fn(&[f32]) + Send + Sync + 'static {
        let content = UnsafeSCShareableContent::get().map_err(|e| format!("Failed to get content"))?;
        let displays = content.displays();
        let display = displays.first().ok_or("No display found")?;
        let filter_init = UnsafeInitParams::Display(display.clone());
        let filter = UnsafeContentFilter::init(filter_init);
        // Wait, 'pixel_format' is OSType. b"BGRA" is &[u8;4].
        // FourCharCode::from_chars exists in crate::os_types::four_char_code but we didn't import it.
        // Actually, we can just use the Default and overwrite fields.
        // But better: use Default and only set what we need.
        let mut config = UnsafeStreamConfiguration::default();
        config.width = 100;
        config.height = 100;
        config.captures_audio = 1;
        config.sample_rate = self.sample_rate;
        config.channel_count = 2;
        config.excludes_current_process_audio = 0;
        let output_wrapper = AudioOutputWrapper {
            callback: Box::new(callback), 
        };
        // Convert config to Id<UnsafeStreamConfigurationRef> using Into
        let stream = UnsafeSCStream::init(filter, config.into(), ErrorHandler);
        stream.add_stream_output(output_wrapper, 1); // 1 = Audio
        stream.start_capture().map_err(|e| "Failed to start capture".to_string())?;
        self.stream = Some(stream);
        Ok(())
    }
    pub fn stop(&mut self) {
        if let Some(stream) = &self.stream {
            stream.stop_capture();
        }
        self.stream = None;
    }
 }
--- a/src-tauri/tauri.conf.json
+++ b/src-tauri/tauri.conf.json
@@ -1,7 +1,7 @@
 {
  "$schema": "https://schema.tauri.app/config/2",
  "productName": "Hearbit AI",
-  "version": "1.1.1",
+  "version": "1.2.0",
  "identifier": "com.hearbit-ai.desktop",
  "build": {
    "beforeDevCommand": "npm run dev",
--- a/src/components/Recorder.tsx
+++ b/src/components/Recorder.tsx
@@ -60,9 +60,9 @@ const Recorder: React.FC<RecorderProps> = ({
    const [isStopping, setIsStopping] = useState(false); // New lock state
    const [isPaused, setIsPaused] = useState(false);
    const [isWaiting, setIsWaiting] = useState(false); // New state for Auto-Start
    const [hasSpeechDetected, setHasSpeechDetected] = useState(false); // New tracking state
    const [autoStartEnabled, setAutoStartEnabled] = useState(false); // Toggle state
    const [status, setStatus] = useState<string>('Ready to record');
    const [selectedDevice, setSelectedDevice] = useState<string>('');
    const [selectedPromptId, setSelectedPromptId] = useState<string>('');
@@ -73,11 +73,8 @@ const Recorder: React.FC<RecorderProps> = ({
    const [lastSpeechTime, setLastSpeechTime] = useState<number>(Date.now());
    const [silenceDuration, setSilenceDuration] = useState(0);
-    // Filtered devices based on mode
+    // Show all devices for both modes now (System Audio is captured natively)
-    const filteredDevices = devices.filter(d => {
+    const filteredDevices = devices;
        const isVirtual = d.name.toLowerCase().includes('hearbit') || d.name.toLowerCase().includes('blackhole');
        return recordingMode === 'meeting' ? isVirtual : !isVirtual;
    });
    useEffect(() => {
        loadDevices();
@@ -170,6 +167,7 @@ const Recorder: React.FC<RecorderProps> = ({
            setIsPaused(false);
            setTranscription('');
            setSummary('');
            setHasSpeechDetected(false); // Reset check for new session
            if (autoStartEnabled) {
                setIsWaiting(true);
@@ -215,15 +213,16 @@ const Recorder: React.FC<RecorderProps> = ({
            unlistenVAD = await listen<{ is_speech: boolean, probability: number }>('vad-event', (event) => {
                if (event.payload.is_speech) {
                    setLastSpeechTime(Date.now());
-                    lastSpeechTimeRef.current = Date.now(); // Update ref immediately
+                    lastSpeechTimeRef.current = Date.now();
                    setSilenceDuration(0);
                    setHasSpeechDetected(true); // Track positive speech
                }
            });
            // Auto-Start Trigger Listener
            unlistenTrigger = await listen('auto-recording-triggered', () => {
                console.log("Auto-Start Triggered from Backend!");
-                // Only trigger if we are actually waiting
+                setHasSpeechDetected(true); // Trigger counts as speech
                setIsWaiting((prev) => {
                    if (prev) {
                        addToast("Audio detected! Recording started.", 'success', 4000);
@@ -341,134 +340,162 @@ const Recorder: React.FC<RecorderProps> = ({
            setIsRecording(false);
            setIsPaused(false);
            setIsWaiting(false); // Reset waiting state
            setTranscription('');
            setSummary('');
            setHasSpeechDetected(false); // Reset checkiting state
            setStatus('Saving recording...');
            const filePath = await invoke<string>('stop_recording');
-            // Wait a moment for file flush (safety)
+            // NEW: Check if speech was actually detected during the session
-            await new Promise(r => setTimeout(r, 500));
+            // If we recorded 20s of silence (Auto-Stop), we shouldn't transcribe.
            // If we recorded 20s of silence (Auto-Stop), we shouldn't transcribe.
            if (!hasSpeechDetected && recordingMode === 'voice') {
                // Note: For 'meeting' mode, system audio might have happened without VAD triggering?
                // But our updated backend VAD logic includes System Audio in 'is_speech' event.
                // So we can trust hasSpeechDetected for both modes now.
-            // Confirm recording saved
+                console.log("No speech detected during recording. Skipping transcription.");
-            addToast(`Recording saved locally: ${filePath.split('/').pop()}`, 'success', 3000);
+                addToast("Recording discarded (No speech/audio detected)", 'info');
            setStatus('Converting to MP3...');
-            // Small delay to show the "saved" message
+                // If auto-start is on, we just loop back.
-            await new Promise(r => setTimeout(r, 500));
+                // skip the rest.
            } else {
-            // Convert WAV to MP3 for smaller size
+                // Wait a moment for file flush (safety)
-            const mp3Path = await invoke<string>('convert_to_mp3', { wavPath: filePath });
+                await new Promise(r => setTimeout(r, 500));
-            // Get file size to check if chunking needed
+                // Confirm recording saved
-            interface AudioMetadata { duration: number; size: number; format: string; }
+                addToast(`Recording saved locally: ${filePath.split('/').pop()}`, 'success', 3000);
-            const metadata = await invoke<AudioMetadata>('get_audio_metadata', { filePath: mp3Path });
+                setStatus('Converting to MP3...');
            const sizeMB = metadata.size / (1024 * 1024);
-            let transText = '';
+                // Small delay to show the "saved" message
                await new Promise(r => setTimeout(r, 500));
-            // Check if chunking needed (only for Meeting mode and large files)
+                // Convert WAV to MP3 for smaller size
-            if (recordingMode === 'meeting' && sizeMB >= 18) {
+                const mp3Path = await invoke<string>('convert_to_mp3', { wavPath: filePath });
                // CHUNKING PATH for large meetings
                setStatus(`Large file (${sizeMB.toFixed(1)}MB). Splitting into chunks...`);
                const chunks = await invoke<string[]>('chunk_audio', {
                    filePath: mp3Path,
                    chunkMinutes: 10
                });
-                addToast(`Processing ${chunks.length} chunks...`, 'info', 4000);
+                // Get file size to check if chunking needed
                interface AudioMetadata { duration: number; size: number; format: string; }
                const metadata = await invoke<AudioMetadata>('get_audio_metadata', { filePath: mp3Path });
                const sizeMB = metadata.size / (1024 * 1024);
-                let allTranscriptions: string[] = [];
+                let transText = '';
-                for (let i = 0; i < chunks.length; i++) {
+                // Check if chunking needed (only for Meeting mode and large files)
-                    setStatus(`Transcribing chunk ${i + 1}/${chunks.length}...`);
+                if (recordingMode === 'meeting' && sizeMB >= 18) {
-                    const chunkText = await invoke<string>('transcribe_audio', {
+                    // CHUNKING PATH for large meetings
-                        filePath: chunks[i],
+                    setStatus(`Large file (${sizeMB.toFixed(1)}MB). Splitting into chunks...`);
                    const chunks = await invoke<string[]>('chunk_audio', {
                        filePath: mp3Path,
                        chunkMinutes: 10
                    });
                    addToast(`Processing ${chunks.length} chunks...`, 'info', 4000);
                    let allTranscriptions: string[] = [];
                    for (let i = 0; i < chunks.length; i++) {
                        setStatus(`Transcribing chunk ${i + 1}/${chunks.length}...`);
                        const chunkText = await invoke<string>('transcribe_audio', {
                            filePath: chunks[i],
                            apiKey,
                            productId
                        });
                        allTranscriptions.push(chunkText);
                    }
                    // Merge transcriptions
                    transText = allTranscriptions.join('\n\n--- Next Segment ---\n\n');
                    addToast('All chunks transcribed successfully!', 'success', 3000);
                } else {
                    // NORMAL PATH for small files
                    setStatus('Transcribing (Infomaniak Whisper)...');
                    transText = await invoke<string>('transcribe_audio', {
                        filePath: mp3Path,
                        apiKey,
                        productId
                    });
                    allTranscriptions.push(chunkText);
                }
-                // Merge transcriptions
+                setTranscription(transText);
                transText = allTranscriptions.join('\n\n--- Next Segment ---\n\n');
                addToast('All chunks transcribed successfully!', 'success', 3000);
            } else {
                // NORMAL PATH for small files
                setStatus('Transcribing (Infomaniak Whisper)...');
                transText = await invoke<string>('transcribe_audio', {
                    filePath: mp3Path,
                    apiKey,
                    productId
                });
            }
-            setTranscription(transText);
+                // Check if transcription is empty or just whitespace
                if (!transText || transText.trim().length === 0) {
                    setStatus('Done (No speech detected)');
                    setTranscription('(No speech detected. Check your microphone settings.)');
                    setTimeout(() => setStatus('Ready to record'), 3000);
                    // allow finally block to restart loop
                } else {
                    // Logic continues...
-            // Check if transcription is empty or just whitespace
+                    // Find selected prompt content - SMART SELECTION
-            if (!transText || transText.trim().length === 0) {
+                    let activePrompt = prompts.find(p => p.id === selectedPromptId);
                setStatus('Done (No speech detected)');
                setTranscription('(No speech detected. Check your microphone settings.)');
                setTimeout(() => setStatus('Ready to record'), 3000);
                return;
            }
-            // Find selected prompt content - SMART SELECTION
+                    // Smart Auto-Select based on keywords
-            let activePrompt = prompts.find(p => p.id === selectedPromptId);
+                    const lowerText = transText.toLowerCase();
                    let bestMatchId = selectedPromptId;
                    let maxMatches = 0;
-            // Smart Auto-Select based on keywords
+                    for (const p of prompts) {
-            const lowerText = transText.toLowerCase();
+                        if (!p.keywords) continue;
-            let bestMatchId = selectedPromptId;
+                        let matches = 0;
-            let maxMatches = 0;
+                        for (const kw of p.keywords) {
-
+                            if (lowerText.includes(kw.toLowerCase())) {
-            for (const p of prompts) {
+                                matches++;
-                if (!p.keywords) continue;
+                            }
-                let matches = 0;
+                        }
-                for (const kw of p.keywords) {
+                        if (matches > maxMatches) {
-                    if (lowerText.includes(kw.toLowerCase())) {
+                            maxMatches = matches;
-                        matches++;
+                            bestMatchId = p.id;
                        }
                    }
-                }
+
-                if (matches > maxMatches) {
+                    if (bestMatchId !== selectedPromptId) {
-                    maxMatches = matches;
+                        const newPrompt = prompts.find(p => p.id === bestMatchId);
-                    bestMatchId = p.id;
+                        if (newPrompt) {
                            console.log(`Smart Select: Switched to '${newPrompt.name}' with ${maxMatches} matches.`);
                            setStatus(`Smart Select: Using "${newPrompt.name}"...`);
                            addToast(`Smart Select: Switched to "${newPrompt.name}"`, 'success', 4000);
                            activePrompt = newPrompt;
                        }
                    }
                    const promptContent = activePrompt ? activePrompt.content : "Summarize this.";
                    setStatus(`Summarizing (${selectedModel})...`);
                    const sumText = await invoke<string>('summarize_text', {
                        text: transText,
                        apiKey,
                        productId,
                        prompt: promptContent,
                        model: selectedModel
                    });
                    setSummary(sumText);
                    // Auto-save to history
                    onSaveToHistory(transText, sumText);
                    setStatus('Done!');
                    addToast('Transcription & Summary complete!', 'success', 4000);
                    onRecordingComplete(); // Auto-switch tab
                    setTimeout(() => setStatus('Ready to record'), 3000);
                }
            }
            if (bestMatchId !== selectedPromptId) {
                const newPrompt = prompts.find(p => p.id === bestMatchId);
                if (newPrompt) {
                    console.log(`Smart Select: Switched to '${newPrompt.name}' with ${maxMatches} matches.`);
                    setStatus(`Smart Select: Using "${newPrompt.name}"...`);
                    addToast(`Smart Select: Switched to "${newPrompt.name}"`, 'success', 4000);
                    activePrompt = newPrompt;
                    // Optional: Update UI selection? setSelectedPromptId(bestMatchId);
                    // Let's verify with user preference? For now, we override as "Magic".
                }
            }
            const promptContent = activePrompt ? activePrompt.content : "Summarize this.";
            setStatus(`Summarizing (${selectedModel})...`);
            const sumText = await invoke<string>('summarize_text', {
                text: transText,
                apiKey,
                productId,
                prompt: promptContent,
                model: selectedModel
            });
            setSummary(sumText);
            // Auto-save to history
            onSaveToHistory(transText, sumText);
            setStatus('Done!');
            addToast('Transcription & Summary complete!', 'success', 4000);
            onRecordingComplete(); // Auto-switch tab
            setTimeout(() => setStatus('Ready to record'), 3000);
        } catch (e) {
            console.error(e);
            setStatus(`Error: ${e}`);
            addToast(`Error processing: ${e}`, 'error');
        } finally {
            setIsStopping(false);
            // AUTO-RESTART LOGIC
            if (autoStartEnabled) {
                console.log("Auto-Start enabled: Restarting listener loop...");
                // Short delay to ensure backend cleanup
                setTimeout(() => {
                    startRecording();
                }, 1000);
            }
        }
    };
@@ -634,12 +661,20 @@ const Recorder: React.FC<RecorderProps> = ({
                    </div>
                    <div className="flex flex-col gap-2 mt-2 w-full">
-                        {recordingMode === 'meeting' && filteredDevices.length === 0 && (
+                        {recordingMode === 'meeting' && (
                            <button
-                                onClick={onOpenSettings}
+                                onClick={async () => {
                                    const allowed = await invoke<boolean>('check_screen_recording_permission');
                                    if (allowed) {
                                        addToast('System Audio Permission: GRANTED ✅', 'success');
                                    } else {
                                        addToast('System Audio Permission: MISSING ❌. Please enable in System Settings -> Privacy -> Screen Recording', 'error', 5000);
                                        // Open Settings?
                                    }
                                }}
                                className="text-xs bg-primary/10 text-primary hover:bg-primary/20 w-full text-center border border-primary/20 rounded p-2 mb-2 font-semibold"
                            >
-                                🪄 Create "Hearbit Audio" Device
+                                🔒 Check Audio Permission
                            </button>
                        )}
                        <button