Release 1.1.0: Add Import Audio Files feature

- New Import tab with drag-and-drop support for audio files - Support for 8 formats: MP3, MP4, WAV, M4A, FLAC, OGG, AAC, WMA - File metadata display (duration, size, format) - Editable meeting titles - Progress tracking with visual indicators - Smart template selection - Auto-navigation to Transcription view - Updated README with BlackHole requirement and Teams config - Added get_audio_metadata Rust command - Version bump to 1.1.0
2026-01-21 09:08:56 +01:00
parent 79f509951c
commit a06e473e85
12 changed files with 1041 additions and 171 deletions
@@ -1739,7 +1739,7 @@ checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"

 [[package]]
 name = "hearbit-ai"
-version = "1.1.0"
+version = "0.1.2"
 dependencies = [
 "chrono",
 "cpal",
@@ -1757,6 +1757,7 @@ dependencies = [
 "tauri-plugin-log",
 "tauri-plugin-oauth",
 "tauri-plugin-opener",
+ "tauri-plugin-shell",
 "tokio",
 "url",
 "voice_activity_detector",
@@ -3089,6 +3090,16 @@ dependencies = [
 "ureq",
 ]

+[[package]]
+name = "os_pipe"
+version = "1.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d8fae84b431384b68627d0f9b3b1245fcf9f46f6c0e3dc902e9dce64edd1967"
+dependencies = [
+ "libc",
+ "windows-sys 0.61.2",
+]
+
 [[package]]
 name = "pango"
 version = "0.18.3"
@@ -4361,12 +4372,44 @@ dependencies = [
 "digest",
 ]

+[[package]]
+name = "shared_child"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e362d9935bc50f019969e2f9ecd66786612daae13e8f277be7bfb66e8bed3f7"
+dependencies = [
+ "libc",
+ "sigchld",
+ "windows-sys 0.60.2",
+]
+
 [[package]]
 name = "shlex"
 version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"

+[[package]]
+name = "sigchld"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47106eded3c154e70176fc83df9737335c94ce22f821c32d17ed1db1f83badb1"
+dependencies = [
+ "libc",
+ "os_pipe",
+ "signal-hook",
+]
+
+[[package]]
+name = "signal-hook"
+version = "0.3.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d881a16cf4426aa584979d30bd82cb33429027e42122b169753d6ef1085ed6e2"
+dependencies = [
+ "libc",
+ "signal-hook-registry",
+]
+
 [[package]]
 name = "signal-hook-registry"
 version = "1.4.8"
@@ -4951,6 +4994,27 @@ dependencies = [
 "zbus",
 ]

+[[package]]
+name = "tauri-plugin-shell"
+version = "2.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "39b76f884a3937e04b631ffdc3be506088fa979369d25147361352f2f352e5ed"
+dependencies = [
+ "encoding_rs",
+ "log",
+ "open",
+ "os_pipe",
+ "regex",
+ "schemars 0.8.22",
+ "serde",
+ "serde_json",
+ "shared_child",
+ "tauri",
+ "tauri-plugin",
+ "thiserror 2.0.18",
+ "tokio",
+]
+
 [[package]]
 name = "tauri-runtime"
 version = "2.9.2"
@@ -1,6 +1,6 @@
 [package]
 name = "hearbit-ai"
-version = "1.1.0"
+version = "0.1.2"
 description = "A Tauri App"
 authors = ["you"]
 edition = "2021"
@@ -18,7 +18,7 @@ crate-type = ["staticlib", "cdylib", "rlib"]
 tauri-build = { version = "2", features = [] }

 [dependencies]
-tauri = { version = "2", features = [] }
+tauri = { version = "2", features = ["tray-icon"] }
 tauri-plugin-opener = "2"
 tauri-plugin-dialog = "2"
 serde = { version = "1", features = ["derive"] }
@@ -36,3 +36,4 @@ oauth2 = "4.4"
 url = "2.5"
 lettre = { version = "0.11", features = ["tokio1", "tokio1-native-tls", "builder"] }
 tauri-plugin-log = "2.0.0"
+tauri-plugin-shell = "2.3.4"
@@ -110,6 +110,9 @@ func createAggregateDevice() {
    }
    print("Found BlackHole 2ch (ID: \(blackHoleID))")
    
+    // --- PART 1: Hearbit Audio (Input: Mic + BlackHole) ---
+    print("\n--- Creating 'Hearbit Audio' (Input) ---")
+    
    // Default Input
    var defaultInputID: AudioObjectID = 0
    var size = UInt32(MemoryLayout<AudioObjectID>.size)
@@ -125,19 +128,14 @@ func createAggregateDevice() {
    }
    print("Found Default Input (ID: \(defaultInputID))")
    
-    // Check for existing "Hearbit Audio" by UID
-    let targetUID = "hearbit_audio_aggregate_v1"
-    if let existingID = findDeviceByUID(targetUID) {
-        print("Found existing Hearbit Audio device (ID: \(existingID)). Destroying to recreate...")
-        if AudioHardwareDestroyAggregateDevice(existingID) != noErr {
-            print("Warning: Failed to destroy existing device.")
-        } else {
-            print("Existing device destroyed.")
-        }
+    // Check for existing "Hearbit Audio"
+    let inputUID = "hearbit_audio_aggregate_v1"
+    if let existingID = findDeviceByUID(inputUID) {
+        print("Found existing Hearbit Audio (ID: \(existingID)). Destroying...")
+        AudioHardwareDestroyAggregateDevice(existingID)
        Thread.sleep(forTimeInterval: 0.5)
    }
    
-    // Build SubDevice List
    guard let bhUID = getStringProperty(objectID: blackHoleID, selector: kAudioDevicePropertyDeviceUID) else {
        print("Error: Could not get BlackHole UID.")
        exit(1)
@@ -147,36 +145,47 @@ func createAggregateDevice() {
        exit(1)
    }
    
-    // Dedup: if Mic IS BlackHole (user set BlackHole as default), don't duplicate
    var subDevicesUIDs = [bhUID]
    if micUID != bhUID {
        subDevicesUIDs.append(micUID)
    }
    
-    let subDevicesArray = subDevicesUIDs.map { 
-        [kAudioSubDeviceUIDKey: $0]
-    }
-    
-    let desc: [String: Any] = [
+    let subDevicesArray = subDevicesUIDs.map { [kAudioSubDeviceUIDKey: $0] }
+    let inputDesc: [String: Any] = [
        kAudioAggregateDeviceNameKey: "Hearbit Audio",
-        kAudioAggregateDeviceUIDKey: targetUID,
+        kAudioAggregateDeviceUIDKey: inputUID,
        kAudioAggregateDeviceIsPrivateKey: Int(0),
        kAudioAggregateDeviceIsStackedKey: Int(0),
        kAudioAggregateDeviceSubDeviceListKey: subDevicesArray
    ]
    
-    print("Creating Aggregate Device with UIDs: \(subDevicesUIDs)")
-    
-    var outID: AudioObjectID = 0
-    let err = AudioHardwareCreateAggregateDevice(desc as CFDictionary, &outID)
-    
-    if err == noErr {
-        print("Success! Created 'Hearbit Audio' with ID: \(outID)")
-        exit(0)
+    var outInputID: AudioObjectID = 0
+    let errIn = AudioHardwareCreateAggregateDevice(inputDesc as CFDictionary, &outInputID)
+    if errIn == noErr {
+        print("Success! Created 'Hearbit Audio' with ID: \(outInputID)")
    } else {
-        print("Failed to create device. Error code: \(err) (\(err.fourCC))")
-        exit(1)
+        print("Failed to create 'Hearbit Audio'. Error: \(errIn)")
    }
+
+
+    // --- PART 2: Cleanup Unstable "Hearbit Speakers" ---
+    // The previous "Hearbit Speakers" device caused MS Teams to crash.
+    // We strictly remove it here to restore stability.
+    print("\n--- Cleaning up Unstable Devices ---")
+    let stopOutputUID = "hearbit_speakers_aggregate_v1"
+    if let existingOutID = findDeviceByUID(stopOutputUID) {
+        print("Found unstable 'Hearbit Speakers' (ID: \(existingOutID)). Removing to fix Teams crash...")
+        let errDist = AudioHardwareDestroyAggregateDevice(existingOutID)
+        if errDist == noErr {
+            print("Successfully removed unstable device.")
+        } else {
+             print("Warning: Failed to remove device. Error: \(errDist)")
+        }
+    } else {
+        print("No unstable 'Hearbit Speakers' found. System is clean.")
+    }
+    
+    exit(0)
 }

 createAggregateDevice()
@@ -11,6 +11,9 @@ pub struct AudioProcessor {
    vad_chunk_size: usize,
    vad_buffer: Vec<f32>,
    
+    // Audio Config
+    channel_count: u16,
+    
    // Resampler
    resampler: FastFixedIn<f32>,
    resample_input_buffer: Vec<f32>,
@@ -21,6 +24,9 @@ pub struct AudioProcessor {
    last_speech_time: u64, // In samples or frames
    hangover_samples: u64,
    
+    // Waiting Mode
+    waiting_for_speech: bool,
+
    // Ring Buffer (for pre-roll)
    ring_buffer: Vec<f32>,
    ring_pos: usize,
@@ -37,12 +43,14 @@ pub struct AudioProcessor {

 impl AudioProcessor {
    pub fn new(
-        sample_rate: u32, 
+        sample_rate: u32,
+        channel_count: u16,
        writer: Arc<Mutex<WavWriter<std::io::BufWriter<std::fs::File>>>>,
-        app_handle: AppHandle
+        app_handle: AppHandle,
+        wait_for_speech: bool
    ) -> Result<Self, String> {
        let vad_sample_rate = 16000;
-        let vad_chunk_size = 512; // Silero usually likes ~30ms which is 512 at 16k? No 16000 * 0.032 = 512.
+        let vad_chunk_size = 512; 
        
        // Initialize VAD
        let vad = VoiceActivityDetector::builder()
@@ -51,8 +59,7 @@ impl AudioProcessor {
            .build()
            .map_err(|e| format!("Failed to init VAD: {:?}", e))?;

-        // Initialize Resampler (Input Rate -> 16000) using FastFixedIn for speed/simplicity
-        // new(f_ratio, max_resample_ratio_relative, polyn_deg, chunk_size, channels)
+        // Initialize Resampler (Input Rate -> 16000)
        let resampler = FastFixedIn::<f32>::new(
            16000.0 / sample_rate as f64,
            1.0, 
@@ -61,20 +68,26 @@ impl AudioProcessor {
            1
        ).map_err(|e| format!("Failed to init Resampler: {:?}", e))?;

-        // Pre-roll buffer (e.g. 0.5 seconds of high quality audio)
+        // Pre-roll buffer (1.0 seconds) * Channels (interleaved store)
        let ring_curr_seconds = 1.0; 
-        let ring_size = (sample_rate as f32 * ring_curr_seconds) as usize;
+        // WavWriter writes interleaved, so we store interleaved.
+        let ring_size = (sample_rate as f32 * ring_curr_seconds) as usize * channel_count as usize;

        Ok(Self {
            vad,
            vad_chunk_size,
            vad_buffer: Vec::new(),
+            channel_count,
            resampler,
            resample_input_buffer: Vec::new(),
            resample_output_buffer: Vec::new(),
            is_speech_active: false,
            last_speech_time: 0,
-            hangover_samples: (sample_rate as f32 * 1.5) as u64, // 1.5s hangover
+            // Hangover counts "processed samples" which are actually frames * channels in current logic?
+            // Actually total_processed_samples usually counts FRAMES in audio terminology, but here we count elements.
+            // Let's stick to elements to match existing logic logic.
+            hangover_samples: (sample_rate as f32 * 1.5 * channel_count as f32) as u64, 
+            waiting_for_speech: wait_for_speech,
            ring_buffer: vec![0.0; ring_size],
            ring_pos: 0,
            ring_size,
@@ -87,30 +100,39 @@ impl AudioProcessor {
    }

    pub fn process(&mut self, data: &[f32]) {
-        // 1. Add to Ring Buffer (always, for pre-roll)
+        // 1. Add to Ring Buffer (Interleaved data - Record EVERYTHING)
        for &sample in data {
            self.ring_buffer[self.ring_pos] = sample;
            self.ring_pos = (self.ring_pos + 1) % self.ring_size;
        }

-        // 2. Resample for VAD
-        // We append new data to input buffer for resampler
-        self.resample_input_buffer.extend_from_slice(data);
+        // 2. Prepare VAD Signal (Mono Mixdown)
+        // FRESH START LOGIC (v0.2.0):
+        // We expect standard Stereo Input (BlackHole 2ch).
+        // No magic 3-channel aggregate.
        
-        // Process in chunks compatible with resampler
-        // Actually rubato process_into_buffer needs waves of input.
-        // Simplified: SincFixedIn wants a fixed number of input frames? 
-        // Docs: "retrieve result... input buffer must contain needed number of frames"
-        // SincFixedIn: "input buffer used for resampling... must receive a fixed number of frames"
-        // Wait, SincFixedIn is fixed INPUT size. SincFixedOut is fixed OUTPUT size.
-        // We want to feed whatever we get.
-        // For simplicity, let's use a simpler resampling strategy or accept rubato's constraints.
-        // Rubato SincFixedIn: we must provide `input_frames_next` frames.
+        let channels = self.channel_count as usize;
+        let frame_count = data.len() / channels;
+        let mut vad_input_chunk = Vec::with_capacity(frame_count);
+
+        for i in 0..frame_count {
+            let frame_start = i * channels;
+            
+            let mix_sample = if channels >= 2 {
+                // Stereo -> Average L + R
+                (data[frame_start] + data[frame_start + 1]) / 2.0
+            } else {
+                // Mono -> Take as is
+                data[frame_start]
+            };
+            
+            vad_input_chunk.push(mix_sample);
+        }
+
+
+        // 3. Resample for VAD
+        self.resample_input_buffer.extend_from_slice(&vad_input_chunk);
        
-        // Let's defer strict resampling and just use decimation if sample rate is multiple?
-        // No, user devices vary.
-        
-        // Handling Resampling properly:
        let needed = self.resampler.input_frames_next();
        while self.resample_input_buffer.len() >= needed {
             let chunk: Vec<f32> = self.resample_input_buffer.drain(0..needed).collect();
@@ -127,63 +149,87 @@ impl AudioProcessor {
             // Update output buffer usage... logic is tricky with drain.
        }

-        // 3. Process VAD
+        // 4. Process VAD
        while self.vad_buffer.len() >= self.vad_chunk_size {
            let vad_chunk: Vec<f32> = self.vad_buffer.drain(0..self.vad_chunk_size).collect();
            // Run Detection
-            // Run Detection
            let probability = self.vad.predict(vad_chunk.clone());
            
            // Calculate RMS for this chunk to use as fallback/hybrid detection
            let sq_sum: f32 = vad_chunk.iter().map(|x| x * x).sum();
            let rms = (sq_sum / vad_chunk.len() as f32).sqrt();

-            // Hybrid VAD: Probability > 0.4 OR RMS > 0.005 (approx -46dB)
-            let is_speech = probability > 0.4 || rms > 0.005;
+            // Hybrid VAD: Probability > 0.8 OR RMS > 0.015
+            // INCREASED THRESHOLDS (v1.9.0): 
+            // Now that routing works, we must filter out system notifications (beeps) and noise floor.
+            let is_speech = probability > 0.8 || rms > 0.015; 

            if is_speech {
                self.is_speech_active = true;
                self.last_speech_time = self.total_processed_samples;
            }
            
-            // Emit VAD event periodically (every 500ms)
+            // Emit VAD event periodically (every 500ms is enough for non-diagnostic mode)
            if self.last_event_time.elapsed().as_millis() > 500 {
-                // Calculate simple RMS of the current chunk for debugging
-                let sq_sum: f32 = vad_chunk.iter().map(|x| x * x).sum();
-                let rms = (sq_sum / vad_chunk.len() as f32).sqrt();
-                
-                // Print debug info to stdout (viewable in terminal)
-                println!("VAD Debug: Prob={:.4}, RMS={:.6}, Speech={}", probability, rms, is_speech);
-
                if let Some(app) = &self.app_handle {
-                     // Just sending probability is enough for now
-                     #[derive(serde::Serialize, Clone)]
+                     #[derive(Clone, serde::Serialize)]
                     struct VadEvent {
-                         probability: f32,
                         is_speech: bool,
+                         probability: f32,
                     }
-                     let _ = app.emit("vad-event", VadEvent { probability, is_speech });
+                     let _ = app.emit("vad-event", VadEvent { 
+                         probability, 
+                         is_speech: self.is_speech_active,
+                     });
                }
                self.last_event_time = std::time::Instant::now();
+                
+                // IMPORTANT: We reset is_speech_active after emitting, 
+                // so we don't latch it forever if the user stops talking.
+                // However, the main loop sets it to true if current chunk is speech.
+                // This logic is a bit of a "latch for X ms".
+                self.is_speech_active = false; 
            }
        }
+
        
        // 4. Update Hangover and Check Write condition
+        if self.waiting_for_speech {
+            if self.is_speech_active {
+                // Trigger Detected!
+                println!("Auto-Start: Speech detected. Flushing pre-roll...");
+                self.waiting_for_speech = false;
+
+                // Flush Ring Buffer (Orderly: from ring_pos to end, then 0 to ring_pos)
+                let mut guard = self.writer.lock().unwrap();
+                let amplitude = i16::MAX as f32;
+                
+                // Part 1: ring_pos to end
+                for i in self.ring_pos..self.ring_size {
+                     let sample = self.ring_buffer[i];
+                     guard.write_sample((sample * amplitude) as i16).ok();
+                }
+                // Part 2: 0 to ring_pos
+                for i in 0..self.ring_pos {
+                    let sample = self.ring_buffer[i];
+                    guard.write_sample((sample * amplitude) as i16).ok();
+                }
+
+                // Emit event to notify frontend that "real" recording started
+                if let Some(app) = &self.app_handle {
+                    let _ = app.emit("auto-recording-triggered", ());
+                }
+
+            } else {
+                // Still waiting, do not write to file.
+                return; 
+            }
+        }
+
+        // Standard Recording Logic (Active or Hangover)
        let time_since_speech = self.total_processed_samples.saturating_sub(self.last_speech_time);
        
        if self.is_speech_active || time_since_speech < self.hangover_samples {
-             // We are recording!
-             // Check if we just started (transition)
-             // Ideally we dump the ring buffer here if we just switched state.
-             // Implementing perfect ring buffer dump is complex (need to track state changes better).
-             // MVP: Just Write Current Data if in state.
-             
-             // Improvement: If we are in hangover, we just write.
-             // If we just detected speech (was not speech?), dump ring buffer? 
-             // We'd need to know if we 'wrote' the ring buffer already.
-             
-             // Simple Logic: just write all incoming data if (Now - LastSpeech < Hangover)
-             
             let mut guard = self.writer.lock().unwrap();
             for &sample in data {
                 let amplitude = i16::MAX as f32;
@@ -1,4 +1,9 @@
-use tauri::{AppHandle, Manager, State, Emitter};
+use tauri::{
+    AppHandle, Manager, State, Emitter, 
+    menu::{Menu, MenuItem},
+    tray::{TrayIconBuilder, TrayIconEvent},
+    WindowEvent
+};
 use std::sync::{Arc, Mutex};
 use std::process::Command;
 use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
@@ -65,7 +70,7 @@ fn get_input_devices() -> Result<Vec<AudioDevice>, String> {


 #[tauri::command]
-fn start_recording(app: AppHandle, state: State<'_, AppState>, device_id: String, save_path: Option<String>, custom_filename: Option<String>) -> Result<(), String> {
+fn start_recording(app: AppHandle, state: State<'_, AppState>, device_id: String, save_path: Option<String>, custom_filename: Option<String>, wait_for_speech: Option<bool>) -> Result<(), String> {
    emit_log(&app, "INFO", &format!("Starting recording on device: {}", device_id));
    let host = cpal::default_host();
    
@@ -77,16 +82,17 @@ fn start_recording(app: AppHandle, state: State<'_, AppState>, device_id: String
        .or_else(|| host.default_input_device())
        .ok_or("No input device found")?;

-    let config = device.default_input_config().map_err(|e| e.to_string())?;
-    
-    // VAD requires 16Hz or 8kHz, typically. Silero likes 16k.
-    // We might need to resample or just check if the device supports it. 
-    // For MVP VAD, we will try to stick to standard rates.
-    // Actually, simple energy VAD is easier to start with if Silero is too heavy or requires ONNX runtime.
-    // Let's check the crate docs or usage first. 
-    // Wait, the user wants to IGNORE music. Energy VAD will fail on music.
-    // voice_activity_detector crate usually uses Silero or similar.
-    
+    // Select the configuration with the MAXIMUM number of channels
+    // This is crucial for "Hearbit Audio" (Aggregate) which lists 3 channels but might default to 2.
+    // We want the raw 3 channels to separate Mic (Ch0) from System (Ch1+2).
+    let supported_configs = device.supported_input_configs().map_err(|e| e.to_string())?;
+    let config = supported_configs
+        .max_by_key(|c| c.channels())
+        .map(|c| c.with_max_sample_rate())
+        .ok_or("No supported input configurations found")?;
+
+    emit_log(&app, "INFO", &format!("Selected Audio Config: {} Channels, {} Hz", config.channels(), config.sample_rate()));
+
    let spec = hound::WavSpec {
        channels: config.channels(),
        sample_rate: config.sample_rate(),
@@ -122,7 +128,12 @@ fn start_recording(app: AppHandle, state: State<'_, AppState>, device_id: String
    
    // Initialize AudioProcessor (VAD)
    // We pass the writer to it.
-    let processor = AudioProcessor::new(config.sample_rate(), writer.clone(), app.clone())
+    let should_wait = wait_for_speech.unwrap_or(false);
+    if should_wait {
+        emit_log(&app, "INFO", "Recording started in WAITING mode (buffer-only until speech).");
+    }
+
+    let processor = AudioProcessor::new(config.sample_rate(), config.channels(), writer.clone(), app.clone(), should_wait)
        .map_err(|e| format!("Failed to create AudioProcessor: {}", e))?;
        
    // Wrap processor in Arc<Mutex> so we can share/move it into callback
@@ -560,6 +571,62 @@ async fn summarize_text(app: AppHandle, text: String, api_key: String, product_i
    }
 }

+#[derive(serde::Serialize)]
+struct AudioMetadata {
+    duration: f64,
+    size: u64,
+    format: String,
+}
+
+#[tauri::command]
+fn get_audio_metadata(app: AppHandle, file_path: String) -> Result<AudioMetadata, String> {
+    emit_log(&app, "INFO", &format!("Getting metadata for: {}", file_path));
+    
+    // Get file size
+    let metadata = std::fs::metadata(&file_path).map_err(|e| e.to_string())?;
+    let size = metadata.len();
+    
+    // Extract format from extension
+    let path = std::path::Path::new(&file_path);
+    let format = path.extension()
+        .and_then(|e| e.to_str())
+        .unwrap_or("unknown")
+        .to_string();
+    
+    // Get duration using ffprobe (requires ffmpeg to be installed)
+    let duration = match Command::new("ffprobe")
+        .args([
+            "-v", "error",
+            "-show_entries", "format=duration",
+            "-of", "default=noprint_wrappers=1:nokey=1",
+            &file_path
+        ])
+        .output()
+    {
+        Ok(output) => {
+            if output.status.success() {
+                let duration_str = String::from_utf8_lossy(&output.stdout);
+                duration_str.trim().parse::<f64>().unwrap_or(0.0)
+            } else {
+                emit_log(&app, "WARN", "ffprobe failed, duration = 0");
+                0.0
+            }
+        },
+        Err(_) => {
+            emit_log(&app, "WARN", "ffprobe not found, duration = 0");
+            0.0
+        }
+    };
+    
+    emit_log(&app, "SUCCESS", &format!("Metadata: {}s, {} bytes", duration, size));
+    
+    Ok(AudioMetadata {
+        duration,
+        size,
+        format,
+    })
+}
+
 #[tauri::command]
 fn open_audio_midi_setup() -> Result<(), String> {
    Command::new("open")
@@ -640,6 +707,49 @@ async fn read_log_file(app: AppHandle) -> Result<String, String> {
 #[cfg_attr(mobile, tauri::mobile_entry_point)]
 pub fn run() {
    tauri::Builder::default()
+        .setup(|app| {
+            // Setup Tray Icon
+            let quit_i = MenuItem::with_id(app, "quit", "Quit Hearbit AI", true, None::<&str>).unwrap();
+            let show_i = MenuItem::with_id(app, "show", "Show Window", true, None::<&str>).unwrap();
+            let menu = Menu::with_items(app, &[&show_i, &quit_i]).unwrap();
+
+            let _tray = TrayIconBuilder::new()
+                .icon(app.default_window_icon().unwrap().clone())
+                .menu(&menu)
+                .show_menu_on_left_click(true) 
+                .on_menu_event(|app, event| {
+                    match event.id.as_ref() {
+                        "quit" => app.exit(0),
+                        "show" => {
+                            if let Some(window) = app.get_webview_window("main") {
+                                let _ = window.show();
+                                let _ = window.set_focus();
+                            }
+                        }
+                        _ => {}
+                    }
+                })
+                .on_tray_icon_event(|tray, event| {
+                     if let TrayIconEvent::Click { .. } = event {
+                          let app = tray.app_handle();
+                          if let Some(window) = app.get_webview_window("main") {
+                               let _ = window.show();
+                               let _ = window.set_focus();
+                          }
+                     }
+                })
+                .build(app)?;
+
+            Ok(())
+        })
+        .on_window_event(|window, event| {
+            if let WindowEvent::CloseRequested { api, .. } = event {
+                // Prevent window from closing, just hide it
+                window.hide().unwrap();
+                api.prevent_close();
+            }
+        })
+        .plugin(tauri_plugin_shell::init())
        .plugin(tauri_plugin_log::Builder::default()
            .targets([
                tauri_plugin_log::Target::new(tauri_plugin_log::TargetKind::Stdout),
@@ -670,6 +780,7 @@ pub fn run() {
            auth::get_calendar_events,
            save_text_file,
            read_log_file,
+            get_audio_metadata,
            email::send_smtp_email
        ])
        .run(tauri::generate_context!())