hearbit-ai-app/src-tauri/src/audio_processor.rs

use std::sync::{Arc, Mutex};
use tauri::{AppHandle, Emitter};
use crate::emit_log;
use cpal::Sample;
use hound::WavWriter;
use rubato::{Resampler, FastFixedIn, PolynomialDegree};
use voice_activity_detector::VoiceActivityDetector;

pub struct AudioProcessor {
    // VAD
    vad: VoiceActivityDetector,
    vad_chunk_size: usize,
    vad_buffer: Vec<f32>,

    // Audio Config
    channel_count: u16,

    // Resampler
    resampler: FastFixedIn<f32>,
    resample_input_buffer: Vec<f32>,
    resample_output_buffer: Vec<f32>,

    // State
    is_speech_active: bool,
    last_speech_time: u64, // In samples or frames
    hangover_samples: u64,

    // Waiting Mode
    waiting_for_speech: bool,

    // Ring Buffer (for pre-roll)
    ring_buffer: Vec<f32>,
    ring_pos: usize,
    ring_size: usize,

    // Output
    writer: Arc<Mutex<WavWriter<std::io::BufWriter<std::fs::File>>>>,
    sample_rate: u32,
    total_processed_samples: u64,
    // Event Emission
    app_handle: Option<AppHandle>,
    last_event_time: std::time::Instant,

    // System Audio Queue for Mixing
    pub system_queue: Arc<Mutex<std::collections::VecDeque<f32>>>,

    // Recording Mode (voice or meeting)
    recording_mode: String,
}

impl AudioProcessor {
    pub fn new(
        sample_rate: u32,
        channel_count: u16,
        writer: Arc<Mutex<WavWriter<std::io::BufWriter<std::fs::File>>>>,
        app_handle: AppHandle,
        wait_for_speech: bool,
        recording_mode: String,
    ) -> Result<Self, String> {
        let vad_sample_rate = 16000;
        let vad_chunk_size = 512;

        // Initialize VAD
        let vad = VoiceActivityDetector::builder()
            .sample_rate(vad_sample_rate as u32)
            .chunk_size(vad_chunk_size)
            .build()
            .map_err(|e| format!("Failed to init VAD: {:?}", e))?;

        // Initialize Resampler (Input Rate -> 16000)
        let resampler = FastFixedIn::<f32>::new(
            16000.0 / sample_rate as f64,
            1.0,
            PolynomialDegree::Septic,
            1024,
            1
        ).map_err(|e| format!("Failed to init Resampler: {:?}", e))?;

        // Pre-roll buffer (3.0 seconds) * Channels (interleaved store)
        let ring_curr_seconds = 3.0;
        // WavWriter writes interleaved, so we store interleaved.
        let ring_size = (sample_rate as f32 * ring_curr_seconds) as usize * channel_count as usize;

        Ok(Self {
            vad,
            vad_chunk_size,
            vad_buffer: Vec::new(),
            channel_count,
            resampler,
            resample_input_buffer: Vec::new(),
            resample_output_buffer: Vec::new(),
            is_speech_active: false,
            last_speech_time: 0,
            // Hangover counts "processed samples" which are actually frames * channels in current logic?
            // Actually total_processed_samples usually counts FRAMES in audio terminology, but here we count elements.
            // Let's stick to elements to match existing logic logic.
            hangover_samples: (sample_rate as f32 * 1.5 * channel_count as f32) as u64,
            waiting_for_speech: wait_for_speech,
            ring_buffer: vec![0.0; ring_size],
            ring_pos: 0,
            ring_size,
            writer,
            sample_rate,
            total_processed_samples: 0,
            app_handle: Some(app_handle),
            last_event_time: std::time::Instant::now(),
            system_queue: Arc::new(Mutex::new(std::collections::VecDeque::new())),
            recording_mode,
        })
    }

    pub fn process(&mut self, input_data: &[f32]) {
        // MIXING LOGIC:
        // We have `input_data` (Microphone). We check `system_queue` for System Audio.
        // System Audio is hardcoded to 2 channels (Stereo) in sc_audio.rs.
        // Microphone `self.channel_count` can be 1 (Mono) or 2 (Stereo).

        let mic_channels = self.channel_count as usize;
        let mut mixed_data = input_data.to_vec();
        let mut max_system_energy = 0.0;

        let gain_mic = 1.0;
        let gain_sys = 0.8; // Slightly lower system audio to prioritize speaker

        if let Ok(mut queue) = self.system_queue.lock() {
            let frames = mixed_data.len() / mic_channels;

            for f in 0..frames {
                // system_queue is always stereo (L, R, L, R...)
                if let (Some(l), Some(r)) = (queue.pop_front(), queue.pop_front()) {
                    let abs_l = l.abs();
                    let abs_r = r.abs();
                    let current_sys_max = if abs_l > abs_r { abs_l } else { abs_r };
                    if current_sys_max > max_system_energy {
                        max_system_energy = current_sys_max;
                    }

                    if mic_channels == 1 {
                        // Mic is Mono: Mix System L+R down to Mono
                        let sys_mono = (l + r) / 2.0;
                        let mixed = (mixed_data[f] * gain_mic) + (sys_mono * gain_sys);
                        mixed_data[f] = mixed.max(-1.0).min(1.0);
                    } else {
                        // Mic is Stereo: Mix L-to-L and R-to-R
                        let f_start = f * 2;
                        let mixed_l = (mixed_data[f_start] * gain_mic) + (l * gain_sys);
                        let mixed_r = (mixed_data[f_start + 1] * gain_mic) + (r * gain_sys);
                        mixed_data[f_start] = mixed_l.max(-1.0).min(1.0);
                        mixed_data[f_start + 1] = mixed_r.max(-1.0).min(1.0);
                    }
                }
            }
        }

        let data = &mixed_data;

        // 1. Add to Ring Buffer (Interleaved data - Record EVERYTHING)
        for &sample in data {
            self.ring_buffer[self.ring_pos] = sample;
            self.ring_pos = (self.ring_pos + 1) % self.ring_size;
        }

        // 2. Prepare VAD Signal (Mono Mixdown)
        // FRESH START LOGIC (v0.2.0):
        // We expect standard Stereo Input.

        let channels = self.channel_count as usize;
        let frame_count = data.len() / channels;
        let mut vad_input_chunk = Vec::with_capacity(frame_count);

        for i in 0..frame_count {
            let frame_start = i * channels;

            let mix_sample = if channels >= 2 {
                // Stereo -> Average L + R
                (data[frame_start] + data[frame_start + 1]) / 2.0
            } else {
                // Mono -> Take as is
                data[frame_start]
            };

            vad_input_chunk.push(mix_sample);
        }


        // 3. Resample for VAD
        self.resample_input_buffer.extend_from_slice(&vad_input_chunk);

        let needed = self.resampler.input_frames_next();
        while self.resample_input_buffer.len() >= needed {
             let chunk: Vec<f32> = self.resample_input_buffer.drain(0..needed).collect();
             // Resample (mono)
             let waves_in = vec![chunk];
             // Allocate output (approx)
             let mut waves_out = vec![vec![0.0; (needed as f64 * (16000.0 / self.sample_rate as f64)).ceil() as usize + 10]; 1]; // +10 padding

             if let Ok((_in_len, out_len)) = self.resampler.process_into_buffer(&waves_in, &mut waves_out, None) {
                 if out_len > 0 {
                     self.vad_buffer.extend_from_slice(&waves_out[0][0..out_len]);
                 }
             }
        }

        // 4. Process VAD
        while self.vad_buffer.len() >= self.vad_chunk_size {
            let vad_chunk: Vec<f32> = self.vad_buffer.drain(0..self.vad_chunk_size).collect();
            // Run Detection
            let probability = self.vad.predict(vad_chunk.clone());

            let system_is_active = max_system_energy > 0.005; // Lowered to match trigger
            let is_speech = probability > 0.9;

            if is_speech || system_is_active {
                self.is_speech_active = true;
                self.last_speech_time = self.total_processed_samples;
            }

            // Emit VAD event periodically
            if self.last_event_time.elapsed().as_millis() > 500 {
                if let Some(app) = &self.app_handle {
                     #[derive(Clone, serde::Serialize)]
                     struct VadEvent {
                         is_speech: bool,
                         probability: f32,
                     }
                     let _ = app.emit("vad-event", VadEvent {
                         probability,
                         is_speech: self.is_speech_active,
                     });
                }
                self.last_event_time = std::time::Instant::now();
                self.is_speech_active = false;
            }
        }


        // 4. Update Hangover and Check Write condition
        if self.waiting_for_speech {
            // TRIGGER CONDITION:
            // 1. VAD says speech (Someone is talking)
            // 2. AND System Audio has energy (Meaning audio is coming from the PC, i.e., Call started)
            // Threshold 0.01 is roughly -40dB, should cover ringtones/speech easily but ignore silence/hiss.

            let system_active = max_system_energy > 0.005;

            // Periodically log energy to help debug why meeting mode might not start
            if self.last_event_time.elapsed().as_millis() > 2000 && self.recording_mode == "meeting" {
                if let Some(app) = &self.app_handle {
                    emit_log(app, "DEBUG", &format!("Waiting for Meeting... Current System Energy: {:.4} (Threshold: 0.005)", max_system_energy));
                }
            }

            // MODE-SPECIFIC TRIGGER LOGIC:
            // "voice" -> Trigger if user speaks (is_speech_active)
            // "meeting" -> Trigger ONLY if system audio energy detected (Call starting)
            let trigger = if self.recording_mode == "voice" {
                self.is_speech_active
            } else {
                system_active
            };

            if trigger {
                // Trigger Detected!
                println!("Auto-Start: Call detected (SysEnergy: {}). Flushing pre-roll...", max_system_energy);
                self.waiting_for_speech = false;

                // Flush Ring Buffer (Orderly: from ring_pos to end, then 0 to ring_pos)
                let mut guard = self.writer.lock().unwrap();
                let amplitude = i16::MAX as f32;

                // Part 1: ring_pos to end
                for i in self.ring_pos..self.ring_size {
                     let sample = self.ring_buffer[i];
                     guard.write_sample((sample * amplitude) as i16).ok();
                }
                // Part 2: 0 to ring_pos
                for i in 0..self.ring_pos {
                    let sample = self.ring_buffer[i];
                    guard.write_sample((sample * amplitude) as i16).ok();
                }

                // Emit event to notify frontend that "real" recording started
                if let Some(app) = &self.app_handle {
                    let _ = app.emit("auto-recording-triggered", ());
                }

            } else {
                // Still waiting, do not write to file.
                return;
            }
        }

        // Standard Recording Logic (Active or Hangover)
        let time_since_speech = self.total_processed_samples.saturating_sub(self.last_speech_time);

        // We write to file if:
        // 1. VAD thinks someone is speaking (Mic or System)
        // 2. OR System audio energy is currently above threshold (Ensures calls are captured)
        // 3. OR we are within the hangover period
        let system_is_active = max_system_energy > 0.005;

        if self.is_speech_active || system_is_active || time_since_speech < self.hangover_samples {
             let mut guard = self.writer.lock().unwrap();
             for &sample in data {
                 let amplitude = i16::MAX as f32;
                 guard.write_sample((sample * amplitude) as i16).ok();
             }
        }

        self.total_processed_samples += data.len() as u64;
    }
}