feat: improve audio exclusion with smart matching and reliability fixes

This commit is contained in:
michael.borak
2026-01-25 16:37:58 +01:00
parent 69dc6b8fac
commit de504fbcb4
15 changed files with 2720 additions and 306 deletions

32
src-tauri/Cargo.lock generated
View File

@@ -629,6 +629,35 @@ dependencies = [
"stacker",
]
[[package]]
name = "cocoa"
version = "0.26.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ad36507aeb7e16159dfe68db81ccc27571c3ccd4b76fb2fb72fc59e7a4b1b64c"
dependencies = [
"bitflags 2.10.0",
"block",
"cocoa-foundation",
"core-foundation 0.10.1",
"core-graphics",
"foreign-types 0.5.0",
"libc",
"objc",
]
[[package]]
name = "cocoa-foundation"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81411967c50ee9a1fc11365f8c585f863a22a9697c89239c452292c40ba79b0d"
dependencies = [
"bitflags 2.10.0",
"block",
"core-foundation 0.10.1",
"core-graphics-types",
"objc",
]
[[package]]
name = "combine"
version = "4.6.7"
@@ -1749,10 +1778,13 @@ version = "1.2.1"
dependencies = [
"base64 0.22.1",
"chrono",
"cocoa",
"core-foundation 0.10.1",
"cpal",
"hound",
"lettre",
"oauth2",
"objc",
"reqwest 0.11.27",
"rubato",
"screencapturekit",

View File

@@ -40,3 +40,6 @@ tauri-plugin-shell = "2.3.4"
base64 = "0.22"
screencapturekit = "0.2.0"
screencapturekit-sys = "0.2.8"
cocoa = "0.26.1"
objc = "0.2.7"
core-foundation = "0.10.1"

65
src-tauri/src/apps.rs Normal file
View File

@@ -0,0 +1,65 @@
use cocoa::base::{id, nil};
use cocoa::foundation::{NSArray, NSAutoreleasePool, NSString};
use objc::{msg_send, sel, sel_impl};
use serde::Serialize;
#[derive(Serialize, Debug)]
pub struct RunningApp {
pub name: String,
pub bundle_id: String,
}
pub fn get_running_applications() -> Vec<RunningApp> {
let mut apps = Vec::new();
unsafe {
let pool = NSAutoreleasePool::new(nil);
// [NSWorkspace sharedWorkspace]
let workspace_class = objc::runtime::Class::get("NSWorkspace").unwrap();
let shared_workspace: id = msg_send![workspace_class, sharedWorkspace];
// [sharedWorkspace runningApplications]
let running_apps: id = msg_send![shared_workspace, runningApplications];
let count: usize = msg_send![running_apps, count];
for i in 0..count {
let app: id = msg_send![running_apps, objectAtIndex: i];
// Check if it's a regular GUI application
// activationPolicy: 0 = regular, 1 = accessory, 2 = prohibited
let policy: isize = msg_send![app, activationPolicy];
if policy != 0 {
continue;
}
let bundle_id_ns: id = msg_send![app, bundleIdentifier];
let name_ns: id = msg_send![app, localizedName];
if bundle_id_ns != nil && name_ns != nil {
let bundle_id = nsstring_to_string(bundle_id_ns);
let name = nsstring_to_string(name_ns);
apps.push(RunningApp { name, bundle_id });
}
}
// We don't drain the pool here as it might be handled by Tauri's main loop if we are called frequently,
// but for a one-off command it's safer.
let _: () = msg_send![pool, release];
}
// Sort by name for better UI experience
apps.sort_by(|a, b| a.name.to_lowercase().cmp(&b.name.to_lowercase()));
apps
}
unsafe fn nsstring_to_string(ns_string: id) -> String {
let char_ptr: *const std::os::raw::c_char = msg_send![ns_string, UTF8String];
if char_ptr.is_null() {
return String::new();
}
std::ffi::CStr::from_ptr(char_ptr)
.to_string_lossy()
.into_owned()
}

File diff suppressed because it is too large Load Diff

984
src-tauri/src/lib.rs.bak Normal file
View File

@@ -0,0 +1,984 @@
use tauri::{
AppHandle, Manager, State, Emitter,
menu::{Menu, MenuItem},
tray::{TrayIconBuilder, TrayIconEvent},
WindowEvent
};
use std::sync::{Arc, Mutex};
use std::process::Command;
use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
use std::time::Duration;
use tokio::time::sleep;
use base64::Engine;
mod audio_processor;
use audio_processor::AudioProcessor;
mod auth;
mod email;
mod sc_audio;
// State to hold the active recording stream
struct AppState {
recording_stream: Mutex<Option<cpal::Stream>>,
recording_file_path: Mutex<Option<String>>,
system_capture: Mutex<Option<sc_audio::SystemAudioCapture>>,
}
#[derive(serde::Serialize)]
struct AudioDevice {
id: String,
name: String,
}
#[derive(serde::Serialize, Clone)]
struct LogEvent {
level: String,
message: String,
timestamp: String,
}
pub(crate) fn emit_log(app: &AppHandle, level: &str, message: &str) {
let log = LogEvent {
level: level.to_string(),
message: message.to_string(),
timestamp: chrono::Local::now().format("%H:%M:%S").to_string(),
};
let _ = app.emit("log-event", log);
}
#[tauri::command]
fn greet(name: &str) -> String {
format!("Hello, {}! You've been greeted from Rust!", name)
}
#[tauri::command]
fn get_input_devices() -> Result<Vec<AudioDevice>, String> {
let host = cpal::default_host();
let devices = host.input_devices().map_err(|e| e.to_string())?;
let mut result = Vec::new();
for device in devices {
#[allow(deprecated)]
if let Ok(name) = device.name() {
// macOS often produces weird names, but let's just use what we get
result.push(AudioDevice {
id: name.clone(), // Using name as ID for simplicity in this MVP
name,
});
}
}
Ok(result)
}
#[tauri::command]
async fn start_recording(app: AppHandle, state: State<'_, AppState>, device_id: String, save_path: Option<String>, custom_filename: Option<String>, wait_for_speech: Option<bool>, mode: String) -> Result<(), String> {
emit_log(&app, "INFO", &format!("Starting recording [Mode: {}] on device: {}", mode, device_id));
let host = cpal::default_host();
// Find device by name (using name as ID)
#[allow(deprecated)]
let device = host.input_devices()
.map_err(|e| e.to_string())?
.find(|d| d.name().map(|n| n == device_id).unwrap_or(false))
.or_else(|| host.default_input_device())
.ok_or("No input device found")?;
// Select the configuration with the MAXIMUM number of channels
// This is crucial for "Hearbit Audio" (Aggregate) which lists 3 channels but might default to 2.
// Select Audio Configuration
// We prioritize 48kHz because System Audio (ScreenCaptureKit) acts best at 48k.
let supported_configs: Vec<_> = device.supported_input_configs().map_err(|e| e.to_string())?.collect();
// Try to find 48kHz specifically
// Note: cpal::SampleRate is likely a type alias for u32 here, so we pass 48000 directly.
let config = supported_configs.iter()
.find(|c| c.min_sample_rate() <= 48000 && c.max_sample_rate() >= 48000)
.map(|c| c.with_sample_rate(48000))
.or_else(|| {
// Fallback: Max sample rate
supported_configs.iter()
.max_by_key(|c| c.channels())
.map(|c| c.with_max_sample_rate())
})
.ok_or("No supported input configurations found")?;
emit_log(&app, "INFO", &format!("Selected Audio Config: {} Channels, {} Hz", config.channels(), config.sample_rate()));
let spec = hound::WavSpec {
channels: config.channels(),
sample_rate: config.sample_rate(),
bits_per_sample: 16,
sample_format: hound::SampleFormat::Int,
};
// Determine file path: User provided or Temp
let filename = if let Some(name) = custom_filename {
// Sanitize filename
let safe_name: String = name.chars().map(|x| if x.is_alphanumeric() || x == ' ' || x == '-' || x == '_' { x } else { '_' }).collect();
format!("{}.wav", safe_name)
} else {
format!("recording_{}.wav", std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH).unwrap().as_secs())
};
let file_path = if let Some(path) = save_path {
if path.trim().is_empty() {
std::env::temp_dir().join(&filename)
} else {
std::path::PathBuf::from(path).join(&filename)
}
} else {
std::env::temp_dir().join(&filename)
};
let file_path_str = file_path.to_string_lossy().to_string();
emit_log(&app, "INFO", &format!("Saving recording to: {}", file_path_str));
let writer = hound::WavWriter::create(&file_path, spec).map_err(|e| e.to_string())?;
let writer = Arc::new(Mutex::new(writer));
let writer_clone = writer.clone();
// Initialize AudioProcessor (VAD)
// We pass the writer to it.
let should_wait = wait_for_speech.unwrap_or(false);
if should_wait {
emit_log(&app, "INFO", &format!("Recording started in WAITING mode (Trigger: {}).", if mode == "voice" { "Speech" } else { "System Audio" }));
}
let processor = AudioProcessor::new(config.sample_rate(), config.channels(), writer.clone(), app.clone(), should_wait, mode)
.map_err(|e| format!("Failed to create AudioProcessor: {}", e))?;
// Wrap processor in Arc<Mutex> so we can share/move it into callback
// Actually, cpal callback takes ownership of its closure state usually if 'move'.
// Since stream is on another thread, we need Send. AudioProcessor should be Send.
// However, the callback is called repeatedly. We need to keep state.
// The workaround is to wrap it in a Mutex.
let processor = Arc::new(Mutex::new(processor));
let processor_clone = processor.clone();
// --- SYSTEM AUDIO CAPTURE START ---
// Prevent Doubling: If user selected an aggregate device (Hearbit Audio/BlackHole),
// it ALREADY contains system audio. In that case, we don't need internal SCK capture.
let is_aggregate = device_id.contains("Hearbit") || device_id.contains("BlackHole");
if is_aggregate {
emit_log(&app, "INFO", "Aggregate device detected. Disabling internal System Audio Capture to prevent doubling.");
} else {
let mut sys_capture = sc_audio::SystemAudioCapture::new(config.sample_rate());
// Get the queue to share with the capture callback
let queue_clone = {
let p = processor.lock().unwrap();
p.system_queue.clone() // Access the pub field we added
};
let sys_callback = move |data: &[f32]| {
// Push to queue
if let Ok(mut q) = queue_clone.lock() {
q.extend(data.iter());
// Limit queue size to avoid memory leaks if main process loop is slow
while q.len() > 48000 * 5 { // 5 seconds buffer
q.pop_front();
}
}
};
match sys_capture.start(sys_callback).await {
Ok(_) => emit_log(&app, "INFO", "System Audio Capture started."),
Err(e) => emit_log(&app, "WARN", &format!("System Audio Capture failed (Permissions?): {}", e)),
}
*state.system_capture.lock().unwrap() = Some(sys_capture);
}
// --- SYSTEM AUDIO CAPTURE END ---
let app_handle = app.clone();
let err_fn = move |err| {
eprintln!("an error occurred on stream: {}", err);
emit_log(&app_handle, "ERROR", &format!("Stream error: {}", err));
};
let stream = match config.sample_format() {
cpal::SampleFormat::F32 => device.build_input_stream(
&config.into(),
move |data: &[f32], _: &_| {
if let Ok(mut p) = processor_clone.lock() {
p.process(data);
}
},
err_fn,
None
),
// For I16 and U16 we need to convert to F32 for our processor
cpal::SampleFormat::I16 => device.build_input_stream(
&config.into(),
move |data: &[i16], _: &_| {
// Convert i16 to f32
let f32_data: Vec<f32> = data.iter().map(|&s| s as f32 / i16::MAX as f32).collect();
if let Ok(mut p) = processor_clone.lock() {
p.process(&f32_data);
}
},
err_fn,
None
),
cpal::SampleFormat::U16 => device.build_input_stream(
&config.into(),
move |data: &[u16], _: &_| {
// Convert u16 to f32
let f32_data: Vec<f32> = data.iter().map(|&s| (s as i32 - 32768) as f32 / 32768.0).collect();
if let Ok(mut p) = processor_clone.lock() {
p.process(&f32_data);
}
},
err_fn,
None
),
_ => return Err("Unsupported sample format".to_string()),
}.map_err(|e| e.to_string())?;
stream.play().map_err(|e| e.to_string())?;
// Store state
*state.recording_stream.lock().unwrap() = Some(stream);
*state.recording_file_path.lock().unwrap() = Some(file_path_str.clone());
emit_log(&app, "SUCCESS", &format!("Recording started. File: {}", file_path_str));
Ok(())
}
#[tauri::command]
fn stop_recording(app: AppHandle, state: State<'_, AppState>) -> Result<String, String> {
emit_log(&app, "INFO", "Stopping recording...");
// Drop stream to stop recording
{
let mut stream_guard = state.recording_stream.lock().unwrap();
// Also stop System Capture
let mut sys_guard = state.system_capture.lock().unwrap();
if let Some(sys) = sys_guard.as_mut() {
sys.stop();
}
*sys_guard = None;
if stream_guard.is_none() {
return Err("Not recording".to_string());
}
*stream_guard = None; // This drops the stream and stops recording
}
// Return file path
let mut path_guard = state.recording_file_path.lock().unwrap();
let path = path_guard.take().ok_or("No recording path found".to_string())?;
emit_log(&app, "SUCCESS", &format!("Recording stopped. Saved to: {}", path));
Ok(path)
}
#[tauri::command]
fn pause_recording(app: AppHandle, state: State<'_, AppState>) -> Result<(), String> {
emit_log(&app, "INFO", "Pausing recording...");
let stream_guard = state.recording_stream.lock().unwrap();
if let Some(stream) = stream_guard.as_ref() {
stream.pause().map_err(|e| e.to_string())?;
emit_log(&app, "SUCCESS", "Recording paused.");
Ok(())
} else {
Err("Not recording".to_string())
}
}
#[tauri::command]
fn resume_recording(app: AppHandle, state: State<'_, AppState>) -> Result<(), String> {
emit_log(&app, "INFO", "Resuming recording...");
let stream_guard = state.recording_stream.lock().unwrap();
if let Some(stream) = stream_guard.as_ref() {
stream.play().map_err(|e| e.to_string())?;
emit_log(&app, "SUCCESS", "Recording resumed.");
Ok(())
} else {
Err("Not recording".to_string())
}
}
#[derive(serde::Deserialize)]
struct ModelListResponse {
data: Vec<ModelData>,
}
#[derive(serde::Deserialize)]
struct ModelData {
id: String,
#[allow(dead_code)]
owned_by: Option<String>,
}
// Structs for Infomaniak API responses
#[derive(serde::Deserialize)]
struct WhisperResponse {
text: Option<String>,
batch_id: Option<String>,
}
#[derive(serde::Deserialize)]
struct ChatCompletionResponse {
choices: Vec<Choice>,
}
#[derive(serde::Deserialize)]
struct Choice {
message: Message,
}
#[derive(serde::Deserialize)]
struct Message {
#[allow(dead_code)]
content: String,
}
#[derive(serde::Serialize)]
struct ModelInfo {
id: String,
name: String,
}
#[tauri::command]
async fn get_available_models(app: AppHandle, api_key: String, product_id: String) -> Result<Vec<ModelInfo>, String> {
emit_log(&app, "INFO", "Fetching available models from Infomaniak...");
let client = reqwest::Client::new();
// Use the v2/openai compliant endpoint as per docs
let url = format!("https://api.infomaniak.com/2/ai/{}/openai/v1/models", product_id);
emit_log(&app, "DEBUG", &format!("GET {}", url));
let res = client.get(&url)
.header("Authorization", format!("Bearer {}", api_key))
.send()
.await
.map_err(|e| {
let msg = format!("Network error fetching models: {}", e);
emit_log(&app, "ERROR", &msg);
msg
})?;
if res.status().is_success() {
let raw_body = res.text().await.map_err(|e| e.to_string())?;
// println!("Models Raw Response: {}", raw_body);
let list: ModelListResponse = serde_json::from_str(&raw_body)
.map_err(|e| format!("Failed to parse models: {}. Body: {}", e, raw_body))?;
let models = list.data.into_iter()
.filter(|m| !m.id.to_lowercase().contains("mini_lm") && !m.id.to_lowercase().contains("bert") && !m.id.to_lowercase().contains("embedding"))
.map(|m| ModelInfo {
id: m.id.clone(),
name: m.id, // Use ID as name for now, or fetch more details if available
}).collect::<Vec<ModelInfo>>();
emit_log(&app, "SUCCESS", &format!("Loaded {} models.", models.len()));
Ok(models)
} else {
let err = res.text().await.unwrap_or_default();
emit_log(&app, "ERROR", &format!("Failed to fetch models: {}", err));
Err(format!("Failed to fetch models: {}", err))
}
}
#[derive(serde::Deserialize)]
struct WhisperVerboseResponse {
text: Option<String>,
segments: Option<Vec<Segment>>,
}
#[derive(serde::Deserialize)]
struct Segment {
start: f64,
end: f64,
text: String,
}
#[tauri::command]
async fn transcribe_audio(app: AppHandle, file_path: String, api_key: String, product_id: String) -> Result<String, String> {
emit_log(&app, "INFO", "Starting transcription with timestamps...");
let client = reqwest::Client::new();
// Prepare file part
let file_bytes = std::fs::read(&file_path).map_err(|e| e.to_string())?;
// We must use a known file name for the part, Infomaniak might care, or not.
let file_part = reqwest::multipart::Part::bytes(file_bytes)
.file_name("recording.wav")
.mime_str("audio/wav")
.map_err(|e| e.to_string())?;
let form = reqwest::multipart::Form::new()
.part("file", file_part)
.text("model", "whisper")
.text("response_format", "verbose_json")
.text("timestamp_granularities[]", "segment"); // Crucial for accurate segments
let url = format!("https://api.infomaniak.com/1/ai/{}/openai/audio/transcriptions", product_id);
emit_log(&app, "DEBUG", &format!("POST {}", url));
let res = client.post(&url)
.header("Authorization", format!("Bearer {}", api_key))
.multipart(form)
.send()
.await
.map_err(|e| {
let msg = format!("Network error during transcription: {}", e);
emit_log(&app, "ERROR", &msg);
msg
})?;
if res.status().is_success() {
let raw_body = res.text().await.map_err(|e| e.to_string())?;
// Check if we got a batch ID
#[derive(serde::Deserialize)]
struct BatchResponse {
batch_id: Option<String>,
}
// Try parsing as batch response first (Infomaniak specific behavior)
if let Ok(batch_res) = serde_json::from_str::<BatchResponse>(&raw_body) {
if let Some(batch_id) = batch_res.batch_id {
emit_log(&app, "INFO", &format!("Transcription queued. Batch ID: {}", batch_id));
return poll_transcription(&app, &client, &api_key, &product_id, &batch_id).await;
}
}
// If not batch, try parsing verbose response directly
// Log the raw body so we can see why it fails
emit_log(&app, "DEBUG", &format!("Direct Response (first 500 chars): {:.500}", raw_body));
let response: WhisperVerboseResponse = serde_json::from_str(&raw_body)
.map_err(|e| format!("Failed to decode JSON: {}. Body: {}", e, raw_body))?;
if let Some(segments) = response.segments {
emit_log(&app, "INFO", &format!("Found {} segments (Direct).", segments.len()));
for (i, seg) in segments.iter().take(3).enumerate() {
emit_log(&app, "DEBUG", &format!("Seg {}: start={}", i, seg.start));
}
// Format timestamps: [MM:SS] Text
let mut formatted_transcript = String::new();
for segment in segments {
let start_mins = (segment.start / 60.0).floor() as u64;
let start_secs = (segment.start % 60.0).floor() as u64;
formatted_transcript.push_str(&format!("[{:02}:{:02}] {}\n", start_mins, start_secs, segment.text.trim()));
}
// Fallback to raw text if segments empty
if formatted_transcript.trim().is_empty() {
if let Some(text) = response.text {
emit_log(&app, "SUCCESS", "Segments missing, using raw text.");
return Ok(text);
}
} else {
emit_log(&app, "SUCCESS", "Transcription received with timestamps.");
return Ok(formatted_transcript);
}
} else if let Some(text) = response.text {
emit_log(&app, "SUCCESS", "Segments missing, using raw text.");
return Ok(text);
}
emit_log(&app, "ERROR", "Response contained no recognized content.");
Err(format!("Response contained no recognized content. Body: {}", raw_body))
} else {
let error_text = res.text().await.unwrap_or_default();
emit_log(&app, "ERROR", &format!("Transcription failed: {}", error_text));
Err(format!("Transcription failed: {}", error_text))
}
}
async fn poll_transcription(app: &AppHandle, client: &reqwest::Client, api_key: &str, product_id: &str, batch_id: &str) -> Result<String, String> {
let status_url = format!("https://api.infomaniak.com/1/ai/{}/results/{}", product_id, batch_id);
let mut attempts = 0;
while attempts < 40 { // 40 * 2s = 80s timeout
attempts += 1;
sleep(Duration::from_secs(2)).await;
emit_log(app, "DEBUG", &format!("Polling status... Attempt {}", attempts));
let res = client.get(&status_url)
.header("Authorization", format!("Bearer {}", api_key))
.send()
.await
.map_err(|e| format!("Polling error: {}", e))?;
if res.status().is_success() {
let json: serde_json::Value = res.json().await.map_err(|e| e.to_string())?;
// Check 'status'
if let Some(status) = json.get("status").and_then(|s| s.as_str()) {
if status == "success" {
// Download the result
let download_url = format!("https://api.infomaniak.com/1/ai/{}/results/{}/download", product_id, batch_id);
let dl_res = client.get(&download_url)
.header("Authorization", format!("Bearer {}", api_key))
.send()
.await
.map_err(|e| e.to_string())?;
if dl_res.status().is_success() {
let content = dl_res.text().await.map_err(|e| e.to_string())?;
emit_log(app, "DEBUG", &format!("Poll Raw Content (first 500 chars): {:.500}", content));
// Try to parse as Verbose JSON to get timestamps
if let Ok(response) = serde_json::from_str::<WhisperVerboseResponse>(&content) {
if let Some(segments) = response.segments {
emit_log(app, "INFO", &format!("Found {} segments.", segments.len()));
// Log first 3 segments start times
for (i, seg) in segments.iter().take(3).enumerate() {
emit_log(app, "DEBUG", &format!("Seg {}: start={}", i, seg.start));
}
let mut formatted_transcript = String::new();
for segment in segments {
let start_mins = (segment.start / 60.0).floor() as u64;
let start_secs = (segment.start % 60.0).floor() as u64;
formatted_transcript.push_str(&format!("[{:02}:{:02}] {}\n", start_mins, start_secs, segment.text.trim()));
}
if !formatted_transcript.trim().is_empty() {
emit_log(app, "SUCCESS", "Transcription completed (async) with timestamps.");
return Ok(formatted_transcript);
} else {
emit_log(app, "WARN", "Segments found but empty content.");
}
} else {
emit_log(app, "WARN", "Verbose parsed but no segments found.");
}
if let Some(text) = response.text {
emit_log(app, "SUCCESS", "Transcription completed (async) - raw text (segments missing).");
return Ok(text);
}
} else {
emit_log(app, "WARN", "Failed to parse poll content as WhisperVerboseResponse");
}
emit_log(app, "SUCCESS", "Transcription completed - returning raw content.");
// If not JSON or no text field, return raw content
return Ok(content);
} else {
emit_log(app, "ERROR", "Failed to download transcription results.");
return Err(format!("Download failed: {}", dl_res.status()));
}
} else if status == "failed" || status == "error" {
let err_msg = format!("Batch processing failed [Status: {}]. Full Response: {:?}", status, json);
emit_log(app, "ERROR", &err_msg);
return Err(err_msg);
}
// If 'processing' or 'pending', continue loop
}
}
}
emit_log(app, "ERROR", "Transcription timed out after 80s.");
Err("Transcription timed out".to_string())
}
#[tauri::command]
async fn summarize_text(app: AppHandle, text: String, api_key: String, product_id: String, prompt: String, model: String) -> Result<String, String> {
emit_log(&app, "INFO", "Starting summarization...");
let client = reqwest::Client::new();
let url = format!("https://api.infomaniak.com/2/ai/{}/openai/v1/chat/completions", product_id);
let messages = serde_json::json!([
{ "role": "system", "content": prompt },
{ "role": "user", "content": text }
]);
let model_to_use = if model.is_empty() { "mixtral".to_string() } else { model };
let body = serde_json::json!({
"model": model_to_use,
"messages": messages
});
emit_log(&app, "DEBUG", &format!("POST {}", url));
let res = client.post(&url)
.header("Authorization", format!("Bearer {}", api_key))
.header("Content-Type", "application/json")
.json(&body)
.send()
.await
.map_err(|e| {
let msg = format!("Network error during summarization: {}", e);
emit_log(&app, "ERROR", &msg);
msg
})?;
if res.status().is_success() {
let raw_body = res.text().await.map_err(|e| e.to_string())?;
// println!("Summarization Raw Response: {}", raw_body);
let response_body: ChatCompletionResponse = serde_json::from_str(&raw_body)
.map_err(|e| format!("Failed to decode JSON: {}. Body: {}", e, raw_body))?;
if let Some(choice) = response_body.choices.first() {
emit_log(&app, "SUCCESS", "Summarization received.");
Ok(choice.message.content.clone())
} else {
emit_log(&app, "WARN", "No summary generated in response.");
Err("No summary generated".to_string())
}
} else {
let error_text = res.text().await.unwrap_or_default();
emit_log(&app, "ERROR", &format!("Summarization failed: {}", error_text));
Err(format!("Summarization failed: {}", error_text))
}
}
#[derive(serde::Serialize)]
struct AudioMetadata {
duration: f64,
size: u64,
format: String,
}
// Helper to find ffmpeg/ffprobe in common paths
fn resolve_binary_path(binary_name: &str) -> String {
let common_paths = [
format!("/opt/homebrew/bin/{}", binary_name),
format!("/usr/local/bin/{}", binary_name),
format!("/usr/bin/{}", binary_name),
];
for path in common_paths.iter() {
if std::path::Path::new(path).exists() {
return path.clone();
}
}
// Fallback to expecting it in PATH
binary_name.to_string()
}
#[tauri::command]
fn get_audio_metadata(app: AppHandle, file_path: String) -> Result<AudioMetadata, String> {
emit_log(&app, "INFO", &format!("Getting metadata for: {}", file_path));
let path = std::path::Path::new(&file_path);
if !path.exists() {
return Err(format!("File not found: {}", file_path));
}
let size = std::fs::metadata(&file_path)
.map_err(|e| e.to_string())?
.len();
// Use ffprobe to get duration
// Try resolved path first
let ffprobe_cmd = resolve_binary_path("ffprobe");
let output = Command::new(&ffprobe_cmd)
.args([
"-v", "error",
"-show_entries", "format=duration",
"-of", "default=noprint_wrappers=1:nokey=1",
&file_path
])
.output()
.map_err(|e| format!("Failed to execute ffprobe at '{}': {}", ffprobe_cmd, e))?;
let duration_str = String::from_utf8_lossy(&output.stdout);
let duration: f64 = duration_str.trim().parse().unwrap_or(0.0);
// Extension as format
let format = path.extension()
.and_then(|e| e.to_str())
.unwrap_or("unknown")
.to_string();
Ok(AudioMetadata {
duration,
size,
format,
})
}
#[tauri::command]
fn convert_to_mp3(app: AppHandle, wav_path: String) -> Result<String, String> {
emit_log(&app, "INFO", &format!("Converting to MP3: {}", wav_path));
let mp3_path = wav_path.replace(".wav", ".mp3");
let ffmpeg_cmd = resolve_binary_path("ffmpeg");
let output = Command::new(&ffmpeg_cmd)
.args([
"-i", &wav_path,
"-codec:a", "libmp3lame",
"-b:a", "64k",
"-y", // overwrite
&mp3_path
])
.output()
.map_err(|e| format!("Failed to execute ffmpeg at '{}': {}", ffmpeg_cmd, e))?;
if output.status.success() {
emit_log(&app, "SUCCESS", &format!("MP3 created: {}", mp3_path));
Ok(mp3_path)
} else {
let error = String::from_utf8_lossy(&output.stderr);
emit_log(&app, "ERROR", &format!("MP3 conversion failed: {}", error));
Err(format!("MP3 conversion failed: {}", error))
}
}
#[tauri::command]
fn chunk_audio(app: AppHandle, file_path: String, chunk_minutes: u32) -> Result<Vec<String>, String> {
emit_log(&app, "INFO", &format!("Chunking audio: {} ({}min chunks)", file_path, chunk_minutes));
let chunk_seconds = chunk_minutes * 60;
let ffprobe_cmd = resolve_binary_path("ffprobe");
let ffmpeg_cmd = resolve_binary_path("ffmpeg");
// Get total duration using ffprobe
let duration_output = Command::new(&ffprobe_cmd)
.args([
"-v", "error",
"-show_entries", "format=duration",
"-of", "default=noprint_wrappers=1:nokey=1",
&file_path
])
.output()
.map_err(|e| format!("Failed to get duration with '{}': {}", ffprobe_cmd, e))?;
let duration_str = String::from_utf8_lossy(&duration_output.stdout);
let duration: f64 = duration_str.trim().parse()
.map_err(|_| "Failed to parse duration".to_string())?;
let num_chunks = (duration / chunk_seconds as f64).ceil() as usize;
emit_log(&app, "INFO", &format!("Total duration: {}s, creating {} chunks", duration, num_chunks));
let mut chunk_paths = Vec::new();
let base_path = file_path.replace(".mp3", "");
for i in 0..num_chunks {
let start_time = i as u32 * chunk_seconds;
let chunk_path = format!("{}_chunk_{}.mp3", base_path, i);
let output = Command::new(&ffmpeg_cmd)
.args([
"-i", &file_path,
"-ss", &start_time.to_string(),
"-t", &chunk_seconds.to_string(),
"-c", "copy",
"-y",
&chunk_path
])
.output()
.map_err(|e| format!("Failed to create chunk {} with '{}': {}", i, ffmpeg_cmd, e))?;
if !output.status.success() {
let error = String::from_utf8_lossy(&output.stderr);
return Err(format!("Chunk {} failed: {}", i, error));
}
chunk_paths.push(chunk_path);
}
emit_log(&app, "SUCCESS", &format!("Created {} chunks", chunk_paths.len()));
Ok(chunk_paths)
}
#[tauri::command]
fn read_image_as_base64(app: AppHandle, file_path: String) -> Result<String, String> {
emit_log(&app, "INFO", &format!("Reading image as base64: {}", file_path));
let bytes = std::fs::read(&file_path)
.map_err(|e| format!("Failed to read file: {}", e))?;
// Detect image type from extension
let extension = std::path::Path::new(&file_path)
.extension()
.and_then(|e| e.to_str())
.unwrap_or("png")
.to_lowercase();
let mime_type = match extension.as_str() {
"jpg" | "jpeg" => "image/jpeg",
"png" => "image/png",
"svg" => "image/svg+xml",
"gif" => "image/gif",
_ => "image/png"
};
// Use base64 encoding
let base64_str = base64::prelude::BASE64_STANDARD.encode(&bytes);
let data_url = format!("data:{};base64,{}", mime_type, base64_str);
emit_log(&app, "SUCCESS", &format!("Image converted to base64 ({} bytes)", base64_str.len()));
Ok(data_url)
}
#[tauri::command]
fn open_audio_midi_setup() -> Result<(), String> {
Command::new("open")
.arg("-a")
.arg("Audio MIDI Setup")
.spawn()
.map_err(|e| e.to_string())?;
Ok(())
}
#[tauri::command]
fn create_hearbit_audio_device(app: AppHandle) -> Result<String, String> {
emit_log(&app, "INFO", "Attempting to create Hearbit Audio device...");
// Resolve resource path
let resource_path = app.path().resource_dir()
.map_err(|e| e.to_string())?
.join("resources/create_hearbit_audio.swift");
if !resource_path.exists() {
// Fallback for dev environment where resources might not be bundled yet or different path
emit_log(&app, "WARN", &format!("Resource script not found at {:?}. Trying local src-tauri path.", resource_path));
}
// For now, in dev mode, we might need to point to the source location if bundle isn't active
// But let's try running it.
let output = Command::new("swift")
.arg(resource_path)
.output()
.map_err(|e| e.to_string())?;
let stdout = String::from_utf8_lossy(&output.stdout).to_string();
let stderr = String::from_utf8_lossy(&output.stderr).to_string();
emit_log(&app, "DEBUG", &format!("Script Output: {}", stdout));
if !stderr.is_empty() {
emit_log(&app, "WARN", &format!("Script Stderr: {}", stderr));
}
if output.status.success() {
emit_log(&app, "SUCCESS", "Hearbit Audio device created successfully.");
Ok("Device created successfully".to_string())
} else {
emit_log(&app, "ERROR", "Failed to create device.");
Err(format!("Failed to create device: {} {}", stdout, stderr))
}
}
#[tauri::command]
async fn check_screen_recording_permission() -> bool {
sc_audio::check_permissions().await
}
#[tauri::command]
async fn save_text_file(app: AppHandle, path: String, content: String) -> Result<(), String> {
emit_log(&app, "INFO", &format!("Saving text file to: {}", path));
match std::fs::write(&path, content) {
Ok(_) => {
emit_log(&app, "SUCCESS", "File saved successfully.");
Ok(())
},
Err(e) => {
emit_log(&app, "ERROR", &format!("Failed to save file: {}", e));
Err(e.to_string())
}
}
}
#[tauri::command]
async fn read_log_file(app: AppHandle) -> Result<String, String> {
let log_path = app.path().app_log_dir().map_err(|e| e.to_string())?.join("hearbit-ai.log");
if log_path.exists() {
let content = std::fs::read_to_string(&log_path).map_err(|e| e.to_string())?;
Ok(content)
} else {
Ok("No log file found yet.".to_string())
}
}
#[cfg_attr(mobile, tauri::mobile_entry_point)]
pub fn run() {
tauri::Builder::default()
.setup(|app| {
// Setup Tray Icon
let quit_i = MenuItem::with_id(app, "quit", "Quit Hearbit AI", true, None::<&str>).unwrap();
let show_i = MenuItem::with_id(app, "show", "Show Window", true, None::<&str>).unwrap();
let menu = Menu::with_items(app, &[&show_i, &quit_i]).unwrap();
let _tray = TrayIconBuilder::new()
.icon(app.default_window_icon().unwrap().clone())
.menu(&menu)
.show_menu_on_left_click(true)
.on_menu_event(|app, event| {
match event.id.as_ref() {
"quit" => app.exit(0),
"show" => {
if let Some(window) = app.get_webview_window("main") {
let _ = window.show();
let _ = window.set_focus();
}
}
_ => {}
}
})
.on_tray_icon_event(|tray, event| {
if let TrayIconEvent::Click { .. } = event {
let app = tray.app_handle();
if let Some(window) = app.get_webview_window("main") {
let _ = window.show();
let _ = window.set_focus();
}
}
})
.build(app)?;
Ok(())
})
.on_window_event(|window, event| {
if let WindowEvent::CloseRequested { api, .. } = event {
// Prevent window from closing, just hide it
window.hide().unwrap();
api.prevent_close();
}
})
.plugin(tauri_plugin_shell::init())
.plugin(tauri_plugin_log::Builder::default()
.targets([
tauri_plugin_log::Target::new(tauri_plugin_log::TargetKind::Stdout),
tauri_plugin_log::Target::new(tauri_plugin_log::TargetKind::LogDir { file_name: Some("hearbit-ai.log".to_string()) }),
])
.build())
.plugin(tauri_plugin_opener::init())
.plugin(tauri_plugin_dialog::init())
.plugin(tauri_plugin_fs::init())
.plugin(tauri_plugin_oauth::init())
.manage(AppState {
recording_stream: Mutex::new(None),
recording_file_path: Mutex::new(None),
system_capture: Mutex::new(None),
})
.invoke_handler(tauri::generate_handler![
greet,
get_input_devices,
start_recording,
stop_recording,
pause_recording,
resume_recording,
transcribe_audio,
summarize_text,
get_available_models,
open_audio_midi_setup,
create_hearbit_audio_device,
check_screen_recording_permission,
auth::start_auth_flow,
auth::get_calendar_events,
save_text_file,
read_log_file,
get_audio_metadata,
convert_to_mp3,
chunk_audio,
read_image_as_base64,
email::send_smtp_email
])
.run(tauri::generate_context!())
.expect("error while running tauri application");
}

View File

@@ -1,17 +1,20 @@
use cocoa::base::nil;
use objc::{msg_send, sel, sel_impl};
use screencapturekit_sys::{
os_types::rc::Id,
shareable_content::UnsafeSCShareableContent,
cm_sample_buffer_ref::CMSampleBufferRef,
content_filter::{UnsafeContentFilter, UnsafeInitParams},
stream_configuration::UnsafeStreamConfiguration,
os_types::rc::Id,
shareable_content::{UnsafeSCRunningApplication, UnsafeSCShareableContent},
stream::UnsafeSCStream,
stream_configuration::UnsafeStreamConfiguration,
stream_error_handler::UnsafeSCStreamError,
stream_output_handler::UnsafeSCStreamOutput,
cm_sample_buffer_ref::CMSampleBufferRef,
};
pub struct SystemAudioCapture {
stream: Option<Id<UnsafeSCStream>>,
sample_rate: u32,
excluded_apps: Vec<String>,
}
struct AudioOutputWrapper {
@@ -20,20 +23,15 @@ struct AudioOutputWrapper {
impl UnsafeSCStreamOutput for AudioOutputWrapper {
fn did_output_sample_buffer(&self, sample: Id<CMSampleBufferRef>, of_type: u8) {
if of_type == 1 { // Audio
if of_type == 1 {
// Audio
let buffers = sample.get_av_audio_buffer_list();
for buffer in buffers {
// Buffer data is u8, we usually get F32 from SCK if configured.
// Assuming f32 (Floating Point) based on our config.
// We need to convert [u8] to [f32].
let data_u8 = buffer.data;
let data_f32: &[f32] = unsafe {
std::slice::from_raw_parts(
data_u8.as_ptr() as *const f32,
data_u8.len() / 4,
)
std::slice::from_raw_parts(data_u8.as_ptr() as *const f32, data_u8.len() / 4)
};
(self.callback)(data_f32);
}
}
@@ -52,52 +50,122 @@ pub async fn check_permissions() -> bool {
}
impl SystemAudioCapture {
pub fn new(sample_rate: u32) -> Self {
Self { stream: None, sample_rate }
pub fn new(sample_rate: u32, excluded_apps: Vec<String>) -> Self {
Self {
stream: None,
sample_rate,
excluded_apps,
}
}
pub async fn start<F>(&mut self, callback: F) -> Result<(), String>
where F: Fn(&[f32]) + Send + Sync + 'static {
let content = UnsafeSCShareableContent::get().map_err(|e| format!("Failed to get content"))?;
async fn build_filter(&self) -> Result<Id<UnsafeContentFilter>, String> {
let content =
UnsafeSCShareableContent::get().map_err(|_| format!("Failed to get content"))?;
let displays = content.displays();
let display = displays.first().ok_or("No display found")?;
let filter_init = UnsafeInitParams::Display(display.clone());
let filter = UnsafeContentFilter::init(filter_init);
if self.excluded_apps.is_empty() {
return Ok(UnsafeContentFilter::init(UnsafeInitParams::Display(
display.clone(),
)));
}
let mut apps_to_exclude = Vec::new();
let all_apps = content.applications();
// Prepare lowercase excluded list for case-insensitive matching
let excluded_lower: Vec<String> = self
.excluded_apps
.iter()
.map(|s| s.to_lowercase())
.collect();
for app in all_apps {
if let Some(bid) = app.get_bundle_identifier() {
let bid_lower = bid.to_lowercase();
// Smart match: check if the running app's ID starts with any blocked ID
// e.g., "com.apple.Safari.WebContent" starts with "com.apple.Safari"
if excluded_lower
.iter()
.any(|excluded| bid_lower.starts_with(excluded))
{
apps_to_exclude.push(app);
}
}
}
// NOTE: In screencapturekit-sys 0.2.8, DisplayIncludingApplicationsExceptingWindows
// actually calls initWithDisplay:excludingApplications: (it's a bug in the crate)
let filter_init = UnsafeInitParams::DisplayIncludingApplicationsExceptingWindows(
display.clone(),
apps_to_exclude,
Vec::new(),
);
Ok(UnsafeContentFilter::init(filter_init))
}
pub async fn start<F>(&mut self, callback: F) -> Result<(), String>
where
F: Fn(&[f32]) + Send + Sync + 'static,
{
let filter = self.build_filter().await?;
// Wait, 'pixel_format' is OSType. b"BGRA" is &[u8;4].
// FourCharCode::from_chars exists in crate::os_types::four_char_code but we didn't import it.
// Actually, we can just use the Default and overwrite fields.
// But better: use Default and only set what we need.
let mut config = UnsafeStreamConfiguration::default();
config.width = 100;
config.height = 100;
config.captures_audio = 1;
config.sample_rate = self.sample_rate;
config.channel_count = 2;
config.excludes_current_process_audio = 0;
config.excludes_current_process_audio = 1;
let output_wrapper = AudioOutputWrapper {
callback: Box::new(callback),
callback: Box::new(callback),
};
// Convert config to Id<UnsafeStreamConfigurationRef> using Into
let stream = UnsafeSCStream::init(filter, config.into(), ErrorHandler);
stream.add_stream_output(output_wrapper, 1); // 1 = Audio
stream.start_capture().map_err(|e| "Failed to start capture".to_string())?;
stream
.start_capture()
.map_err(|_| "Failed to start capture".to_string())?;
self.stream = Some(stream);
Ok(())
}
pub fn stop(&mut self) {
if let Some(stream) = &self.stream {
stream.stop_capture();
let _ = stream.stop_capture();
}
self.stream = None;
}
pub async fn refresh_filter(&mut self) -> Result<(), String> {
let stream = match &self.stream {
Some(s) => s,
None => return Ok(()),
};
let filter = self.build_filter().await?;
// Call updateContentFilter:completionHandler:
// screencapturekit-sys 0.2.8 does not have this method exposed yet in UnsafeSCStream.
// We use msg_send! to call it directly on the underlying object.
unsafe {
// Get raw pointer to the underlying Objective-C objects.
// Since we don't have easy access to the inner pointer of Id<T> via methods,
// we cast the pointer to the Id wrapper itself to a pointer to a pointer.
// This assumes Id<T> is a transparent wrapper around a pointer.
let stream_ptr =
*(stream as *const Id<UnsafeSCStream> as *const *mut objc::runtime::Object);
let filter_ptr =
*(&filter as *const Id<UnsafeContentFilter> as *const *mut objc::runtime::Object);
let _: () =
msg_send![stream_ptr, updateContentFilter: filter_ptr completionHandler: nil];
}
Ok(())
}
}

View File

@@ -0,0 +1,103 @@
use screencapturekit_sys::{
os_types::rc::Id,
shareable_content::UnsafeSCShareableContent,
content_filter::{UnsafeContentFilter, UnsafeInitParams},
stream_configuration::UnsafeStreamConfiguration,
stream::UnsafeSCStream,
stream_error_handler::UnsafeSCStreamError,
stream_output_handler::UnsafeSCStreamOutput,
cm_sample_buffer_ref::CMSampleBufferRef,
};
pub struct SystemAudioCapture {
stream: Option<Id<UnsafeSCStream>>,
sample_rate: u32,
}
struct AudioOutputWrapper {
callback: Box<dyn Fn(&[f32]) + Send + Sync>,
}
impl UnsafeSCStreamOutput for AudioOutputWrapper {
fn did_output_sample_buffer(&self, sample: Id<CMSampleBufferRef>, of_type: u8) {
if of_type == 1 { // Audio
let buffers = sample.get_av_audio_buffer_list();
for buffer in buffers {
// Buffer data is u8, we usually get F32 from SCK if configured.
// Assuming f32 (Floating Point) based on our config.
// We need to convert [u8] to [f32].
let data_u8 = buffer.data;
let data_f32: &[f32] = unsafe {
std::slice::from_raw_parts(
data_u8.as_ptr() as *const f32,
data_u8.len() / 4,
)
};
(self.callback)(data_f32);
}
}
}
}
struct ErrorHandler;
impl UnsafeSCStreamError for ErrorHandler {
fn handle_error(&self) {
// eprintln!("Stream Error");
}
}
pub async fn check_permissions() -> bool {
UnsafeSCShareableContent::get().is_ok()
}
impl SystemAudioCapture {
pub fn new(sample_rate: u32) -> Self {
Self { stream: None, sample_rate }
}
pub async fn start<F>(&mut self, callback: F) -> Result<(), String>
where F: Fn(&[f32]) + Send + Sync + 'static {
let content = UnsafeSCShareableContent::get().map_err(|e| format!("Failed to get content"))?;
let displays = content.displays();
let display = displays.first().ok_or("No display found")?;
let filter_init = UnsafeInitParams::Display(display.clone());
let filter = UnsafeContentFilter::init(filter_init);
// Wait, 'pixel_format' is OSType. b"BGRA" is &[u8;4].
// FourCharCode::from_chars exists in crate::os_types::four_char_code but we didn't import it.
// Actually, we can just use the Default and overwrite fields.
// But better: use Default and only set what we need.
let mut config = UnsafeStreamConfiguration::default();
config.width = 100;
config.height = 100;
config.captures_audio = 1;
config.sample_rate = self.sample_rate;
config.channel_count = 2;
config.excludes_current_process_audio = 0;
let output_wrapper = AudioOutputWrapper {
callback: Box::new(callback),
};
// Convert config to Id<UnsafeStreamConfigurationRef> using Into
let stream = UnsafeSCStream::init(filter, config.into(), ErrorHandler);
stream.add_stream_output(output_wrapper, 1); // 1 = Audio
stream.start_capture().map_err(|e| "Failed to start capture".to_string())?;
self.stream = Some(stream);
Ok(())
}
pub fn stop(&mut self) {
if let Some(stream) = &self.stream {
stream.stop_capture();
}
self.stream = None;
}
}

View File

@@ -1,7 +1,7 @@
{
"$schema": "https://schema.tauri.app/config/2",
"productName": "Hearbit AI",
"version": "1.2.1",
"version": "1.2.3",
"identifier": "com.hearbit-ai.desktop",
"build": {
"beforeDevCommand": "npm run dev",