"""
Advanced Speech Recognition and Synthesis Engine
Real-time speech-to-text and text-to-speech processing.
"""

import asyncio
import io
import json
import logging
import os
import threading
import time
import wave
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Union

import numpy as np

# Audio processing
try:
    import pyaudio
    import librosa
    AUDIO_AVAILABLE = True
except ImportError:
    AUDIO_AVAILABLE = False
    print("Audio libraries not available. Speech processing will be limited.")

# Speech recognition
try:
    import speech_recognition as sr
    SPEECH_RECOGNITION_AVAILABLE = True
except ImportError:
    SPEECH_RECOGNITION_AVAILABLE = False
    print("speech_recognition library not available. STT features will be limited.")

# Text-to-speech
try:
    import pyttsx3
    PYTTSX3_AVAILABLE = True
except ImportError:
    PYTTSX3_AVAILABLE = False
    print("pyttsx3 not available. TTS features will be limited.")

# Advanced TTS
try:
    from gtts import gTTS
    GTTS_AVAILABLE = True
except ImportError:
    GTTS_AVAILABLE = False
    print("gTTS not available. Advanced TTS features will be limited.")

# Deep learning for speech
try:
    import torch
    import torchaudio
    TORCH_AVAILABLE = True
except ImportError:
    TORCH_AVAILABLE = False
    print("PyTorch not available. Deep learning speech features will be limited.")

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class AudioConfig:
    """Audio configuration settings."""
    sample_rate: int = 16000
    channels: int = 1
    chunk_size: int = 1024
    format: int = 16  # 16-bit
    device_index: Optional[int] = None

@dataclass
class SpeechRecognitionResult:
    """Result from speech recognition."""
    text: str
    confidence: float
    language: str
    processing_time: float
    audio_duration: float
    alternatives: List[Dict[str, Any]] = None

@dataclass
class SpeechSynthesisResult:
    """Result from speech synthesis."""
    audio_file: str
    text: str
    language: str
    voice_id: str
    synthesis_time: float

class AudioCapture:
    """Real-time audio capture."""
    
    def __init__(self, config: AudioConfig):
        self.config = config
        self.audio = None
        self.stream = None
        self.is_recording = False
        self.audio_buffer = []
        
        if AUDIO_AVAILABLE:
            self.audio = pyaudio.PyAudio()
    
    def start_recording(self) -> bool:
        """Start audio recording."""
        if not AUDIO_AVAILABLE:
            logger.error("PyAudio not available for recording")
            return False
        
        try:
            self.stream = self.audio.open(
                format=self.audio.get_format_from_width(self.config.format // 8),
                channels=self.config.channels,
                rate=self.config.sample_rate,
                input=True,
                input_device_index=self.config.device_index,
                frames_per_buffer=self.config.chunk_size
            )
            
            self.is_recording = True
            self.audio_buffer = []
            logger.info("Audio recording started")
            return True
            
        except Exception as e:
            logger.error(f"Error starting audio recording: {str(e)}")
            return False
    
    def stop_recording(self) -> bytes:
        """Stop audio recording and return audio data."""
        if not self.is_recording or not self.stream:
            return b''
        
        self.is_recording = False
        
        if self.stream:
            self.stream.stop_stream()
            self.stream.close()
        
        # Combine audio chunks
        audio_data = b''.join(self.audio_buffer)
        self.audio_buffer = []
        
        logger.info("Audio recording stopped")
        return audio_data
    
    def record_chunk(self) -> bytes:
        """Record a single audio chunk."""
        if not self.is_recording or not self.stream:
            return b''
        
        try:
            data = self.stream.read(self.config.chunk_size, exception_on_overflow=False)
            self.audio_buffer.append(data)
            return data
        except Exception as e:
            logger.error(f"Error recording audio chunk: {str(e)}")
            return b''
    
    def save_audio(self, audio_data: bytes, filename: str) -> bool:
        """Save audio data to WAV file."""
        try:
            with wave.open(filename, 'wb') as wf:
                wf.setnchannels(self.config.channels)
                wf.setsampwidth(self.config.format // 8)
                wf.setframerate(self.config.sample_rate)
                wf.writeframes(audio_data)
            
            logger.info(f"Audio saved to {filename}")
            return True
            
        except Exception as e:
            logger.error(f"Error saving audio: {str(e)}")
            return False
    
    def __del__(self):
        """Cleanup audio resources."""
        if self.stream:
            self.stream.close()
        if self.audio:
            self.audio.terminate()

class SpeechToTextEngine:
    """Speech-to-text recognition engine."""
    
    def __init__(self):
        self.recognizers = {}
        self._initialize_recognizers()
    
    def _initialize_recognizers(self):
        """Initialize speech recognition engines."""
        if SPEECH_RECOGNITION_AVAILABLE:
            self.recognizers['google'] = sr.Recognizer()
            self.recognizers['sphinx'] = sr.Recognizer()
    
    def recognize_from_file(self, audio_file: Union[str, Path], 
                          engine: str = 'google',
                          language: str = 'en-US') -> SpeechRecognitionResult:
        """Recognize speech from audio file."""
        start_time = time.time()
        
        if not SPEECH_RECOGNITION_AVAILABLE:
            return SpeechRecognitionResult(
                text="Speech recognition not available",
                confidence=0.0,
                language=language,
                processing_time=0.0,
                audio_duration=0.0
            )
        
        try:
            # Load audio file
            with sr.AudioFile(str(audio_file)) as source:
                recognizer = self.recognizers.get(engine, self.recognizers['google'])
                audio = recognizer.record(source)
                audio_duration = len(audio.frame_data) / (audio.sample_rate * audio.sample_width)
            
            # Perform recognition
            if engine == 'google':
                text = recognizer.recognize_google(audio, language=language)
                confidence = 0.9  # Google doesn't provide confidence scores
            elif engine == 'sphinx':
                text = recognizer.recognize_sphinx(audio, language=language)
                confidence = 0.7
            else:
                text = recognizer.recognize_google(audio, language=language)
                confidence = 0.9
            
            processing_time = time.time() - start_time
            
            return SpeechRecognitionResult(
                text=text,
                confidence=confidence,
                language=language,
                processing_time=processing_time,
                audio_duration=audio_duration
            )
            
        except sr.UnknownValueError:
            return SpeechRecognitionResult(
                text="Could not understand audio",
                confidence=0.0,
                language=language,
                processing_time=time.time() - start_time,
                audio_duration=0.0
            )
        except sr.RequestError as e:
            return SpeechRecognitionResult(
                text=f"Error with speech recognition service: {str(e)}",
                confidence=0.0,
                language=language,
                processing_time=time.time() - start_time,
                audio_duration=0.0
            )
    
    def recognize_from_microphone(self, duration: float = 5.0,
                                engine: str = 'google',
                                language: str = 'en-US') -> SpeechRecognitionResult:
        """Recognize speech from microphone."""
        start_time = time.time()
        
        if not SPEECH_RECOGNITION_AVAILABLE:
            return SpeechRecognitionResult(
                text="Speech recognition not available",
                confidence=0.0,
                language=language,
                processing_time=0.0,
                audio_duration=0.0
            )
        
        try:
            recognizer = self.recognizers.get(engine, self.recognizers['google'])
            microphone = sr.Microphone()
            
            # Adjust for ambient noise
            with microphone as source:
                logger.info("Adjusting for ambient noise...")
                recognizer.adjust_for_ambient_noise(source)
            
            # Record audio
            logger.info(f"Recording for {duration} seconds...")
            with microphone as source:
                audio = recognizer.listen(source, timeout=duration, phrase_time_limit=duration)
            
            # Perform recognition
            if engine == 'google':
                text = recognizer.recognize_google(audio, language=language)
                confidence = 0.9
            elif engine == 'sphinx':
                text = recognizer.recognize_sphinx(audio, language=language)
                confidence = 0.7
            else:
                text = recognizer.recognize_google(audio, language=language)
                confidence = 0.9
            
            processing_time = time.time() - start_time
            
            return SpeechRecognitionResult(
                text=text,
                confidence=confidence,
                language=language,
                processing_time=processing_time,
                audio_duration=duration
            )
            
        except sr.WaitTimeoutError:
            return SpeechRecognitionResult(
                text="No speech detected",
                confidence=0.0,
                language=language,
                processing_time=time.time() - start_time,
                audio_duration=duration
            )
        except Exception as e:
            return SpeechRecognitionResult(
                text=f"Error: {str(e)}",
                confidence=0.0,
                language=language,
                processing_time=time.time() - start_time,
                audio_duration=duration
            )
    
    def continuous_recognition(self, callback: Callable[[str], None],
                             engine: str = 'google',
                             language: str = 'en-US',
                             phrase_time_limit: float = 2.0) -> None:
        """Continuous speech recognition with callback."""
        if not SPEECH_RECOGNITION_AVAILABLE:
            logger.error("Speech recognition not available for continuous recognition")
            return
        
        recognizer = self.recognizers.get(engine, self.recognizers['google'])
        microphone = sr.Microphone()
        
        # Adjust for ambient noise
        with microphone as source:
            recognizer.adjust_for_ambient_noise(source)
        
        def recognize_worker():
            while True:
                try:
                    with microphone as source:
                        audio = recognizer.listen(source, phrase_time_limit=phrase_time_limit)
                    
                    # Recognize in separate thread to avoid blocking
                    def process_audio():
                        try:
                            if engine == 'google':
                                text = recognizer.recognize_google(audio, language=language)
                            else:
                                text = recognizer.recognize_sphinx(audio, language=language)
                            
                            callback(text)
                        except (sr.UnknownValueError, sr.RequestError):
                            pass  # Ignore recognition errors in continuous mode
                    
                    threading.Thread(target=process_audio, daemon=True).start()
                    
                except Exception as e:
                    logger.error(f"Error in continuous recognition: {str(e)}")
                    time.sleep(1)
        
        # Start recognition in background thread
        threading.Thread(target=recognize_worker, daemon=True).start()

class TextToSpeechEngine:
    """Text-to-speech synthesis engine."""
    
    def __init__(self):
        self.engines = {}
        self._initialize_engines()
    
    def _initialize_engines(self):
        """Initialize TTS engines."""
        if PYTTSX3_AVAILABLE:
            try:
                self.engines['pyttsx3'] = pyttsx3.init()
                self._configure_pyttsx3()
            except Exception as e:
                logger.warning(f"Could not initialize pyttsx3: {str(e)}")
    
    def _configure_pyttsx3(self):
        """Configure pyttsx3 engine."""
        if 'pyttsx3' in self.engines:
            engine = self.engines['pyttsx3']
            
            # Set speech rate
            rate = engine.getProperty('rate')
            engine.setProperty('rate', rate - 50)  # Slow down speech
            
            # Set volume
            volume = engine.getProperty('volume')
            engine.setProperty('volume', volume)
    
    def synthesize_to_file(self, text: str, output_file: str,
                         engine: str = 'pyttsx3',
                         language: str = 'en',
                         voice_id: Optional[str] = None) -> SpeechSynthesisResult:
        """Synthesize text to speech and save to file."""
        start_time = time.time()
        
        try:
            if engine == 'pyttsx3' and 'pyttsx3' in self.engines:
                return self._synthesize_pyttsx3(text, output_file, voice_id, start_time)
            elif engine == 'gtts' and GTTS_AVAILABLE:
                return self._synthesize_gtts(text, output_file, language, start_time)
            else:
                # Fallback to available engine
                if 'pyttsx3' in self.engines:
                    return self._synthesize_pyttsx3(text, output_file, voice_id, start_time)
                elif GTTS_AVAILABLE:
                    return self._synthesize_gtts(text, output_file, language, start_time)
                else:
                    raise Exception("No TTS engine available")
        
        except Exception as e:
            logger.error(f"Error in speech synthesis: {str(e)}")
            return SpeechSynthesisResult(
                audio_file="",
                text=text,
                language=language,
                voice_id=voice_id or "default",
                synthesis_time=time.time() - start_time
            )
    
    def _synthesize_pyttsx3(self, text: str, output_file: str,
                          voice_id: Optional[str], start_time: float) -> SpeechSynthesisResult:
        """Synthesize using pyttsx3."""
        engine = self.engines['pyttsx3']
        
        # Set voice if specified
        if voice_id:
            voices = engine.getProperty('voices')
            for voice in voices:
                if voice_id in voice.id:
                    engine.setProperty('voice', voice.id)
                    break
        
        # Save to file
        engine.save_to_file(text, output_file)
        engine.runAndWait()
        
        synthesis_time = time.time() - start_time
        
        return SpeechSynthesisResult(
            audio_file=output_file,
            text=text,
            language="en",
            voice_id=voice_id or "default",
            synthesis_time=synthesis_time
        )
    
    def _synthesize_gtts(self, text: str, output_file: str,
                        language: str, start_time: float) -> SpeechSynthesisResult:
        """Synthesize using gTTS."""
        tts = gTTS(text=text, lang=language[:2], slow=False)  # Use first 2 chars of language
        tts.save(output_file)
        
        synthesis_time = time.time() - start_time
        
        return SpeechSynthesisResult(
            audio_file=output_file,
            text=text,
            language=language,
            voice_id="gtts",
            synthesis_time=synthesis_time
        )
    
    def speak(self, text: str, engine: str = 'pyttsx3',
             voice_id: Optional[str] = None) -> bool:
        """Speak text directly."""
        try:
            if engine == 'pyttsx3' and 'pyttsx3' in self.engines:
                engine_obj = self.engines['pyttsx3']
                
                # Set voice if specified
                if voice_id:
                    voices = engine_obj.getProperty('voices')
                    for voice in voices:
                        if voice_id in voice.id:
                            engine_obj.setProperty('voice', voice.id)
                            break
                
                engine_obj.say(text)
                engine_obj.runAndWait()
                return True
            else:
                logger.warning("No suitable TTS engine available for direct speech")
                return False
                
        except Exception as e:
            logger.error(f"Error in direct speech: {str(e)}")
            return False
    
    def get_available_voices(self, engine: str = 'pyttsx3') -> List[Dict[str, str]]:
        """Get available voices for TTS engine."""
        if engine == 'pyttsx3' and 'pyttsx3' in self.engines:
            engine_obj = self.engines['pyttsx3']
            voices = engine_obj.getProperty('voices')
            
            voice_list = []
            for voice in voices:
                voice_list.append({
                    'id': voice.id,
                    'name': voice.name,
                    'languages': getattr(voice, 'languages', []),
                    'gender': getattr(voice, 'gender', 'unknown')
                })
            
            return voice_list
        
        return []

class SpeechRecognitionEngine:
    """
    Comprehensive Speech Recognition and Synthesis Engine.
    Supports real-time speech-to-text and text-to-speech processing.
    """
    
    def __init__(self, audio_config: Optional[AudioConfig] = None):
        self.audio_config = audio_config or AudioConfig()
        self.audio_capture = AudioCapture(self.audio_config)
        self.stt_engine = SpeechToTextEngine()
        self.tts_engine = TextToSpeechEngine()
        
        self.processing_stats = {
            'stt_requests': 0,
            'tts_requests': 0,
            'total_stt_time': 0.0,
            'total_tts_time': 0.0,
            'average_stt_time': 0.0,
            'average_tts_time': 0.0
        }
    
    async def transcribe_audio_file(self, audio_file: Union[str, Path],
                                  engine: str = 'google',
                                  language: str = 'en-US') -> SpeechRecognitionResult:
        """Transcribe audio file to text."""
        logger.info(f"Transcribing audio file: {audio_file}")
        
        result = self.stt_engine.recognize_from_file(audio_file, engine, language)
        
        # Update stats
        self.processing_stats['stt_requests'] += 1
        self.processing_stats['total_stt_time'] += result.processing_time
        self.processing_stats['average_stt_time'] = (
            self.processing_stats['total_stt_time'] / 
            self.processing_stats['stt_requests']
        )
        
        return result
    
    async def transcribe_microphone(self, duration: float = 5.0,
                                  engine: str = 'google',
                                  language: str = 'en-US') -> SpeechRecognitionResult:
        """Transcribe speech from microphone."""
        logger.info(f"Transcribing microphone input for {duration} seconds")
        
        result = self.stt_engine.recognize_from_microphone(duration, engine, language)
        
        # Update stats
        self.processing_stats['stt_requests'] += 1
        self.processing_stats['total_stt_time'] += result.processing_time
        self.processing_stats['average_stt_time'] = (
            self.processing_stats['total_stt_time'] / 
            self.processing_stats['stt_requests']
        )
        
        return result
    
    async def synthesize_speech(self, text: str, output_file: str,
                              engine: str = 'pyttsx3',
                              language: str = 'en',
                              voice_id: Optional[str] = None) -> SpeechSynthesisResult:
        """Synthesize text to speech."""
        logger.info(f"Synthesizing speech: '{text[:50]}...'")
        
        result = self.tts_engine.synthesize_to_file(text, output_file, engine, language, voice_id)
        
        # Update stats
        self.processing_stats['tts_requests'] += 1
        self.processing_stats['total_tts_time'] += result.synthesis_time
        self.processing_stats['average_tts_time'] = (
            self.processing_stats['total_tts_time'] / 
            self.processing_stats['tts_requests']
        )
        
        return result
    
    async def speak_text(self, text: str, engine: str = 'pyttsx3',
                        voice_id: Optional[str] = None) -> bool:
        """Speak text directly."""
        logger.info(f"Speaking text: '{text[:50]}...'")
        
        success = self.tts_engine.speak(text, engine, voice_id)
        
        if success:
            self.processing_stats['tts_requests'] += 1
        
        return success
    
    def start_continuous_recognition(self, callback: Callable[[str], None],
                                   engine: str = 'google',
                                   language: str = 'en-US') -> None:
        """Start continuous speech recognition."""
        logger.info("Starting continuous speech recognition")
        self.stt_engine.continuous_recognition(callback, engine, language)
    
    async def batch_transcribe(self, audio_files: List[Union[str, Path]],
                             engine: str = 'google',
                             language: str = 'en-US',
                             max_workers: int = 4) -> List[SpeechRecognitionResult]:
        """Transcribe multiple audio files in parallel."""
        logger.info(f"Starting batch transcription of {len(audio_files)} files")
        
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            tasks = [
                executor.submit(
                    self.stt_engine.recognize_from_file,
                    audio_file, engine, language
                )
                for audio_file in audio_files
            ]
            
            results = []
            for task in tasks:
                try:
                    result = task.result()
                    results.append(result)
                    
                    # Update stats
                    self.processing_stats['stt_requests'] += 1
                    self.processing_stats['total_stt_time'] += result.processing_time
                    
                except Exception as e:
                    logger.error(f"Error in batch transcription: {str(e)}")
                    results.append(SpeechRecognitionResult(
                        text=f"Error: {str(e)}",
                        confidence=0.0,
                        language=language,
                        processing_time=0.0,
                        audio_duration=0.0
                    ))
        
        # Update average
        if self.processing_stats['stt_requests'] > 0:
            self.processing_stats['average_stt_time'] = (
                self.processing_stats['total_stt_time'] / 
                self.processing_stats['stt_requests']
            )
        
        logger.info(f"Batch transcription complete. Processed {len(results)} files")
        return results
    
    def get_available_voices(self, engine: str = 'pyttsx3') -> List[Dict[str, str]]:
        """Get available voices."""
        return self.tts_engine.get_available_voices(engine)
    
    def get_processing_stats(self) -> Dict[str, Any]:
        """Get processing statistics."""
        return self.processing_stats.copy()

# Example usage and testing
async def main():
    """Example usage of the Speech Recognition Engine."""
    print("=== Speech Recognition Engine Demo ===")
    
    # Initialize speech engine
    speech_engine = SpeechRecognitionEngine()
    
    # Test 1: Available features
    print("\n1. Available features:")
    print(f"Audio processing: {AUDIO_AVAILABLE}")
    print(f"Speech recognition: {SPEECH_RECOGNITION_AVAILABLE}")
    print(f"pyttsx3 TTS: {PYTTSX3_AVAILABLE}")
    print(f"Google TTS: {GTTS_AVAILABLE}")
    
    # Test 2: Available voices
    print("\n2. Available voices:")
    voices = speech_engine.get_available_voices()
    for voice in voices[:3]:  # Show first 3 voices
        print(f"- {voice['name']} ({voice['id']})")
    
    # Test 3: Text-to-speech synthesis
    print("\n3. Testing text-to-speech...")
    test_text = "Hello, this is a test of the speech synthesis engine."
    
    try:
        if PYTTSX3_AVAILABLE or GTTS_AVAILABLE:
            result = await speech_engine.synthesize_speech(
                test_text,
                "test_speech.wav",
                engine='pyttsx3' if PYTTSX3_AVAILABLE else 'gtts'
            )
            
            print(f"Speech synthesized: {result.audio_file}")
            print(f"Synthesis time: {result.synthesis_time:.2f}s")
            
            # Clean up test file
            try:
                os.remove("test_speech.wav")
            except:
                pass
        else:
            print("No TTS engines available for testing")
    except Exception as e:
        print(f"TTS test failed: {str(e)}")
    
    # Test 4: Direct speech (if available)
    print("\n4. Testing direct speech...")
    if PYTTSX3_AVAILABLE:
        try:
            success = await speech_engine.speak_text("Testing direct speech output.")
            print(f"Direct speech test: {'Success' if success else 'Failed'}")
        except Exception as e:
            print(f"Direct speech test failed: {str(e)}")
    else:
        print("Direct speech not available")
    
    # Test 5: Microphone transcription (if available)
    if SPEECH_RECOGNITION_AVAILABLE and AUDIO_AVAILABLE:
        print("\n5. Testing microphone transcription...")
        print("Note: This would require microphone access in a real environment")
        # result = await speech_engine.transcribe_microphone(duration=3.0)
        # print(f"Transcribed: {result.text}")
        # print(f"Confidence: {result.confidence}")
    
    # Test 6: Processing statistics
    print("\n6. Processing statistics:")
    stats = speech_engine.get_processing_stats()
    for key, value in stats.items():
        print(f"{key}: {value}")
    
    print("\n=== Speech Recognition Engine Demo Complete ===")

if __name__ == "__main__":
    asyncio.run(main()) 