#!/usr/bin/env python3
"""
Advanced Data Processing for TuskLang Python SDK
================================================
High-performance data processing, ETL, and transformation

This module provides advanced data processing capabilities for the TuskLang Python SDK,
including ETL operations, data transformation, streaming processing, and high-performance
data manipulation.
"""

import pandas as pd
import numpy as np
import json
import csv
import time
import threading
from typing import Any, Dict, List, Optional, Callable, Union, Tuple, Iterator
from dataclasses import dataclass, asdict
from datetime import datetime, timedelta
from enum import Enum
import logging
import asyncio
import aiofiles
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import multiprocessing as mp
from functools import partial
import dask.dataframe as dd
import vaex
import polars as pl


class DataFormat(Enum):
    """Data format enumeration"""
    CSV = "csv"
    JSON = "json"
    PARQUET = "parquet"
    AVRO = "avro"
    XML = "xml"
    YAML = "yaml"
    EXCEL = "excel"
    CUSTOM = "custom"


class ProcessingMode(Enum):
    """Processing mode enumeration"""
    BATCH = "batch"
    STREAMING = "streaming"
    REAL_TIME = "real_time"
    PARALLEL = "parallel"


@dataclass
class DataSchema:
    """Data schema structure"""
    name: str
    fields: List[Dict[str, Any]]
    primary_key: Optional[str] = None
    indexes: List[str] = None
    constraints: Dict[str, Any] = None


@dataclass
class ProcessingJob:
    """Processing job structure"""
    job_id: str
    name: str
    source_path: str
    target_path: str
    transformations: List[Dict[str, Any]]
    status: str
    progress: float
    created_at: datetime
    started_at: Optional[datetime] = None
    completed_at: Optional[datetime] = None


class AdvancedDataProcessing:
    """Advanced data processing system for TuskLang"""
    
    def __init__(self, config: Dict[str, Any] = None):
        self.config = config or {}
        self.logger = logging.getLogger('tusklang.dataprocessing')
        
        # Initialize components
        self.data_processors = {}
        self.processing_jobs = {}
        self.data_schemas = {}
        self.cache = {}
        
        # Initialize processing components
        self.etl_engine = ETLEngine()
        self.stream_processor = StreamProcessor()
        self.data_transformer = DataTransformer()
        self.parallel_processor = ParallelProcessor()
        
        # Initialize processing
        self.processing_active = True
        self.job_queue = asyncio.Queue()
        
        # Start background processes
        self._start_background_processes()
    
    def _start_background_processes(self):
        """Start background data processing processes"""
        # Job processor
        self.job_processor_thread = threading.Thread(target=self._job_processor_loop, daemon=True)
        self.job_processor_thread.start()
        
        # Cache cleanup
        self.cache_cleanup_thread = threading.Thread(target=self._cache_cleanup_loop, daemon=True)
        self.cache_cleanup_thread.start()
    
    def load_data(self, file_path: str, format_type: str = "auto", 
                  schema: Optional[DataSchema] = None) -> Union[pd.DataFrame, dd.DataFrame, pl.DataFrame]:
        """Load data from file"""
        try:
            format_enum = DataFormat(format_type.lower()) if format_type != "auto" else None
            
            if format_enum == DataFormat.CSV or (format_type == "auto" and file_path.endswith('.csv')):
                return self._load_csv(file_path, schema)
            elif format_enum == DataFormat.JSON or (format_type == "auto" and file_path.endswith('.json')):
                return self._load_json(file_path, schema)
            elif format_enum == DataFormat.PARQUET or (format_type == "auto" and file_path.endswith('.parquet')):
                return self._load_parquet(file_path, schema)
            elif format_enum == DataFormat.EXCEL or (format_type == "auto" and file_path.endswith(('.xlsx', '.xls'))):
                return self._load_excel(file_path, schema)
            else:
                raise ValueError(f"Unsupported file format: {file_path}")
                
        except Exception as e:
            self.logger.error(f"Failed to load data from {file_path}: {e}")
            raise
    
    def _load_csv(self, file_path: str, schema: Optional[DataSchema] = None) -> pd.DataFrame:
        """Load CSV data"""
        df = pd.read_csv(file_path)
        
        if schema:
            # Apply schema validation
            df = self._apply_schema(df, schema)
        
        return df
    
    def _load_json(self, file_path: str, schema: Optional[DataSchema] = None) -> pd.DataFrame:
        """Load JSON data"""
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        if isinstance(data, list):
            df = pd.DataFrame(data)
        else:
            df = pd.DataFrame([data])
        
        if schema:
            df = self._apply_schema(df, schema)
        
        return df
    
    def _load_parquet(self, file_path: str, schema: Optional[DataSchema] = None) -> pd.DataFrame:
        """Load Parquet data"""
        df = pd.read_parquet(file_path)
        
        if schema:
            df = self._apply_schema(df, schema)
        
        return df
    
    def _load_excel(self, file_path: str, schema: Optional[DataSchema] = None) -> pd.DataFrame:
        """Load Excel data"""
        df = pd.read_excel(file_path)
        
        if schema:
            df = self._apply_schema(df, schema)
        
        return df
    
    def _apply_schema(self, df: pd.DataFrame, schema: DataSchema) -> pd.DataFrame:
        """Apply schema to dataframe"""
        # Validate and transform columns based on schema
        for field in schema.fields:
            field_name = field["name"]
            field_type = field["type"]
            
            if field_name in df.columns:
                # Convert data type
                if field_type == "int":
                    df[field_name] = pd.to_numeric(df[field_name], errors='coerce').astype('Int64')
                elif field_type == "float":
                    df[field_name] = pd.to_numeric(df[field_name], errors='coerce')
                elif field_type == "datetime":
                    df[field_name] = pd.to_datetime(df[field_name], errors='coerce')
                elif field_type == "string":
                    df[field_name] = df[field_name].astype(str)
        
        return df
    
    def save_data(self, data: Union[pd.DataFrame, dd.DataFrame, pl.DataFrame], 
                  file_path: str, format_type: str = "auto") -> bool:
        """Save data to file"""
        try:
            format_enum = DataFormat(format_type.lower()) if format_type != "auto" else None
            
            if format_enum == DataFormat.CSV or (format_type == "auto" and file_path.endswith('.csv')):
                return self._save_csv(data, file_path)
            elif format_enum == DataFormat.JSON or (format_type == "auto" and file_path.endswith('.json')):
                return self._save_json(data, file_path)
            elif format_enum == DataFormat.PARQUET or (format_type == "auto" and file_path.endswith('.parquet')):
                return self._save_parquet(data, file_path)
            elif format_enum == DataFormat.EXCEL or (format_type == "auto" and file_path.endswith(('.xlsx', '.xls'))):
                return self._save_excel(data, file_path)
            else:
                raise ValueError(f"Unsupported file format: {file_path}")
                
        except Exception as e:
            self.logger.error(f"Failed to save data to {file_path}: {e}")
            return False
    
    def _save_csv(self, data: Union[pd.DataFrame, dd.DataFrame, pl.DataFrame], file_path: str) -> bool:
        """Save data as CSV"""
        if isinstance(data, pd.DataFrame):
            data.to_csv(file_path, index=False)
        elif isinstance(data, dd.DataFrame):
            data.to_csv(file_path, single_file=True, index=False)
        elif isinstance(data, pl.DataFrame):
            data.write_csv(file_path)
        return True
    
    def _save_json(self, data: Union[pd.DataFrame, dd.DataFrame, pl.DataFrame], file_path: str) -> bool:
        """Save data as JSON"""
        if isinstance(data, pd.DataFrame):
            data.to_json(file_path, orient='records', indent=2)
        elif isinstance(data, dd.DataFrame):
            data.to_json(file_path, orient='records')
        elif isinstance(data, pl.DataFrame):
            data.write_json(file_path)
        return True
    
    def _save_parquet(self, data: Union[pd.DataFrame, dd.DataFrame, pl.DataFrame], file_path: str) -> bool:
        """Save data as Parquet"""
        if isinstance(data, pd.DataFrame):
            data.to_parquet(file_path, index=False)
        elif isinstance(data, dd.DataFrame):
            data.to_parquet(file_path)
        elif isinstance(data, pl.DataFrame):
            data.write_parquet(file_path)
        return True
    
    def _save_excel(self, data: Union[pd.DataFrame, dd.DataFrame, pl.DataFrame], file_path: str) -> bool:
        """Save data as Excel"""
        if isinstance(data, pd.DataFrame):
            data.to_excel(file_path, index=False)
        elif isinstance(data, dd.DataFrame):
            data.to_excel(file_path, index=False)
        elif isinstance(data, pl.DataFrame):
            data.write_excel(file_path)
        return True
    
    def transform_data(self, data: Union[pd.DataFrame, dd.DataFrame, pl.DataFrame],
                      transformations: List[Dict[str, Any]]) -> Union[pd.DataFrame, dd.DataFrame, pl.DataFrame]:
        """Apply transformations to data"""
        try:
            result = data.copy() if hasattr(data, 'copy') else data
            
            for transformation in transformations:
                transform_type = transformation.get("type")
                params = transformation.get("params", {})
                
                if transform_type == "filter":
                    result = self.data_transformer.filter_data(result, params)
                elif transform_type == "select":
                    result = self.data_transformer.select_columns(result, params)
                elif transform_type == "groupby":
                    result = self.data_transformer.group_data(result, params)
                elif transform_type == "aggregate":
                    result = self.data_transformer.aggregate_data(result, params)
                elif transform_type == "join":
                    result = self.data_transformer.join_data(result, params)
                elif transform_type == "sort":
                    result = self.data_transformer.sort_data(result, params)
                elif transform_type == "custom":
                    result = self.data_transformer.apply_custom_transform(result, params)
                else:
                    self.logger.warning(f"Unknown transformation type: {transform_type}")
            
            return result
            
        except Exception as e:
            self.logger.error(f"Data transformation error: {e}")
            raise
    
    def create_processing_job(self, name: str, source_path: str, target_path: str,
                             transformations: List[Dict[str, Any]]) -> str:
        """Create a new data processing job"""
        job_id = f"job_{int(time.time())}"
        
        job = ProcessingJob(
            job_id=job_id,
            name=name,
            source_path=source_path,
            target_path=target_path,
            transformations=transformations,
            status="pending",
            progress=0.0,
            created_at=datetime.now()
        )
        
        self.processing_jobs[job_id] = job
        
        # Add to job queue
        asyncio.create_task(self.job_queue.put(job))
        
        self.logger.info(f"Created processing job: {job_id}")
        return job_id
    
    def get_job_status(self, job_id: str) -> Optional[Dict[str, Any]]:
        """Get job status"""
        if job_id not in self.processing_jobs:
            return None
        
        job = self.processing_jobs[job_id]
        return {
            "job_id": job.job_id,
            "name": job.name,
            "status": job.status,
            "progress": job.progress,
            "created_at": job.created_at.isoformat(),
            "started_at": job.started_at.isoformat() if job.started_at else None,
            "completed_at": job.completed_at.isoformat() if job.completed_at else None
        }
    
    def list_jobs(self) -> List[Dict[str, Any]]:
        """List all processing jobs"""
        return [
            self.get_job_status(job_id)
            for job_id in self.processing_jobs.keys()
        ]
    
    async def process_stream(self, data_stream: Iterator[Any], 
                           transformations: List[Dict[str, Any]],
                           output_stream: Optional[Callable] = None) -> bool:
        """Process streaming data"""
        try:
            return await self.stream_processor.process_stream(
                data_stream, transformations, output_stream
            )
        except Exception as e:
            self.logger.error(f"Stream processing error: {e}")
            return False
    
    def process_parallel(self, data: Union[pd.DataFrame, List[Any]], 
                        transformations: List[Dict[str, Any]],
                        num_workers: int = None) -> Union[pd.DataFrame, List[Any]]:
        """Process data in parallel"""
        try:
            if num_workers is None:
                num_workers = mp.cpu_count()
            
            return self.parallel_processor.process_parallel(
                data, transformations, num_workers
            )
        except Exception as e:
            self.logger.error(f"Parallel processing error: {e}")
            raise
    
    def register_schema(self, schema: DataSchema) -> bool:
        """Register data schema"""
        try:
            self.data_schemas[schema.name] = schema
            self.logger.info(f"Registered schema: {schema.name}")
            return True
        except Exception as e:
            self.logger.error(f"Failed to register schema {schema.name}: {e}")
            return False
    
    def get_schema(self, schema_name: str) -> Optional[DataSchema]:
        """Get data schema"""
        return self.data_schemas.get(schema_name)
    
    def list_schemas(self) -> List[str]:
        """List all registered schemas"""
        return list(self.data_schemas.keys())
    
    def _job_processor_loop(self):
        """Background job processing loop"""
        while self.processing_active:
            try:
                # Process jobs from queue
                loop = asyncio.new_event_loop()
                asyncio.set_event_loop(loop)
                
                async def process_jobs():
                    while not self.job_queue.empty():
                        job = await self.job_queue.get()
                        await self._process_job(job)
                
                loop.run_until_complete(process_jobs())
                loop.close()
                
                time.sleep(1)  # Check every second
                
            except Exception as e:
                self.logger.error(f"Job processor error: {e}")
                time.sleep(5)
    
    async def _process_job(self, job: ProcessingJob):
        """Process a single job"""
        try:
            job.status = "running"
            job.started_at = datetime.now()
            
            # Load data
            data = self.load_data(job.source_path)
            job.progress = 0.2
            
            # Apply transformations
            transformed_data = self.transform_data(data, job.transformations)
            job.progress = 0.8
            
            # Save data
            self.save_data(transformed_data, job.target_path)
            job.progress = 1.0
            
            job.status = "completed"
            job.completed_at = datetime.now()
            
            self.logger.info(f"Completed job: {job.job_id}")
            
        except Exception as e:
            job.status = "failed"
            self.logger.error(f"Job {job.job_id} failed: {e}")
    
    def _cache_cleanup_loop(self):
        """Cache cleanup background loop"""
        while self.processing_active:
            try:
                # Clean up old cache entries
                current_time = time.time()
                expired_keys = [
                    key for key, item in self.cache.items()
                    if current_time > item["expires_at"]
                ]
                
                for key in expired_keys:
                    del self.cache[key]
                
                time.sleep(300)  # Cleanup every 5 minutes
                
            except Exception as e:
                self.logger.error(f"Cache cleanup error: {e}")
                time.sleep(600)


class ETLEngine:
    """ETL (Extract, Transform, Load) engine"""
    
    def __init__(self):
        self.logger = logging.getLogger('tusklang.dataprocessing.etl')
    
    def extract(self, source: str, format_type: str = "auto") -> pd.DataFrame:
        """Extract data from source"""
        # This would implement data extraction logic
        return pd.DataFrame()
    
    def transform(self, data: pd.DataFrame, transformations: List[Dict[str, Any]]) -> pd.DataFrame:
        """Transform data"""
        # This would implement data transformation logic
        return data
    
    def load(self, data: pd.DataFrame, target: str, format_type: str = "auto") -> bool:
        """Load data to target"""
        # This would implement data loading logic
        return True


class StreamProcessor:
    """Stream data processor"""
    
    def __init__(self):
        self.logger = logging.getLogger('tusklang.dataprocessing.stream')
    
    async def process_stream(self, data_stream: Iterator[Any], 
                           transformations: List[Dict[str, Any]],
                           output_stream: Optional[Callable] = None) -> bool:
        """Process streaming data"""
        try:
            for data_chunk in data_stream:
                # Apply transformations to chunk
                transformed_chunk = self._apply_transformations(data_chunk, transformations)
                
                # Output transformed chunk
                if output_stream:
                    await output_stream(transformed_chunk)
            
            return True
            
        except Exception as e:
            self.logger.error(f"Stream processing error: {e}")
            return False
    
    def _apply_transformations(self, data_chunk: Any, transformations: List[Dict[str, Any]]) -> Any:
        """Apply transformations to data chunk"""
        # Simplified transformation application
        return data_chunk


class DataTransformer:
    """Data transformation utilities"""
    
    def __init__(self):
        self.logger = logging.getLogger('tusklang.dataprocessing.transformer')
    
    def filter_data(self, data: Union[pd.DataFrame, dd.DataFrame, pl.DataFrame], 
                   params: Dict[str, Any]) -> Union[pd.DataFrame, dd.DataFrame, pl.DataFrame]:
        """Filter data based on conditions"""
        condition = params.get("condition")
        if condition and hasattr(data, 'query'):
            return data.query(condition)
        return data
    
    def select_columns(self, data: Union[pd.DataFrame, dd.DataFrame, pl.DataFrame], 
                      params: Dict[str, Any]) -> Union[pd.DataFrame, dd.DataFrame, pl.DataFrame]:
        """Select specific columns"""
        columns = params.get("columns", [])
        if columns and hasattr(data, 'loc'):
            return data.loc[:, columns]
        return data
    
    def group_data(self, data: Union[pd.DataFrame, dd.DataFrame, pl.DataFrame], 
                  params: Dict[str, Any]) -> Union[pd.DataFrame, dd.DataFrame, pl.DataFrame]:
        """Group data by columns"""
        group_by = params.get("group_by", [])
        if group_by and hasattr(data, 'groupby'):
            return data.groupby(group_by)
        return data
    
    def aggregate_data(self, data: Union[pd.DataFrame, dd.DataFrame, pl.DataFrame], 
                      params: Dict[str, Any]) -> Union[pd.DataFrame, dd.DataFrame, pl.DataFrame]:
        """Aggregate data"""
        aggregations = params.get("aggregations", {})
        if aggregations and hasattr(data, 'agg'):
            return data.agg(aggregations)
        return data
    
    def join_data(self, data: Union[pd.DataFrame, dd.DataFrame, pl.DataFrame], 
                 params: Dict[str, Any]) -> Union[pd.DataFrame, dd.DataFrame, pl.DataFrame]:
        """Join data with another dataset"""
        other_data = params.get("other_data")
        join_type = params.get("join_type", "inner")
        on = params.get("on")
        
        if other_data is not None and hasattr(data, 'merge'):
            return data.merge(other_data, on=on, how=join_type)
        return data
    
    def sort_data(self, data: Union[pd.DataFrame, dd.DataFrame, pl.DataFrame], 
                 params: Dict[str, Any]) -> Union[pd.DataFrame, dd.DataFrame, pl.DataFrame]:
        """Sort data by columns"""
        sort_by = params.get("sort_by", [])
        ascending = params.get("ascending", True)
        
        if sort_by and hasattr(data, 'sort_values'):
            return data.sort_values(by=sort_by, ascending=ascending)
        return data
    
    def apply_custom_transform(self, data: Union[pd.DataFrame, dd.DataFrame, pl.DataFrame], 
                             params: Dict[str, Any]) -> Union[pd.DataFrame, dd.DataFrame, pl.DataFrame]:
        """Apply custom transformation function"""
        func = params.get("function")
        if func and callable(func):
            return func(data)
        return data


class ParallelProcessor:
    """Parallel data processor"""
    
    def __init__(self):
        self.logger = logging.getLogger('tusklang.dataprocessing.parallel')
    
    def process_parallel(self, data: Union[pd.DataFrame, List[Any]], 
                        transformations: List[Dict[str, Any]],
                        num_workers: int) -> Union[pd.DataFrame, List[Any]]:
        """Process data in parallel"""
        try:
            with ProcessPoolExecutor(max_workers=num_workers) as executor:
                # Split data into chunks
                chunks = self._split_data(data, num_workers)
                
                # Process chunks in parallel
                futures = [
                    executor.submit(self._process_chunk, chunk, transformations)
                    for chunk in chunks
                ]
                
                # Collect results
                results = [future.result() for future in futures]
                
                # Combine results
                return self._combine_results(results)
                
        except Exception as e:
            self.logger.error(f"Parallel processing error: {e}")
            raise
    
    def _split_data(self, data: Union[pd.DataFrame, List[Any]], num_chunks: int) -> List[Any]:
        """Split data into chunks"""
        if isinstance(data, pd.DataFrame):
            chunk_size = len(data) // num_chunks
            return [data[i:i+chunk_size] for i in range(0, len(data), chunk_size)]
        elif isinstance(data, list):
            chunk_size = len(data) // num_chunks
            return [data[i:i+chunk_size] for i in range(0, len(data), chunk_size)]
        else:
            return [data]
    
    def _process_chunk(self, chunk: Any, transformations: List[Dict[str, Any]]) -> Any:
        """Process a single data chunk"""
        # Apply transformations to chunk
        result = chunk
        for transformation in transformations:
            # Simplified transformation application
            pass
        return result
    
    def _combine_results(self, results: List[Any]) -> Union[pd.DataFrame, List[Any]]:
        """Combine parallel processing results"""
        if not results:
            return pd.DataFrame() if isinstance(results[0], pd.DataFrame) else []
        
        if isinstance(results[0], pd.DataFrame):
            return pd.concat(results, ignore_index=True)
        elif isinstance(results[0], list):
            return [item for sublist in results for item in sublist]
        else:
            return results


# Global data processing instance
advanced_data_processing = AdvancedDataProcessing()


def load_data(file_path: str, format_type: str = "auto", 
              schema: Optional[DataSchema] = None) -> Union[pd.DataFrame, dd.DataFrame, pl.DataFrame]:
    """Load data from file"""
    return advanced_data_processing.load_data(file_path, format_type, schema)


def save_data(data: Union[pd.DataFrame, dd.DataFrame, pl.DataFrame], 
              file_path: str, format_type: str = "auto") -> bool:
    """Save data to file"""
    return advanced_data_processing.save_data(data, file_path, format_type)


def transform_data(data: Union[pd.DataFrame, dd.DataFrame, pl.DataFrame],
                  transformations: List[Dict[str, Any]]) -> Union[pd.DataFrame, dd.DataFrame, pl.DataFrame]:
    """Apply transformations to data"""
    return advanced_data_processing.transform_data(data, transformations)


def create_processing_job(name: str, source_path: str, target_path: str,
                         transformations: List[Dict[str, Any]]) -> str:
    """Create a new data processing job"""
    return advanced_data_processing.create_processing_job(name, source_path, target_path, transformations)


def get_job_status(job_id: str) -> Optional[Dict[str, Any]]:
    """Get job status"""
    return advanced_data_processing.get_job_status(job_id)


def list_processing_jobs() -> List[Dict[str, Any]]:
    """List all processing jobs"""
    return advanced_data_processing.list_jobs()


async def process_stream(data_stream: Iterator[Any], 
                        transformations: List[Dict[str, Any]],
                        output_stream: Optional[Callable] = None) -> bool:
    """Process streaming data"""
    return await advanced_data_processing.process_stream(data_stream, transformations, output_stream)


def process_parallel(data: Union[pd.DataFrame, List[Any]], 
                    transformations: List[Dict[str, Any]],
                    num_workers: int = None) -> Union[pd.DataFrame, List[Any]]:
    """Process data in parallel"""
    return advanced_data_processing.process_parallel(data, transformations, num_workers)


def register_schema(schema: DataSchema) -> bool:
    """Register data schema"""
    return advanced_data_processing.register_schema(schema)


def get_schema(schema_name: str) -> Optional[DataSchema]:
    """Get data schema"""
    return advanced_data_processing.get_schema(schema_name)


def list_schemas() -> List[str]:
    """List all registered schemas"""
    return advanced_data_processing.list_schemas()


if __name__ == "__main__":
    print("Advanced Data Processing for TuskLang Python SDK")
    print("=" * 50)
    
    # Test data processing capabilities
    print("\n1. Testing Data Loading:")
    
    # Create sample data
    sample_data = pd.DataFrame({
        'id': [1, 2, 3, 4, 5],
        'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
        'age': [25, 30, 35, 40, 45],
        'salary': [50000, 60000, 70000, 80000, 90000]
    })
    
    # Save sample data
    save_data(sample_data, "sample_data.csv")
    print("  Created sample data file")
    
    # Load data
    loaded_data = load_data("sample_data.csv")
    print(f"  Loaded data shape: {loaded_data.shape}")
    
    # Test transformations
    print("\n2. Testing Data Transformations:")
    
    transformations = [
        {"type": "filter", "params": {"condition": "age > 30"}},
        {"type": "select", "params": {"columns": ["name", "age", "salary"]}},
        {"type": "sort", "params": {"sort_by": ["salary"], "ascending": False}}
    ]
    
    transformed_data = transform_data(loaded_data, transformations)
    print(f"  Transformed data shape: {transformed_data.shape}")
    print(f"  Transformed data:\n{transformed_data}")
    
    # Test processing job
    print("\n3. Testing Processing Job:")
    
    job_id = create_processing_job(
        "Sample Job",
        "sample_data.csv",
        "processed_data.csv",
        transformations
    )
    
    print(f"  Created job: {job_id}")
    
    # Get job status
    status = get_job_status(job_id)
    print(f"  Job status: {status}")
    
    # List jobs
    jobs = list_processing_jobs()
    print(f"  Total jobs: {len(jobs)}")
    
    # Test schema registration
    print("\n4. Testing Schema Registration:")
    
    schema = DataSchema(
        name="employee_schema",
        fields=[
            {"name": "id", "type": "int"},
            {"name": "name", "type": "string"},
            {"name": "age", "type": "int"},
            {"name": "salary", "type": "float"}
        ],
        primary_key="id"
    )
    
    registered = register_schema(schema)
    print(f"  Schema registered: {registered}")
    
    # List schemas
    schemas = list_schemas()
    print(f"  Available schemas: {schemas}")
    
    print("\nAdvanced data processing testing completed!") 