"""
TuskLang Python SDK - Data Governance & Privacy Engine
Production-quality data governance with lineage tracking, privacy controls, and regulatory compliance
"""

import asyncio
import json
import logging
import uuid
import hashlib
import secrets
from datetime import datetime, timedelta, timezone
from typing import Dict, List, Optional, Any, Union, Tuple, Set
from dataclasses import dataclass, field
from enum import Enum
import threading
import re
from pathlib import Path

try:
    import networkx as nx
    NETWORKX_AVAILABLE = True
except ImportError:
    NETWORKX_AVAILABLE = False

try:
    import sqlite3
    SQLITE_AVAILABLE = True
except ImportError:
    SQLITE_AVAILABLE = False


class DataClassification(Enum):
    PUBLIC = "public"
    INTERNAL = "internal"
    CONFIDENTIAL = "confidential"
    RESTRICTED = "restricted"
    TOP_SECRET = "top_secret"


class PrivacyRegulation(Enum):
    GDPR = "gdpr"
    CCPA = "ccpa"
    PIPEDA = "pipeda"
    LGPD = "lgpd"
    PDPA = "pdpa"
    HIPAA = "hipaa"


class DataCategory(Enum):
    PII = "personal_identifiable_information"
    PHI = "protected_health_information"
    FINANCIAL = "financial_data"
    BIOMETRIC = "biometric_data"
    BEHAVIORAL = "behavioral_data"
    LOCATION = "location_data"
    COMMUNICATION = "communication_data"
    TECHNICAL = "technical_data"


class ConsentStatus(Enum):
    GRANTED = "granted"
    WITHDRAWN = "withdrawn"
    PENDING = "pending"
    EXPIRED = "expired"
    NOT_REQUIRED = "not_required"


class RetentionAction(Enum):
    DELETE = "delete"
    ANONYMIZE = "anonymize"
    ARCHIVE = "archive"
    REVIEW = "review"
    EXTEND = "extend"


@dataclass
class DataAsset:
    """Data asset with governance metadata"""
    asset_id: str
    name: str
    description: str
    data_type: str
    classification: DataClassification
    categories: List[DataCategory]
    
    # Location and ownership
    location: str
    system: str
    owner: str
    steward: str
    
    # Privacy and compliance
    contains_pii: bool = False
    applicable_regulations: List[PrivacyRegulation] = field(default_factory=list)
    retention_period_days: int = 2555  # 7 years default
    
    # Lineage
    upstream_assets: List[str] = field(default_factory=list)
    downstream_assets: List[str] = field(default_factory=list)
    
    # Quality and usage
    quality_score: float = 0.0
    last_accessed: Optional[datetime] = None
    access_frequency: int = 0
    
    # Metadata
    schema: Dict[str, Any] = field(default_factory=dict)
    tags: List[str] = field(default_factory=list)
    custom_attributes: Dict[str, Any] = field(default_factory=dict)
    
    # Audit
    created_at: datetime = field(default_factory=datetime.utcnow)
    updated_at: datetime = field(default_factory=datetime.utcnow)


@dataclass
class DataLineage:
    """Data lineage tracking"""
    lineage_id: str
    source_asset_id: str
    target_asset_id: str
    transformation_type: str
    transformation_logic: str = ""
    
    # Processing details
    process_name: str = ""
    process_version: str = ""
    execution_time: Optional[datetime] = None
    
    # Impact analysis
    impact_score: float = 0.5
    business_context: str = ""
    
    created_at: datetime = field(default_factory=datetime.utcnow)


@dataclass
class ConsentRecord:
    """Privacy consent management"""
    consent_id: str
    data_subject_id: str
    purpose: str
    legal_basis: str
    status: ConsentStatus
    
    # Consent details
    granted_at: Optional[datetime] = None
    withdrawn_at: Optional[datetime] = None
    expires_at: Optional[datetime] = None
    
    # Scope
    data_categories: List[DataCategory] = field(default_factory=list)
    processing_activities: List[str] = field(default_factory=list)
    
    # Compliance
    regulation: PrivacyRegulation = PrivacyRegulation.GDPR
    consent_method: str = "explicit"
    proof_of_consent: str = ""
    
    created_at: datetime = field(default_factory=datetime.utcnow)
    updated_at: datetime = field(default_factory=datetime.utcnow)


@dataclass
class RetentionPolicy:
    """Data retention policy"""
    policy_id: str
    name: str
    description: str
    
    # Retention rules
    retention_period_days: int
    action: RetentionAction
    
    # Scope
    applies_to_categories: List[DataCategory]
    applies_to_classifications: List[DataClassification]
    asset_patterns: List[str] = field(default_factory=list)
    
    trigger_condition: str = "age"
    
    # Regulatory basis
    regulation: Optional[PrivacyRegulation] = None
    legal_basis: str = ""
    
    # Execution
    enabled: bool = True
    last_executed: Optional[datetime] = None
    next_execution: Optional[datetime] = None
    
    created_at: datetime = field(default_factory=datetime.utcnow)
    updated_at: datetime = field(default_factory=datetime.utcnow)


@dataclass
class PrivacyImpactAssessment:
    """Privacy Impact Assessment (PIA)"""
    pia_id: str
    name: str
    description: str
    status: str  # draft, in_review, approved, rejected
    
    # Scope
    data_categories: List[DataCategory]
    processing_activities: List[str]
    affected_systems: List[str]
    
    # Assessment
    risk_level: str = "medium"  # low, medium, high, critical
    privacy_risks: List[str] = field(default_factory=list)
    mitigation_measures: List[str] = field(default_factory=list)
    
    # Compliance
    applicable_regulations: List[PrivacyRegulation] = field(default_factory=list)
    legal_basis: str = ""
    
    # Review
    reviewer: str = ""
    review_date: Optional[datetime] = None
    approval_date: Optional[datetime] = None
    
    created_at: datetime = field(default_factory=datetime.utcnow)
    updated_at: datetime = field(default_factory=datetime.utcnow)


class DataGovernanceEngine:
    """Production-quality data governance with privacy and compliance automation"""
    
    def __init__(self, config: Optional[Dict[str, Any]] = None):
        self.config = config or {}
        self.logger = logging.getLogger(__name__)
        self.db_path = self.config.get('database_path', 'governance.db')
        
        # Data registries
        self.data_assets: Dict[str, DataAsset] = {}
        self.lineage_records: Dict[str, DataLineage] = {}
        self.consent_records: Dict[str, ConsentRecord] = {}
        self.retention_policies: Dict[str, RetentionPolicy] = {}
        self.privacy_assessments: Dict[str, PrivacyImpactAssessment] = {}
        
        # Lineage graph
        self.lineage_graph = None
        if NETWORKX_AVAILABLE:
            self.lineage_graph = nx.DiGraph()
        
        # Threading
        self.governance_lock = threading.RLock()
        self.retention_executor = threading.Thread(target=self._retention_processor, daemon=True)
        self.retention_active = True
        
        # PII detection patterns
        self.pii_patterns = {
            'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
            'phone': r'\b\d{3}-\d{3}-\d{4}\b|\b\(\d{3}\)\s*\d{3}-\d{4}\b',
            'ssn': r'\b\d{3}-\d{2}-\d{4}\b',
            'credit_card': r'\b\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}\b',
            'ip_address': r'\b(?:\d{1,3}\.){3}\d{1,3}\b',
            'passport': r'\b[A-Z]{1,2}\d{6,9}\b'
        }
        
        # Initialize
        self._initialize_database()
        self._load_default_policies()
        self._start_retention_processor()
        
        self.logger.info("Data Governance Engine initialized successfully")

    def _initialize_database(self):
        """Initialize database schema"""
        if not SQLITE_AVAILABLE:
            return
        
        try:
            conn = sqlite3.connect(self.db_path)
            cursor = conn.cursor()
            
            cursor.executescript("""
                CREATE TABLE IF NOT EXISTS data_assets (
                    asset_id TEXT PRIMARY KEY,
                    name TEXT NOT NULL,
                    description TEXT,
                    data_type TEXT,
                    classification TEXT,
                    categories TEXT,
                    location TEXT,
                    system TEXT,
                    owner TEXT,
                    steward TEXT,
                    contains_pii INTEGER DEFAULT 0,
                    applicable_regulations TEXT,
                    retention_period_days INTEGER DEFAULT 2555,
                    quality_score REAL DEFAULT 0.0,
                    last_accessed TIMESTAMP,
                    access_frequency INTEGER DEFAULT 0,
                    schema_json TEXT,
                    tags TEXT,
                    custom_attributes TEXT,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                );
                
                CREATE TABLE IF NOT EXISTS data_lineage (
                    lineage_id TEXT PRIMARY KEY,
                    source_asset_id TEXT,
                    target_asset_id TEXT,
                    transformation_type TEXT,
                    transformation_logic TEXT,
                    process_name TEXT,
                    process_version TEXT,
                    execution_time TIMESTAMP,
                    impact_score REAL DEFAULT 0.5,
                    business_context TEXT,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                );
                
                CREATE TABLE IF NOT EXISTS consent_records (
                    consent_id TEXT PRIMARY KEY,
                    data_subject_id TEXT NOT NULL,
                    purpose TEXT NOT NULL,
                    legal_basis TEXT,
                    status TEXT NOT NULL,
                    granted_at TIMESTAMP,
                    withdrawn_at TIMESTAMP,
                    expires_at TIMESTAMP,
                    data_categories TEXT,
                    processing_activities TEXT,
                    regulation TEXT,
                    consent_method TEXT,
                    proof_of_consent TEXT,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                );
                
                CREATE TABLE IF NOT EXISTS retention_policies (
                    policy_id TEXT PRIMARY KEY,
                    name TEXT NOT NULL,
                    description TEXT,
                    applies_to_categories TEXT,
                    applies_to_classifications TEXT,
                    asset_patterns TEXT,
                    retention_period_days INTEGER,
                    action TEXT,
                    trigger_condition TEXT DEFAULT 'age',
                    regulation TEXT,
                    legal_basis TEXT,
                    enabled INTEGER DEFAULT 1,
                    last_executed TIMESTAMP,
                    next_execution TIMESTAMP,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                );
                
                CREATE TABLE IF NOT EXISTS privacy_assessments (
                    pia_id TEXT PRIMARY KEY,
                    name TEXT NOT NULL,
                    description TEXT,
                    status TEXT DEFAULT 'draft',
                    data_categories TEXT,
                    processing_activities TEXT,
                    affected_systems TEXT,
                    risk_level TEXT DEFAULT 'medium',
                    privacy_risks TEXT,
                    mitigation_measures TEXT,
                    applicable_regulations TEXT,
                    legal_basis TEXT,
                    reviewer TEXT,
                    review_date TIMESTAMP,
                    approval_date TIMESTAMP,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                );
                
                CREATE INDEX IF NOT EXISTS idx_assets_classification ON data_assets(classification);
                CREATE INDEX IF NOT EXISTS idx_assets_owner ON data_assets(owner);
                CREATE INDEX IF NOT EXISTS idx_lineage_source ON data_lineage(source_asset_id);
                CREATE INDEX IF NOT EXISTS idx_lineage_target ON data_lineage(target_asset_id);
                CREATE INDEX IF NOT EXISTS idx_consent_subject ON consent_records(data_subject_id);
                CREATE INDEX IF NOT EXISTS idx_consent_status ON consent_records(status);
            """)
            
            conn.commit()
            conn.close()
            
        except Exception as e:
            self.logger.error(f"Database initialization failed: {e}")

    def _load_default_policies(self):
        """Load default retention policies"""
        
        # GDPR retention policy
        gdpr_policy = RetentionPolicy(
            policy_id="gdpr_default",
            name="GDPR Default Retention",
            description="Default GDPR data retention policy",
            applies_to_categories=[DataCategory.PII],
            applies_to_classifications=[DataClassification.CONFIDENTIAL, DataClassification.RESTRICTED],
            retention_period_days=2555,  # 7 years
            action=RetentionAction.DELETE,
            regulation=PrivacyRegulation.GDPR,
            legal_basis="Article 5(1)(e) - storage limitation principle"
        )
        
        # HIPAA retention policy
        hipaa_policy = RetentionPolicy(
            policy_id="hipaa_default",
            name="HIPAA PHI Retention",
            description="HIPAA protected health information retention",
            applies_to_categories=[DataCategory.PHI],
            applies_to_classifications=[DataClassification.RESTRICTED],
            retention_period_days=2190,  # 6 years
            action=RetentionAction.ARCHIVE,
            regulation=PrivacyRegulation.HIPAA,
            legal_basis="45 CFR 164.530(j)(2) - minimum 6 years retention"
        )
        
        # Financial data retention
        financial_policy = RetentionPolicy(
            policy_id="financial_default",
            name="Financial Data Retention",
            description="Financial records retention policy",
            applies_to_categories=[DataCategory.FINANCIAL],
            applies_to_classifications=[DataClassification.CONFIDENTIAL],
            retention_period_days=2555,  # 7 years
            action=RetentionAction.ARCHIVE,
            legal_basis="Generally accepted accounting principles"
        )
        
        self.retention_policies.update({
            gdpr_policy.policy_id: gdpr_policy,
            hipaa_policy.policy_id: hipaa_policy,
            financial_policy.policy_id: financial_policy
        })

    def _start_retention_processor(self):
        """Start retention policy processor"""
        if not self.retention_executor.is_alive():
            self.retention_executor.start()

    def _retention_processor(self):
        """Background retention policy processor"""
        while self.retention_active:
            try:
                asyncio.run(self._process_retention_policies())
                # Run every hour
                threading.Event().wait(3600)
            except Exception as e:
                self.logger.error(f"Retention processor error: {e}")
                threading.Event().wait(300)  # 5 minute retry

    async def register_data_asset(self, name: str, data_type: str, location: str,
                                owner: str, classification: DataClassification = DataClassification.INTERNAL,
                                description: str = "", steward: str = "",
                                schema: Optional[Dict[str, Any]] = None) -> str:
        """Register new data asset with governance metadata"""
        
        asset_id = str(uuid.uuid4())
        
        # Detect PII and categorize data
        contains_pii = await self._detect_pii_in_schema(schema or {})
        categories = await self._categorize_data(data_type, schema or {})
        
        # Determine applicable regulations
        regulations = []
        if contains_pii:
            regulations.extend([PrivacyRegulation.GDPR, PrivacyRegulation.CCPA])
        if DataCategory.PHI in categories:
            regulations.append(PrivacyRegulation.HIPAA)
        if DataCategory.FINANCIAL in categories:
            regulations.extend([PrivacyRegulation.GDPR, PrivacyRegulation.CCPA])
        
        asset = DataAsset(
            asset_id=asset_id,
            name=name,
            description=description,
            data_type=data_type,
            classification=classification,
            categories=categories,
            location=location,
            system=self._extract_system_from_location(location),
            owner=owner,
            steward=steward or owner,
            contains_pii=contains_pii,
            applicable_regulations=list(set(regulations)),
            schema=schema or {}
        )
        
        with self.governance_lock:
            self.data_assets[asset_id] = asset
        
        # Persist to database
        await self._persist_data_asset(asset)
        
        self.logger.info(f"Registered data asset {name} ({asset_id})")
        return asset_id

    async def _detect_pii_in_schema(self, schema: Dict[str, Any]) -> bool:
        """Detect PII in data schema"""
        
        schema_text = json.dumps(schema).lower()
        
        # Check for PII patterns
        for pii_type, pattern in self.pii_patterns.items():
            if re.search(pattern, schema_text, re.IGNORECASE):
                return True
        
        # Check for PII field names
        pii_field_names = {
            'email', 'phone', 'ssn', 'social_security', 'passport', 'license',
            'first_name', 'last_name', 'full_name', 'address', 'zip', 'postal',
            'birth_date', 'dob', 'age', 'gender', 'race', 'ethnicity'
        }
        
        for field_name in schema.keys():
            if field_name.lower() in pii_field_names:
                return True
        
        return False

    async def _categorize_data(self, data_type: str, schema: Dict[str, Any]) -> List[DataCategory]:
        """Categorize data based on type and schema"""
        
        categories = []
        
        # Check for health information
        health_indicators = {'medical', 'health', 'diagnosis', 'treatment', 'patient', 'doctor'}
        if any(indicator in data_type.lower() or 
               any(indicator in field.lower() for field in schema.keys())
               for indicator in health_indicators):
            categories.append(DataCategory.PHI)
        
        # Check for financial data
        financial_indicators = {'payment', 'transaction', 'account', 'balance', 'credit', 'bank'}
        if any(indicator in data_type.lower() or
               any(indicator in field.lower() for field in schema.keys())
               for indicator in financial_indicators):
            categories.append(DataCategory.FINANCIAL)
        
        # Check for location data
        location_indicators = {'location', 'address', 'coordinates', 'gps', 'lat', 'lon', 'country'}
        if any(indicator in data_type.lower() or
               any(indicator in field.lower() for field in schema.keys())
               for indicator in location_indicators):
            categories.append(DataCategory.LOCATION)
        
        # Check for behavioral data
        behavioral_indicators = {'click', 'view', 'session', 'activity', 'behavior', 'preference'}
        if any(indicator in data_type.lower() or
               any(indicator in field.lower() for field in schema.keys())
               for indicator in behavioral_indicators):
            categories.append(DataCategory.BEHAVIORAL)
        
        # Default to PII if contains personal information
        if not categories and await self._detect_pii_in_schema(schema):
            categories.append(DataCategory.PII)
        
        # Default to technical if no specific category
        if not categories:
            categories.append(DataCategory.TECHNICAL)
        
        return categories

    def _extract_system_from_location(self, location: str) -> str:
        """Extract system name from data location"""
        # Simple extraction - enhance based on location formats
        if location.startswith('db://'):
            return location.split('//')[1].split('/')[0]
        elif location.startswith('s3://'):
            return 's3'
        elif location.startswith('hdfs://'):
            return 'hadoop'
        else:
            return 'unknown'

    async def track_data_lineage(self, source_asset_id: str, target_asset_id: str,
                               transformation_type: str, transformation_logic: str = "",
                               process_name: str = "") -> str:
        """Track data lineage between assets"""
        
        lineage_id = str(uuid.uuid4())
        
        lineage = DataLineage(
            lineage_id=lineage_id,
            source_asset_id=source_asset_id,
            target_asset_id=target_asset_id,
            transformation_type=transformation_type,
            transformation_logic=transformation_logic,
            process_name=process_name,
            execution_time=datetime.utcnow()
        )
        
        with self.governance_lock:
            self.lineage_records[lineage_id] = lineage
            
            # Update asset relationships
            source_asset = self.data_assets.get(source_asset_id)
            target_asset = self.data_assets.get(target_asset_id)
            
            if source_asset:
                source_asset.downstream_assets.append(target_asset_id)
            if target_asset:
                target_asset.upstream_assets.append(source_asset_id)
            
            # Update lineage graph
            if NETWORKX_AVAILABLE and self.lineage_graph:
                self.lineage_graph.add_edge(source_asset_id, target_asset_id, 
                                          lineage_id=lineage_id,
                                          transformation_type=transformation_type)
        
        await self._persist_lineage(lineage)
        
        self.logger.info(f"Tracked lineage {source_asset_id} -> {target_asset_id}")
        return lineage_id

    async def create_consent_record(self, data_subject_id: str, purpose: str,
                                  legal_basis: str, data_categories: List[DataCategory],
                                  regulation: PrivacyRegulation = PrivacyRegulation.GDPR,
                                  expires_days: Optional[int] = None) -> str:
        """Create privacy consent record"""
        
        consent_id = str(uuid.uuid4())
        expires_at = None
        if expires_days:
            expires_at = datetime.utcnow() + timedelta(days=expires_days)
        
        consent = ConsentRecord(
            consent_id=consent_id,
            data_subject_id=data_subject_id,
            purpose=purpose,
            legal_basis=legal_basis,
            status=ConsentStatus.GRANTED,
            granted_at=datetime.utcnow(),
            expires_at=expires_at,
            data_categories=data_categories,
            regulation=regulation
        )
        
        with self.governance_lock:
            self.consent_records[consent_id] = consent
        
        await self._persist_consent(consent)
        
        self.logger.info(f"Created consent record for {data_subject_id} ({consent_id})")
        return consent_id

    async def withdraw_consent(self, consent_id: str, withdrawal_reason: str = "") -> bool:
        """Withdraw privacy consent"""
        
        consent = self.consent_records.get(consent_id)
        if not consent:
            return False
        
        consent.status = ConsentStatus.WITHDRAWN
        consent.withdrawn_at = datetime.utcnow()
        consent.updated_at = datetime.utcnow()
        
        await self._persist_consent(consent)
        
        # Trigger data processing review for withdrawn consent
        await self._handle_consent_withdrawal(consent)
        
        self.logger.info(f"Withdrew consent {consent_id}: {withdrawal_reason}")
        return True

    async def _handle_consent_withdrawal(self, consent: ConsentRecord):
        """Handle consent withdrawal - identify and flag affected data"""
        
        # Find assets that may be affected by consent withdrawal
        affected_assets = []
        
        for asset in self.data_assets.values():
            # Check if asset contains relevant data categories
            if any(cat in asset.categories for cat in consent.data_categories):
                # Check if asset is subject to the same regulation
                if consent.regulation in asset.applicable_regulations:
                    affected_assets.append(asset.asset_id)
        
        if affected_assets:
            self.logger.warning(f"Consent withdrawal {consent.consent_id} affects {len(affected_assets)} assets")
            # In production, trigger data processing review workflow

    async def conduct_privacy_impact_assessment(self, name: str, description: str,
                                              data_categories: List[DataCategory],
                                              processing_activities: List[str],
                                              affected_systems: List[str]) -> str:
        """Conduct Privacy Impact Assessment"""
        
        pia_id = str(uuid.uuid4())
        
        # Auto-assess risk level based on data categories
        risk_level = "medium"
        high_risk_categories = {DataCategory.PII, DataCategory.PHI, DataCategory.BIOMETRIC}
        
        if any(cat in high_risk_categories for cat in data_categories):
            risk_level = "high"
        
        # Generate automatic privacy risks
        privacy_risks = await self._assess_privacy_risks(data_categories, processing_activities)
        
        # Generate mitigation measures
        mitigation_measures = await self._generate_mitigation_measures(data_categories, privacy_risks)
        
        pia = PrivacyImpactAssessment(
            pia_id=pia_id,
            name=name,
            description=description,
            data_categories=data_categories,
            processing_activities=processing_activities,
            affected_systems=affected_systems,
            risk_level=risk_level,
            privacy_risks=privacy_risks,
            mitigation_measures=mitigation_measures,
            applicable_regulations=[PrivacyRegulation.GDPR]  # Default to GDPR
        )
        
        with self.governance_lock:
            self.privacy_assessments[pia_id] = pia
        
        await self._persist_pia(pia)
        
        self.logger.info(f"Conducted PIA {name} ({pia_id}) - Risk Level: {risk_level}")
        return pia_id

    async def _assess_privacy_risks(self, data_categories: List[DataCategory], 
                                  processing_activities: List[str]) -> List[str]:
        """Auto-assess privacy risks"""
        
        risks = []
        
        # Risk assessment based on data categories
        if DataCategory.PII in data_categories:
            risks.append("Identity theft risk from PII exposure")
            risks.append("Profiling and discrimination risk")
        
        if DataCategory.PHI in data_categories:
            risks.append("Medical identity theft")
            risks.append("Health insurance discrimination")
        
        if DataCategory.FINANCIAL in data_categories:
            risks.append("Financial fraud and identity theft")
            risks.append("Credit score manipulation")
        
        if DataCategory.BIOMETRIC in data_categories:
            risks.append("Permanent identity compromise")
            risks.append("Unauthorized biometric authentication")
        
        if DataCategory.LOCATION in data_categories:
            risks.append("Physical safety and stalking risks")
            risks.append("Movement pattern analysis")
        
        # Risk assessment based on processing activities
        high_risk_activities = {
            'profiling', 'automated_decision_making', 'cross_border_transfer',
            'large_scale_processing', 'behavioral_analysis'
        }
        
        for activity in processing_activities:
            if any(risk_activity in activity.lower() for risk_activity in high_risk_activities):
                risks.append(f"High privacy risk from {activity}")
        
        return list(set(risks))

    async def _generate_mitigation_measures(self, data_categories: List[DataCategory], 
                                          risks: List[str]) -> List[str]:
        """Generate privacy risk mitigation measures"""
        
        measures = []
        
        # Standard mitigation measures
        measures.extend([
            "Implement data encryption at rest and in transit",
            "Apply principle of data minimization",
            "Establish clear data retention policies",
            "Implement access controls and role-based permissions",
            "Conduct regular security assessments",
            "Provide privacy training to staff"
        ])
        
        # Category-specific measures
        if DataCategory.PII in data_categories:
            measures.extend([
                "Implement data pseudonymization where possible",
                "Provide clear consent mechanisms",
                "Enable data subject rights (access, rectification, deletion)"
            ])
        
        if DataCategory.PHI in data_categories:
            measures.extend([
                "Implement HIPAA-compliant security controls",
                "Establish business associate agreements",
                "Conduct regular HIPAA risk assessments"
            ])
        
        if DataCategory.FINANCIAL in data_categories:
            measures.extend([
                "Implement PCI-DSS compliance controls",
                "Use tokenization for payment data",
                "Establish fraud monitoring systems"
            ])
        
        # Risk-specific measures
        if any("automated_decision" in risk.lower() for risk in risks):
            measures.append("Implement human review for automated decisions")
        
        if any("cross_border" in risk.lower() for risk in risks):
            measures.append("Implement appropriate cross-border transfer safeguards")
        
        return list(set(measures))

    async def get_data_lineage_impact(self, asset_id: str, direction: str = "downstream") -> Dict[str, Any]:
        """Get data lineage impact analysis"""
        
        if not NETWORKX_AVAILABLE or not self.lineage_graph:
            return {"error": "Lineage analysis not available"}
        
        impact_assets = set()
        
        if direction == "downstream":
            # Find all assets that depend on this asset
            if asset_id in self.lineage_graph:
                impact_assets = set(nx.descendants(self.lineage_graph, asset_id))
        else:
            # Find all assets this asset depends on
            if asset_id in self.lineage_graph:
                impact_assets = set(nx.ancestors(self.lineage_graph, asset_id))
        
        # Get asset details
        impacted_assets = []
        for imp_asset_id in impact_assets:
            asset = self.data_assets.get(imp_asset_id)
            if asset:
                impacted_assets.append({
                    'asset_id': imp_asset_id,
                    'name': asset.name,
                    'classification': asset.classification.value,
                    'owner': asset.owner,
                    'system': asset.system
                })
        
        return {
            'source_asset_id': asset_id,
            'direction': direction,
            'impact_count': len(impacted_assets),
            'impacted_assets': impacted_assets
        }

    async def _process_retention_policies(self):
        """Process data retention policies"""
        
        now = datetime.utcnow()
        
        for policy in self.retention_policies.values():
            if not policy.enabled:
                continue
            
            # Check if policy should be executed
            if policy.next_execution and now < policy.next_execution:
                continue
            
            # Find assets subject to this policy
            applicable_assets = await self._find_assets_for_policy(policy)
            
            # Process each asset
            actions_taken = 0
            for asset in applicable_assets:
                if await self._should_apply_retention_action(asset, policy):
                    success = await self._apply_retention_action(asset, policy)
                    if success:
                        actions_taken += 1
            
            # Update policy execution time
            policy.last_executed = now
            policy.next_execution = now + timedelta(days=1)  # Daily check
            
            if actions_taken > 0:
                self.logger.info(f"Retention policy {policy.name} processed {actions_taken} assets")

    async def _find_assets_for_policy(self, policy: RetentionPolicy) -> List[DataAsset]:
        """Find assets applicable to retention policy"""
        
        applicable_assets = []
        
        for asset in self.data_assets.values():
            # Check category match
            if policy.applies_to_categories:
                if not any(cat in asset.categories for cat in policy.applies_to_categories):
                    continue
            
            # Check classification match
            if policy.applies_to_classifications:
                if asset.classification not in policy.applies_to_classifications:
                    continue
            
            # Check asset pattern match
            if policy.asset_patterns:
                if not any(pattern in asset.location for pattern in policy.asset_patterns):
                    continue
            
            applicable_assets.append(asset)
        
        return applicable_assets

    async def _should_apply_retention_action(self, asset: DataAsset, policy: RetentionPolicy) -> bool:
        """Check if retention action should be applied to asset"""
        
        if policy.trigger_condition == "age":
            asset_age_days = (datetime.utcnow() - asset.created_at).days
            return asset_age_days >= policy.retention_period_days
        
        return False

    async def _apply_retention_action(self, asset: DataAsset, policy: RetentionPolicy) -> bool:
        """Apply retention action to asset"""
        
        try:
            if policy.action == RetentionAction.DELETE:
                # Mark asset for deletion
                asset.tags.append("RETENTION_DELETE")
                self.logger.info(f"Asset {asset.name} marked for deletion per policy {policy.name}")
                
            elif policy.action == RetentionAction.ANONYMIZE:
                # Mark asset for anonymization
                asset.tags.append("RETENTION_ANONYMIZE")
                self.logger.info(f"Asset {asset.name} marked for anonymization per policy {policy.name}")
                
            elif policy.action == RetentionAction.ARCHIVE:
                # Mark asset for archival
                asset.tags.append("RETENTION_ARCHIVE")
                self.logger.info(f"Asset {asset.name} marked for archival per policy {policy.name}")
            
            asset.updated_at = datetime.utcnow()
            return True
            
        except Exception as e:
            self.logger.error(f"Failed to apply retention action for {asset.name}: {e}")
            return False

    async def _persist_data_asset(self, asset: DataAsset):
        """Persist data asset to database"""
        if not SQLITE_AVAILABLE:
            return
        
        try:
            conn = sqlite3.connect(self.db_path)
            cursor = conn.cursor()
            
            cursor.execute("""
                INSERT OR REPLACE INTO data_assets (
                    asset_id, name, description, data_type, classification,
                    categories, location, system, owner, steward, contains_pii,
                    applicable_regulations, retention_period_days, quality_score,
                    last_accessed, access_frequency, schema_json, tags,
                    custom_attributes, created_at, updated_at
                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                asset.asset_id, asset.name, asset.description, asset.data_type,
                asset.classification.value, json.dumps([cat.value for cat in asset.categories]),
                asset.location, asset.system, asset.owner, asset.steward,
                int(asset.contains_pii), json.dumps([reg.value for reg in asset.applicable_regulations]),
                asset.retention_period_days, asset.quality_score, asset.last_accessed,
                asset.access_frequency, json.dumps(asset.schema), json.dumps(asset.tags),
                json.dumps(asset.custom_attributes), asset.created_at, asset.updated_at
            ))
            
            conn.commit()
            conn.close()
            
        except Exception as e:
            self.logger.error(f"Failed to persist asset {asset.asset_id}: {e}")

    async def _persist_lineage(self, lineage: DataLineage):
        """Persist lineage record to database"""
        if not SQLITE_AVAILABLE:
            return
        
        try:
            conn = sqlite3.connect(self.db_path)
            cursor = conn.cursor()
            
            cursor.execute("""
                INSERT OR REPLACE INTO data_lineage (
                    lineage_id, source_asset_id, target_asset_id, transformation_type,
                    transformation_logic, process_name, process_version, execution_time,
                    impact_score, business_context, created_at
                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                lineage.lineage_id, lineage.source_asset_id, lineage.target_asset_id,
                lineage.transformation_type, lineage.transformation_logic,
                lineage.process_name, lineage.process_version, lineage.execution_time,
                lineage.impact_score, lineage.business_context, lineage.created_at
            ))
            
            conn.commit()
            conn.close()
            
        except Exception as e:
            self.logger.error(f"Failed to persist lineage {lineage.lineage_id}: {e}")

    async def _persist_consent(self, consent: ConsentRecord):
        """Persist consent record to database"""
        if not SQLITE_AVAILABLE:
            return
        
        try:
            conn = sqlite3.connect(self.db_path)
            cursor = conn.cursor()
            
            cursor.execute("""
                INSERT OR REPLACE INTO consent_records (
                    consent_id, data_subject_id, purpose, legal_basis, status,
                    granted_at, withdrawn_at, expires_at, data_categories,
                    processing_activities, regulation, consent_method,
                    proof_of_consent, created_at, updated_at
                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                consent.consent_id, consent.data_subject_id, consent.purpose,
                consent.legal_basis, consent.status.value, consent.granted_at,
                consent.withdrawn_at, consent.expires_at,
                json.dumps([cat.value for cat in consent.data_categories]),
                json.dumps(consent.processing_activities), consent.regulation.value,
                consent.consent_method, consent.proof_of_consent,
                consent.created_at, consent.updated_at
            ))
            
            conn.commit()
            conn.close()
            
        except Exception as e:
            self.logger.error(f"Failed to persist consent {consent.consent_id}: {e}")

    async def _persist_pia(self, pia: PrivacyImpactAssessment):
        """Persist privacy impact assessment to database"""
        if not SQLITE_AVAILABLE:
            return
        
        try:
            conn = sqlite3.connect(self.db_path)
            cursor = conn.cursor()
            
            cursor.execute("""
                INSERT OR REPLACE INTO privacy_assessments (
                    pia_id, name, description, status, data_categories,
                    processing_activities, affected_systems, risk_level,
                    privacy_risks, mitigation_measures, applicable_regulations,
                    legal_basis, reviewer, review_date, approval_date,
                    created_at, updated_at
                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                pia.pia_id, pia.name, pia.description, pia.status,
                json.dumps([cat.value for cat in pia.data_categories]),
                json.dumps(pia.processing_activities), json.dumps(pia.affected_systems),
                pia.risk_level, json.dumps(pia.privacy_risks),
                json.dumps(pia.mitigation_measures),
                json.dumps([reg.value for reg in pia.applicable_regulations]),
                pia.legal_basis, pia.reviewer, pia.review_date, pia.approval_date,
                pia.created_at, pia.updated_at
            ))
            
            conn.commit()
            conn.close()
            
        except Exception as e:
            self.logger.error(f"Failed to persist PIA {pia.pia_id}: {e}")

    def get_governance_dashboard_data(self) -> Dict[str, Any]:
        """Get governance dashboard data"""
        
        now = datetime.utcnow()
        
        # Asset statistics
        total_assets = len(self.data_assets)
        pii_assets = sum(1 for asset in self.data_assets.values() if asset.contains_pii)
        
        # Classification breakdown
        classification_counts = {}
        for asset in self.data_assets.values():
            classification_counts[asset.classification.value] = classification_counts.get(asset.classification.value, 0) + 1
        
        # Consent statistics
        total_consents = len(self.consent_records)
        active_consents = sum(1 for consent in self.consent_records.values() if consent.status == ConsentStatus.GRANTED)
        
        # Retention statistics
        assets_pending_retention = sum(1 for asset in self.data_assets.values() 
                                     if any(tag.startswith("RETENTION_") for tag in asset.tags))
        
        dashboard = {
            'timestamp': now.isoformat(),
            'data_assets': {
                'total': total_assets,
                'contains_pii': pii_assets,
                'classification_breakdown': classification_counts,
                'pending_retention_action': assets_pending_retention
            },
            'privacy_compliance': {
                'total_consents': total_consents,
                'active_consents': active_consents,
                'withdrawn_consents': total_consents - active_consents,
                'privacy_assessments': len(self.privacy_assessments)
            },
            'data_lineage': {
                'total_lineage_records': len(self.lineage_records),
                'assets_with_lineage': len(set(
                    [l.source_asset_id for l in self.lineage_records.values()] +
                    [l.target_asset_id for l in self.lineage_records.values()]
                ))
            },
            'governance_policies': {
                'retention_policies': len(self.retention_policies),
                'active_policies': sum(1 for policy in self.retention_policies.values() if policy.enabled)
            }
        }
        
        return dashboard

    def __del__(self):
        """Cleanup resources"""
        self.retention_active = False
        if hasattr(self, 'retention_executor') and self.retention_executor.is_alive():
            self.retention_executor.join(timeout=5) 