"""
HeadlessX Scraper Module (Production Version with Large Response Support)
==========================================================================
Optimized for handling large HTML responses
"""

import requests
from bs4 import BeautifulSoup
import subprocess
import time
from pathlib import Path
from os import getenv
from dotenv import load_dotenv

# Configuration
load_dotenv("variables.env")

HEADLESSX_URL = getenv("HEADLESSX_URL")
HEADLESSX_PROJECT_PATH = Path(getenv("HEADLESSX_PROJECT_PATH"))
HEADLESSX_TOKEN = getenv("HEADLESSX_TOKEN")
REQUEST_SLEEP_SECONDS = 6

def is_headlessx_up() -> bool:
    """Check if HeadlessX health endpoint is responding."""
    try:
        r = requests.get(f"{HEADLESSX_URL}/api/health", timeout=3)
        return r.status_code == 200
    except requests.RequestException:
        return False


def ensure_headlessx_running(max_wait_seconds: int = 40) -> None:
    """Ensure that HeadlessX server is running."""
    
    if is_headlessx_up():
        return

    subprocess.Popen(
        ["npm", "start"],
        cwd=str(HEADLESSX_PROJECT_PATH),
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL,
    )

    start_time = time.time()
    while time.time() - start_time < max_wait_seconds:
        if is_headlessx_up():
            return
        time.sleep(REQUEST_SLEEP_SECONDS)

    raise RuntimeError(
        "HeadlessX did not become ready within the given timeout. "
        "Check the logs by running `npm start` manually in a terminal."
    )


def fetch_html_with_headlessx(
    url: str, 
    human_behavior: bool = True, 
    timeout_ms: int = 30000,
    max_retries: int = 3,
    verbose: bool = False
) -> str:
    """
    Fetch HTML from HeadlessX /api/html endpoint with retry logic.
    Optimized for large responses (up to 10MB).
    """
    endpoint = f"{HEADLESSX_URL}/api/html"
    
    for attempt in range(max_retries):
        try:
            if verbose or attempt > 0:
                print(f"[HEADLESS] Fetching URL (attempt {attempt + 1}/{max_retries}): {url}")
                print(f"[HEADLESS] Settings: timeout={timeout_ms}ms, human_behavior={human_behavior}")
            
            # CRITICAL: Increase HTTP timeout to handle large responses
            http_timeout = (timeout_ms / 1000) + 60
                        
            response = requests.post(
                endpoint,
                params={"token": HEADLESSX_TOKEN},
                json={
                    "url": url,
                    "timeout": timeout_ms,
                    "humanBehavior": human_behavior,
                },
                timeout=http_timeout,  # Increased timeout
                stream=True  # Stream large responses
            )

            if verbose:
                print(f"[HEADLESS] Response status: {response.status_code}")
                print(f"[HEADLESS] Content-Type: {response.headers.get('content-type')}")
                content_length = response.headers.get('content-length', 'unknown')
                print(f"[HEADLESS] Content-Length: {content_length}")

            # Handle 500 errors with retry
            if response.status_code == 500:
                print(f"[HEADLESS] Server returned 500 error (attempt {attempt + 1}/{max_retries})")
                
                # Try to read response body for debugging
                try:
                    error_body = response.text[:500]
                    print(f"[HEADLESS] Error response: {error_body}")
                except:
                    pass
                
                if attempt < max_retries - 1:
                    # Exponential backoff: 5s, 10s, 20s...
                    wait_time = REQUEST_SLEEP_SECONDS * (2 ** attempt)
                    time.sleep(wait_time)
                    
                    # Try without human behavior on retry (faster)
                    if attempt > 0 and human_behavior:
                        human_behavior = False
                    
                    # Increase timeout on retry
                    if attempt > 1:
                        timeout_ms = int(timeout_ms * 1.5)
                    
                    continue
                else:
                    print("[HEADLESS] All retries exhausted")
                    raise RuntimeError(
                        f"HeadlessX server error 500 after {max_retries} attempts. "
                        f"This may be due to a very large response or page complexity. "
                        f"URL: {url}"
                    )

            # Raise error if not 2xx status
            response.raise_for_status()

            content_type = response.headers.get("content-type", "")

            # Case 1: endpoint returns JSON
            if "application/json" in content_type:
                try:
                    # Use iter_content for large responses
                    data = response.json()
                except ValueError:
                    print("[HEADLESS] Could not decode JSON")
                    if verbose:
                        print(f"Raw response (first 1000 chars): {response.text[:1000]}")
                    raise

                # Expected format: {"html": "<!doctype html>..."}
                if isinstance(data, dict) and "html" in data:
                    html_length = len(data['html'])
                    return data["html"]

                # In case it returns a string directly
                if isinstance(data, str):
                    return data

                print("[HEADLESS] Unexpected JSON structure")
                raise RuntimeError("Unexpected JSON structure from HeadlessX /api/html")

            # Case 2: returns HTML directly (text/html)
            html = response.text
            return html
            
        except requests.exceptions.Timeout:
            print(f"[HEADLESS] Request timed out (attempt {attempt + 1}/{max_retries})")
            if attempt < max_retries - 1:
                wait_time = 5 * (2 ** attempt)
                print(f"[HEADLESS] Waiting {wait_time}s before retry...")
                time.sleep(wait_time)
                
                # Disable human behavior to speed up
                if human_behavior:
                    human_behavior = False
                
                continue
            else:
                raise RuntimeError(f"Request timed out after {max_retries} attempts")
                
        except requests.exceptions.ConnectionError as e:
            print(f"[HEADLESS] Connection error: {e}")
            print("[HEADLESS] Make sure HeadlessX is running on port 3000")
            raise
            
        except requests.exceptions.HTTPError as e:
            # Don't retry on 4xx errors (client errors)
            if 400 <= response.status_code < 500:
                print(f"[HEADLESS] Client error {response.status_code}")
                raise
            raise
            
        except Exception as e:
            print(f"[HEADLESS] Unexpected error: {e}")
            if attempt < max_retries - 1:
                wait_time = REQUEST_SLEEP_SECONDS * (2 ** attempt)
                time.sleep(wait_time)
                continue
            else:
                raise

    # Should never reach here
    raise RuntimeError(f"Failed to fetch {url} after {max_retries} attempts")


def get_soup(
    url: str,
    human_behavior: bool = True,
    timeout_ms: int = 60000,  # Increased default to 60s
    ensure_server: bool = True,
    max_retries: int = 3,
    verbose: bool = False
) -> BeautifulSoup:
    """
    Fetch a URL and return a BeautifulSoup object.
    Optimized for large HTML responses.
    """
    if ensure_server:
        ensure_headlessx_running()
    
    html = fetch_html_with_headlessx(url, human_behavior, timeout_ms, max_retries, verbose)
    soup = BeautifulSoup(html, "html.parser")
    
    if verbose:
        title = soup.title.string if soup.title else 'No title'
        print(f"[HEADLESS] Successfully created BeautifulSoup object")
        print(f"[HEADLESS] Page title: {title}")
    
    return soup