"""
Functionality for generating and extracting IDEAS code bundles.
"""

import fnmatch
import hashlib
import io
import json
import logging
import os
import pathlib
import struct
import tarfile
import typing
from contextlib import contextmanager

from ideas_schemas import schemas
from ideas_schemas.exceptions import SchemaValidationError

from ideas import containers
from ideas.containers import split_full_name
from ideas.exceptions import (
    ContainerNotPublishedError,
    ToolBundleBuildError,
    ToolBundleExtractError,
    ToolInputsFormatError,
    ToolMisconfiguredError,
    ToolNotFoundError,
)
from ideas.session import Session
from ideas.tools.executor import Tool

BUNDLE_MAGIC = b"CBUNDLE\x00"  # Magic string, padded to 8 bytes
BUNDLE_VERSION = 1  # Version of code bundle, for future use
BUNDLE_HEADER_SIZE = 256  # bytes
BUNDLE_METADATA_SCHEMA_VERSION = "1.0.0"  # Schema version for bundle metadata

logger = logging.getLogger(__name__)


class MD5HashingWriter(io.BufferedWriter):
    def __init__(self, raw):
        super().__init__(raw)
        self._md5 = hashlib.md5()

    def write(self, b):
        self._md5.update(b)
        return super().write(b)

    @property
    def checksum(self):
        return self._md5.digest()


class CodeBundle:
    """
    Bundles the config/metadata about multiple tools and their code into a single bundle, to be
    uploaded to IDEAS.

    File format
    ===========

    +---------------------+--------------------------------+
    | Offset (bytes)      | Description                    |
    +---------------------+--------------------------------+
    | 0:7                 | Magic bytes: b'CBUNDLE\\0\\0'  |
    | 8:15                | Version (uint64, LE)           |
    | 16:23               | Metadata length (uint64, LE)   |
    | 24:39               | MD5 checksum (uint64)          |
    | 40                  | Gzipped flag (uint8)           |
    | 41:255              | Reserved (zero-filled)         |
    +---------------------+--------------------------------+
    | 256:(256 + N - 1)   | Metadata JSON (UTF-8; N bytes) |
    +---------------------+--------------------------------+
    | ...                 | Code archive (tar or tar.gz)   |
    +---------------------+--------------------------------+

    Metadata JSON
    =============

    JSON-encoded metadata about all files configured in the given code directory.

    {
        "tool_key": {
            "tool_spec": {
            },
            "introspection": {
            }
        },
        ...
    }

    Code archive
    ============

    Resulting tar or tar.gz file has the cwd at the root, for example:

        $ tar -tf .ideas/downsample/bundle.tar
            downsample/
            downsample/Dockerfile
            downsample/analysis/
            downsample/analysis/__init__.py
            downsample/analysis/downsample.py
            ...

    It can be extracted to an arbitrary location:

        $ tar -xf .ideas/downsample/bundle.tar -C /tmp
        $ ls /tmp/downsample/
        analysis/  Dockerfile  pyproject.toml  resources/  scripts/  setup.py

    """

    def __init__(
        self,
        code_dir: pathlib.Path,
        metadata_dir: pathlib.Path,
        version: int = BUNDLE_VERSION,
        included_tools: typing.Tuple[str] = (),
    ):
        self.code_dir = code_dir
        self.metadata_dir = metadata_dir
        self.version = version
        self.included_tools = included_tools

    def get_metadata(
        self,
        tenant_id: int,
        skip_container_check: bool = False,
        skip_validation: bool = False,
        session: typing.Optional[Session] = None,
    ):
        metadata = {"schema_version": BUNDLE_METADATA_SCHEMA_VERSION}
        config_folder = self.metadata_dir
        if not config_folder.exists():
            raise ToolNotFoundError(f"Tool config not found in {config_folder}")

        for item in config_folder.iterdir():
            if item.is_dir():
                key = item.name
                if self.included_tools and key not in self.included_tools:
                    # If tools are explicitly specified, skip over any tools in the config folder
                    # that haven't been specified by the user.
                    continue

                tool = Tool(key=key, code_dir=self.code_dir)
                try:
                    tool.load()  # load in config and spec data
                except ToolInputsFormatError:
                    pass
                finally:
                    tool.configure()

                if not tool.config.data or not tool.spec.data:
                    raise ToolMisconfiguredError(key=key)

                config = tool.config.data

                if not skip_container_check:
                    container = config["container"]
                    logger.debug(
                        f"Checking if local container {container} exists in IDEAS container registry..."
                    )
                    ideas_container = None
                    repo_digests = containers.get_repo_digests_for_registry(
                        container, session
                    )
                    if repo_digests:
                        ideas_container = containers.get_container_from_repo_digest(
                            repo_digests=repo_digests,
                            tenant_id=tenant_id,
                            session=session,
                        )
                    if not ideas_container:
                        raise ContainerNotPublishedError(
                            f"Local container {container} not published to IDEAS container registry",
                            container=container,
                        )
                    logger.debug(
                        f"Found matching container {ideas_container['full_name']} in IDEAS container registry\n"
                    )
                    config["container"] = ideas_container["full_name"]

                tool_spec = tool.spec.data

                # update container image name with name of container on ideas
                repository, label = split_full_name(config["container"])
                tool_spec["container_image"]["repository"] = repository
                tool_spec["container_image"]["label"] = label

                metadata[key] = {
                    "tool_spec": tool_spec,
                }

        # no tool keys, only schema version, nothing to bundle here
        if len(metadata.keys()) == 1:
            raise ToolNotFoundError(f"Not tools configured in {config_folder}")

        # validate the bundle metadata with the IDEAS v3 bundle metadata schema
        try:
            schemas.validate_v3_bundle_metadata(metadata)
        except SchemaValidationError as e:
            logger.error(
                "Unable to validate bundle metadata: %s",
                e.message,
                extra={"metadata": metadata},
            )
            if not skip_validation:
                raise ToolBundleBuildError(
                    "Unable to validate bundle metadata, please contact support or use --skip-validation"
                )
        return metadata

    @classmethod
    @contextmanager
    def _write(
        cls,
        path: pathlib.Path,
        metadata: dict,
        version: int = BUNDLE_VERSION,
    ):
        """
        Context-manager that writes the code bundle file. Automatically generates and writes header
        at end of writing code file bundle.
        """
        header = io.BytesIO()
        metadata_bytes = json.dumps(metadata, sort_keys=True).encode("utf-8")
        metadata_length = len(metadata_bytes)
        with open(path, "wb") as f:
            # Will write header at the end
            f.seek(BUNDLE_HEADER_SIZE)

            # Metadata
            f.write(metadata_bytes)

            hashed_f = MD5HashingWriter(f)
            yield hashed_f

            hashed_f.flush()

            # Generate checksum with what was written in context manager
            checksum = hashed_f.checksum

            # Magic bytes
            header.write(BUNDLE_MAGIC)
            # Bundle version
            header.write(version.to_bytes(8, "little"))
            # Bundle metadata length
            header.write(metadata_length.to_bytes(8, "little"))
            # Tar file checksum
            header.write(checksum)
            # Gzipped flag
            header.write((0).to_bytes(8, "little"))  # Not gzipped for now
            # Padding
            remaining = BUNDLE_HEADER_SIZE - header.tell()
            if remaining < 0:
                raise ToolBundleBuildError("Unable to fit bundle header")
            header.write(b"\x00" * remaining)
            header.seek(0)

            # Write header at start of file
            f.seek(0)
            f.write(header.getvalue())

    def write(
        self,
        path: pathlib.Path,
        tenant_id: int,
        skip_container_check: bool = False,
        skip_validation: bool = False,
        excludes: typing.Tuple[str] = (),
        session: typing.Optional[Session] = None,
    ):
        metadata = self.get_metadata(
            tenant_id=tenant_id,
            skip_container_check=skip_container_check,
            skip_validation=skip_validation,
            session=session,
        )

        # determine which directories in code folder to exclude from bundle
        exclude_dirs = [e for e in excludes if os.path.isdir(e)]
        exclude_dirs.extend(["outputs", ".ideas", "__pycache__", ".git", ".github"])

        # determine which file patterns to apply for filtering files that go in bundle
        exclude_patterns = [e for e in excludes if not os.path.isdir(e)]
        exclude_patterns.extend([".DS_Store", "*.pyc", "*.pyo", "*.cbundle"])

        def bundle_exclude_filter(
            tarinfo: tarfile.TarInfo,
        ) -> typing.Optional[tarfile.TarInfo]:
            """
            Excludes files matching patterns from the code bundle.

            TODO consider supporting .gitignore patterns: pathspec is a library that could help
            """
            name = tarinfo.name
            for exclude_pattern in exclude_patterns:
                if fnmatch.fnmatch(name, exclude_pattern):
                    return None
            return tarinfo

        with CodeBundle._write(path, metadata) as f:
            with tarfile.open(fileobj=f, mode="w|") as tar:
                for item in self.code_dir.iterdir():
                    if item.is_dir() and item.name in exclude_dirs:
                        continue
                    tar.add(item, arcname=item.name, filter=bundle_exclude_filter)


def extract_code_bundle(
    cbundle: pathlib.Path, output_tar_file: typing.Optional[pathlib.Path] = None
):
    """
    Extract metadata (and optionally, the source code tar file) for a specified code bundle.
    """
    with cbundle.open("rb") as f:
        header = f.read(BUNDLE_HEADER_SIZE)
        magic = header[:8]
        if not magic == BUNDLE_MAGIC:
            raise ToolBundleExtractError("Not an IDEAS code bundle file")

        try:
            version, metadata_len, checksum = struct.unpack_from("<QQ16s", header, 8)
        except struct.error:
            raise ToolBundleExtractError("Code bundle header invalid")

        if version != BUNDLE_VERSION:
            raise ToolBundleExtractError(f"Code bundle version {version} not supported")

        metadata_bytes = f.read(metadata_len)
        try:
            metadata = json.loads(metadata_bytes)
        except json.decoder.JSONDecodeError:
            raise ToolBundleExtractError("Metadata malformed")

        if output_tar_file is not None:
            md5 = hashlib.md5()
            f.seek(BUNDLE_HEADER_SIZE + metadata_len)
            with output_tar_file.open("wb") as out:
                while chunk := f.read(8192):
                    out.write(chunk)
                    md5.update(chunk)

            if checksum != md5.digest():
                raise ToolBundleExtractError(
                    f"Header checksum {checksum.hex()} does't match file checksum {md5.digest().hex()}"
                )

    return metadata
