import json
from pathlib import Path
import random
from typing import Any, Dict, List, Tuple

import networkx as nx
import graphviz
from jsonschema import RefResolver, Draft202012Validator
import yaml

import sophios
from sophios import ast, compiler, utils_cwl
from sophios.cli import get_args
from sophios.utils_yaml import wic_loader
from sophios.wic_types import GraphData, GraphReps, NodeData, StepId, Yaml, YamlTree
from ..wic_types import Json, Tools


config_schemas = {}
path = Path('../mm-workflows/autogenerated/schemas/config_schemas.json')
if path.exists():
    with open(path, mode='r', encoding='utf-8') as f:
        config_schemas = json.load(f)


def default_schema(url: bool = False) -> Json:
    """A basic default schema (to avoid copy & paste).

    Args:
        url (bool, optional): Determines whether to include the $schema url. Defaults to False.

    Returns:
        Json: A basic default schema
    """
    schema: Json = {}
    schema['type'] = 'object'
    schema['additionalProperties'] = False
    schema['errorMessage'] = 'Property is not allowed. IS YOUR INDENTATION CORRECT?'
    if url:
        schema['$schema'] = 'https://json-schema.org/draft/2020-12/schema'
    return schema


def named_schema(name: str, schema: Json) -> Json:
    """Creates a schema which starts with name and contains the given schema.

    Args:
        name (str): The identifier of the string
        schema (Json): The given schema

    Returns:
        Json: A schema which matches name and the given schema
    """
    schema_ = default_schema()
    schema_['properties'] = {name: schema}
    return schema_


def named_empty_schema(name: str) -> Json:
    """Creates a schema which starts with name, but is otherwise an empty wildcard

    Args:
        name (str): The identifier of the string

    Returns:
        Json: A schema which matches anything starting with name
    """
    return named_schema(name, {})  # NOTE: {} is essentially a wildcard


def named_null_schema(name: str) -> Json:
    """Creates a schema which starts with name and contains nothing else

    Args:
        name (str): The identifier of the string

    Returns:
        Json: A schema which matches name and nothing else
    """
    # NOTE: Use this together with anyOf to allow no explicit arguments
    return named_schema(name, {'type': 'null'})


def named_id_schema(name: str) -> Json:
    """Creates a schema which consists of an 'id' with the given name and nothing else.

    Args:
        name (str): The identifier of the string

    Returns:
        Json: A schema which consists of an 'id' with the given name and nothing else.
    """
    # NOTE: Use this together with anyOf to allow no explicit arguments
    return named_schema('id', {'type': 'string', 'const': name})


def cwl_type_to_jsonschema_type_schema(type_obj: Json) -> Json:
    """Converts a canonicalized CWL type into the equivalent jsonschema type schema, if possible.

    Args:
        type_obj (Json): A canonical CWL type object

    Returns:
        Json: A JSON type schema corresponding to type_obj if valid else None
    """
    jsontype = cwl_type_to_jsonschema_type(type_obj)
    if jsontype is not None:
        if isinstance(jsontype, (List, str)):
            return {'type': jsontype}
    return jsontype


def cwl_type_to_jsonschema_type(type_obj: Json) -> Json:
    """Converts a canonicalized CWL type into the equivalent jsonschema type schema, if possible.

    Args:
        type_obj (Json): A canonical CWL type object

    Returns:
        Json: A JSON type schema corresponding to type_obj if valid else None
    """
    if isinstance(type_obj, str):
        # Obviously ignore empty types
        if type_obj == '':
            return None
        # The 'null' type is valid; it does not indicate an error.
        if type_obj == 'null':
            return 'null'
        # There are only two numeric types in json
        if type_obj == 'int' or type_obj == 'long':
            return 'integer'
        if type_obj == 'float' or type_obj == 'double':
            return 'number'
        # Rename common abbreviations
        if type_obj == 'bool':
            return 'boolean'
        if type_obj == 'str':
            return 'string'
        # CWL supports an Any type; map this to the empty wildcard {} schema
        if type_obj == 'Any':
            return {}
        # json does not have any File or Directory types
        if type_obj == 'File':
            return None
        if type_obj == 'Directory':
            return None
        if type_obj == 'WritableFile':
            return None
        if type_obj == 'WritableDirectory':
            return None

    if isinstance(type_obj, Dict):
        if type_obj.get('type') == 'array' and 'items' in type_obj:
            items = cwl_type_to_jsonschema_type(type_obj['items'])
            if items is None:
                return None  # Propagate any type failures
            if items == {}:  # Explicitly handle the `Any` type
                return {**type_obj, 'items': items}
            if isinstance(type_obj['items'], str) and items != {}:
                # Wrap primitive strings in {'type': ...}
                return {**type_obj, 'items': {'type': items}}
            return {**type_obj, 'items': items}
        # TODO: Other cases?

    if isinstance(type_obj, List):
        items = [cwl_type_to_jsonschema_type(item) for item in type_obj]
        if any([item is None for item in items]):
            return None  # Propagate any type failures
        # See https://cswr.github.io/JsonSchema/spec/multiple_types/
        # In a list, if some of the types are themselves arrays or objects,
        # we need to replace them with "array" and "object". This loses
        # information, but that's the specification, so...
        items = ['array' if isinstance(item, Dict) and item.get('type', '') == 'array' else item for item in items]
        items = ['object' if isinstance(item, Dict) and item.get('type', '') == 'object' else item for item in items]
        return items

    # TODO: Support CWL records
    return type_obj


def cwl_schema(name: str, cwl: Json, id_prefix: str) -> Json:
    """Generates a schema (including documentation) based on the inputs of a CWL CommandLineTool or Workflow.

    Args:
        name (str): The name of the CWL CommandLineTool or Workflow
        cwl (Json): The CWL CommandLineTool or Workflow
        id_prefix (str): Either the string 'tools' or 'workflows'

    Returns:
        Json: An autogenerated, documented schema based on the inputs and outputs of a CWL CommandLineTool or Workflow.
    """
    inputs_props: Json = {}

    str_nonempty = {'type': 'string', 'minLength': 1}
    anytype: Dict[Any, Any] = {}

    # See utils_yaml.py
    alias = default_schema()
    alias['properties'] = {'wic_alias': str_nonempty}  # !*

    ii = default_schema()
    ii['properties'] = {'wic_inline_input': anytype}  # !ii

    # required = []
    for key, val in cwl['inputs'].items():
        metadata = {'title': val.get('label', ''), 'description': val.get('doc', '')}
        str_nonempty = {'type': 'string', 'minLength': 1, **metadata}

        # Determine required keys
        # if key == 'config' or not ('?' in cwltype or 'default' in val):
        #    required.append(key)

        # Handle special cases
        if key == 'config' and name in config_schemas:
            inputs_props[key] = {'anyOf': [str_nonempty, alias, ii,
                                           {**config_schemas[name], **metadata}]}
            continue

        if key == 'config' and name == 'cwl_subinterpreter':
            # This may cause problems with hypothesis, but since the
            # cwl_subinterpreter config tag takes an arbitrary Json-encoded string
            # as input, we cannot restrict this particular sub-schema.
            empty_schema: Dict = {}
            inputs_props[key] = empty_schema
            continue

        if key == 'config' and name == 'config_tag_mdp':
            grompp = config_schemas.get('grompp', {})

            ii_mdp = default_schema()
            ii_mdp['properties'] = {'wic_inline_input': grompp}  # !ii

            inputs_props[key] = ii_mdp
            continue

        # Add type information, with exceptions
        cwltype = utils_cwl.canonicalize_type(val.get('type', ''))
        jsontype = cwl_type_to_jsonschema_type_schema(cwltype)
        if jsontype:
            if jsontype == {'type': 'string'}:
                jsontype = str_nonempty
            if isinstance(jsontype['type'], List) and 'string' in jsontype['type']:
                jsontype['type'].remove('string')
            inputs_props[key] = {'anyOf': [str_nonempty, alias, ii, {**jsontype, **metadata}]}
        else:
            inputs_props[key] = {'anyOf': [str_nonempty, alias, ii]}

    # Do not mark properties which are required for CWL as required for yml,
    # because the whole point of inference is that we shouldn't have to!
    # if not required == []:
    #    inputs_props['required'] = required

    inputs = default_schema()
    inputs['properties'] = inputs_props

    scatter = {}
    if inputs_props:
        # NOTE: The CWL specification defines what happens when a scattered input array is empty at runtime.
        # https://www.commonwl.org/v1.2/Workflow.html#Scatter/gather
        # However, it does not define, in the case that there are no input parameters, if it is
        # permissible to specify the scatter tag with an empty list of parameters.
        # In anticipation of different CWL implementations interpreting this ambiguity differently,
        # let's just banish it here, i.e. only allow the scatter tag when there are inputs.
        consts = [{**val, 'const': key} for key, val in inputs_props.items()]
        scatter_props = {'type': 'array', 'items': {'anyOf': consts}}
        scattermethods = ['dotproduct', 'flat_crossproduct', 'nested_crossproduct']
        scattermethod_props: Json = {'type': 'string', 'enum': scattermethods}
        scatter = {'scatter': scatter_props,
                   'scatterMethod': scattermethod_props}  # NOTE: capital M

    outputs_props: Json = {}
    for key, val in cwl['outputs'].items():
        metadata = {'title': val.get('label', ''), 'description': val.get('doc', '')}
        str_nonempty = {'type': 'string', 'minLength': 1, **metadata}

        # Add type information, with exceptions
        cwltype = utils_cwl.canonicalize_type(val.get('type', ''))
        jsontype = cwl_type_to_jsonschema_type_schema(cwltype)
        if jsontype:
            outputs_props[key] = {'anyOf': [str_nonempty, {**jsontype, **metadata}]}
        else:
            outputs_props[key] = str_nonempty

    outputs = default_schema()
    outputs['properties'] = outputs_props

    # See utils_yaml.py
    anchor = default_schema()
    anchor['properties'] = {'wic_anchor': str_nonempty}  # !&

    keys_anchors: Json = {}
    for key in cwl['outputs'].keys():
        key_schema = default_schema()
        key_schema['properties'] = {key: anchor}

    # NOTE: This function generates schemas compatible with call sites in a
    # workflow. Specifically, the types of `inputs:` and `in:` are the same (Json)
    # but we want to use out, NOT outputs below, which has type List[str].
    outputs_keys = {'type': 'string', 'enum': list(cwl['outputs'].keys())}
    # Moreover, we want to support the wic_anchor !& syntax in the out: tag
    out = {'type': 'array', 'items': {'anyOf': [outputs_keys, keys_anchors]}}

    step_name = name + '.wic' if id_prefix == 'workflows' else name

    step_props = default_schema()
    step_props['title'] = cwl.get('label', '')
    step_props['description'] = cwl.get('doc', '')
    step_props['properties'] = {'in': inputs,
                                'out': out,  # NOT outputs! See comment above!
                                **scatter,
                                # The run: tag can be either a string path to another file,
                                # or the file contents can be inlined. For now, just use {}
                                'run': {},  # TODO
                                'id': {'type': 'string', 'const': step_name},
                                'hints': {},  # TODO
                                'requirements': {},  # TODO
                                }

    # NOTE: See comment in get_validator(). Nonetheless, the vscode YAML extension
    # appears to be resolving ids w.r.t. relative local paths. jsonschema
    # (correctly) treats f'tools/{name}.json' as as uninterpreted string,
    # so instead of using name let's just use fake relative paths in ids.
    step_props['$id'] = f'{id_prefix}/{name}.json'
    return step_props


def wic_tag_schema(hypothesis: bool = False) -> Json:
    """The schema of the (recursive) wic: metadata annotation tag.

    Args:
        hypothesis (bool): Determines whether we should restrict the search space.

    Returns:
        Json: The schema of the (recursive) wic: metadata annotation tag.
    """
    # NOTE: This schema needs to be recursive. Use dynamic anchors / references.
    # See https://json-schema.org/draft/2020-12/json-schema-core.html#dynamic-ref
    # and https://stackoverflow.com/questions/69728686/explanation-of-dynamicref-dynamicanchor-in-json-schema-as-opposed-to-ref-and

    # TODO: restrict the str to the enum of all valid step keys
    pat_int_str = "\\([0-9]+, [A-Za-z0-9_\\.]+\\)"

    graphviz_props: Json = {}
    # Is it useful to have an empty label? Let's bar it for now.
    graphviz_props['label'] = {'type': 'string', 'minLength': 1}
    pat_gv_style = '((,\\s*)*(dashed|dotted|solid|invis|bold|tapered|filled|striped|wedged|diagonals|rounded))+'
    graphviz_props['style'] = {'type': 'string', 'pattern': pat_gv_style}
    graphviz_props['ranksame'] = {'type': 'array'}
    graphviz_props['ranksame']['items'] = {'type': 'string', 'pattern': pat_int_str}

    graphviz_schema = default_schema()
    graphviz_schema['properties'] = graphviz_props

    # Call recursive reference
    recursive_ref = {'$dynamicRef': '#wic'}
    in_props: Json = {}  # TODO: Add yml specific properties
    out_props: Json = {}  # TODO: Add yml specific properties

    scatter_props: Json = {}  # TODO: Add yml specific properties
    scattermethod_props: Json = {'type': 'string', 'enum': ['dotproduct', 'flat_crossproduct', 'nested_crossproduct']}

    choices_props = {'wic': recursive_ref, 'scatterMethod': scattermethod_props}  # NOTE: capital M
    if not hypothesis:
        # Empty wildcard {} schemas can cause problems with hypothesis.
        choices_props['in'] = in_props
        choices_props['out'] = out_props
        choices_props['scatter'] = scatter_props
    choices = default_schema()
    choices['properties'] = choices_props

    # See https://json-schema.org/understanding-json-schema/reference/object.html#patternproperties
    # NOTE: This recursive schema is correct, as determined by jsonschema.validate()
    # However, it seems that the vscode YAML extension does not support recursive
    # schema. (IntelliSense works fine until the first instance of recursion.)
    # TODO: A workaround would be to autogenerate a specific schema for each
    # yml file. We should probably do this anyway for the in: tag.
    steps = default_schema()
    # additionalProperties = False still works with patternProperties FYI
    steps['patternProperties'] = {pat_int_str: choices}

    # implementations = default_schema()
    implementations: Dict[Any, Any] = {}
    implementations['type'] = 'object'
    implementations['additionalProperties'] = True
    # TODO: Restrict the implementation properties and make default_implementation an enum

    str_nonempty = {'type': 'string', 'minLength': 1}

    namespace: Dict[Any, Any] = str_nonempty
    # namespace['enum'] = ...
    # TODO: Restrict the namespace properties to only those in search_paths_wic

    implementation = str_nonempty
    default_implementation = str_nonempty
    inlineable = {'type': 'boolean'}

    schema = default_schema(url=True)
    schema['$id'] = 'wic_tag'
    # Create recursive anchor
    schema['$dynamicAnchor'] = 'wic'
    schema['title'] = 'Metadata annotations'
    schema['description'] = 'Use steps: to recursively overload / pass parameters.\nUse graphviz: to modify the DAGs.'

    pat_semver = "^[0-9]+\\.[0-9]+\\.[0-9]+$"
    version = {'type': 'string', 'pattern': pat_semver}
    driver = {'type': 'string', 'enum': ['slurm', 'argo']}

    schema_props = {'graphviz': graphviz_schema, 'steps': steps, 'implementation': implementation,
                    'default_implementation': default_implementation,
                    'version': str_nonempty, 'driver': driver,
                    'namespace': namespace, 'inlineable': inlineable}
    if not hypothesis:
        # {'additionalProperties': True} can cause problems with hypothesis.
        schema_props['implementations'] = implementations
    schema['properties'] = schema_props
    return schema


def wic_main_schema(tools_cwl: Tools, yml_stems: List[str], schema_store: Dict[str, Json], hypothesis: bool = False) -> Json:
    """The main schema which is used to validate yml files.

    Args:
        tools_cwl (Tools): The CWL CommandLineTool definitions found using get_tools_cwl()
        yml_stems (List[str]): The names of the yml workflow definitions found using get_yml_paths()
        schema_store (Dict[str, Json]): A global mapping between ids and schemas
        hypothesis (bool): Determines whether we should restrict the search space.

    Returns:
        Json: The main schema which is used to validate yml files.
    """
    wildcard_schema: Json = {}
    wildcard_schema['type'] = 'object'
    wildcard_schema['additionalProperties'] = True

    # NOTE: As mentioned below, using $ref's with external schema files
    # (coincidentally?) works with the VSCode YAML extension, and for the
    # jsonschema library we can supply an explicit schemastore. The API of the
    # hypothesis-jsonschema library, however, only takes a schema. So we either
    # need to bundle the external file contents into wic.json (using $def's),
    # or (since there is only one call site per file) simply inline the contents.
    schemas_tools: List[Tuple[str, Json, Json, Json]] = \
        [(step_id.stem,
          schema_store.get(f'tools/{step_id.stem}.json',
                           {'$ref': f'tools/{step_id.stem}.json'}),
          named_null_schema(step_id.stem),
          named_id_schema(step_id.stem))
         for step_id in tools_cwl if not step_id.stem.startswith('python_script')]
    schemas_tools_list: List[Json] = [{'anyOf': [schema, schema_id]}
                                      for name, schema, schema_null, schema_id in schemas_tools]
    schemas_tools_dict: dict[str, Json] = {name: {'anyOf': [schema, {}]}
                                           for name, schema, schema_null, schema_id in schemas_tools}
#    tools_schemas: List[Json] = [{'anyOf': [{'$ref': f'tools/{step_id.stem}.json'},
#                                            named_null_schema(step_id.stem)]} for step_id in tools_cwl]
    # NOTE: See comment in get_validator(). Nonetheless, the vscode YAML extension
    # appears to be resolving ids w.r.t. relative local paths. jsonschema
    # (correctly) treats f'tools/{name}.json' as an uninterpreted string,
    # so instead of using stem let's just use fake relative paths in ids.

    # NOTE: We could/should re-validate after every AST modification. This will
    # require substantial code changes, so let's not worry about it for now.
    schemas_yml: List[Tuple[str, Json, Json, Json]] = \
        [(yml_stem,
          schema_store.get(f'workflows/{yml_stem}.json',
                           {'$ref': f'workflows/{yml_stem}.json'}),
          named_null_schema(yml_stem),
          named_id_schema(yml_stem))
         for yml_stem in yml_stems]
    schemas_yml_list: List[Json] = [{'anyOf': [schema, schema_id]}
                                    for name, schema, schema_null, schema_id in schemas_yml]
    schemas_yml_dict: dict[str, Json] = {name: {'anyOf': [schema, {}]}
                                         for name, schema, schema_null, schema_id in schemas_yml}
#    yml_schemas: List[Json] = [{'anyOf': [{'$ref': f'workflows/{yml_stem}.json'},
#                                          named_null_schema(f'{yml_stem}.wic')]} for yml_stem in yml_stems]

    steps_list: Json = {}
    steps_list['type'] = 'array'
    steps_list['description'] = 'A list of workflow steps'

    steps_dict: Json = {}
    steps_dict['type'] = 'object'
    steps_dict['description'] = 'An ordered dict of workflow steps'

    steps_dict['properties'] = {**schemas_tools_dict, **schemas_yml_dict}  # **wildcard_schema ??

    if hypothesis:
        # For performance reasons, limit the size of the schema. The first time
        # you call .example(), hypothesis will compile the schema and cache the
        # results. Subsequent .example() calls are nearly instantaneous.
        # The time increases fairly rapidly with k, i.e.
        k = 1  # 1-5 minutes...
        # Choose a random subset so we're not testing the same files
        schemas_tools_list = random.choices(schemas_tools_list, k=k)
        schemas_yml_list = random.choices(schemas_yml_list, k=k)

    steps_schemas_list = schemas_tools_list + schemas_yml_list

    str_nonempty = {'type': 'string', 'minLength': 1}

    if not hypothesis:
        in_schema: Json = {}
        in_schema['type'] = 'object'
        in_schema['additionalProperties'] = True
        in_schema['properties'] = {'script': str_nonempty}

        # See utils_yaml.py
        anchor = default_schema()
        anchor['properties'] = {'wic_anchor': str_nonempty}  # !&

        # NOTE: We do not know the specific keys statically, so we have to use str_nonempty
        out_schema: Json = {'type': 'array', 'items': {'anyOf': [str_nonempty, anchor]}}

        in_out_schema = default_schema()
        in_out_schema['properties'] = {'in': in_schema, 'out': out_schema}

        python_script_schema = default_schema()
        python_script_schema['properties'] = {'python_script': in_schema}

        steps_schemas_list += [python_script_schema]

    if not hypothesis:
        # In tools_schemas and yml_schemas above, we are able to use the step name
        # to uniquely identify the associated schema of the child node.
        # We want to support raw CWL, where the step name is not necessarily
        # the name of the CommandLineTool / Subworkflow.
        # We still know which schema we should use by looking at the run tag.
        # This shouldn't be terribly difficult to implement, but for now
        # let's just use a wildcard schema with arbitrary keys and values.
        # (So for now intellisense code completion is unavailable for these steps.)
        # Crucially, code completion *is preserved* for tools_schemas and yml_schemas!
        steps_schemas_list += [wildcard_schema]

    steps_list['items'] = {'anyOf': steps_schemas_list, 'minItems': 1, 'title': 'Valid workflow steps'}
    if hypothesis:
        # For performance reasons, limit the number of steps.
        # This should (hopefully) avoid hypothesis.errors.DeadlineExceeded
        steps_list['items']['maxItems'] = 5

    # TODO: Use the real CWL inputs schema
    inputs: Dict[Any, Any] = {}
    inputs['type'] = 'object'
    inputs['additionalProperties'] = True

    # TODO: Use the real CWL outputs schema
    outputs: Dict[Any, Any] = {}
    outputs['type'] = 'object'
    outputs['additionalProperties'] = True

    schema = default_schema(url=True)
    schema['$id'] = 'wic_main'
    schema['title'] = 'Validating against the Workflow Inference Compiler schema'
    # schema['description'] = ''
    # schema['required'] = ['steps'] # steps are not required, e.g. npt.wic

    steps = {'anyOf': [steps_list, steps_dict]}

    schema_props = {'steps': steps,
                    'class': str_nonempty,
                    'cwlVersion': str_nonempty,  # TODO enum https://www.commonwl.org/v1.2/Workflow.html#CWLVersion
                    # TODO https://www.commonwl.org/v1.2/SchemaSalad.html#Explicit_context
                    '$base': str_nonempty,
                    '$namespaces': {},  # TODO
                    '$schemas': {},  # TODO
                    '$graph': {},  # TODO
                    '$import': {},  # TODO https://www.commonwl.org/v1.2/SchemaSalad.html#Import
                    '$include': {},  # TODO https://www.commonwl.org/v1.2/SchemaSalad.html#Include
                    'id': str_nonempty,
                    'hints': {},  # TODO
                    'requirements': {},  # TODO
                    'label': str_nonempty,
                    'doc': str_nonempty}
    # schema_props['wic'] = wic_tag_schema(hypothesis) # NOTE: This technically 'works'
    # with hypothesis, but the wic_tag_schema still needs some work.

    if not hypothesis:
        schema_props['wic'] = wic_tag_schema(hypothesis)
        # {'additionalProperties': True} can cause problems with hypothesis.
        schema_props['inputs'] = inputs
        schema_props['outputs'] = outputs

    schema['properties'] = schema_props

    # https://json-schema.org/understanding-json-schema/structuring.html#bundling
    # import copy
    # schema['$defs'] = copy.deepcopy(schema_store)
    # Without deepcopy, "ValueError: Circular reference detected"
    # "f.write(json.dumps(schema, indent=2))"

    return schema


def compile_workflow_generate_schema(homedir: str,
                                     yml_path_str: str, yml_path: Path,
                                     tools_cwl: Tools,
                                     yml_paths: Dict[str, Dict[str, Path]],
                                     validator: Draft202012Validator,
                                     ignore_validation_errors: bool,
                                     allow_raw_cwl: bool) -> Json:
    """Compiles a workflow and generates a schema which (recursively) includes the inputs/outputs from subworkflows.

    Args:
        homedir (str): The users home directory
        yml_path_str (str): The stem of the path to the yml file
        yml_path (Path): The path to the yml file
        tools_cwl (Tools): The CWL CommandLineTool definitions found using get_tools_cwl()
        yml_paths (Dict[str, Dict[str, Path]]): The yml workflow definitions found using get_yml_paths()
        validator (Draft202012Validator): Used to validate the yml files against the autogenerated schema.
        ignore_validation_errors (bool): Temporarily ignore validation errors. Do not use this permanently!
        allow_raw_cwl (bool): Do not check whether the input to a workflow step refers to the workflow inputs: tag

    Returns:
        Json: An autogenerated, documented schema based on the inputs and outputs of the Workflow.
    """
    # First compile the workflow.
    # Load the high-level yaml workflow file.
    with open(yml_path, mode='r', encoding='utf-8') as y:
        root_yaml_tree: Yaml = yaml.load(y.read(), Loader=wic_loader())
    Path('autogenerated/').mkdir(parents=True, exist_ok=True)
    wic_tag = {'wic': root_yaml_tree.get('wic', {})}
    plugin_ns = wic_tag['wic'].get('namespace', 'global')
    step_id = StepId(yml_path_str, plugin_ns)
    y_t = YamlTree(step_id, root_yaml_tree)
    yaml_tree_raw = sophios.ast.read_ast_from_disk(homedir, y_t, yml_paths, tools_cwl, validator,
                                                   ignore_validation_errors)
    # with open(f'autogenerated/{Path(yml_path).stem}_tree_raw.wic', mode='w', encoding='utf-8') as f:
    #    f.write(yaml.dump(yaml_tree_raw.yml))
    yaml_tree = sophios.ast.merge_yml_trees(yaml_tree_raw, {}, tools_cwl)
    # with open(f'autogenerated/{Path(yml_path).stem}_tree_merged.wic', mode='w', encoding='utf-8') as f:
    #    f.write(yaml.dump(yaml_tree.yml))
    root_yml_dir_abs = Path(yml_path).parent.absolute()
    yaml_tree = ast.python_script_generate_cwl(yaml_tree, root_yml_dir_abs, tools_cwl)
    with open(f'autogenerated/{Path(yml_path).stem}_tree_python_script.wic', mode='w', encoding='utf-8') as f:
        f.write(yaml.dump(yaml_tree.yml))

    graph_gv = graphviz.Digraph(name=f'cluster_{yml_path}')
    graph_gv.attr(newrank='True')
    graph_nx = nx.DiGraph()
    graphdata = GraphData(str(yml_path))
    graph = GraphReps(graph_gv, graph_nx, graphdata)
    args = get_args(str(yml_path), ['--allow_raw_cwl'] if allow_raw_cwl else [])
    compiler_info = sophios.compiler.compile_workflow(yaml_tree, args, [], [graph], {}, {}, {}, {},
                                                      tools_cwl, True, relative_run_path=True, testing=True)
    rose_tree = compiler_info.rose
    sub_node_data: NodeData = rose_tree.data

    # wic.io.write_to_disk(rose_tree, Path('autogenerated/'), relative_run_path=True)
    schema = cwl_schema(step_id.stem, sub_node_data.compiled_cwl, 'workflows')

    # with open(f'autogenerated/schemas/workflows/{step_id.stem}.json', mode='w', encoding='utf-8') as f:
    #    f.write(json.dumps(schema, indent=2))

    return schema


def get_validator(tools_cwl: Tools, yml_stems: List[str], schema_store: Dict[str, Json] = {},
                  write_to_disk: bool = False, hypothesis: bool = False) -> Draft202012Validator:
    """Generates the main schema used to check the yml files for correctness and returns a validator.

    Args:
        tools_cwl (Tools): The CWL CommandLineTool definitions found using get_tools_cwl()
        yml_stems (List[str]): The names of the yml workflow definitions found using get_yml_paths()
        schema_store (Dict[str, Json]): A global mapping between ids and schemas
        write_to_disk (bool): Controls whether to write the schemas to disk.
        hypothesis (bool): Determines whether we should restrict the search space.

    Returns:
        Draft202012Validator: A validator which is used to check the yml files for correctness.
    """
    for step_id, tool in tools_cwl.items():
        schema_tool = cwl_schema(step_id.stem, tool.cwl, 'tools')
        schema_store[schema_tool['$id']] = schema_tool
        # if write_to_disk:
        #    with open(f'autogenerated/schemas/tools/{step_id.stem}.json', mode='w', encoding='utf-8') as f:
        #        f.write(json.dumps(schema_tool, indent=2))

    # Add temporary placeholders to the schema_store so we don't get
    # "jsonschema.exceptions.RefResolutionError: unknown url type: 'workflows/*.json'"
    for yml_stem in yml_stems:
        if f'workflows/{yml_stem}.json' not in schema_store:
            schema_store[f'workflows/{yml_stem}.json'] = {}

    schema = wic_main_schema(tools_cwl, yml_stems, schema_store, hypothesis)
    schema_store[schema['$id']] = schema
    schema_store['wic_tag'] = wic_tag_schema(hypothesis)
    if write_to_disk:
        with open('autogenerated/schemas/wic.json', mode='w', encoding='utf-8') as f:
            f.write(json.dumps(schema, indent=2))

    # Load cached schema from disk
    # NOTE: This may or may not be the correct solution. We should double check
    # all of the call sites of get_validator and the write_to_disk parameters.
    if Path('autogenerated/schemas/wic.json').exists():
        with open('autogenerated/schemas/wic.json', mode='r', encoding='utf-8') as r:
            schema = json.loads(r.read())

    # See https://stackoverflow.com/questions/53968770/how-to-set-up-local-file-references-in-python-jsonschema-document
    # The $ref tag refers to URIs defined in $id tags, NOT relative paths on
    # the local filesystem! We need to create a global mapping between ids and schemas
    # i.e. schema_store.
    resolver = RefResolver.from_schema(schema, store=schema_store)
    """ Use check_schema to 'first verify that the provided schema is
    itself valid, since not doing so can lead to less obvious error
    messages and fail in less obvious or consistent ways.'
    """
    # i.e. This should match 'https://json-schema.org/draft/2020-12/schema'
    # NOTE: If you get nasty errors while developing the schema such as:
    # "jsonschema.exceptions.SchemaError: ... is not valid under any of the given schemas"
    # try temporarily commenting this line out to generate the schema anyway.
    # Then, in any yml file, the very first line should show a "schema stack trace"
    Draft202012Validator.check_schema(schema)
    validator = Draft202012Validator(schema, resolver=resolver)
    return validator
