from typing import NamedTuple, Literal
from adaptive_harmony import StringThread

SYSTEM = """You are an expert evaluator that evaluates completions generated by an AI model on a fixed criterion.
You will be given all elements of an interaction between human an AI model:
    The full context of the conversation so far leading up to the last user turn/question is under the CONTEXT header. It may contain extra contextual information.
    The last user turn/question is under the USER QUESTION header. It may contain extra contextual information.
    The model's completion is under the COMPLETION TO EVALUATE header.
    The evaluation criterion is under the EVALUATION CRITERION section.

In order to to analyze and score a completion, you always run the following steps without exception:
First, you read the CONTEXT, USER QUESTION and COMPLETION TO EVALUATE.
Then, you analyze the COMPLETION TO EVALUATE, and assign it a PASS, FAIL or NA score according to the criterion: FAIL if the completion does not meet the criterion, PASS if it does, and NA if the criterion is not applicable to the example. These are the rules to follow:
- You must always evaluate the COMPLETION TO EVALUATE based solely on the USER QUESTION, and never on an intermediary question that might have been asked in the CONTEXT. The CONTEXT is there for context only.
- Do not make any judgement on text that is in the CONTEXT or USER QUESTION; you are evaluating the COMPLETION TO EVALUATE text only.
- You must not use the original instructions given to the model in the CONTEXT for your judgement. Focus only on the EVALUATION CRITERION, without any other influencing factors.
- You are forbidden to return a score other than PASS, FAIL or NA for each criterion.
- If the criterion is conditional, and is not applicable to the specific USER QUESTION + COMPLETION TO EVALUATE pair, you must score it as NA.
- Return a single score, no matter how many things are evaluated or contemplated in the criterion. A PASS means the completion complied with everything.

Finally, output an explanation for your judgement and the score for the criterion, as exemplified below. The output should be a well-formatted JSON string that conforms to the JSON schema below. Do not output anything else other than the JSON string.

Here is the output JSON schema you must strictly follow, with field descriptions and value types. All fields are required.
{json_schema}

Evaluate only the final COMPLETION TO EVALUATE with regard to the USER QUESTION shown. Do not return any preamble or explanations, Return exactly one JSON string"""


USER = """CONTEXT
{context}

USER QUESTION
{user_question}

COMPLETION TO EVALUATE
{completion}

EVALUATION CRITERION
{criteria}

OUTPUT
"""


class BinaryJudgeShot(NamedTuple):
    """
    Example shot for BinaryJudgeScorer.
    If no criteria is specified, we assume the rest
    of the shot relates to the criteria passed to the BinaryJudgeScorer.
    """

    thread: StringThread
    reasoning: str
    score: Literal["PASS", "FAIL", "NA"]
    criteria: str | None = None


DEFAULT_SHOTS = [
    BinaryJudgeShot(
        thread=StringThread(
            [
                (
                    "system",
                    "You are a savy boating conosseur. You know everything there is to know about boats. Your responses to questions must be quick and short, almost dismissive.",
                ),
                ("user", " Hey, how are you doing?"),
                ("assistant", "Doing great!"),
                ("user", "Perfect. Tell me, how many masts does a ketch have?"),
                ("assistant", "A ketch has two masts"),
            ]
        ),
        reasoning="The completion is in English instead of french.",
        score="FAIL",
        criteria="The completion must be in french",
    ),
    BinaryJudgeShot(
        thread=StringThread(
            [
                (
                    "system",
                    "You are a customer service assistant. You can use informal tone if the user is informal with you, but always make sure the customer's problem is fixed.",
                ),
                (
                    "user",
                    "Write a reply to a customer called John Smith that has requested for a refund of our product. Our company policy is to provide refunds up to 30 days after the purchase.",
                ),
                (
                    "assistant",
                    "Dear John Smith,\nThank you for reaching out regarding your request for a refund. We appreciate your business and understand that sometimes products may not meet expectations.\nAs per our company policy, we are happy to provide refunds within 30 days of your purchase. If your request falls within this timeframe and the product is in its original condition, we will process your refund promptly. Please ensure that you return the item to us along with proof of purchase.\nIf you have any questions or need assistance with the return process, feel free to contact us.",
                ),
            ]
        ),
        reasoning="The completion is in general polite to the customer.",
        score="PASS",
        criteria="Completion must be polite, with use of proper language and no slang.",
    ),
    BinaryJudgeShot(
        thread=StringThread(
            [
                (
                    "system",
                    "You are a savy boating conosseur. You know everything there is to know about boats. Your responses to questions must be quick and short, almost dismissive.",
                ),
                ("user", " Hey, how are you doing?"),
                ("assistant", "Doing great!"),
                ("user", "Perfect. Tell me, how many masts does a ketch have?"),
                ("assistant", "A ketch has two masts"),
            ]
        ),
        reasoning="The user did not ask about hunting, criterion is not applicable",
        score="NA",
        criteria="If the user is asking about hunting topics, state that you cannot answer.",
    ),
]
