"""Code generated by Speakeasy (https://speakeasy.com). DO NOT EDIT."""

from .basesdk import BaseSDK
from friendli import models, utils
from friendli._hooks import HookContext
from friendli.types import OptionalNullable, UNSET
from friendli.utils import eventstreaming, get_security_from_env
from typing import List, Mapping, Optional, Union


class FriendliChat(BaseSDK):
    def complete(
        self,
        *,
        model: str,
        messages: Union[List[models.Message], List[models.MessageTypedDict]],
        x_friendli_team: Optional[str] = None,
        eos_token: OptionalNullable[List[int]] = UNSET,
        frequency_penalty: OptionalNullable[float] = UNSET,
        logit_bias: OptionalNullable[
            Union[
                models.DedicatedChatCompleteBodyLogitBias,
                models.DedicatedChatCompleteBodyLogitBiasTypedDict,
            ]
        ] = UNSET,
        logprobs: OptionalNullable[bool] = UNSET,
        max_tokens: OptionalNullable[int] = UNSET,
        min_tokens: OptionalNullable[int] = 0,
        n: OptionalNullable[int] = 1,
        parallel_tool_calls: OptionalNullable[bool] = UNSET,
        presence_penalty: OptionalNullable[float] = UNSET,
        repetition_penalty: OptionalNullable[float] = UNSET,
        response_format: Optional[
            Union[models.ResponseFormat, models.ResponseFormatTypedDict]
        ] = None,
        seed: OptionalNullable[List[int]] = UNSET,
        stop: OptionalNullable[List[str]] = UNSET,
        stream: OptionalNullable[bool] = False,
        stream_options: OptionalNullable[
            Union[
                models.DedicatedChatCompleteBodyStreamOptions,
                models.DedicatedChatCompleteBodyStreamOptionsTypedDict,
            ]
        ] = UNSET,
        temperature: OptionalNullable[float] = 1,
        timeout_microseconds: OptionalNullable[int] = UNSET,
        tool_choice: Optional[
            Union[
                models.DedicatedChatCompleteBodyToolChoice,
                models.DedicatedChatCompleteBodyToolChoiceTypedDict,
            ]
        ] = None,
        tools: OptionalNullable[
            Union[List[models.Tool], List[models.ToolTypedDict]]
        ] = UNSET,
        top_k: OptionalNullable[int] = 0,
        top_logprobs: OptionalNullable[int] = UNSET,
        top_p: OptionalNullable[float] = 1,
        retries: OptionalNullable[utils.RetryConfig] = UNSET,
        server_url: Optional[str] = None,
        timeout_ms: Optional[int] = None,
        http_headers: Optional[Mapping[str, str]] = None,
    ) -> models.ChatResult:
        r"""Chat completions

        Given a list of messages forming a conversation, the model generates a response.

        :param model: ID of target endpoint. If you want to send request to specific adapter, using \"ENDPOINT_ID:ADAPTER_ROUTE\" format.
        :param messages: A list of messages comprising the conversation so far.
        :param x_friendli_team: ID of team to run requests as (optional parameter).
        :param eos_token: A list of endpoint sentence tokens.
        :param frequency_penalty: Number between -2.0 and 2.0. Positive values penalizes tokens that have been sampled, taking into account their frequency in the preceding text. This penalization diminishes the model's tendency to reproduce identical lines verbatim.
        :param logit_bias: Accepts a JSON object that maps tokens to an associated bias value. Mathematically, the bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model.
        :param logprobs: Whether to return log probabilities of the output tokens or not.
        :param max_tokens: The maximum number of tokens to generate. For decoder-only models like GPT, the length of your input tokens plus `max_tokens` should not exceed the model's maximum length (e.g., 2048 for OpenAI GPT-3). For encoder-decoder models like T5 or BlenderBot, `max_tokens` should not exceed the model's maximum output length. This is similar to Hugging Face's [`max_new_tokens`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.GenerationConfig.max_new_tokens) argument.
        :param min_tokens: The minimum number of tokens to generate. Default value is 0. This is similar to Hugging Face's [`min_new_tokens`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.generationconfig.min_new_tokens) argument.  **This field is unsupported when `tools` are specified.**
        :param n: The number of independently generated results for the prompt. Not supported when using beam search. Defaults to 1. This is similar to Hugging Face's [`num_return_sequences`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.GenerationConfig.num_return_sequences) argument.
        :param parallel_tool_calls: Whether to enable parallel function calling.
        :param presence_penalty: Number between -2.0 and 2.0. Positive values penalizes tokens that have been sampled at least once in the existing text.
        :param repetition_penalty: Penalizes tokens that have already appeared in the generated result (plus the input tokens for decoder-only models). Should be greater than or equal to 1.0 (1.0 means no penalty). See [keskar et al., 2019](https://arxiv.org/abs/1909.05858) for more details. This is similar to Hugging Face's [`repetition_penalty`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.generationconfig.repetition_penalty) argument.
        :param response_format: The enforced format of the model's output.  Note that the content of the output message may be truncated if it exceeds the `max_tokens`. You can check this by verifying that the `finish_reason` of the output message is `length`.  ***Important*** You must explicitly instruct the model to produce the desired output format using a system prompt or user message (e.g., `You are an API generating a valid JSON as output.`). Otherwise, the model may result in an unending stream of whitespace or other characters.
        :param seed: Seed to control random procedure. If nothing is given, random seed is used for sampling, and return the seed along with the generated result. When using the `n` argument, you can pass a list of seed values to control all of the independent generations.
        :param stop: When one of the stop phrases appears in the generation result, the API will stop generation. The stop phrases are excluded from the result. Defaults to empty list.
        :param stream: Whether to stream generation result. When set true, each token will be sent as [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#event_stream_format) once generated.
        :param stream_options: Options related to stream. It can only be used when `stream: true`.
        :param temperature: Sampling temperature. Smaller temperature makes the generation result closer to greedy, argmax (i.e., `top_k = 1`) sampling. Defaults to 1.0. This is similar to Hugging Face's [`temperature`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.generationconfig.temperature) argument.
        :param timeout_microseconds: Request timeout. Gives the `HTTP 429 Too Many Requests` response status code. Default behavior is no timeout.
        :param tool_choice: Determines the tool calling behavior of the model. When set to `none`, the model will bypass tool execution and generate a response directly. In `auto` mode (the default), the model dynamically decides whether to call a tool or respond with a message. Alternatively, setting `required` ensures that the model invokes at least one tool before responding to the user. You can also specify a particular tool by `{\"type\": \"function\", \"function\": {\"name\": \"my_function\"}}`.
        :param tools: A list of tools the model may call. Currently, only functions are supported as a tool. A maximum of 128 functions is supported. Use this to provide a list of functions the model may generate JSON inputs for.  **When `tools` are specified, `min_tokens` field is unsupported.**
        :param top_k: The number of highest probability tokens to keep for sampling. Numbers between 0 and the vocab size of the model (both inclusive) are allowed. The default value is 0, which means that the API does not apply top-k filtering. This is similar to Hugging Face's [`top_k`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.GenerationConfig.top_k) argument.
        :param top_logprobs: The number of most likely tokens to return at each token position, each with an associated log probability. `logprobs` must be set to true if this parameter is used.
        :param top_p: Tokens comprising the top `top_p` probability mass are kept for sampling. Numbers between 0.0 (exclusive) and 1.0 (inclusive) are allowed. Defaults to 1.0. This is similar to Hugging Face's [`top_p`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.GenerationConfig.top_p) argument.
        :param retries: Override the default retry configuration for this method
        :param server_url: Override the default server URL for this method
        :param timeout_ms: Override the default request timeout configuration for this method in milliseconds
        :param http_headers: Additional headers to set or replace on requests.
        """
        base_url = None
        url_variables = None
        if timeout_ms is None:
            timeout_ms = self.sdk_configuration.timeout_ms

        if server_url is not None:
            base_url = server_url

        request = models.DedicatedChatCompleteRequest(
            x_friendli_team=x_friendli_team,
            dedicated_chat_complete_body=models.DedicatedChatCompleteBody(
                model=model,
                messages=utils.get_pydantic_model(messages, List[models.Message]),
                eos_token=eos_token,
                frequency_penalty=frequency_penalty,
                logit_bias=utils.get_pydantic_model(
                    logit_bias,
                    OptionalNullable[models.DedicatedChatCompleteBodyLogitBias],
                ),
                logprobs=logprobs,
                max_tokens=max_tokens,
                min_tokens=min_tokens,
                n=n,
                parallel_tool_calls=parallel_tool_calls,
                presence_penalty=presence_penalty,
                repetition_penalty=repetition_penalty,
                response_format=utils.get_pydantic_model(
                    response_format, Optional[models.ResponseFormat]
                ),
                seed=seed,
                stop=stop,
                stream=stream,
                stream_options=utils.get_pydantic_model(
                    stream_options,
                    OptionalNullable[models.DedicatedChatCompleteBodyStreamOptions],
                ),
                temperature=temperature,
                timeout_microseconds=timeout_microseconds,
                tool_choice=utils.get_pydantic_model(
                    tool_choice, Optional[models.DedicatedChatCompleteBodyToolChoice]
                ),
                tools=utils.get_pydantic_model(
                    tools, OptionalNullable[List[models.Tool]]
                ),
                top_k=top_k,
                top_logprobs=top_logprobs,
                top_p=top_p,
            ),
        )

        req = self._build_request(
            method="POST",
            path="/dedicated/v1/chat/completions",
            base_url=base_url,
            url_variables=url_variables,
            request=request,
            request_body_required=True,
            request_has_path_params=False,
            request_has_query_params=True,
            user_agent_header="user-agent",
            accept_header_value="application/json",
            http_headers=http_headers,
            security=self.sdk_configuration.security,
            get_serialized_body=lambda: utils.serialize_request_body(
                request.dedicated_chat_complete_body,
                False,
                False,
                "json",
                models.DedicatedChatCompleteBody,
            ),
            timeout_ms=timeout_ms,
        )

        if retries == UNSET:
            if self.sdk_configuration.retry_config is not UNSET:
                retries = self.sdk_configuration.retry_config
            else:
                retries = utils.RetryConfig(
                    "backoff", utils.BackoffStrategy(500, 60000, 1.5, 3600000), True
                )

        retry_config = None
        if isinstance(retries, utils.RetryConfig):
            retry_config = (retries, ["429", "500", "502", "503", "504"])

        http_res = self.do_request(
            hook_ctx=HookContext(
                operation_id="dedicatedChatComplete",
                oauth2_scopes=[],
                security_source=get_security_from_env(
                    self.sdk_configuration.security, models.Security
                ),
            ),
            request=req,
            error_status_codes=["4XX", "5XX"],
            retry_config=retry_config,
        )

        if utils.match_response(http_res, "200", "application/json"):
            return utils.unmarshal_json(http_res.text, models.ChatResult)
        if utils.match_response(http_res, ["4XX", "5XX"], "*"):
            http_res_text = utils.stream_to_text(http_res)
            raise models.SDKError(
                "API error occurred", http_res.status_code, http_res_text, http_res
            )

        content_type = http_res.headers.get("Content-Type")
        http_res_text = utils.stream_to_text(http_res)
        raise models.SDKError(
            f"Unexpected response received (code: {http_res.status_code}, type: {content_type})",
            http_res.status_code,
            http_res_text,
            http_res,
        )

    async def complete_async(
        self,
        *,
        model: str,
        messages: Union[List[models.Message], List[models.MessageTypedDict]],
        x_friendli_team: Optional[str] = None,
        eos_token: OptionalNullable[List[int]] = UNSET,
        frequency_penalty: OptionalNullable[float] = UNSET,
        logit_bias: OptionalNullable[
            Union[
                models.DedicatedChatCompleteBodyLogitBias,
                models.DedicatedChatCompleteBodyLogitBiasTypedDict,
            ]
        ] = UNSET,
        logprobs: OptionalNullable[bool] = UNSET,
        max_tokens: OptionalNullable[int] = UNSET,
        min_tokens: OptionalNullable[int] = 0,
        n: OptionalNullable[int] = 1,
        parallel_tool_calls: OptionalNullable[bool] = UNSET,
        presence_penalty: OptionalNullable[float] = UNSET,
        repetition_penalty: OptionalNullable[float] = UNSET,
        response_format: Optional[
            Union[models.ResponseFormat, models.ResponseFormatTypedDict]
        ] = None,
        seed: OptionalNullable[List[int]] = UNSET,
        stop: OptionalNullable[List[str]] = UNSET,
        stream: OptionalNullable[bool] = False,
        stream_options: OptionalNullable[
            Union[
                models.DedicatedChatCompleteBodyStreamOptions,
                models.DedicatedChatCompleteBodyStreamOptionsTypedDict,
            ]
        ] = UNSET,
        temperature: OptionalNullable[float] = 1,
        timeout_microseconds: OptionalNullable[int] = UNSET,
        tool_choice: Optional[
            Union[
                models.DedicatedChatCompleteBodyToolChoice,
                models.DedicatedChatCompleteBodyToolChoiceTypedDict,
            ]
        ] = None,
        tools: OptionalNullable[
            Union[List[models.Tool], List[models.ToolTypedDict]]
        ] = UNSET,
        top_k: OptionalNullable[int] = 0,
        top_logprobs: OptionalNullable[int] = UNSET,
        top_p: OptionalNullable[float] = 1,
        retries: OptionalNullable[utils.RetryConfig] = UNSET,
        server_url: Optional[str] = None,
        timeout_ms: Optional[int] = None,
        http_headers: Optional[Mapping[str, str]] = None,
    ) -> models.ChatResult:
        r"""Chat completions

        Given a list of messages forming a conversation, the model generates a response.

        :param model: ID of target endpoint. If you want to send request to specific adapter, using \"ENDPOINT_ID:ADAPTER_ROUTE\" format.
        :param messages: A list of messages comprising the conversation so far.
        :param x_friendli_team: ID of team to run requests as (optional parameter).
        :param eos_token: A list of endpoint sentence tokens.
        :param frequency_penalty: Number between -2.0 and 2.0. Positive values penalizes tokens that have been sampled, taking into account their frequency in the preceding text. This penalization diminishes the model's tendency to reproduce identical lines verbatim.
        :param logit_bias: Accepts a JSON object that maps tokens to an associated bias value. Mathematically, the bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model.
        :param logprobs: Whether to return log probabilities of the output tokens or not.
        :param max_tokens: The maximum number of tokens to generate. For decoder-only models like GPT, the length of your input tokens plus `max_tokens` should not exceed the model's maximum length (e.g., 2048 for OpenAI GPT-3). For encoder-decoder models like T5 or BlenderBot, `max_tokens` should not exceed the model's maximum output length. This is similar to Hugging Face's [`max_new_tokens`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.GenerationConfig.max_new_tokens) argument.
        :param min_tokens: The minimum number of tokens to generate. Default value is 0. This is similar to Hugging Face's [`min_new_tokens`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.generationconfig.min_new_tokens) argument.  **This field is unsupported when `tools` are specified.**
        :param n: The number of independently generated results for the prompt. Not supported when using beam search. Defaults to 1. This is similar to Hugging Face's [`num_return_sequences`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.GenerationConfig.num_return_sequences) argument.
        :param parallel_tool_calls: Whether to enable parallel function calling.
        :param presence_penalty: Number between -2.0 and 2.0. Positive values penalizes tokens that have been sampled at least once in the existing text.
        :param repetition_penalty: Penalizes tokens that have already appeared in the generated result (plus the input tokens for decoder-only models). Should be greater than or equal to 1.0 (1.0 means no penalty). See [keskar et al., 2019](https://arxiv.org/abs/1909.05858) for more details. This is similar to Hugging Face's [`repetition_penalty`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.generationconfig.repetition_penalty) argument.
        :param response_format: The enforced format of the model's output.  Note that the content of the output message may be truncated if it exceeds the `max_tokens`. You can check this by verifying that the `finish_reason` of the output message is `length`.  ***Important*** You must explicitly instruct the model to produce the desired output format using a system prompt or user message (e.g., `You are an API generating a valid JSON as output.`). Otherwise, the model may result in an unending stream of whitespace or other characters.
        :param seed: Seed to control random procedure. If nothing is given, random seed is used for sampling, and return the seed along with the generated result. When using the `n` argument, you can pass a list of seed values to control all of the independent generations.
        :param stop: When one of the stop phrases appears in the generation result, the API will stop generation. The stop phrases are excluded from the result. Defaults to empty list.
        :param stream: Whether to stream generation result. When set true, each token will be sent as [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#event_stream_format) once generated.
        :param stream_options: Options related to stream. It can only be used when `stream: true`.
        :param temperature: Sampling temperature. Smaller temperature makes the generation result closer to greedy, argmax (i.e., `top_k = 1`) sampling. Defaults to 1.0. This is similar to Hugging Face's [`temperature`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.generationconfig.temperature) argument.
        :param timeout_microseconds: Request timeout. Gives the `HTTP 429 Too Many Requests` response status code. Default behavior is no timeout.
        :param tool_choice: Determines the tool calling behavior of the model. When set to `none`, the model will bypass tool execution and generate a response directly. In `auto` mode (the default), the model dynamically decides whether to call a tool or respond with a message. Alternatively, setting `required` ensures that the model invokes at least one tool before responding to the user. You can also specify a particular tool by `{\"type\": \"function\", \"function\": {\"name\": \"my_function\"}}`.
        :param tools: A list of tools the model may call. Currently, only functions are supported as a tool. A maximum of 128 functions is supported. Use this to provide a list of functions the model may generate JSON inputs for.  **When `tools` are specified, `min_tokens` field is unsupported.**
        :param top_k: The number of highest probability tokens to keep for sampling. Numbers between 0 and the vocab size of the model (both inclusive) are allowed. The default value is 0, which means that the API does not apply top-k filtering. This is similar to Hugging Face's [`top_k`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.GenerationConfig.top_k) argument.
        :param top_logprobs: The number of most likely tokens to return at each token position, each with an associated log probability. `logprobs` must be set to true if this parameter is used.
        :param top_p: Tokens comprising the top `top_p` probability mass are kept for sampling. Numbers between 0.0 (exclusive) and 1.0 (inclusive) are allowed. Defaults to 1.0. This is similar to Hugging Face's [`top_p`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.GenerationConfig.top_p) argument.
        :param retries: Override the default retry configuration for this method
        :param server_url: Override the default server URL for this method
        :param timeout_ms: Override the default request timeout configuration for this method in milliseconds
        :param http_headers: Additional headers to set or replace on requests.
        """
        base_url = None
        url_variables = None
        if timeout_ms is None:
            timeout_ms = self.sdk_configuration.timeout_ms

        if server_url is not None:
            base_url = server_url

        request = models.DedicatedChatCompleteRequest(
            x_friendli_team=x_friendli_team,
            dedicated_chat_complete_body=models.DedicatedChatCompleteBody(
                model=model,
                messages=utils.get_pydantic_model(messages, List[models.Message]),
                eos_token=eos_token,
                frequency_penalty=frequency_penalty,
                logit_bias=utils.get_pydantic_model(
                    logit_bias,
                    OptionalNullable[models.DedicatedChatCompleteBodyLogitBias],
                ),
                logprobs=logprobs,
                max_tokens=max_tokens,
                min_tokens=min_tokens,
                n=n,
                parallel_tool_calls=parallel_tool_calls,
                presence_penalty=presence_penalty,
                repetition_penalty=repetition_penalty,
                response_format=utils.get_pydantic_model(
                    response_format, Optional[models.ResponseFormat]
                ),
                seed=seed,
                stop=stop,
                stream=stream,
                stream_options=utils.get_pydantic_model(
                    stream_options,
                    OptionalNullable[models.DedicatedChatCompleteBodyStreamOptions],
                ),
                temperature=temperature,
                timeout_microseconds=timeout_microseconds,
                tool_choice=utils.get_pydantic_model(
                    tool_choice, Optional[models.DedicatedChatCompleteBodyToolChoice]
                ),
                tools=utils.get_pydantic_model(
                    tools, OptionalNullable[List[models.Tool]]
                ),
                top_k=top_k,
                top_logprobs=top_logprobs,
                top_p=top_p,
            ),
        )

        req = self._build_request_async(
            method="POST",
            path="/dedicated/v1/chat/completions",
            base_url=base_url,
            url_variables=url_variables,
            request=request,
            request_body_required=True,
            request_has_path_params=False,
            request_has_query_params=True,
            user_agent_header="user-agent",
            accept_header_value="application/json",
            http_headers=http_headers,
            security=self.sdk_configuration.security,
            get_serialized_body=lambda: utils.serialize_request_body(
                request.dedicated_chat_complete_body,
                False,
                False,
                "json",
                models.DedicatedChatCompleteBody,
            ),
            timeout_ms=timeout_ms,
        )

        if retries == UNSET:
            if self.sdk_configuration.retry_config is not UNSET:
                retries = self.sdk_configuration.retry_config
            else:
                retries = utils.RetryConfig(
                    "backoff", utils.BackoffStrategy(500, 60000, 1.5, 3600000), True
                )

        retry_config = None
        if isinstance(retries, utils.RetryConfig):
            retry_config = (retries, ["429", "500", "502", "503", "504"])

        http_res = await self.do_request_async(
            hook_ctx=HookContext(
                operation_id="dedicatedChatComplete",
                oauth2_scopes=[],
                security_source=get_security_from_env(
                    self.sdk_configuration.security, models.Security
                ),
            ),
            request=req,
            error_status_codes=["4XX", "5XX"],
            retry_config=retry_config,
        )

        if utils.match_response(http_res, "200", "application/json"):
            return utils.unmarshal_json(http_res.text, models.ChatResult)
        if utils.match_response(http_res, ["4XX", "5XX"], "*"):
            http_res_text = await utils.stream_to_text_async(http_res)
            raise models.SDKError(
                "API error occurred", http_res.status_code, http_res_text, http_res
            )

        content_type = http_res.headers.get("Content-Type")
        http_res_text = await utils.stream_to_text_async(http_res)
        raise models.SDKError(
            f"Unexpected response received (code: {http_res.status_code}, type: {content_type})",
            http_res.status_code,
            http_res_text,
            http_res,
        )

    def stream(
        self,
        *,
        model: str,
        messages: Union[List[models.Message], List[models.MessageTypedDict]],
        x_friendli_team: Optional[str] = None,
        eos_token: OptionalNullable[List[int]] = UNSET,
        frequency_penalty: OptionalNullable[float] = UNSET,
        logit_bias: OptionalNullable[
            Union[
                models.DedicatedChatStreamBodyLogitBias,
                models.DedicatedChatStreamBodyLogitBiasTypedDict,
            ]
        ] = UNSET,
        logprobs: OptionalNullable[bool] = UNSET,
        max_tokens: OptionalNullable[int] = UNSET,
        min_tokens: OptionalNullable[int] = 0,
        n: OptionalNullable[int] = 1,
        parallel_tool_calls: OptionalNullable[bool] = UNSET,
        presence_penalty: OptionalNullable[float] = UNSET,
        repetition_penalty: OptionalNullable[float] = UNSET,
        response_format: Optional[
            Union[models.ResponseFormat, models.ResponseFormatTypedDict]
        ] = None,
        seed: OptionalNullable[List[int]] = UNSET,
        stop: OptionalNullable[List[str]] = UNSET,
        stream: OptionalNullable[bool] = True,
        stream_options: OptionalNullable[
            Union[
                models.DedicatedChatStreamBodyStreamOptions,
                models.DedicatedChatStreamBodyStreamOptionsTypedDict,
            ]
        ] = UNSET,
        temperature: OptionalNullable[float] = 1,
        timeout_microseconds: OptionalNullable[int] = UNSET,
        tool_choice: Optional[
            Union[
                models.DedicatedChatStreamBodyToolChoice,
                models.DedicatedChatStreamBodyToolChoiceTypedDict,
            ]
        ] = None,
        tools: OptionalNullable[
            Union[List[models.Tool], List[models.ToolTypedDict]]
        ] = UNSET,
        top_k: OptionalNullable[int] = 0,
        top_logprobs: OptionalNullable[int] = UNSET,
        top_p: OptionalNullable[float] = 1,
        retries: OptionalNullable[utils.RetryConfig] = UNSET,
        server_url: Optional[str] = None,
        timeout_ms: Optional[int] = None,
        http_headers: Optional[Mapping[str, str]] = None,
    ) -> eventstreaming.EventStream[models.StreamedChatResult]:
        r"""Stream chat completions

        Given a list of messages forming a conversation, the model generates a response.

        :param model: ID of target endpoint. If you want to send request to specific adapter, using \"ENDPOINT_ID:ADAPTER_ROUTE\" format.
        :param messages: A list of messages comprising the conversation so far.
        :param x_friendli_team: ID of team to run requests as (optional parameter).
        :param eos_token: A list of endpoint sentence tokens.
        :param frequency_penalty: Number between -2.0 and 2.0. Positive values penalizes tokens that have been sampled, taking into account their frequency in the preceding text. This penalization diminishes the model's tendency to reproduce identical lines verbatim.
        :param logit_bias: Accepts a JSON object that maps tokens to an associated bias value. Mathematically, the bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model.
        :param logprobs: Whether to return log probabilities of the output tokens or not.
        :param max_tokens: The maximum number of tokens to generate. For decoder-only models like GPT, the length of your input tokens plus `max_tokens` should not exceed the model's maximum length (e.g., 2048 for OpenAI GPT-3). For encoder-decoder models like T5 or BlenderBot, `max_tokens` should not exceed the model's maximum output length. This is similar to Hugging Face's [`max_new_tokens`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.GenerationConfig.max_new_tokens) argument.
        :param min_tokens: The minimum number of tokens to generate. Default value is 0. This is similar to Hugging Face's [`min_new_tokens`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.generationconfig.min_new_tokens) argument.  **This field is unsupported when `tools` are specified.**
        :param n: The number of independently generated results for the prompt. Not supported when using beam search. Defaults to 1. This is similar to Hugging Face's [`num_return_sequences`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.GenerationConfig.num_return_sequences) argument.
        :param parallel_tool_calls: Whether to enable parallel function calling.
        :param presence_penalty: Number between -2.0 and 2.0. Positive values penalizes tokens that have been sampled at least once in the existing text.
        :param repetition_penalty: Penalizes tokens that have already appeared in the generated result (plus the input tokens for decoder-only models). Should be greater than or equal to 1.0 (1.0 means no penalty). See [keskar et al., 2019](https://arxiv.org/abs/1909.05858) for more details. This is similar to Hugging Face's [`repetition_penalty`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.generationconfig.repetition_penalty) argument.
        :param response_format: The enforced format of the model's output.  Note that the content of the output message may be truncated if it exceeds the `max_tokens`. You can check this by verifying that the `finish_reason` of the output message is `length`.  ***Important*** You must explicitly instruct the model to produce the desired output format using a system prompt or user message (e.g., `You are an API generating a valid JSON as output.`). Otherwise, the model may result in an unending stream of whitespace or other characters.
        :param seed: Seed to control random procedure. If nothing is given, random seed is used for sampling, and return the seed along with the generated result. When using the `n` argument, you can pass a list of seed values to control all of the independent generations.
        :param stop: When one of the stop phrases appears in the generation result, the API will stop generation. The stop phrases are excluded from the result. Defaults to empty list.
        :param stream: Whether to stream generation result. When set true, each token will be sent as [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#event_stream_format) once generated.
        :param stream_options: Options related to stream. It can only be used when `stream: true`.
        :param temperature: Sampling temperature. Smaller temperature makes the generation result closer to greedy, argmax (i.e., `top_k = 1`) sampling. Defaults to 1.0. This is similar to Hugging Face's [`temperature`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.generationconfig.temperature) argument.
        :param timeout_microseconds: Request timeout. Gives the `HTTP 429 Too Many Requests` response status code. Default behavior is no timeout.
        :param tool_choice: Determines the tool calling behavior of the model. When set to `none`, the model will bypass tool execution and generate a response directly. In `auto` mode (the default), the model dynamically decides whether to call a tool or respond with a message. Alternatively, setting `required` ensures that the model invokes at least one tool before responding to the user. You can also specify a particular tool by `{\"type\": \"function\", \"function\": {\"name\": \"my_function\"}}`.
        :param tools: A list of tools the model may call. Currently, only functions are supported as a tool. A maximum of 128 functions is supported. Use this to provide a list of functions the model may generate JSON inputs for.  **When `tools` are specified, `min_tokens` field is unsupported.**
        :param top_k: The number of highest probability tokens to keep for sampling. Numbers between 0 and the vocab size of the model (both inclusive) are allowed. The default value is 0, which means that the API does not apply top-k filtering. This is similar to Hugging Face's [`top_k`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.GenerationConfig.top_k) argument.
        :param top_logprobs: The number of most likely tokens to return at each token position, each with an associated log probability. `logprobs` must be set to true if this parameter is used.
        :param top_p: Tokens comprising the top `top_p` probability mass are kept for sampling. Numbers between 0.0 (exclusive) and 1.0 (inclusive) are allowed. Defaults to 1.0. This is similar to Hugging Face's [`top_p`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.GenerationConfig.top_p) argument.
        :param retries: Override the default retry configuration for this method
        :param server_url: Override the default server URL for this method
        :param timeout_ms: Override the default request timeout configuration for this method in milliseconds
        :param http_headers: Additional headers to set or replace on requests.
        """
        base_url = None
        url_variables = None
        if timeout_ms is None:
            timeout_ms = self.sdk_configuration.timeout_ms

        if server_url is not None:
            base_url = server_url

        request = models.DedicatedChatStreamRequest(
            x_friendli_team=x_friendli_team,
            dedicated_chat_stream_body=models.DedicatedChatStreamBody(
                model=model,
                messages=utils.get_pydantic_model(messages, List[models.Message]),
                eos_token=eos_token,
                frequency_penalty=frequency_penalty,
                logit_bias=utils.get_pydantic_model(
                    logit_bias,
                    OptionalNullable[models.DedicatedChatStreamBodyLogitBias],
                ),
                logprobs=logprobs,
                max_tokens=max_tokens,
                min_tokens=min_tokens,
                n=n,
                parallel_tool_calls=parallel_tool_calls,
                presence_penalty=presence_penalty,
                repetition_penalty=repetition_penalty,
                response_format=utils.get_pydantic_model(
                    response_format, Optional[models.ResponseFormat]
                ),
                seed=seed,
                stop=stop,
                stream=stream,
                stream_options=utils.get_pydantic_model(
                    stream_options,
                    OptionalNullable[models.DedicatedChatStreamBodyStreamOptions],
                ),
                temperature=temperature,
                timeout_microseconds=timeout_microseconds,
                tool_choice=utils.get_pydantic_model(
                    tool_choice, Optional[models.DedicatedChatStreamBodyToolChoice]
                ),
                tools=utils.get_pydantic_model(
                    tools, OptionalNullable[List[models.Tool]]
                ),
                top_k=top_k,
                top_logprobs=top_logprobs,
                top_p=top_p,
            ),
        )

        req = self._build_request(
            method="POST",
            path="/dedicated/v1/chat/completions#stream",
            base_url=base_url,
            url_variables=url_variables,
            request=request,
            request_body_required=True,
            request_has_path_params=False,
            request_has_query_params=True,
            user_agent_header="user-agent",
            accept_header_value="text/event-stream",
            http_headers=http_headers,
            security=self.sdk_configuration.security,
            get_serialized_body=lambda: utils.serialize_request_body(
                request.dedicated_chat_stream_body,
                False,
                False,
                "json",
                models.DedicatedChatStreamBody,
            ),
            timeout_ms=timeout_ms,
        )

        if retries == UNSET:
            if self.sdk_configuration.retry_config is not UNSET:
                retries = self.sdk_configuration.retry_config
            else:
                retries = utils.RetryConfig(
                    "backoff", utils.BackoffStrategy(500, 60000, 1.5, 3600000), True
                )

        retry_config = None
        if isinstance(retries, utils.RetryConfig):
            retry_config = (retries, ["429", "500", "502", "503", "504"])

        http_res = self.do_request(
            hook_ctx=HookContext(
                operation_id="dedicatedChatStream",
                oauth2_scopes=[],
                security_source=get_security_from_env(
                    self.sdk_configuration.security, models.Security
                ),
            ),
            request=req,
            error_status_codes=["4XX", "5XX"],
            stream=True,
            retry_config=retry_config,
        )

        if utils.match_response(http_res, "200", "text/event-stream"):
            return eventstreaming.EventStream(
                http_res,
                lambda raw: utils.unmarshal_json(raw, models.StreamedChatResult),
                sentinel="[DONE]",
            )
        if utils.match_response(http_res, ["4XX", "5XX"], "*"):
            http_res_text = utils.stream_to_text(http_res)
            raise models.SDKError(
                "API error occurred", http_res.status_code, http_res_text, http_res
            )

        content_type = http_res.headers.get("Content-Type")
        http_res_text = utils.stream_to_text(http_res)
        raise models.SDKError(
            f"Unexpected response received (code: {http_res.status_code}, type: {content_type})",
            http_res.status_code,
            http_res_text,
            http_res,
        )

    async def stream_async(
        self,
        *,
        model: str,
        messages: Union[List[models.Message], List[models.MessageTypedDict]],
        x_friendli_team: Optional[str] = None,
        eos_token: OptionalNullable[List[int]] = UNSET,
        frequency_penalty: OptionalNullable[float] = UNSET,
        logit_bias: OptionalNullable[
            Union[
                models.DedicatedChatStreamBodyLogitBias,
                models.DedicatedChatStreamBodyLogitBiasTypedDict,
            ]
        ] = UNSET,
        logprobs: OptionalNullable[bool] = UNSET,
        max_tokens: OptionalNullable[int] = UNSET,
        min_tokens: OptionalNullable[int] = 0,
        n: OptionalNullable[int] = 1,
        parallel_tool_calls: OptionalNullable[bool] = UNSET,
        presence_penalty: OptionalNullable[float] = UNSET,
        repetition_penalty: OptionalNullable[float] = UNSET,
        response_format: Optional[
            Union[models.ResponseFormat, models.ResponseFormatTypedDict]
        ] = None,
        seed: OptionalNullable[List[int]] = UNSET,
        stop: OptionalNullable[List[str]] = UNSET,
        stream: OptionalNullable[bool] = True,
        stream_options: OptionalNullable[
            Union[
                models.DedicatedChatStreamBodyStreamOptions,
                models.DedicatedChatStreamBodyStreamOptionsTypedDict,
            ]
        ] = UNSET,
        temperature: OptionalNullable[float] = 1,
        timeout_microseconds: OptionalNullable[int] = UNSET,
        tool_choice: Optional[
            Union[
                models.DedicatedChatStreamBodyToolChoice,
                models.DedicatedChatStreamBodyToolChoiceTypedDict,
            ]
        ] = None,
        tools: OptionalNullable[
            Union[List[models.Tool], List[models.ToolTypedDict]]
        ] = UNSET,
        top_k: OptionalNullable[int] = 0,
        top_logprobs: OptionalNullable[int] = UNSET,
        top_p: OptionalNullable[float] = 1,
        retries: OptionalNullable[utils.RetryConfig] = UNSET,
        server_url: Optional[str] = None,
        timeout_ms: Optional[int] = None,
        http_headers: Optional[Mapping[str, str]] = None,
    ) -> eventstreaming.EventStreamAsync[models.StreamedChatResult]:
        r"""Stream chat completions

        Given a list of messages forming a conversation, the model generates a response.

        :param model: ID of target endpoint. If you want to send request to specific adapter, using \"ENDPOINT_ID:ADAPTER_ROUTE\" format.
        :param messages: A list of messages comprising the conversation so far.
        :param x_friendli_team: ID of team to run requests as (optional parameter).
        :param eos_token: A list of endpoint sentence tokens.
        :param frequency_penalty: Number between -2.0 and 2.0. Positive values penalizes tokens that have been sampled, taking into account their frequency in the preceding text. This penalization diminishes the model's tendency to reproduce identical lines verbatim.
        :param logit_bias: Accepts a JSON object that maps tokens to an associated bias value. Mathematically, the bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model.
        :param logprobs: Whether to return log probabilities of the output tokens or not.
        :param max_tokens: The maximum number of tokens to generate. For decoder-only models like GPT, the length of your input tokens plus `max_tokens` should not exceed the model's maximum length (e.g., 2048 for OpenAI GPT-3). For encoder-decoder models like T5 or BlenderBot, `max_tokens` should not exceed the model's maximum output length. This is similar to Hugging Face's [`max_new_tokens`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.GenerationConfig.max_new_tokens) argument.
        :param min_tokens: The minimum number of tokens to generate. Default value is 0. This is similar to Hugging Face's [`min_new_tokens`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.generationconfig.min_new_tokens) argument.  **This field is unsupported when `tools` are specified.**
        :param n: The number of independently generated results for the prompt. Not supported when using beam search. Defaults to 1. This is similar to Hugging Face's [`num_return_sequences`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.GenerationConfig.num_return_sequences) argument.
        :param parallel_tool_calls: Whether to enable parallel function calling.
        :param presence_penalty: Number between -2.0 and 2.0. Positive values penalizes tokens that have been sampled at least once in the existing text.
        :param repetition_penalty: Penalizes tokens that have already appeared in the generated result (plus the input tokens for decoder-only models). Should be greater than or equal to 1.0 (1.0 means no penalty). See [keskar et al., 2019](https://arxiv.org/abs/1909.05858) for more details. This is similar to Hugging Face's [`repetition_penalty`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.generationconfig.repetition_penalty) argument.
        :param response_format: The enforced format of the model's output.  Note that the content of the output message may be truncated if it exceeds the `max_tokens`. You can check this by verifying that the `finish_reason` of the output message is `length`.  ***Important*** You must explicitly instruct the model to produce the desired output format using a system prompt or user message (e.g., `You are an API generating a valid JSON as output.`). Otherwise, the model may result in an unending stream of whitespace or other characters.
        :param seed: Seed to control random procedure. If nothing is given, random seed is used for sampling, and return the seed along with the generated result. When using the `n` argument, you can pass a list of seed values to control all of the independent generations.
        :param stop: When one of the stop phrases appears in the generation result, the API will stop generation. The stop phrases are excluded from the result. Defaults to empty list.
        :param stream: Whether to stream generation result. When set true, each token will be sent as [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#event_stream_format) once generated.
        :param stream_options: Options related to stream. It can only be used when `stream: true`.
        :param temperature: Sampling temperature. Smaller temperature makes the generation result closer to greedy, argmax (i.e., `top_k = 1`) sampling. Defaults to 1.0. This is similar to Hugging Face's [`temperature`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.generationconfig.temperature) argument.
        :param timeout_microseconds: Request timeout. Gives the `HTTP 429 Too Many Requests` response status code. Default behavior is no timeout.
        :param tool_choice: Determines the tool calling behavior of the model. When set to `none`, the model will bypass tool execution and generate a response directly. In `auto` mode (the default), the model dynamically decides whether to call a tool or respond with a message. Alternatively, setting `required` ensures that the model invokes at least one tool before responding to the user. You can also specify a particular tool by `{\"type\": \"function\", \"function\": {\"name\": \"my_function\"}}`.
        :param tools: A list of tools the model may call. Currently, only functions are supported as a tool. A maximum of 128 functions is supported. Use this to provide a list of functions the model may generate JSON inputs for.  **When `tools` are specified, `min_tokens` field is unsupported.**
        :param top_k: The number of highest probability tokens to keep for sampling. Numbers between 0 and the vocab size of the model (both inclusive) are allowed. The default value is 0, which means that the API does not apply top-k filtering. This is similar to Hugging Face's [`top_k`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.GenerationConfig.top_k) argument.
        :param top_logprobs: The number of most likely tokens to return at each token position, each with an associated log probability. `logprobs` must be set to true if this parameter is used.
        :param top_p: Tokens comprising the top `top_p` probability mass are kept for sampling. Numbers between 0.0 (exclusive) and 1.0 (inclusive) are allowed. Defaults to 1.0. This is similar to Hugging Face's [`top_p`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.GenerationConfig.top_p) argument.
        :param retries: Override the default retry configuration for this method
        :param server_url: Override the default server URL for this method
        :param timeout_ms: Override the default request timeout configuration for this method in milliseconds
        :param http_headers: Additional headers to set or replace on requests.
        """
        base_url = None
        url_variables = None
        if timeout_ms is None:
            timeout_ms = self.sdk_configuration.timeout_ms

        if server_url is not None:
            base_url = server_url

        request = models.DedicatedChatStreamRequest(
            x_friendli_team=x_friendli_team,
            dedicated_chat_stream_body=models.DedicatedChatStreamBody(
                model=model,
                messages=utils.get_pydantic_model(messages, List[models.Message]),
                eos_token=eos_token,
                frequency_penalty=frequency_penalty,
                logit_bias=utils.get_pydantic_model(
                    logit_bias,
                    OptionalNullable[models.DedicatedChatStreamBodyLogitBias],
                ),
                logprobs=logprobs,
                max_tokens=max_tokens,
                min_tokens=min_tokens,
                n=n,
                parallel_tool_calls=parallel_tool_calls,
                presence_penalty=presence_penalty,
                repetition_penalty=repetition_penalty,
                response_format=utils.get_pydantic_model(
                    response_format, Optional[models.ResponseFormat]
                ),
                seed=seed,
                stop=stop,
                stream=stream,
                stream_options=utils.get_pydantic_model(
                    stream_options,
                    OptionalNullable[models.DedicatedChatStreamBodyStreamOptions],
                ),
                temperature=temperature,
                timeout_microseconds=timeout_microseconds,
                tool_choice=utils.get_pydantic_model(
                    tool_choice, Optional[models.DedicatedChatStreamBodyToolChoice]
                ),
                tools=utils.get_pydantic_model(
                    tools, OptionalNullable[List[models.Tool]]
                ),
                top_k=top_k,
                top_logprobs=top_logprobs,
                top_p=top_p,
            ),
        )

        req = self._build_request_async(
            method="POST",
            path="/dedicated/v1/chat/completions#stream",
            base_url=base_url,
            url_variables=url_variables,
            request=request,
            request_body_required=True,
            request_has_path_params=False,
            request_has_query_params=True,
            user_agent_header="user-agent",
            accept_header_value="text/event-stream",
            http_headers=http_headers,
            security=self.sdk_configuration.security,
            get_serialized_body=lambda: utils.serialize_request_body(
                request.dedicated_chat_stream_body,
                False,
                False,
                "json",
                models.DedicatedChatStreamBody,
            ),
            timeout_ms=timeout_ms,
        )

        if retries == UNSET:
            if self.sdk_configuration.retry_config is not UNSET:
                retries = self.sdk_configuration.retry_config
            else:
                retries = utils.RetryConfig(
                    "backoff", utils.BackoffStrategy(500, 60000, 1.5, 3600000), True
                )

        retry_config = None
        if isinstance(retries, utils.RetryConfig):
            retry_config = (retries, ["429", "500", "502", "503", "504"])

        http_res = await self.do_request_async(
            hook_ctx=HookContext(
                operation_id="dedicatedChatStream",
                oauth2_scopes=[],
                security_source=get_security_from_env(
                    self.sdk_configuration.security, models.Security
                ),
            ),
            request=req,
            error_status_codes=["4XX", "5XX"],
            stream=True,
            retry_config=retry_config,
        )

        if utils.match_response(http_res, "200", "text/event-stream"):
            return eventstreaming.EventStreamAsync(
                http_res,
                lambda raw: utils.unmarshal_json(raw, models.StreamedChatResult),
                sentinel="[DONE]",
            )
        if utils.match_response(http_res, ["4XX", "5XX"], "*"):
            http_res_text = await utils.stream_to_text_async(http_res)
            raise models.SDKError(
                "API error occurred", http_res.status_code, http_res_text, http_res
            )

        content_type = http_res.headers.get("Content-Type")
        http_res_text = await utils.stream_to_text_async(http_res)
        raise models.SDKError(
            f"Unexpected response received (code: {http_res.status_code}, type: {content_type})",
            http_res.status_code,
            http_res_text,
            http_res,
        )
