.clang-format
.dockerignore
.gitignore
.pre-commit-config.yaml
.readthedocs.yaml
.shellcheckrc
.yapfignore
CMakeLists.txt
CODE_OF_CONDUCT.md
CONTRIBUTING.md
DCO
Dockerfile
Dockerfile.arm
Dockerfile.cpu
Dockerfile.hpu
Dockerfile.neuron
Dockerfile.openvino
Dockerfile.ppc64le
Dockerfile.rocm
Dockerfile.rocm_base
Dockerfile.tpu
Dockerfile.xpu
LICENSE
MANIFEST.in
README.md
SECURITY.md
collect_env.py
find_cuda_init.py
format.sh
pyproject.toml
python_only_dev.py
requirements-build.txt
requirements-common.txt
requirements-cpu.txt
requirements-cuda.txt
requirements-dev.txt
requirements-hpu.txt
requirements-lint.txt
requirements-neuron.txt
requirements-openvino.txt
requirements-rocm-build.txt
requirements-rocm.txt
requirements-test.in
requirements-test.txt
requirements-tpu.txt
requirements-xpu.txt
setup.py
use_existing_torch.py
.buildkite/check-wheel-size.py
.buildkite/generate_index.py
.buildkite/release-pipeline.yaml
.buildkite/run-amd-test.sh
.buildkite/run-benchmarks.sh
.buildkite/run-cpu-test-ppc64le.sh
.buildkite/run-cpu-test.sh
.buildkite/run-gh200-test.sh
.buildkite/run-hpu-test.sh
.buildkite/run-multi-node-test.sh
.buildkite/run-neuron-test.sh
.buildkite/run-openvino-test.sh
.buildkite/run-tpu-test.sh
.buildkite/run-xpu-test.sh
.buildkite/test-pipeline.yaml
.buildkite/upload-wheels.sh
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
.buildkite/lm-eval-harness/run-tests.sh
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
.buildkite/lm-eval-harness/configs/models-large.txt
.buildkite/lm-eval-harness/configs/models-small.txt
.buildkite/nightly-benchmarks/README.md
.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
.buildkite/nightly-benchmarks/nightly-annotation.md
.buildkite/nightly-benchmarks/nightly-descriptions.md
.buildkite/nightly-benchmarks/nightly-pipeline.yaml
.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
.buildkite/nightly-benchmarks/scripts/launch-server.sh
.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
.buildkite/nightly-benchmarks/tests/latency-tests.json
.buildkite/nightly-benchmarks/tests/nightly-tests.json
.buildkite/nightly-benchmarks/tests/serving-tests.json
.buildkite/nightly-benchmarks/tests/throughput-tests.json
.github/CODEOWNERS
.github/FUNDING.yml
.github/PULL_REQUEST_TEMPLATE.md
.github/dependabot.yml
.github/mergify.yml
.github/ISSUE_TEMPLATE/100-documentation.yml
.github/ISSUE_TEMPLATE/200-installation.yml
.github/ISSUE_TEMPLATE/300-usage.yml
.github/ISSUE_TEMPLATE/400-bug-report.yml
.github/ISSUE_TEMPLATE/500-feature-request.yml
.github/ISSUE_TEMPLATE/600-new-model.yml
.github/ISSUE_TEMPLATE/700-performance-discussion.yml
.github/ISSUE_TEMPLATE/750-RFC.yml
.github/ISSUE_TEMPLATE/800-misc-discussion.yml
.github/ISSUE_TEMPLATE/config.yml
.github/scripts/cleanup_pr_body.sh
.github/workflows/add_label_automerge.yml
.github/workflows/cleanup_pr_body.yml
.github/workflows/lint-and-deploy.yaml
.github/workflows/pre-commit.yml
.github/workflows/publish.yml
.github/workflows/reminder_comment.yml
.github/workflows/stale.yml
.github/workflows/matchers/actionlint.json
.github/workflows/matchers/mypy.json
.github/workflows/scripts/build.sh
.github/workflows/scripts/create_release.js
.github/workflows/scripts/cuda-install.sh
.github/workflows/scripts/env.sh
.github/workflows/scripts/pytorch-install.sh
benchmarks/README.md
benchmarks/backend_request_func.py
benchmarks/benchmark_guided.py
benchmarks/benchmark_latency.py
benchmarks/benchmark_long_document_qa_throughput.py
benchmarks/benchmark_prefix_caching.py
benchmarks/benchmark_prioritization.py
benchmarks/benchmark_serving.py
benchmarks/benchmark_serving_guided.py
benchmarks/benchmark_throughput.py
benchmarks/benchmark_utils.py
benchmarks/launch_tgi_server.sh
benchmarks/sonnet.txt
benchmarks/cutlass_benchmarks/sparse_benchmarks.py
benchmarks/cutlass_benchmarks/utils.py
benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
benchmarks/cutlass_benchmarks/weight_shapes.py
benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
benchmarks/disagg_benchmarks/round_robin_proxy.py
benchmarks/disagg_benchmarks/visualize_benchmark_results.py
benchmarks/fused_kernels/layernorm_rms_benchmarks.py
benchmarks/kernels/benchmark_aqlm.py
benchmarks/kernels/benchmark_layernorm.py
benchmarks/kernels/benchmark_lora.py
benchmarks/kernels/benchmark_machete.py
benchmarks/kernels/benchmark_marlin.py
benchmarks/kernels/benchmark_moe.py
benchmarks/kernels/benchmark_paged_attention.py
benchmarks/kernels/benchmark_quant.py
benchmarks/kernels/benchmark_rmsnorm.py
benchmarks/kernels/benchmark_rope.py
benchmarks/kernels/benchmark_shapes.py
benchmarks/kernels/graph_machete_bench.py
benchmarks/kernels/requirements.txt
benchmarks/kernels/utils.py
benchmarks/kernels/weight_shapes.py
benchmarks/overheads/benchmark_hashing.py
benchmarks/structured_schemas/structured_schema_1.json
cmake/cpu_extension.cmake
cmake/hipify.py
cmake/utils.cmake
csrc/activation_kernels.cu
csrc/cache.h
csrc/cache_kernels.cu
csrc/cuda_compat.h
csrc/cuda_utils.h
csrc/cuda_utils_kernels.cu
csrc/cumem_allocator.cpp
csrc/custom_all_reduce.cu
csrc/custom_all_reduce.cuh
csrc/custom_all_reduce_test.cu
csrc/dispatch_utils.h
csrc/layernorm_kernels.cu
csrc/layernorm_quant_kernels.cu
csrc/ops.h
csrc/permute_cols.cu
csrc/pos_encoding_kernels.cu
csrc/torch_bindings.cpp
csrc/type_convert.cuh
csrc/attention/attention_dtypes.h
csrc/attention/attention_generic.cuh
csrc/attention/attention_kernels.cuh
csrc/attention/attention_utils.cuh
csrc/attention/dtype_bfloat16.cuh
csrc/attention/dtype_float16.cuh
csrc/attention/dtype_float32.cuh
csrc/attention/dtype_fp8.cuh
csrc/attention/paged_attention_v1.cu
csrc/attention/paged_attention_v2.cu
csrc/core/exception.hpp
csrc/core/math.hpp
csrc/core/registration.h
csrc/core/scalar_type.hpp
csrc/cpu/activation.cpp
csrc/cpu/attention.cpp
csrc/cpu/cache.cpp
csrc/cpu/cpu_types.hpp
csrc/cpu/cpu_types_arm.hpp
csrc/cpu/cpu_types_vsx.hpp
csrc/cpu/cpu_types_x86.hpp
csrc/cpu/dnnl_helper.hpp
csrc/cpu/layernorm.cpp
csrc/cpu/pos_encoding.cpp
csrc/cpu/quant.cpp
csrc/cpu/torch_bindings.cpp
csrc/cpu/utils.cpp
csrc/cutlass_extensions/common.cpp
csrc/cutlass_extensions/common.hpp
csrc/cutlass_extensions/cute_utils.cuh
csrc/cutlass_extensions/torch_utils.hpp
csrc/cutlass_extensions/vllm_collective_builder.cuh
csrc/cutlass_extensions/vllm_custom_types.cuh
csrc/cutlass_extensions/vllm_cutlass_library_extension.py
csrc/cutlass_extensions/vllm_numeric_conversion.cuh
csrc/cutlass_extensions/vllm_type_utils.cuh
csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp
csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp
csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
csrc/cutlass_extensions/gemm/dispatch_policy.hpp
csrc/cutlass_extensions/gemm/collective/collective_builder.hpp
csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp
csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
csrc/mamba/causal_conv1d/causal_conv1d.cu
csrc/mamba/causal_conv1d/causal_conv1d.h
csrc/mamba/causal_conv1d/static_switch.h
csrc/mamba/mamba_ssm/selective_scan.h
csrc/mamba/mamba_ssm/selective_scan_fwd.cu
csrc/mamba/mamba_ssm/static_switch.h
csrc/moe/marlin_moe_ops.cu
csrc/moe/moe_align_sum_kernels.cu
csrc/moe/moe_ops.h
csrc/moe/topk_softmax_kernels.cu
csrc/moe/torch_bindings.cpp
csrc/moe/marlin_kernels/marlin_moe_kernel.h
csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu
csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h
csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu
csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h
csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu
csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h
csrc/prepare_inputs/advance_step.cu
csrc/prepare_inputs/advance_step.cuh
csrc/quantization/vectorization.cuh
csrc/quantization/aqlm/gemm_kernels.cu
csrc/quantization/awq/dequantize.cuh
csrc/quantization/awq/gemm_kernels.cu
csrc/quantization/compressed_tensors/int8_quant_kernels.cu
csrc/quantization/cutlass_w8a8/Epilogues.md
csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh
csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh
csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh
csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh
csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu
csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu
csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu
csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh
csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu
csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8_dispatch.cuh
csrc/quantization/fp4/nvfp4_quant_entry.cu
csrc/quantization/fp4/nvfp4_quant_kernels.cu
csrc/quantization/fp8/common.cu
csrc/quantization/fp8/common.cuh
csrc/quantization/fp8/fp8_marlin.cu
csrc/quantization/fp8/amd/hip_float8.h
csrc/quantization/fp8/amd/hip_float8_impl.h
csrc/quantization/fp8/amd/quant_utils.cuh
csrc/quantization/fp8/nvidia/quant_utils.cuh
csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
csrc/quantization/fused_kernels/layernorm_utils.cuh
csrc/quantization/fused_kernels/quant_conversions.cuh
csrc/quantization/gguf/dequantize.cuh
csrc/quantization/gguf/ggml-common.h
csrc/quantization/gguf/gguf_kernel.cu
csrc/quantization/gguf/mmq.cuh
csrc/quantization/gguf/mmvq.cuh
csrc/quantization/gguf/vecdotq.cuh
csrc/quantization/gptq/compat.cuh
csrc/quantization/gptq/matrix_view.cuh
csrc/quantization/gptq/q_gemm.cu
csrc/quantization/gptq/qdq_2.cuh
csrc/quantization/gptq/qdq_3.cuh
csrc/quantization/gptq/qdq_4.cuh
csrc/quantization/gptq/qdq_8.cuh
csrc/quantization/gptq/qdq_util.cuh
csrc/quantization/gptq_marlin/awq_marlin_repack.cu
csrc/quantization/gptq_marlin/gptq_marlin.cu
csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
csrc/quantization/gptq_marlin/marlin.cuh
csrc/quantization/gptq_marlin/marlin_dtypes.cuh
csrc/quantization/machete/Readme.md
csrc/quantization/machete/generate.py
csrc/quantization/machete/machete_collective_builder.cuh
csrc/quantization/machete/machete_interleaving_utils.cuh
csrc/quantization/machete/machete_mainloop.cuh
csrc/quantization/machete/machete_mm_kernel.cuh
csrc/quantization/machete/machete_mm_launcher.cuh
csrc/quantization/machete/machete_prepack_kernel.cuh
csrc/quantization/machete/machete_prepack_launcher.cuh
csrc/quantization/machete/machete_prepacked_layout.cuh
csrc/quantization/machete/machete_pytorch.cu
csrc/quantization/marlin/dense/LICENSE
csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
csrc/quantization/marlin/dense/common/base.h
csrc/quantization/marlin/dense/common/mem.h
csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
csrc/quantization/marlin/sparse/LICENSE
csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
csrc/quantization/marlin/sparse/common/base.h
csrc/quantization/marlin/sparse/common/mem.h
csrc/quantization/marlin/sparse/common/mma.h
csrc/rocm/attention.cu
csrc/rocm/ops.h
csrc/rocm/torch_bindings.cpp
csrc/sparse/cutlass/sparse_compressor_c3x.cuh
csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
docs/Makefile
docs/README.md
docs/make.bat
docs/requirements-docs.txt
docs/seed_parameter_behavior.md
docs/source/conf.py
docs/source/generate_examples.py
docs/source/index.md
docs/source/_static/custom.css
docs/source/_static/custom.js
docs/source/_templates/sections/header.html
docs/source/api/inference_params.md
docs/source/api/engine/async_llm_engine.md
docs/source/api/engine/index.md
docs/source/api/engine/llm_engine.md
docs/source/api/model/adapters.md
docs/source/api/model/index.md
docs/source/api/model/interfaces.md
docs/source/api/model/interfaces_base.md
docs/source/api/multimodal/index.md
docs/source/api/multimodal/inputs.md
docs/source/api/multimodal/parse.md
docs/source/api/multimodal/processing.md
docs/source/api/multimodal/profiling.md
docs/source/api/multimodal/registry.md
docs/source/api/offline_inference/index.md
docs/source/api/offline_inference/llm.md
docs/source/api/offline_inference/llm_inputs.md
docs/source/assets/contributing/dockerfile-stages-dependency.png
docs/source/assets/deployment/architecture_helm_deployment.png
docs/source/assets/design/hierarchy.png
docs/source/assets/design/arch_overview/entrypoints.excalidraw.png
docs/source/assets/design/arch_overview/llm_engine.excalidraw.png
docs/source/assets/design/v1/prefix_caching/example-time-1.png
docs/source/assets/design/v1/prefix_caching/example-time-3.png
docs/source/assets/design/v1/prefix_caching/example-time-4.png
docs/source/assets/design/v1/prefix_caching/example-time-5.png
docs/source/assets/design/v1/prefix_caching/example-time-6.png
docs/source/assets/design/v1/prefix_caching/example-time-7.png
docs/source/assets/design/v1/prefix_caching/free.png
docs/source/assets/design/v1/prefix_caching/overview.png
docs/source/assets/features/disagg_prefill/abstraction.jpg
docs/source/assets/features/disagg_prefill/overview.jpg
docs/source/assets/kernel/k_vecs.png
docs/source/assets/kernel/key.png
docs/source/assets/kernel/logits_vec.png
docs/source/assets/kernel/q_vecs.png
docs/source/assets/kernel/query.png
docs/source/assets/kernel/v_vec.png
docs/source/assets/kernel/value.png
docs/source/assets/logos/vllm-logo-only-light.ico
docs/source/assets/logos/vllm-logo-only-light.png
docs/source/assets/logos/vllm-logo-text-dark.png
docs/source/assets/logos/vllm-logo-text-light.png
docs/source/community/blog.md
docs/source/community/meetups.md
docs/source/community/sponsors.md
docs/source/contributing/overview.md
docs/source/contributing/vulnerability_management.md
docs/source/contributing/dockerfile/dockerfile.md
docs/source/contributing/model/basic.md
docs/source/contributing/model/index.md
docs/source/contributing/model/multimodal.md
docs/source/contributing/model/registration.md
docs/source/contributing/model/tests.md
docs/source/contributing/profiling/profiling_index.md
docs/source/deployment/docker.md
docs/source/deployment/k8s.md
docs/source/deployment/nginx.md
docs/source/deployment/frameworks/bentoml.md
docs/source/deployment/frameworks/cerebrium.md
docs/source/deployment/frameworks/dstack.md
docs/source/deployment/frameworks/helm.md
docs/source/deployment/frameworks/index.md
docs/source/deployment/frameworks/lws.md
docs/source/deployment/frameworks/modal.md
docs/source/deployment/frameworks/skypilot.md
docs/source/deployment/frameworks/triton.md
docs/source/deployment/integrations/index.md
docs/source/deployment/integrations/kserve.md
docs/source/deployment/integrations/kubeai.md
docs/source/deployment/integrations/llamastack.md
docs/source/design/arch_overview.md
docs/source/design/automatic_prefix_caching.md
docs/source/design/huggingface_integration.md
docs/source/design/mm_processing.md
docs/source/design/multiprocessing.md
docs/source/design/plugin_system.md
docs/source/design/kernel/paged_attention.md
docs/source/design/v1/prefix_caching.md
docs/source/features/automatic_prefix_caching.md
docs/source/features/compatibility_matrix.md
docs/source/features/disagg_prefill.md
docs/source/features/lora.md
docs/source/features/reasoning_outputs.md
docs/source/features/spec_decode.md
docs/source/features/structured_outputs.md
docs/source/features/tool_calling.md
docs/source/features/quantization/auto_awq.md
docs/source/features/quantization/bnb.md
docs/source/features/quantization/fp8.md
docs/source/features/quantization/gguf.md
docs/source/features/quantization/index.md
docs/source/features/quantization/int4.md
docs/source/features/quantization/int8.md
docs/source/features/quantization/quantized_kvcache.md
docs/source/features/quantization/supported_hardware.md
docs/source/getting_started/faq.md
docs/source/getting_started/quickstart.md
docs/source/getting_started/troubleshooting.md
docs/source/getting_started/installation/device.template.md
docs/source/getting_started/installation/index.md
docs/source/getting_started/installation/python_env_setup.inc.md
docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
docs/source/getting_started/installation/ai_accelerator/index.md
docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
docs/source/getting_started/installation/cpu/apple.inc.md
docs/source/getting_started/installation/cpu/arm.inc.md
docs/source/getting_started/installation/cpu/build.inc.md
docs/source/getting_started/installation/cpu/index.md
docs/source/getting_started/installation/cpu/x86.inc.md
docs/source/getting_started/installation/gpu/cuda.inc.md
docs/source/getting_started/installation/gpu/index.md
docs/source/getting_started/installation/gpu/rocm.inc.md
docs/source/getting_started/installation/gpu/xpu.inc.md
docs/source/models/generative_models.md
docs/source/models/pooling_models.md
docs/source/models/supported_models.md
docs/source/models/extensions/index.md
docs/source/models/extensions/runai_model_streamer.md
docs/source/models/extensions/tensorizer.md
docs/source/performance/benchmarks.md
docs/source/performance/optimization.md
docs/source/serving/distributed_serving.md
docs/source/serving/engine_args.md
docs/source/serving/env_vars.md
docs/source/serving/metrics.md
docs/source/serving/multimodal_inputs.md
docs/source/serving/offline_inference.md
docs/source/serving/openai_compatible_server.md
docs/source/serving/usage_stats.md
docs/source/serving/integrations/index.md
docs/source/serving/integrations/langchain.md
docs/source/serving/integrations/llamaindex.md
examples/template_alpaca.jinja
examples/template_baichuan.jinja
examples/template_blip2.jinja
examples/template_chatglm.jinja
examples/template_chatglm2.jinja
examples/template_chatml.jinja
examples/template_deepseek_vl2.jinja
examples/template_dse_qwen2_vl.jinja
examples/template_falcon.jinja
examples/template_falcon_180b.jinja
examples/template_inkbot.jinja
examples/template_llava.jinja
examples/template_vlm2vec.jinja
examples/tool_chat_template_granite.jinja
examples/tool_chat_template_granite_20b_fc.jinja
examples/tool_chat_template_hermes.jinja
examples/tool_chat_template_internlm2_tool.jinja
examples/tool_chat_template_llama3.1_json.jinja
examples/tool_chat_template_llama3.2_json.jinja
examples/tool_chat_template_llama3.2_pythonic.jinja
examples/tool_chat_template_mistral.jinja
examples/tool_chat_template_mistral_parallel.jinja
examples/tool_chat_template_toolace.jinja
examples/offline_inference/audio_language.py
examples/offline_inference/chat_with_tools.py
examples/offline_inference/disaggregated_prefill.py
examples/offline_inference/distributed.py
examples/offline_inference/encoder_decoder.py
examples/offline_inference/florence2_inference.py
examples/offline_inference/llm_engine_example.py
examples/offline_inference/lora_with_quantization_inference.py
examples/offline_inference/mlpspeculator.py
examples/offline_inference/multilora_inference.py
examples/offline_inference/neuron.py
examples/offline_inference/neuron_int8_quantization.py
examples/offline_inference/pixtral.py
examples/offline_inference/prefix_caching.py
examples/offline_inference/prithvi_geospatial_mae.py
examples/offline_inference/profiling.py
examples/offline_inference/rlhf.py
examples/offline_inference/rlhf_colocate.py
examples/offline_inference/save_sharded_state.py
examples/offline_inference/simple_profiling.py
examples/offline_inference/structured_outputs.py
examples/offline_inference/torchrun_example.py
examples/offline_inference/tpu.py
examples/offline_inference/vision_language.py
examples/offline_inference/vision_language_embedding.py
examples/offline_inference/vision_language_multi_image.py
examples/offline_inference/whisper.py
examples/offline_inference/basic/README.md
examples/offline_inference/basic/basic.py
examples/offline_inference/basic/chat.py
examples/offline_inference/basic/classify.py
examples/offline_inference/basic/embed.py
examples/offline_inference/basic/generate.py
examples/offline_inference/basic/score.py
examples/offline_inference/openai/openai_batch.md
examples/offline_inference/openai/openai_example_batch.jsonl
examples/offline_inference/profiling_tpu/README.md
examples/offline_inference/profiling_tpu/profiling.py
examples/online_serving/api_client.py
examples/online_serving/cohere_rerank_client.py
examples/online_serving/disaggregated_prefill.sh
examples/online_serving/gradio_openai_chatbot_webserver.py
examples/online_serving/gradio_webserver.py
examples/online_serving/jinaai_rerank_client.py
examples/online_serving/openai_chat_completion_client.py
examples/online_serving/openai_chat_completion_client_for_multimodal.py
examples/online_serving/openai_chat_completion_client_with_tools.py
examples/online_serving/openai_chat_completion_structured_outputs.py
examples/online_serving/openai_chat_completion_with_reasoning.py
examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
examples/online_serving/openai_chat_embedding_client_for_multimodal.py
examples/online_serving/openai_completion_client.py
examples/online_serving/openai_cross_encoder_score.py
examples/online_serving/openai_embedding_client.py
examples/online_serving/openai_pooling_client.py
examples/online_serving/openai_transcription_client.py
examples/online_serving/run_cluster.sh
examples/online_serving/sagemaker-entrypoint.sh
examples/online_serving/chart-helm/.helmignore
examples/online_serving/chart-helm/Chart.yaml
examples/online_serving/chart-helm/README.md
examples/online_serving/chart-helm/ct.yaml
examples/online_serving/chart-helm/lintconf.yaml
examples/online_serving/chart-helm/values.schema.json
examples/online_serving/chart-helm/values.yaml
examples/online_serving/chart-helm/templates/_helpers.tpl
examples/online_serving/chart-helm/templates/configmap.yaml
examples/online_serving/chart-helm/templates/custom-objects.yaml
examples/online_serving/chart-helm/templates/deployment.yaml
examples/online_serving/chart-helm/templates/hpa.yaml
examples/online_serving/chart-helm/templates/job.yaml
examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml
examples/online_serving/chart-helm/templates/pvc.yaml
examples/online_serving/chart-helm/templates/secrets.yaml
examples/online_serving/chart-helm/templates/service.yaml
examples/online_serving/opentelemetry/Otel.md
examples/online_serving/opentelemetry/dummy_client.py
examples/online_serving/prometheus_grafana/README.md
examples/online_serving/prometheus_grafana/docker-compose.yaml
examples/online_serving/prometheus_grafana/grafana.json
examples/online_serving/prometheus_grafana/prometheus.yaml
examples/other/logging_configuration.md
examples/other/tensorize_vllm_model.py
tests/__init__.py
tests/conftest.py
tests/test_cache_block_hashing.py
tests/test_config.py
tests/test_embedded_commit.py
tests/test_inputs.py
tests/test_logger.py
tests/test_logits_processor.py
tests/test_regression.py
tests/test_sampling_params.py
tests/test_scalartype.py
tests/test_seed_behavior.py
tests/test_sequence.py
tests/test_sharded_state_loader.py
tests/test_utils.py
tests/utils.py
tests/async_engine/__init__.py
tests/async_engine/api_server_async_engine.py
tests/async_engine/test_api_server.py
tests/async_engine/test_async_llm_engine.py
tests/async_engine/test_request_tracker.py
tests/basic_correctness/__init__.py
tests/basic_correctness/test_basic_correctness.py
tests/basic_correctness/test_chunked_prefill.py
tests/basic_correctness/test_cpu_offload.py
tests/basic_correctness/test_cumem.py
tests/basic_correctness/test_preemption.py
tests/compile/__init__.py
tests/compile/backend.py
tests/compile/test_basic_correctness.py
tests/compile/test_full_graph.py
tests/compile/test_functionalization.py
tests/compile/test_fusion.py
tests/compile/test_pass_manager.py
tests/compile/test_wrapper.py
tests/compile/utils.py
tests/compile/piecewise/__init__.py
tests/compile/piecewise/test_simple.py
tests/compile/piecewise/test_toy_llama.py
tests/core/__init__.py
tests/core/test_chunked_prefill_scheduler.py
tests/core/test_num_computed_tokens_update.py
tests/core/test_scheduler.py
tests/core/test_scheduler_encoder_decoder.py
tests/core/test_serialization.py
tests/core/utils.py
tests/core/block/__init__.py
tests/core/block/conftest.py
tests/core/block/test_block_manager.py
tests/core/block/test_block_table.py
tests/core/block/test_common.py
tests/core/block/test_cpu_gpu_block_allocator.py
tests/core/block/test_naive_block.py
tests/core/block/test_prefix_caching_block.py
tests/core/block/e2e/__init__.py
tests/core/block/e2e/conftest.py
tests/core/block/e2e/test_correctness.py
tests/core/block/e2e/test_correctness_sliding_window.py
tests/data/test_config.yaml
tests/distributed/__init__.py
tests/distributed/test_ca_buffer_sharing.py
tests/distributed/test_comm_ops.py
tests/distributed/test_custom_all_reduce.py
tests/distributed/test_distributed_oot.py
tests/distributed/test_multi_node_assignment.py
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_partition.py
tests/distributed/test_pp_cudagraph.py
tests/distributed/test_pynccl.py
tests/distributed/test_same_node.py
tests/distributed/test_shm_broadcast.py
tests/distributed/test_torchrun_example.py
tests/distributed/test_utils.py
tests/encoder_decoder/__init__.py
tests/encoder_decoder/test_e2e_correctness.py
tests/engine/__init__.py
tests/engine/test_arg_utils.py
tests/engine/test_computed_prefix_blocks.py
tests/engine/test_detokenization.py
tests/engine/test_executor.py
tests/engine/test_multiproc_workers.py
tests/engine/test_short_mm_context.py
tests/engine/test_skip_tokenizer_init.py
tests/engine/test_stop_reason.py
tests/engine/test_stop_strings.py
tests/engine/output_processor/__init__.py
tests/engine/output_processor/test_multi_step.py
tests/engine/output_processor/test_stop_checker.py
tests/entrypoints/__init__.py
tests/entrypoints/conftest.py
tests/entrypoints/test_chat_utils.py
tests/entrypoints/llm/__init__.py
tests/entrypoints/llm/test_accuracy.py
tests/entrypoints/llm/test_chat.py
tests/entrypoints/llm/test_collective_rpc.py
tests/entrypoints/llm/test_encode.py
tests/entrypoints/llm/test_generate.py
tests/entrypoints/llm/test_generate_multiple_loras.py
tests/entrypoints/llm/test_gpu_utilization.py
tests/entrypoints/llm/test_guided_generate.py
tests/entrypoints/llm/test_init.py
tests/entrypoints/llm/test_lazy_outlines.py
tests/entrypoints/llm/test_prompt_validation.py
tests/entrypoints/offline_mode/__init__.py
tests/entrypoints/offline_mode/test_offline_mode.py
tests/entrypoints/openai/__init__.py
tests/entrypoints/openai/test_async_tokenization.py
tests/entrypoints/openai/test_audio.py
tests/entrypoints/openai/test_basic.py
tests/entrypoints/openai/test_chat.py
tests/entrypoints/openai/test_chat_echo.py
tests/entrypoints/openai/test_chat_template.py
tests/entrypoints/openai/test_chunked_prompt.py
tests/entrypoints/openai/test_cli_args.py
tests/entrypoints/openai/test_completion.py
tests/entrypoints/openai/test_embedding.py
tests/entrypoints/openai/test_encoder_decoder.py
tests/entrypoints/openai/test_lora_adapters.py
tests/entrypoints/openai/test_metrics.py
tests/entrypoints/openai/test_models.py
tests/entrypoints/openai/test_oot_registration.py
tests/entrypoints/openai/test_pooling.py
tests/entrypoints/openai/test_prompt_validation.py
tests/entrypoints/openai/test_rerank.py
tests/entrypoints/openai/test_return_tokens_as_ids.py
tests/entrypoints/openai/test_root_path.py
tests/entrypoints/openai/test_run_batch.py
tests/entrypoints/openai/test_score.py
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_models.py
tests/entrypoints/openai/test_shutdown.py
tests/entrypoints/openai/test_sleep.py
tests/entrypoints/openai/test_tokenization.py
tests/entrypoints/openai/test_transcription_validation.py
tests/entrypoints/openai/test_video.py
tests/entrypoints/openai/test_vision.py
tests/entrypoints/openai/test_vision_embedding.py
tests/entrypoints/openai/correctness/__init__.py
tests/entrypoints/openai/correctness/test_lmeval.py
tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
tests/entrypoints/openai/reasoning_parsers/__init__.py
tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
tests/entrypoints/openai/reasoning_parsers/utils.py
tests/entrypoints/openai/tool_parsers/__init__.py
tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
tests/entrypoints/openai/tool_parsers/utils.py
tests/kernels/__init__.py
tests/kernels/allclose_default.py
tests/kernels/conftest.py
tests/kernels/quant_utils.py
tests/kernels/test_activation.py
tests/kernels/test_aqlm.py
tests/kernels/test_attention.py
tests/kernels/test_attention_selector.py
tests/kernels/test_awq.py
tests/kernels/test_awq_marlin.py
tests/kernels/test_awq_triton.py
tests/kernels/test_block_fp8.py
tests/kernels/test_blocksparse_attention.py
tests/kernels/test_cache.py
tests/kernels/test_cascade_flash_attn.py
tests/kernels/test_causal_conv1d.py
tests/kernels/test_cutlass.py
tests/kernels/test_cutlass_2of4_sparse.py
tests/kernels/test_encoder_decoder_attn.py
tests/kernels/test_flash_attn.py
tests/kernels/test_flashinfer.py
tests/kernels/test_fp8_quant.py
tests/kernels/test_fused_quant_layernorm.py
tests/kernels/test_ggml.py
tests/kernels/test_gguf.py
tests/kernels/test_gptq.py
tests/kernels/test_int8_quant.py
tests/kernels/test_layernorm.py
tests/kernels/test_machete_mm.py
tests/kernels/test_mamba_mixer2.py
tests/kernels/test_mamba_ssm.py
tests/kernels/test_mamba_ssm_ssd.py
tests/kernels/test_marlin_gemm.py
tests/kernels/test_mha_attn.py
tests/kernels/test_moe.py
tests/kernels/test_nvfp4_quant.py
tests/kernels/test_permute_cols.py
tests/kernels/test_pos_encoding.py
tests/kernels/test_prefix_prefill.py
tests/kernels/test_rocm_attention_selector.py
tests/kernels/test_rotary_embedding.py
tests/kernels/test_triton_decode_attention.py
tests/kernels/test_triton_scaled_mm.py
tests/kernels/test_utils.py
tests/kernels/utils.py
tests/kv_transfer/disagg_test.py
tests/kv_transfer/module_test.py
tests/kv_transfer/test_lookup_buffer.py
tests/kv_transfer/test_lookup_buffer.sh
tests/kv_transfer/test_send_recv.py
tests/kv_transfer/test_send_recv.sh
tests/lora/__init__.py
tests/lora/conftest.py
tests/lora/test_add_lora.py
tests/lora/test_baichuan.py
tests/lora/test_chatglm3_tp.py
tests/lora/test_gemma.py
tests/lora/test_jamba.py
tests/lora/test_layers.py
tests/lora/test_llama_tp.py
tests/lora/test_long_context.py
tests/lora/test_lora_bias_e2e.py
tests/lora/test_lora_checkpoints.py
tests/lora/test_lora_huggingface.py
tests/lora/test_lora_manager.py
tests/lora/test_minicpmv_tp.py
tests/lora/test_mixtral.py
tests/lora/test_peft_helper.py
tests/lora/test_phi.py
tests/lora/test_punica_ops.py
tests/lora/test_quant_model.py
tests/lora/test_qwen2vl.py
tests/lora/test_tokenizer_group.py
tests/lora/test_ultravox.py
tests/lora/test_utils.py
tests/lora/test_worker.py
tests/lora/utils.py
tests/lora/data/__init__.py
tests/lora/data/long_context_test_data.py
tests/metrics/__init__.py
tests/metrics/test_metrics.py
tests/mistral_tool_use/__init__.py
tests/mistral_tool_use/conftest.py
tests/mistral_tool_use/test_mistral_tool_calls.py
tests/mistral_tool_use/utils.py
tests/model_executor/__init__.py
tests/model_executor/conftest.py
tests/model_executor/test_enabled_custom_ops.py
tests/model_executor/test_guided_processors.py
tests/model_executor/test_model_load_with_params.py
tests/model_executor/weight_utils.py
tests/models/__init__.py
tests/models/registry.py
tests/models/test_initialization.py
tests/models/test_oot_registration.py
tests/models/test_registry.py
tests/models/test_transformers.py
tests/models/test_vision.py
tests/models/utils.py
tests/models/decoder_only/__init__.py
tests/models/decoder_only/audio_language/__init__.py
tests/models/decoder_only/audio_language/test_ultravox.py
tests/models/decoder_only/language/__init__.py
tests/models/decoder_only/language/test_aqlm.py
tests/models/decoder_only/language/test_fp8.py
tests/models/decoder_only/language/test_gguf.py
tests/models/decoder_only/language/test_gptq_marlin.py
tests/models/decoder_only/language/test_gptq_marlin_24.py
tests/models/decoder_only/language/test_granite.py
tests/models/decoder_only/language/test_hybrid.py
tests/models/decoder_only/language/test_mamba.py
tests/models/decoder_only/language/test_mistral.py
tests/models/decoder_only/language/test_modelopt.py
tests/models/decoder_only/language/test_models.py
tests/models/decoder_only/language/test_phimoe.py
tests/models/decoder_only/vision_language/__init__.py
tests/models/decoder_only/vision_language/test_awq.py
tests/models/decoder_only/vision_language/test_intern_vit.py
tests/models/decoder_only/vision_language/test_models.py
tests/models/decoder_only/vision_language/test_phi3v.py
tests/models/decoder_only/vision_language/test_pixtral.py
tests/models/decoder_only/vision_language/test_qwen2_vl.py
tests/models/decoder_only/vision_language/vlm_utils/__init__.py
tests/models/decoder_only/vision_language/vlm_utils/builders.py
tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
tests/models/decoder_only/vision_language/vlm_utils/core.py
tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
tests/models/decoder_only/vision_language/vlm_utils/runners.py
tests/models/decoder_only/vision_language/vlm_utils/types.py
tests/models/embedding/__init__.py
tests/models/embedding/utils.py
tests/models/embedding/language/__init__.py
tests/models/embedding/language/test_cls_models.py
tests/models/embedding/language/test_embedding.py
tests/models/embedding/language/test_gritlm.py
tests/models/embedding/language/test_scoring.py
tests/models/embedding/vision_language/__init__.py
tests/models/embedding/vision_language/test_dse_qwen2_vl.py
tests/models/embedding/vision_language/test_llava_next.py
tests/models/embedding/vision_language/test_phi3v.py
tests/models/encoder_decoder/__init__.py
tests/models/encoder_decoder/audio_language/__init__.py
tests/models/encoder_decoder/audio_language/test_whisper.py
tests/models/encoder_decoder/language/__init__.py
tests/models/encoder_decoder/language/test_bart.py
tests/models/encoder_decoder/vision_language/__init__.py
tests/models/encoder_decoder/vision_language/test_broadcast.py
tests/models/encoder_decoder/vision_language/test_florence2.py
tests/models/encoder_decoder/vision_language/test_mllama.py
tests/models/fixtures/pixtral_chat.json
tests/models/fixtures/pixtral_chat_engine.json
tests/models/multimodal/__init__.py
tests/models/multimodal/processing/__init__.py
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_h2ovl.py
tests/models/multimodal/processing/test_idefics3.py
tests/models/multimodal/processing/test_internvl.py
tests/models/multimodal/processing/test_llava_next.py
tests/models/multimodal/processing/test_llava_onevision.py
tests/models/multimodal/processing/test_phi3v.py
tests/models/multimodal/processing/test_qwen2_vl.py
tests/mq_llm_engine/__init__.py
tests/mq_llm_engine/test_abort.py
tests/mq_llm_engine/test_error_handling.py
tests/mq_llm_engine/test_load.py
tests/mq_llm_engine/utils.py
tests/multi_step/__init__.py
tests/multi_step/test_correctness_async_llm.py
tests/multi_step/test_correctness_llm.py
tests/multimodal/__init__.py
tests/multimodal/test_inputs.py
tests/multimodal/test_processing.py
tests/multimodal/test_utils.py
tests/multimodal/utils.py
tests/neuron/test_prefix_prefill.py
tests/plugins/vllm_add_dummy_model/setup.py
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
tests/plugins/vllm_add_dummy_platform/setup.py
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
tests/plugins_tests/test_platform_plugins.py
tests/plugins_tests/test_scheduler_plugins.py
tests/prefix_caching/__init__.py
tests/prefix_caching/test_disable_sliding_window.py
tests/prefix_caching/test_prefix_caching.py
tests/prompt_adapter/test_bloom.py
tests/prompt_adapter/test_multi_adapter_inference.py
tests/prompt_adapter/test_pa_lora.py
tests/prompts/example.txt
tests/prompts/summary.txt
tests/quantization/__init__.py
tests/quantization/test_bitsandbytes.py
tests/quantization/test_compressed_tensors.py
tests/quantization/test_configs.py
tests/quantization/test_cpu_offload.py
tests/quantization/test_experts_int8.py
tests/quantization/test_fp8.py
tests/quantization/test_gptq_dynamic.py
tests/quantization/test_ipex_quant.py
tests/quantization/test_lm_head.py
tests/quantization/test_ptpc_fp8.py
tests/quantization/test_quark.py
tests/quantization/test_register_quantization_config.py
tests/quantization/utils.py
tests/runai_model_streamer_test/__init__.py
tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
tests/runai_model_streamer_test/test_weight_utils.py
tests/samplers/__init__.py
tests/samplers/test_beam_search.py
tests/samplers/test_ignore_eos.py
tests/samplers/test_logits_processor.py
tests/samplers/test_logprobs.py
tests/samplers/test_no_bad_words.py
tests/samplers/test_ranks.py
tests/samplers/test_rejection_sampler.py
tests/samplers/test_sampler.py
tests/samplers/test_seeded_generate.py
tests/samplers/test_typical_acceptance_sampler.py
tests/spec_decode/__init__.py
tests/spec_decode/test_batch_expansion.py
tests/spec_decode/test_dynamic_spec_decode.py
tests/spec_decode/test_metrics.py
tests/spec_decode/test_multi_step_worker.py
tests/spec_decode/test_ngram_worker.py
tests/spec_decode/test_scorer.py
tests/spec_decode/test_spec_decode_worker.py
tests/spec_decode/test_utils.py
tests/spec_decode/utils.py
tests/spec_decode/e2e/__init__.py
tests/spec_decode/e2e/conftest.py
tests/spec_decode/e2e/test_compatibility.py
tests/spec_decode/e2e/test_eagle_correctness.py
tests/spec_decode/e2e/test_integration.py
tests/spec_decode/e2e/test_integration_dist_tp2.py
tests/spec_decode/e2e/test_integration_dist_tp4.py
tests/spec_decode/e2e/test_logprobs.py
tests/spec_decode/e2e/test_medusa_correctness.py
tests/spec_decode/e2e/test_mlp_correctness.py
tests/spec_decode/e2e/test_mtp_correctness.py
tests/spec_decode/e2e/test_multistep_correctness.py
tests/spec_decode/e2e/test_ngram_correctness.py
tests/spec_decode/e2e/test_seed.py
tests/standalone_tests/lazy_imports.py
tests/standalone_tests/python_only_compile.sh
tests/system_messages/sonnet3.5_nov2024.txt
tests/tensorizer_loader/__init__.py
tests/tensorizer_loader/conftest.py
tests/tensorizer_loader/test_tensorizer.py
tests/tokenization/__init__.py
tests/tokenization/test_cached_tokenizer.py
tests/tokenization/test_detokenize.py
tests/tokenization/test_get_eos.py
tests/tokenization/test_mistral_tokenizer.py
tests/tokenization/test_tokenizer.py
tests/tokenization/test_tokenizer_group.py
tests/tokenization/test_tokenizer_registry.py
tests/tool_use/__init__.py
tests/tool_use/conftest.py
tests/tool_use/test_chat_completion_request_validations.py
tests/tool_use/test_chat_completions.py
tests/tool_use/test_jamba_tool_parser.py
tests/tool_use/test_parallel_tool_calls.py
tests/tool_use/test_tool_calls.py
tests/tool_use/utils.py
tests/tpu/__init__.py
tests/tpu/test_compilation.py
tests/tpu/test_custom_dispatcher.py
tests/tpu/test_quantization_accuracy.py
tests/tracing/__init__.py
tests/tracing/test_tracing.py
tests/v1/__init__.py
tests/v1/test_stats.py
tests/v1/test_utils.py
tests/v1/core/test_kv_cache_utils.py
tests/v1/core/test_prefix_caching.py
tests/v1/core/test_scheduler.py
tests/v1/e2e/__init__.py
tests/v1/e2e/test_cascade_attention.py
tests/v1/e2e/test_ngram_spec_decode.py
tests/v1/engine/__init__.py
tests/v1/engine/conftest.py
tests/v1/engine/test_async_llm.py
tests/v1/engine/test_engine_args.py
tests/v1/engine/test_engine_core.py
tests/v1/engine/test_engine_core_client.py
tests/v1/engine/test_llm_engine.py
tests/v1/engine/test_output_processor.py
tests/v1/engine/utils.py
tests/v1/entrypoints/__init__.py
tests/v1/entrypoints/conftest.py
tests/v1/entrypoints/openai/test_completion.py
tests/v1/sample/__init__.py
tests/v1/sample/test_logprobs.py
tests/v1/sample/test_logprobs_e2e.py
tests/v1/sample/test_rejection_sampler.py
tests/v1/sample/test_sampler.py
tests/v1/sample/utils.py
tests/v1/spec_decode/test_ngram.py
tests/v1/worker/__init__.py
tests/v1/worker/test_gpu_input_batch.py
tests/v1/worker/test_gpu_model_runner.py
tests/vllm_test_utils/setup.py
tests/vllm_test_utils/vllm_test_utils/__init__.py
tests/vllm_test_utils/vllm_test_utils/blame.py
tests/vllm_test_utils/vllm_test_utils/monitor.py
tests/weight_loading/models-large.txt
tests/weight_loading/models.txt
tests/weight_loading/run_model_weight_loading_test.sh
tests/weight_loading/test_weight_loading.py
tests/worker/__init__.py
tests/worker/test_encoder_decoder_model_runner.py
tests/worker/test_model_input.py
tests/worker/test_model_runner.py
tests/worker/test_profile.py
tests/worker/test_swap.py
tools/check_repo.sh
tools/check_spdx_header.py
tools/mypy.sh
tools/png-lint.sh
tools/report_build_time_ninja.py
tools/shellcheck.sh
tools/profiler/print_layerwise_table.py
tools/profiler/visualize_layerwise_profile.py
vllm/__init__.py
vllm/_custom_ops.py
vllm/_ipex_ops.py
vllm/_version.py
vllm/beam_search.py
vllm/config.py
vllm/connections.py
vllm/envs.py
vllm/forward_context.py
vllm/logger.py
vllm/logits_process.py
vllm/outputs.py
vllm/pooling_params.py
vllm/py.typed
vllm/sampling_params.py
vllm/scalar_type.py
vllm/scripts.py
vllm/sequence.py
vllm/tracing.py
vllm/utils.py
vllm/version.py
vllm.egg-info/PKG-INFO
vllm.egg-info/SOURCES.txt
vllm.egg-info/dependency_links.txt
vllm.egg-info/entry_points.txt
vllm.egg-info/requires.txt
vllm.egg-info/top_level.txt
vllm/adapter_commons/__init__.py
vllm/adapter_commons/layers.py
vllm/adapter_commons/models.py
vllm/adapter_commons/request.py
vllm/adapter_commons/utils.py
vllm/adapter_commons/worker_manager.py
vllm/assets/__init__.py
vllm/assets/audio.py
vllm/assets/base.py
vllm/assets/image.py
vllm/assets/video.py
vllm/attention/__init__.py
vllm/attention/layer.py
vllm/attention/selector.py
vllm/attention/backends/__init__.py
vllm/attention/backends/abstract.py
vllm/attention/backends/blocksparse_attn.py
vllm/attention/backends/flash_attn.py
vllm/attention/backends/flashinfer.py
vllm/attention/backends/hpu_attn.py
vllm/attention/backends/ipex_attn.py
vllm/attention/backends/openvino.py
vllm/attention/backends/pallas.py
vllm/attention/backends/placeholder_attn.py
vllm/attention/backends/rocm_flash_attn.py
vllm/attention/backends/torch_sdpa.py
vllm/attention/backends/triton_mla.py
vllm/attention/backends/utils.py
vllm/attention/backends/xformers.py
vllm/attention/backends/mla/__init__.py
vllm/attention/backends/mla/utils.py
vllm/attention/ops/__init__.py
vllm/attention/ops/hpu_paged_attn.py
vllm/attention/ops/ipex_attn.py
vllm/attention/ops/nki_flash_attn.py
vllm/attention/ops/paged_attn.py
vllm/attention/ops/prefix_prefill.py
vllm/attention/ops/triton_decode_attention.py
vllm/attention/ops/triton_flash_attention.py
vllm/attention/ops/blocksparse_attention/__init__.py
vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py
vllm/attention/ops/blocksparse_attention/interface.py
vllm/attention/ops/blocksparse_attention/utils.py
vllm/compilation/__init__.py
vllm/compilation/backends.py
vllm/compilation/compiler_interface.py
vllm/compilation/counter.py
vllm/compilation/decorators.py
vllm/compilation/fix_functionalization.py
vllm/compilation/fusion.py
vllm/compilation/fx_utils.py
vllm/compilation/inductor_pass.py
vllm/compilation/monitor.py
vllm/compilation/multi_output_match.py
vllm/compilation/pass_manager.py
vllm/compilation/reshapes.py
vllm/compilation/vllm_inductor_pass.py
vllm/compilation/wrapper.py
vllm/core/__init__.py
vllm/core/block_manager.py
vllm/core/evictor.py
vllm/core/interfaces.py
vllm/core/placeholder_block_space_manager.py
vllm/core/scheduler.py
vllm/core/block/__init__.py
vllm/core/block/block_table.py
vllm/core/block/common.py
vllm/core/block/cpu_gpu_block_allocator.py
vllm/core/block/interfaces.py
vllm/core/block/naive_block.py
vllm/core/block/prefix_caching_block.py
vllm/core/block/utils.py
vllm/device_allocator/__init__.py
vllm/device_allocator/cumem.py
vllm/distributed/__init__.py
vllm/distributed/communication_op.py
vllm/distributed/parallel_state.py
vllm/distributed/utils.py
vllm/distributed/device_communicators/__init__.py
vllm/distributed/device_communicators/base_device_communicator.py
vllm/distributed/device_communicators/cpu_communicator.py
vllm/distributed/device_communicators/cuda_communicator.py
vllm/distributed/device_communicators/cuda_wrapper.py
vllm/distributed/device_communicators/custom_all_reduce.py
vllm/distributed/device_communicators/custom_all_reduce_utils.py
vllm/distributed/device_communicators/hpu_communicator.py
vllm/distributed/device_communicators/pynccl.py
vllm/distributed/device_communicators/pynccl_wrapper.py
vllm/distributed/device_communicators/shm_broadcast.py
vllm/distributed/device_communicators/tpu_communicator.py
vllm/distributed/device_communicators/xpu_communicator.py
vllm/distributed/kv_transfer/README.md
vllm/distributed/kv_transfer/__init__.py
vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg
vllm/distributed/kv_transfer/kv_transfer_agent.py
vllm/distributed/kv_transfer/kv_connector/__init__.py
vllm/distributed/kv_transfer/kv_connector/base.py
vllm/distributed/kv_transfer/kv_connector/factory.py
vllm/distributed/kv_transfer/kv_connector/simple_connector.py
vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py
vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
vllm/distributed/kv_transfer/kv_pipe/__init__.py
vllm/distributed/kv_transfer/kv_pipe/base.py
vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
vllm/engine/__init__.py
vllm/engine/arg_utils.py
vllm/engine/async_llm_engine.py
vllm/engine/async_timeout.py
vllm/engine/llm_engine.py
vllm/engine/metrics.py
vllm/engine/metrics_types.py
vllm/engine/protocol.py
vllm/engine/multiprocessing/__init__.py
vllm/engine/multiprocessing/client.py
vllm/engine/multiprocessing/engine.py
vllm/engine/output_processor/__init__.py
vllm/engine/output_processor/interfaces.py
vllm/engine/output_processor/multi_step.py
vllm/engine/output_processor/single_step.py
vllm/engine/output_processor/stop_checker.py
vllm/engine/output_processor/util.py
vllm/entrypoints/__init__.py
vllm/entrypoints/api_server.py
vllm/entrypoints/chat_utils.py
vllm/entrypoints/launcher.py
vllm/entrypoints/llm.py
vllm/entrypoints/logger.py
vllm/entrypoints/utils.py
vllm/entrypoints/cli/__init__.py
vllm/entrypoints/cli/main.py
vllm/entrypoints/cli/openai.py
vllm/entrypoints/cli/serve.py
vllm/entrypoints/cli/types.py
vllm/entrypoints/openai/__init__.py
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/cli_args.py
vllm/entrypoints/openai/logits_processors.py
vllm/entrypoints/openai/protocol.py
vllm/entrypoints/openai/run_batch.py
vllm/entrypoints/openai/serving_chat.py
vllm/entrypoints/openai/serving_completion.py
vllm/entrypoints/openai/serving_embedding.py
vllm/entrypoints/openai/serving_engine.py
vllm/entrypoints/openai/serving_models.py
vllm/entrypoints/openai/serving_pooling.py
vllm/entrypoints/openai/serving_rerank.py
vllm/entrypoints/openai/serving_score.py
vllm/entrypoints/openai/serving_tokenization.py
vllm/entrypoints/openai/serving_transcription.py
vllm/entrypoints/openai/reasoning_parsers/__init__.py
vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
vllm/entrypoints/openai/tool_parsers/__init__.py
vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
vllm/entrypoints/openai/tool_parsers/utils.py
vllm/executor/__init__.py
vllm/executor/executor_base.py
vllm/executor/mp_distributed_executor.py
vllm/executor/msgspec_utils.py
vllm/executor/multiproc_worker_utils.py
vllm/executor/ray_distributed_executor.py
vllm/executor/ray_utils.py
vllm/executor/uniproc_executor.py
vllm/inputs/__init__.py
vllm/inputs/data.py
vllm/inputs/parse.py
vllm/inputs/preprocess.py
vllm/inputs/registry.py
vllm/logging_utils/__init__.py
vllm/logging_utils/formatter.py
vllm/lora/__init__.py
vllm/lora/fully_sharded_layers.py
vllm/lora/layers.py
vllm/lora/lora.py
vllm/lora/models.py
vllm/lora/peft_helper.py
vllm/lora/request.py
vllm/lora/utils.py
vllm/lora/worker_manager.py
vllm/lora/ops/__init__.py
vllm/lora/ops/torch_ops/__init__.py
vllm/lora/ops/torch_ops/lora_ops.py
vllm/lora/ops/triton_ops/__init__.py
vllm/lora/ops/triton_ops/bgmv_expand.py
vllm/lora/ops/triton_ops/bgmv_expand_slice.py
vllm/lora/ops/triton_ops/bgmv_shrink.py
vllm/lora/ops/triton_ops/kernel_utils.py
vllm/lora/ops/triton_ops/sgmv_expand.py
vllm/lora/ops/triton_ops/sgmv_shrink.py
vllm/lora/ops/triton_ops/utils.py
vllm/lora/punica_wrapper/__init__.py
vllm/lora/punica_wrapper/punica_base.py
vllm/lora/punica_wrapper/punica_cpu.py
vllm/lora/punica_wrapper/punica_gpu.py
vllm/lora/punica_wrapper/punica_hpu.py
vllm/lora/punica_wrapper/punica_selector.py
vllm/lora/punica_wrapper/utils.py
vllm/model_executor/__init__.py
vllm/model_executor/custom_op.py
vllm/model_executor/parameter.py
vllm/model_executor/pooling_metadata.py
vllm/model_executor/sampling_metadata.py
vllm/model_executor/utils.py
vllm/model_executor/guided_decoding/__init__.py
vllm/model_executor/guided_decoding/guided_fields.py
vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
vllm/model_executor/guided_decoding/outlines_decoding.py
vllm/model_executor/guided_decoding/outlines_logits_processors.py
vllm/model_executor/guided_decoding/utils.py
vllm/model_executor/guided_decoding/xgrammar_decoding.py
vllm/model_executor/layers/__init__.py
vllm/model_executor/layers/activation.py
vllm/model_executor/layers/layernorm.py
vllm/model_executor/layers/linear.py
vllm/model_executor/layers/logits_processor.py
vllm/model_executor/layers/pooler.py
vllm/model_executor/layers/rejection_sampler.py
vllm/model_executor/layers/resampler.py
vllm/model_executor/layers/rotary_embedding.py
vllm/model_executor/layers/sampler.py
vllm/model_executor/layers/spec_decode_base_sampler.py
vllm/model_executor/layers/typical_acceptance_sampler.py
vllm/model_executor/layers/utils.py
vllm/model_executor/layers/vocab_parallel_embedding.py
vllm/model_executor/layers/fused_moe/__init__.py
vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
vllm/model_executor/layers/fused_moe/fused_moe.py
vllm/model_executor/layers/fused_moe/layer.py
vllm/model_executor/layers/fused_moe/moe_pallas.py
vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/README
vllm/model_executor/layers/mamba/__init__.py
vllm/model_executor/layers/mamba/mamba_mixer.py
vllm/model_executor/layers/mamba/mamba_mixer2.py
vllm/model_executor/layers/mamba/ops/__init__.py
vllm/model_executor/layers/mamba/ops/causal_conv1d.py
vllm/model_executor/layers/mamba/ops/mamba_ssm.py
vllm/model_executor/layers/mamba/ops/ssd_bmm.py
vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py
vllm/model_executor/layers/mamba/ops/ssd_combined.py
vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
vllm/model_executor/layers/quantization/__init__.py
vllm/model_executor/layers/quantization/aqlm.py
vllm/model_executor/layers/quantization/awq.py
vllm/model_executor/layers/quantization/awq_marlin.py
vllm/model_executor/layers/quantization/awq_triton.py
vllm/model_executor/layers/quantization/base_config.py
vllm/model_executor/layers/quantization/bitsandbytes.py
vllm/model_executor/layers/quantization/deepspeedfp.py
vllm/model_executor/layers/quantization/experts_int8.py
vllm/model_executor/layers/quantization/fbgemm_fp8.py
vllm/model_executor/layers/quantization/fp8.py
vllm/model_executor/layers/quantization/gguf.py
vllm/model_executor/layers/quantization/gptq.py
vllm/model_executor/layers/quantization/gptq_marlin.py
vllm/model_executor/layers/quantization/gptq_marlin_24.py
vllm/model_executor/layers/quantization/hqq_marlin.py
vllm/model_executor/layers/quantization/ipex_quant.py
vllm/model_executor/layers/quantization/kv_cache.py
vllm/model_executor/layers/quantization/marlin.py
vllm/model_executor/layers/quantization/modelopt.py
vllm/model_executor/layers/quantization/moe_wna16.py
vllm/model_executor/layers/quantization/neuron_quant.py
vllm/model_executor/layers/quantization/ptpc_fp8.py
vllm/model_executor/layers/quantization/qqq.py
vllm/model_executor/layers/quantization/schema.py
vllm/model_executor/layers/quantization/tpu_int8.py
vllm/model_executor/layers/quantization/compressed_tensors/__init__.py
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
vllm/model_executor/layers/quantization/compressed_tensors/utils.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
vllm/model_executor/layers/quantization/kernels/__init__.py
vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
vllm/model_executor/layers/quantization/quark/__init__.py
vllm/model_executor/layers/quantization/quark/quark.py
vllm/model_executor/layers/quantization/quark/quark_moe.py
vllm/model_executor/layers/quantization/quark/utils.py
vllm/model_executor/layers/quantization/quark/schemes/__init__.py
vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
vllm/model_executor/layers/quantization/utils/__init__.py
vllm/model_executor/layers/quantization/utils/fp8_utils.py
vllm/model_executor/layers/quantization/utils/gptq_utils.py
vllm/model_executor/layers/quantization/utils/layer_utils.py
vllm/model_executor/layers/quantization/utils/machete_utils.py
vllm/model_executor/layers/quantization/utils/marlin_utils.py
vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
vllm/model_executor/layers/quantization/utils/quant_utils.py
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/model_loader/__init__.py
vllm/model_executor/model_loader/loader.py
vllm/model_executor/model_loader/neuron.py
vllm/model_executor/model_loader/openvino.py
vllm/model_executor/model_loader/tensorizer.py
vllm/model_executor/model_loader/utils.py
vllm/model_executor/model_loader/weight_utils.py
vllm/model_executor/models/__init__.py
vllm/model_executor/models/adapters.py
vllm/model_executor/models/arctic.py
vllm/model_executor/models/aria.py
vllm/model_executor/models/baichuan.py
vllm/model_executor/models/bamba.py
vllm/model_executor/models/bart.py
vllm/model_executor/models/bert.py
vllm/model_executor/models/blip.py
vllm/model_executor/models/blip2.py
vllm/model_executor/models/bloom.py
vllm/model_executor/models/chameleon.py
vllm/model_executor/models/chatglm.py
vllm/model_executor/models/clip.py
vllm/model_executor/models/commandr.py
vllm/model_executor/models/dbrx.py
vllm/model_executor/models/decilm.py
vllm/model_executor/models/deepseek.py
vllm/model_executor/models/deepseek_mtp.py
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_vl2.py
vllm/model_executor/models/eagle.py
vllm/model_executor/models/exaone.py
vllm/model_executor/models/fairseq2_llama.py
vllm/model_executor/models/falcon.py
vllm/model_executor/models/florence2.py
vllm/model_executor/models/fuyu.py
vllm/model_executor/models/gemma.py
vllm/model_executor/models/gemma2.py
vllm/model_executor/models/glm.py
vllm/model_executor/models/glm4v.py
vllm/model_executor/models/gpt2.py
vllm/model_executor/models/gpt_bigcode.py
vllm/model_executor/models/gpt_j.py
vllm/model_executor/models/gpt_neox.py
vllm/model_executor/models/granite.py
vllm/model_executor/models/granitemoe.py
vllm/model_executor/models/gritlm.py
vllm/model_executor/models/h2ovl.py
vllm/model_executor/models/idefics2_vision_model.py
vllm/model_executor/models/idefics3.py
vllm/model_executor/models/interfaces.py
vllm/model_executor/models/interfaces_base.py
vllm/model_executor/models/intern_vit.py
vllm/model_executor/models/internlm2.py
vllm/model_executor/models/internlm2_ve.py
vllm/model_executor/models/internvl.py
vllm/model_executor/models/jais.py
vllm/model_executor/models/jamba.py
vllm/model_executor/models/llama.py
vllm/model_executor/models/llava.py
vllm/model_executor/models/llava_next.py
vllm/model_executor/models/llava_next_video.py
vllm/model_executor/models/llava_onevision.py
vllm/model_executor/models/mamba.py
vllm/model_executor/models/mamba2.py
vllm/model_executor/models/mamba_cache.py
vllm/model_executor/models/medusa.py
vllm/model_executor/models/minicpm.py
vllm/model_executor/models/minicpm3.py
vllm/model_executor/models/minicpmo.py
vllm/model_executor/models/minicpmv.py
vllm/model_executor/models/mixtral.py
vllm/model_executor/models/mixtral_quant.py
vllm/model_executor/models/mllama.py
vllm/model_executor/models/mlp_speculator.py
vllm/model_executor/models/module_mapping.py
vllm/model_executor/models/molmo.py
vllm/model_executor/models/mpt.py
vllm/model_executor/models/nemotron.py
vllm/model_executor/models/nvlm_d.py
vllm/model_executor/models/olmo.py
vllm/model_executor/models/olmo2.py
vllm/model_executor/models/olmoe.py
vllm/model_executor/models/opt.py
vllm/model_executor/models/orion.py
vllm/model_executor/models/paligemma.py
vllm/model_executor/models/persimmon.py
vllm/model_executor/models/phi.py
vllm/model_executor/models/phi3.py
vllm/model_executor/models/phi3_small.py
vllm/model_executor/models/phi3v.py
vllm/model_executor/models/phimoe.py
vllm/model_executor/models/pixtral.py
vllm/model_executor/models/prithvi_geospatial_mae.py
vllm/model_executor/models/qwen.py
vllm/model_executor/models/qwen2.py
vllm/model_executor/models/qwen2_5_vl.py
vllm/model_executor/models/qwen2_audio.py
vllm/model_executor/models/qwen2_moe.py
vllm/model_executor/models/qwen2_rm.py
vllm/model_executor/models/qwen2_vl.py
vllm/model_executor/models/qwen_vl.py
vllm/model_executor/models/registry.py
vllm/model_executor/models/roberta.py
vllm/model_executor/models/siglip.py
vllm/model_executor/models/solar.py
vllm/model_executor/models/stablelm.py
vllm/model_executor/models/starcoder2.py
vllm/model_executor/models/telechat2.py
vllm/model_executor/models/transformers.py
vllm/model_executor/models/ultravox.py
vllm/model_executor/models/utils.py
vllm/model_executor/models/vision.py
vllm/model_executor/models/whisper.py
vllm/multimodal/__init__.py
vllm/multimodal/audio.py
vllm/multimodal/base.py
vllm/multimodal/hasher.py
vllm/multimodal/image.py
vllm/multimodal/inputs.py
vllm/multimodal/parse.py
vllm/multimodal/processing.py
vllm/multimodal/profiling.py
vllm/multimodal/registry.py
vllm/multimodal/utils.py
vllm/multimodal/video.py
vllm/platforms/__init__.py
vllm/platforms/cpu.py
vllm/platforms/cuda.py
vllm/platforms/hpu.py
vllm/platforms/interface.py
vllm/platforms/neuron.py
vllm/platforms/openvino.py
vllm/platforms/rocm.py
vllm/platforms/tpu.py
vllm/platforms/xpu.py
vllm/plugins/__init__.py
vllm/profiler/__init__.py
vllm/profiler/layerwise_profile.py
vllm/profiler/utils.py
vllm/prompt_adapter/__init__.py
vllm/prompt_adapter/layers.py
vllm/prompt_adapter/models.py
vllm/prompt_adapter/request.py
vllm/prompt_adapter/utils.py
vllm/prompt_adapter/worker_manager.py
vllm/spec_decode/__init__.py
vllm/spec_decode/batch_expansion.py
vllm/spec_decode/draft_model_runner.py
vllm/spec_decode/interfaces.py
vllm/spec_decode/medusa_worker.py
vllm/spec_decode/metrics.py
vllm/spec_decode/mlp_speculator_worker.py
vllm/spec_decode/mqa_scorer.py
vllm/spec_decode/multi_step_worker.py
vllm/spec_decode/ngram_worker.py
vllm/spec_decode/proposer_worker_base.py
vllm/spec_decode/smaller_tp_proposer_worker.py
vllm/spec_decode/spec_decode_worker.py
vllm/spec_decode/target_model_runner.py
vllm/spec_decode/top1_proposer.py
vllm/spec_decode/util.py
vllm/third_party/__init__.py
vllm/third_party/pynvml.py
vllm/transformers_utils/__init__.py
vllm/transformers_utils/config.py
vllm/transformers_utils/detokenizer.py
vllm/transformers_utils/detokenizer_utils.py
vllm/transformers_utils/processor.py
vllm/transformers_utils/s3_utils.py
vllm/transformers_utils/tokenizer.py
vllm/transformers_utils/tokenizer_base.py
vllm/transformers_utils/utils.py
vllm/transformers_utils/configs/__init__.py
vllm/transformers_utils/configs/arctic.py
vllm/transformers_utils/configs/chatglm.py
vllm/transformers_utils/configs/cohere2.py
vllm/transformers_utils/configs/dbrx.py
vllm/transformers_utils/configs/deepseek_vl2.py
vllm/transformers_utils/configs/eagle.py
vllm/transformers_utils/configs/exaone.py
vllm/transformers_utils/configs/falcon.py
vllm/transformers_utils/configs/h2ovl.py
vllm/transformers_utils/configs/internvl.py
vllm/transformers_utils/configs/jais.py
vllm/transformers_utils/configs/medusa.py
vllm/transformers_utils/configs/mllama.py
vllm/transformers_utils/configs/mlp_speculator.py
vllm/transformers_utils/configs/mpt.py
vllm/transformers_utils/configs/nemotron.py
vllm/transformers_utils/configs/nvlm_d.py
vllm/transformers_utils/configs/olmo2.py
vllm/transformers_utils/configs/solar.py
vllm/transformers_utils/configs/telechat2.py
vllm/transformers_utils/configs/ultravox.py
vllm/transformers_utils/processors/__init__.py
vllm/transformers_utils/processors/deepseek_vl2.py
vllm/transformers_utils/tokenizer_group/__init__.py
vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
vllm/transformers_utils/tokenizer_group/tokenizer_group.py
vllm/transformers_utils/tokenizers/__init__.py
vllm/transformers_utils/tokenizers/mistral.py
vllm/triton_utils/__init__.py
vllm/triton_utils/custom_cache_manager.py
vllm/triton_utils/importing.py
vllm/usage/__init__.py
vllm/usage/usage_lib.py
vllm/v1/__init__.py
vllm/v1/kv_cache_interface.py
vllm/v1/outputs.py
vllm/v1/request.py
vllm/v1/serial_utils.py
vllm/v1/utils.py
vllm/v1/attention/__init__.py
vllm/v1/attention/backends/__init__.py
vllm/v1/attention/backends/flash_attn.py
vllm/v1/attention/backends/pallas.py
vllm/v1/attention/backends/rocm_attn.py
vllm/v1/core/__init__.py
vllm/v1/core/encoder_cache_manager.py
vllm/v1/core/kv_cache_manager.py
vllm/v1/core/kv_cache_utils.py
vllm/v1/core/scheduler.py
vllm/v1/core/scheduler_output.py
vllm/v1/engine/__init__.py
vllm/v1/engine/async_llm.py
vllm/v1/engine/core.py
vllm/v1/engine/core_client.py
vllm/v1/engine/detokenizer.py
vllm/v1/engine/llm_engine.py
vllm/v1/engine/logprobs.py
vllm/v1/engine/mm_input_cache.py
vllm/v1/engine/output_processor.py
vllm/v1/engine/processor.py
vllm/v1/executor/__init__.py
vllm/v1/executor/abstract.py
vllm/v1/executor/multiproc_executor.py
vllm/v1/executor/ray_distributed_executor.py
vllm/v1/metrics/__init__.py
vllm/v1/metrics/loggers.py
vllm/v1/metrics/stats.py
vllm/v1/sample/__init__.py
vllm/v1/sample/metadata.py
vllm/v1/sample/rejection_sampler.py
vllm/v1/sample/sampler.py
vllm/v1/sample/ops/__init__.py
vllm/v1/sample/ops/penalties.py
vllm/v1/sample/ops/topk_topp_sampler.py
vllm/v1/spec_decode/__init__.py
vllm/v1/spec_decode/ngram_proposer.py
vllm/v1/stats/__init__.py
vllm/v1/stats/common.py
vllm/v1/worker/__init__.py
vllm/v1/worker/block_table.py
vllm/v1/worker/gpu_input_batch.py
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_worker.py
vllm/v1/worker/lora_model_runner_mixin.py
vllm/v1/worker/tpu_model_runner.py
vllm/v1/worker/tpu_worker.py
vllm/v1/worker/worker_base.py
vllm/vllm_flash_attn/.gitkeep
vllm/worker/__init__.py
vllm/worker/cache_engine.py
vllm/worker/cpu_enc_dec_model_runner.py
vllm/worker/cpu_model_runner.py
vllm/worker/cpu_pooling_model_runner.py
vllm/worker/cpu_worker.py
vllm/worker/enc_dec_model_runner.py
vllm/worker/hpu_model_runner.py
vllm/worker/hpu_worker.py
vllm/worker/model_runner.py
vllm/worker/model_runner_base.py
vllm/worker/multi_step_model_runner.py
vllm/worker/multi_step_tpu_worker.py
vllm/worker/multi_step_worker.py
vllm/worker/neuron_model_runner.py
vllm/worker/neuron_worker.py
vllm/worker/openvino_model_runner.py
vllm/worker/openvino_worker.py
vllm/worker/pooling_model_runner.py
vllm/worker/tpu_model_runner.py
vllm/worker/tpu_worker.py
vllm/worker/utils.py
vllm/worker/worker.py
vllm/worker/worker_base.py
vllm/worker/xpu_model_runner.py
vllm/worker/xpu_worker.py