LICENSE.md
MANIFEST.in
README.md
pyproject.toml
evals/__init__.py
evals/api.py
evals/base.py
evals/data.py
evals/data_test.py
evals/eval.py
evals/formatting.py
evals/metrics.py
evals/record.py
evals/record_test.py
evals/registry.py
evals/registry_test.py
evals/task_state.py
evals.egg-info/PKG-INFO
evals.egg-info/SOURCES.txt
evals.egg-info/dependency_links.txt
evals.egg-info/entry_points.txt
evals.egg-info/requires.txt
evals.egg-info/top_level.txt
evals/cli/oaieval.py
evals/cli/oaievalset.py
evals/completion_fns/__init__.py
evals/completion_fns/cot.py
evals/completion_fns/langchain_llm.py
evals/completion_fns/langchain_math.py
evals/completion_fns/openai.py
evals/completion_fns/retrieval.py
evals/completion_fns/solver_completion_fn.py
evals/elsuite/lambada.py
evals/elsuite/multiple_choice.py
evals/elsuite/solver_tools_convo.py
evals/elsuite/translate.py
evals/elsuite/utils.py
evals/elsuite/utils_test.py
evals/elsuite/already_said_that/distractors.py
evals/elsuite/already_said_that/eval.py
evals/elsuite/already_said_that/prompts.py
evals/elsuite/already_said_that/solvers.py
evals/elsuite/already_said_that/test_distractors.py
evals/elsuite/already_said_that/utils.py
evals/elsuite/already_said_that/scripts/gen_data.py
evals/elsuite/already_said_that/scripts/make_plots.py
evals/elsuite/ballots/eval.py
evals/elsuite/ballots/prompts.py
evals/elsuite/ballots/utils.py
evals/elsuite/ballots/scripts/make_plots.py
evals/elsuite/basic/fuzzy_match.py
evals/elsuite/basic/fuzzy_match_test.py
evals/elsuite/basic/includes.py
evals/elsuite/basic/includes_test.py
evals/elsuite/basic/json_match.py
evals/elsuite/basic/json_match_test.py
evals/elsuite/basic/json_validator.py
evals/elsuite/basic/json_validator_test.py
evals/elsuite/basic/match.py
evals/elsuite/basic/match_test.py
evals/elsuite/basic/match_with_solvers.py
evals/elsuite/bluff/eval.py
evals/elsuite/bluff/prompts.py
evals/elsuite/bluff/solver_player.py
evals/elsuite/bluff/strategy_solver.py
evals/elsuite/bluff/bluff/__init__.py
evals/elsuite/bluff/bluff/cards.py
evals/elsuite/bluff/bluff/game.py
evals/elsuite/bluff/bluff/players.py
evals/elsuite/bluff/bluff/round.py
evals/elsuite/bluff/bluff/task_description.py
evals/elsuite/bluff/bluff/test_bluff_game.py
evals/elsuite/bluff/scripts/make_plots.py
evals/elsuite/bugged_tools/bugged_tools.py
evals/elsuite/bugged_tools/eval.py
evals/elsuite/bugged_tools/task_description.py
evals/elsuite/bugged_tools/tools.py
evals/elsuite/bugged_tools/utils.py
evals/elsuite/bugged_tools/scripts/plot_experiments.py
evals/elsuite/cant_do_that_anymore/defaults.py
evals/elsuite/cant_do_that_anymore/eval.py
evals/elsuite/cant_do_that_anymore/utils.py
evals/elsuite/cant_do_that_anymore/chess/board.py
evals/elsuite/cant_do_that_anymore/chess/board_test.py
evals/elsuite/cant_do_that_anymore/chess/move_variants.py
evals/elsuite/cant_do_that_anymore/chess/notation.py
evals/elsuite/cant_do_that_anymore/chess/pieces.py
evals/elsuite/cant_do_that_anymore/chess/utils.py
evals/elsuite/cant_do_that_anymore/scripts/dataset_creation.py
evals/elsuite/cant_do_that_anymore/scripts/diagonal_dataset_creation.py
evals/elsuite/cant_do_that_anymore/scripts/make_plots.py
evals/elsuite/error_recovery/defaults.py
evals/elsuite/error_recovery/eval.py
evals/elsuite/error_recovery/scripts/dataset_creation.py
evals/elsuite/error_recovery/scripts/make_plots.py
evals/elsuite/function_deduction/baselines.py
evals/elsuite/function_deduction/eval.py
evals/elsuite/function_deduction/prompts.py
evals/elsuite/function_deduction/solvers.py
evals/elsuite/function_deduction/solvers_test.py
evals/elsuite/function_deduction/scripts/make_plots.py
evals/elsuite/function_deduction/scripts/dataset/create_dataset.py
evals/elsuite/hr_ml_agent_bench/__init__.py
evals/elsuite/hr_ml_agent_bench/actions.py
evals/elsuite/hr_ml_agent_bench/auto_marking.py
evals/elsuite/hr_ml_agent_bench/autoeval.py
evals/elsuite/hr_ml_agent_bench/environment.py
evals/elsuite/hr_ml_agent_bench/eval.py
evals/elsuite/hr_ml_agent_bench/high_level_actions.py
evals/elsuite/hr_ml_agent_bench/low_level_actions.py
evals/elsuite/hr_ml_agent_bench/prepare_task.py
evals/elsuite/hr_ml_agent_bench/prompts.py
evals/elsuite/hr_ml_agent_bench/schema.py
evals/elsuite/hr_ml_agent_bench/utils.py
evals/elsuite/hr_ml_agent_bench/benchmarks/__init__.py
evals/elsuite/hr_ml_agent_bench/benchmarks/ant/baselines/human.py
evals/elsuite/hr_ml_agent_bench/benchmarks/ant/baselines/naive.py
evals/elsuite/hr_ml_agent_bench/benchmarks/ant/env/train.py
evals/elsuite/hr_ml_agent_bench/benchmarks/ant/scripts/grade.py
evals/elsuite/hr_ml_agent_bench/benchmarks/bipedal_walker/baselines/human.py
evals/elsuite/hr_ml_agent_bench/benchmarks/bipedal_walker/baselines/naive.py
evals/elsuite/hr_ml_agent_bench/benchmarks/bipedal_walker/env/train.py
evals/elsuite/hr_ml_agent_bench/benchmarks/bipedal_walker/scripts/grade.py
evals/elsuite/hr_ml_agent_bench/benchmarks/cartpole/baselines/human.py
evals/elsuite/hr_ml_agent_bench/benchmarks/cartpole/baselines/naive.py
evals/elsuite/hr_ml_agent_bench/benchmarks/cartpole/env/train.py
evals/elsuite/hr_ml_agent_bench/benchmarks/cartpole/scripts/grade.py
evals/elsuite/hr_ml_agent_bench/benchmarks/cifar10/env/train.py
evals/elsuite/hr_ml_agent_bench/benchmarks/cifar10/scripts/grade.py
evals/elsuite/hr_ml_agent_bench/benchmarks/cifar10/scripts/prepare.py
evals/elsuite/hr_ml_agent_bench/benchmarks/feedback/env/train.py
evals/elsuite/hr_ml_agent_bench/benchmarks/feedback/scripts/grade.py
evals/elsuite/hr_ml_agent_bench/benchmarks/feedback/scripts/prepare.py
evals/elsuite/hr_ml_agent_bench/benchmarks/house_price/env/train.py
evals/elsuite/hr_ml_agent_bench/benchmarks/house_price/scripts/grade.py
evals/elsuite/hr_ml_agent_bench/benchmarks/house_price/scripts/prepare.py
evals/elsuite/hr_ml_agent_bench/benchmarks/humanoid/baselines/human.py
evals/elsuite/hr_ml_agent_bench/benchmarks/humanoid/baselines/naive.py
evals/elsuite/hr_ml_agent_bench/benchmarks/humanoid/env/train.py
evals/elsuite/hr_ml_agent_bench/benchmarks/humanoid/scripts/grade.py
evals/elsuite/hr_ml_agent_bench/benchmarks/imdb/env/train.py
evals/elsuite/hr_ml_agent_bench/benchmarks/imdb/scripts/grade.py
evals/elsuite/hr_ml_agent_bench/benchmarks/inverted_pendulum/baselines/human.py
evals/elsuite/hr_ml_agent_bench/benchmarks/inverted_pendulum/baselines/naive.py
evals/elsuite/hr_ml_agent_bench/benchmarks/inverted_pendulum/env/train.py
evals/elsuite/hr_ml_agent_bench/benchmarks/inverted_pendulum/scripts/grade.py
evals/elsuite/hr_ml_agent_bench/benchmarks/ogbn_arxiv/env/train.py
evals/elsuite/hr_ml_agent_bench/benchmarks/ogbn_arxiv/scripts/grade.py
evals/elsuite/hr_ml_agent_bench/benchmarks/ogbn_arxiv/scripts/prepare.py
evals/elsuite/hr_ml_agent_bench/benchmarks/parkinsons_disease/env/train.py
evals/elsuite/hr_ml_agent_bench/benchmarks/parkinsons_disease/scripts/grade.py
evals/elsuite/hr_ml_agent_bench/benchmarks/parkinsons_disease/scripts/prepare.py
evals/elsuite/hr_ml_agent_bench/benchmarks/pong/baselines/human.py
evals/elsuite/hr_ml_agent_bench/benchmarks/pong/baselines/naive.py
evals/elsuite/hr_ml_agent_bench/benchmarks/pong/env/train.py
evals/elsuite/hr_ml_agent_bench/benchmarks/pong/scripts/grade.py
evals/elsuite/hr_ml_agent_bench/benchmarks/pusher/baselines/human.py
evals/elsuite/hr_ml_agent_bench/benchmarks/pusher/baselines/naive.py
evals/elsuite/hr_ml_agent_bench/benchmarks/pusher/env/train.py
evals/elsuite/hr_ml_agent_bench/benchmarks/pusher/scripts/grade.py
evals/elsuite/hr_ml_agent_bench/benchmarks/spaceship_titanic/env/train.py
evals/elsuite/hr_ml_agent_bench/benchmarks/spaceship_titanic/scripts/grade.py
evals/elsuite/hr_ml_agent_bench/benchmarks/spaceship_titanic/scripts/prepare.py
evals/elsuite/hr_ml_agent_bench/benchmarks/vectorization/env/train.py
evals/elsuite/hr_ml_agent_bench/benchmarks/vectorization/scripts/grade.py
evals/elsuite/hr_ml_agent_bench/benchmarks/vectorization/scripts/human_baseline.py
evals/elsuite/hr_ml_agent_bench/scripts/plot_experiments.py
evals/elsuite/hr_ml_agent_bench/scripts/run_experiments.py
evals/elsuite/hr_ml_agent_bench/solvers/baseline.py
evals/elsuite/hr_ml_agent_bench/tests/test_actions.py
evals/elsuite/identifying_variables/constants.py
evals/elsuite/identifying_variables/eval.py
evals/elsuite/identifying_variables/graph_utils.py
evals/elsuite/identifying_variables/latent_funcs.py
evals/elsuite/identifying_variables/metrics.py
evals/elsuite/identifying_variables/prompts.py
evals/elsuite/identifying_variables/solvers.py
evals/elsuite/identifying_variables/structs.py
evals/elsuite/identifying_variables/utils.py
evals/elsuite/identifying_variables/renderers/__init__.py
evals/elsuite/identifying_variables/renderers/base.py
evals/elsuite/identifying_variables/renderers/corrset.py
evals/elsuite/identifying_variables/renderers/tabular.py
evals/elsuite/identifying_variables/renderers/templates.py
evals/elsuite/identifying_variables/scripts/gen_data.py
evals/elsuite/identifying_variables/scripts/make_plots.py
evals/elsuite/identifying_variables/scripts/plotting_utils.py
evals/elsuite/identifying_variables/scripts/table_utils.py
evals/elsuite/incontext_rl/anti-cot_solver.py
evals/elsuite/incontext_rl/baselines.py
evals/elsuite/incontext_rl/defaults.py
evals/elsuite/incontext_rl/env_setup.py
evals/elsuite/incontext_rl/eval.py
evals/elsuite/incontext_rl/scripts/plot_experiments.py
evals/elsuite/make_me_pay/eval.py
evals/elsuite/make_me_pay/makemepay.py
evals/elsuite/make_me_pay/makemepay_test.py
evals/elsuite/make_me_pay/task_description.py
evals/elsuite/make_me_pay/utils.py
evals/elsuite/make_me_pay/scripts/make_plots.py
evals/elsuite/make_me_pay/solvers/lm_con_artist_solver.py
evals/elsuite/make_me_pay/solvers/prompts.py
evals/elsuite/make_me_say/autoeval.py
evals/elsuite/make_me_say/core.py
evals/elsuite/make_me_say/defaults.py
evals/elsuite/make_me_say/eval.py
evals/elsuite/make_me_say/makemesay_test.py
evals/elsuite/make_me_say/utils.py
evals/elsuite/mmmu/eval.py
evals/elsuite/modelgraded/base.py
evals/elsuite/modelgraded/classify.py
evals/elsuite/modelgraded/classify_utils.py
evals/elsuite/multistep_web_tasks/constants.py
evals/elsuite/multistep_web_tasks/eval.py
evals/elsuite/multistep_web_tasks/session.py
evals/elsuite/multistep_web_tasks/utils.py
evals/elsuite/multistep_web_tasks/docker/flask-playwright/app.py
evals/elsuite/multistep_web_tasks/docker/homepage/app.py
evals/elsuite/multistep_web_tasks/reproducibility/make_plots.py
evals/elsuite/multistep_web_tasks/reproducibility/make_task_jsonl.py
evals/elsuite/multistep_web_tasks/reproducibility/run_environments.py
evals/elsuite/multistep_web_tasks/solvers/strong_solver/strong_prompts.py
evals/elsuite/multistep_web_tasks/solvers/strong_solver/strong_solver.py
evals/elsuite/multistep_web_tasks/solvers/webarena_solvers/webarena_prompts.py
evals/elsuite/multistep_web_tasks/solvers/webarena_solvers/webarena_solvers.py
evals/elsuite/multistep_web_tasks/webarena/eval_run.py
evals/elsuite/multistep_web_tasks/webarena/task_description.py
evals/elsuite/multistep_web_tasks/webarena/bash_browser_env/bash_browser_env.py
evals/elsuite/multistep_web_tasks/webarena/bash_browser_env/bash_browser_utils.py
evals/elsuite/multistep_web_tasks/webarena/bash_env/__init__.py
evals/elsuite/multistep_web_tasks/webarena/bash_env/actions.py
evals/elsuite/multistep_web_tasks/webarena/bash_env/bash_utils.py
evals/elsuite/multistep_web_tasks/webarena/bash_env/basic_bash_env.py
evals/elsuite/multistep_web_tasks/webarena/browser_env/__init__.py
evals/elsuite/multistep_web_tasks/webarena/browser_env/actions.py
evals/elsuite/multistep_web_tasks/webarena/browser_env/auto_login.py
evals/elsuite/multistep_web_tasks/webarena/browser_env/basic_browser_env.py
evals/elsuite/multistep_web_tasks/webarena/browser_env/browser_utils.py
evals/elsuite/multistep_web_tasks/webarena/browser_env/constants.py
evals/elsuite/multistep_web_tasks/webarena/browser_env/env_config.py
evals/elsuite/multistep_web_tasks/webarena/browser_env/helper_functions.py
evals/elsuite/multistep_web_tasks/webarena/browser_env/processors.py
evals/elsuite/multistep_web_tasks/webarena/core/env.py
evals/elsuite/multistep_web_tasks/webarena/core/playwright_api.py
evals/elsuite/multistep_web_tasks/webarena/core/utils.py
evals/elsuite/multistep_web_tasks/webarena/evaluation_harness/__init__.py
evals/elsuite/multistep_web_tasks/webarena/evaluation_harness/evaluators.py
evals/elsuite/multistep_web_tasks/webarena/evaluation_harness/helper_functions.py
evals/elsuite/sandbagging/defaults.py
evals/elsuite/sandbagging/mmlu_eval.py
evals/elsuite/sandbagging/sandbagging_eval.py
evals/elsuite/sandbagging/solvers.py
evals/elsuite/sandbagging/utils.py
evals/elsuite/sandbagging/scripts/consistency_plots.py
evals/elsuite/sandbagging/scripts/sandbagging_all_plots.py
evals/elsuite/sandbagging/scripts/sandbagging_subset_plots.py
evals/elsuite/sandbagging/scripts/utils.py
evals/elsuite/schelling_point/eval.py
evals/elsuite/schelling_point/prompts.py
evals/elsuite/schelling_point/utils.py
evals/elsuite/self_prompting/eval.py
evals/elsuite/self_prompting/task_description.py
evals/elsuite/self_prompting/scripts/make_plots.py
evals/elsuite/self_prompting/scripts/dataset/compile_data.py
evals/elsuite/self_prompting/scripts/dataset/eval_list.py
evals/elsuite/self_prompting/solvers/baselines.py
evals/elsuite/self_prompting/solvers/custom_cot_solver.py
evals/elsuite/skill_acquisition/eval.py
evals/elsuite/skill_acquisition/solvers.py
evals/elsuite/skill_acquisition/task_description.py
evals/elsuite/skill_acquisition/test_skill_acquisition.py
evals/elsuite/skill_acquisition/utils.py
evals/elsuite/skill_acquisition/scraping/scrape_distractor_articles.py
evals/elsuite/skill_acquisition/scraping/scrape_miskito.py
evals/elsuite/skill_acquisition/scripts/make_plots.py
evals/elsuite/steganography/eval.py
evals/elsuite/steganography/monitor.py
evals/elsuite/steganography/prompts.py
evals/elsuite/steganography/reconstruction_metrics.py
evals/elsuite/steganography/steganography.py
evals/elsuite/steganography/scripts/make_plots.py
evals/elsuite/steganography/scripts/dataset/complexity_metrics.py
evals/elsuite/steganography/scripts/dataset/csv2jsonl.py
evals/elsuite/steganography/scripts/dataset/custom_datasets.py
evals/elsuite/steganography/scripts/dataset/dataset.py
evals/elsuite/steganography/scripts/dataset/utils.py
evals/elsuite/test/match.py
evals/elsuite/text_compression/compression.py
evals/elsuite/text_compression/eval.py
evals/elsuite/text_compression/prompts.py
evals/elsuite/text_compression/reconstruction_metrics.py
evals/elsuite/text_compression/scripts/make_plots.py
evals/elsuite/text_compression/scripts/dataset/complexity_metrics.py
evals/elsuite/text_compression/scripts/dataset/csv2jsonl.py
evals/elsuite/text_compression/scripts/dataset/custom_datasets.py
evals/elsuite/text_compression/scripts/dataset/dataset.py
evals/elsuite/text_compression/scripts/dataset/utils.py
evals/elsuite/theory_of_mind/scripts/data_generation.py
evals/elsuite/theory_of_mind/scripts/make_plots.py
evals/elsuite/track_the_stat/eval.py
evals/elsuite/track_the_stat/solvers.py
evals/elsuite/track_the_stat/utils.py
evals/elsuite/track_the_stat/prompts/__init__.py
evals/elsuite/track_the_stat/prompts/median.py
evals/elsuite/track_the_stat/prompts/mode.py
evals/elsuite/track_the_stat/scripts/make_plots.py
evals/elsuite/twenty_questions/eval.py
evals/elsuite/twenty_questions/test_utils.py
evals/elsuite/twenty_questions/utils.py
evals/elsuite/twenty_questions/scripts/make_plots.py
evals/prompt/base.py
evals/registry/completion_fns/cot.yaml
evals/registry/completion_fns/langchain_chains.yaml
evals/registry/completion_fns/langchain_llms.yaml
evals/registry/eval_sets/chinese-numbers.yaml
evals/registry/eval_sets/coqa-ex.yaml
evals/registry/eval_sets/css-selectors.yaml
evals/registry/eval_sets/exams-all.yaml
evals/registry/eval_sets/hr-ml-agent-bench.yaml
evals/registry/eval_sets/logiqa-logical-reasoning-plus.yaml
evals/registry/eval_sets/manga-translation.yaml
evals/registry/eval_sets/mazes.yaml
evals/registry/eval_sets/mmmu.yaml
evals/registry/eval_sets/pointer-value-retrieval.yaml
evals/registry/eval_sets/raven-matrices.yaml
evals/registry/eval_sets/schelling_point.yaml
evals/registry/eval_sets/stock-options.yaml
evals/registry/eval_sets/test-all.yaml
evals/registry/eval_sets/test-basic.yaml
evals/registry/eval_sets/test-modelgraded.yaml
evals/registry/eval_sets/ukraine-gec.yaml
evals/registry/eval_sets/word-associations.yaml
evals/registry/evals/2d_movement.yaml
evals/registry/evals/3d_globe_movement.yaml
evals/registry/evals/3d_object_manipulation.yaml
evals/registry/evals/Chinese_character_riddles.yaml
evals/registry/evals/GPT-model-text-detection.yaml
evals/registry/evals/Unfamiliar-Chinese-Character.yaml
evals/registry/evals/ab.yaml
evals/registry/evals/aba-mrpc-true-false.yaml
evals/registry/evals/abstract-causal-reasoning.yaml
evals/registry/evals/abstract2title.yaml
evals/registry/evals/accounting_audit.yaml
evals/registry/evals/actors-sequence.yaml
evals/registry/evals/adultery_state_laws.yaml
evals/registry/evals/afrikaans-lexicon.yaml
evals/registry/evals/aime_evaluation.yaml
evals/registry/evals/algebra-word-problems.yaml
evals/registry/evals/allergen-information.yaml
evals/registry/evals/already_said_that.yaml
evals/registry/evals/alternate-numeral-systems.yaml
evals/registry/evals/ambiguous-sentences.yaml
evals/registry/evals/anagrams.yaml
evals/registry/evals/arabic-literature-qa.yaml
evals/registry/evals/arc.yaml
evals/registry/evals/arithmetic-expression.yaml
evals/registry/evals/arithmetical_puzzles.yaml
evals/registry/evals/ascii-digit-recognition.yaml
evals/registry/evals/ascii-wordart.yaml
evals/registry/evals/asl-classifiers.yaml
evals/registry/evals/astro_eval.yaml
evals/registry/evals/atpl_exams.yaml
evals/registry/evals/automata-and-complexity.yaml
evals/registry/evals/backgammon.yaml
evals/registry/evals/balance-chemical-equation.yaml
evals/registry/evals/ballots.yaml
evals/registry/evals/banking77.yaml
evals/registry/evals/base64-decode.yaml
evals/registry/evals/beam-analysis.yaml
evals/registry/evals/belarusian-antonyms.yaml
evals/registry/evals/belarusian-grammar.yaml
evals/registry/evals/belarusian-lexicon.yaml
evals/registry/evals/belarusian-numerals.yaml
evals/registry/evals/belarusian-orthography.yaml
evals/registry/evals/belarusian-proverbs.yaml
evals/registry/evals/belarusian-rhyme.yaml
evals/registry/evals/belarusian-russian-translation.yaml
evals/registry/evals/belarusian-syllable-count.yaml
evals/registry/evals/belarusian-synonyms.yaml
evals/registry/evals/belarusian-word-analogy-inflection.yaml
evals/registry/evals/benjaminmoore_to_hex.yaml
evals/registry/evals/bias_detection.yaml
evals/registry/evals/bigrams.yaml
evals/registry/evals/bitwise.yaml
evals/registry/evals/blackfoot-numerals-modern.yaml
evals/registry/evals/bluff.yaml
evals/registry/evals/body-movement.yaml
evals/registry/evals/born-first.yaml
evals/registry/evals/brazilian-lexicon.yaml
evals/registry/evals/brazilian_laws.yaml
evals/registry/evals/bugged_tools.yaml
evals/registry/evals/building_floorplan.yaml
evals/registry/evals/bulgarian-lexicon.yaml
evals/registry/evals/cant_do_that_anymore.yaml
evals/registry/evals/canto_wu_pronunciation.yaml
evals/registry/evals/canto_wu_pronunciation_fewshot.yaml
evals/registry/evals/cardinal-directions.yaml
evals/registry/evals/categorize_with_distractors.yaml
evals/registry/evals/chess-piece-count.yaml
evals/registry/evals/chess.yaml
evals/registry/evals/chinese-lantern-riddles.yaml
evals/registry/evals/chinese-remainder-theorem.yaml
evals/registry/evals/chinese_ancient_masterpieces_dynasty.yaml
evals/registry/evals/chinese_ancient_poetry.yaml
evals/registry/evals/chinese_chu_ci.yaml
evals/registry/evals/chinese_famous_novel.yaml
evals/registry/evals/chinese_hard_translations.yaml
evals/registry/evals/chinese_homonym.yaml
evals/registry/evals/chinese_homophonic.yaml
evals/registry/evals/chinese_idioms.yaml
evals/registry/evals/chinese_modern_poem_identification.yaml
evals/registry/evals/chinese_poem.yaml
evals/registry/evals/chinese_shi_jing.yaml
evals/registry/evals/chinese_song_ci.yaml
evals/registry/evals/chinese_tang_poetries.yaml
evals/registry/evals/chinese_zodiac.yaml
evals/registry/evals/cissp-study-questions.yaml
evals/registry/evals/co-sql.yaml
evals/registry/evals/code_combination.yaml
evals/registry/evals/code_progress.yaml
evals/registry/evals/color_theory_complementary.yaml
evals/registry/evals/compare-countries-area.yaml
evals/registry/evals/complex-analogies-en-ru.yaml
evals/registry/evals/complex-replace-characters.yaml
evals/registry/evals/comprehensive-graph-reasoning.yaml
evals/registry/evals/confusing_korean.yaml
evals/registry/evals/connect-4.yaml
evals/registry/evals/consensus_summary.yaml
evals/registry/evals/context-free-grammar.yaml
evals/registry/evals/convert-hex-hsl-lightness.yaml
evals/registry/evals/convert_bwt_num_and_chinese_num.yaml
evals/registry/evals/coq-editing.yaml
evals/registry/evals/coq-proof-step.yaml
evals/registry/evals/coqa-ex.yaml
evals/registry/evals/corr2cause.yaml
evals/registry/evals/count_intersections_polynomial.yaml
evals/registry/evals/count_token_freq_dna.yaml
evals/registry/evals/counterfactual-reasoning.yaml
evals/registry/evals/countries.yaml
evals/registry/evals/crepe.yaml
evals/registry/evals/cricket_situations.yaml
evals/registry/evals/crontab.yaml
evals/registry/evals/csharp-linq.yaml
evals/registry/evals/css-selectors.yaml
evals/registry/evals/cube-pack.yaml
evals/registry/evals/cybersecurity-filepaths.yaml
evals/registry/evals/date-booking.yaml
evals/registry/evals/date-calculator.yaml
evals/registry/evals/day-of-week-from-date.yaml
evals/registry/evals/decrypt-caesar-cipher.yaml
evals/registry/evals/detect-hshd.yaml
evals/registry/evals/determinant.yaml
evals/registry/evals/dhammapada-reference.yaml
evals/registry/evals/diabetes.yaml
evals/registry/evals/diagrammatic_logic.yaml
evals/registry/evals/dice-rotation-sequence.yaml
evals/registry/evals/direct-speech-tag.yaml
evals/registry/evals/directions.yaml
evals/registry/evals/dna-melting-calculation.yaml
evals/registry/evals/dutch-lexicon.yaml
evals/registry/evals/dutch-rhymes.yaml
evals/registry/evals/emoji-riddle.yaml
evals/registry/evals/emotional-intelligence.yaml
evals/registry/evals/error_recovery.yaml
evals/registry/evals/escher-sentences.yaml
evals/registry/evals/euler_problems.yaml
evals/registry/evals/european-date-format-challenge.yaml
evals/registry/evals/event-categories.yaml
evals/registry/evals/exams.yaml
evals/registry/evals/fcc_amateur_extra.yaml
evals/registry/evals/finance.yaml
evals/registry/evals/finance_calc.yaml
evals/registry/evals/financial-derivatives.yaml
evals/registry/evals/find-letter.yaml
evals/registry/evals/find-thirukkural.yaml
evals/registry/evals/find_country_from_svg.yaml
evals/registry/evals/finger-tracking.yaml
evals/registry/evals/finnish-rhyme.yaml
evals/registry/evals/first-letters.yaml
evals/registry/evals/food.yaml
evals/registry/evals/formal-grammar-to-regex.yaml
evals/registry/evals/formal_logic.yaml
evals/registry/evals/forth-stack-sim.yaml
evals/registry/evals/french-lexicon.yaml
evals/registry/evals/french-part-of-speech.yaml
evals/registry/evals/french_homonym_and_homograph.yaml
evals/registry/evals/function-deduction.yaml
evals/registry/evals/game-theory.yaml
evals/registry/evals/gears_rotation.yaml
evals/registry/evals/geometry_puzzle.yaml
evals/registry/evals/german-part-of-speech.yaml
evals/registry/evals/gol.yaml
evals/registry/evals/gpt-protocol-buffers.yaml
evals/registry/evals/greek-nt-manuscripts.yaml
evals/registry/evals/greek-vocabulary.yaml
evals/registry/evals/gregorian-to-hebrew-date.yaml
evals/registry/evals/guess-the-singer.yaml
evals/registry/evals/gujarati_numerals.yaml
evals/registry/evals/hard_russian_computer_science_tasks.yaml
evals/registry/evals/heart-disease.yaml
evals/registry/evals/hebrew-bible.yaml
evals/registry/evals/hebrew-homophones.yaml
evals/registry/evals/hebrew-rhyme.yaml
evals/registry/evals/hebrew-same-noun-gender.yaml
evals/registry/evals/hebrew_grammar.yaml
evals/registry/evals/hebrew_plurals.yaml
evals/registry/evals/hebrew_talmud_suka.yaml
evals/registry/evals/hindi_shuddha.yaml
evals/registry/evals/hindi_upsc.yaml
evals/registry/evals/hindi_words.yaml
evals/registry/evals/historical-kana-orthography-reading.yaml
evals/registry/evals/hr-ml-agent-bench.yaml
evals/registry/evals/human-safety.yaml
evals/registry/evals/iambic-pentameter.yaml
evals/registry/evals/icelandic-inflection-easy.yaml
evals/registry/evals/icelandic-inflection-hard.yaml
evals/registry/evals/icelandic-inflection-medium.yaml
evals/registry/evals/icelandic-sentences-gec.yaml
evals/registry/evals/identifying_variables.yaml
evals/registry/evals/illinois-law.yaml
evals/registry/evals/imperial_date_to_string.yaml
evals/registry/evals/incontext_rl.yaml
evals/registry/evals/indonesian_numbers.yaml
evals/registry/evals/infiniteloop-match.yaml
evals/registry/evals/integer-sequence-predictions.yaml
evals/registry/evals/interlingual-homograph.yaml
evals/registry/evals/internal_representations.yaml
evals/registry/evals/invert_word_wise.yaml
evals/registry/evals/invoice_due_date_leap_day_adjustment.yaml
evals/registry/evals/invoices.yaml
evals/registry/evals/iqbal-poetry-translation.yaml
evals/registry/evals/irish-lexicon.yaml
evals/registry/evals/irish-plural-nouns.yaml
evals/registry/evals/irony.yaml
evals/registry/evals/irrelevant-negative-diversion.yaml
evals/registry/evals/islands.yaml
evals/registry/evals/isosceles-right-triangle.yaml
evals/registry/evals/italian-new-words.yaml
evals/registry/evals/italian-rhyme.yaml
evals/registry/evals/italian_big_math_expression.yaml
evals/registry/evals/japanese-decimal-units.yaml
evals/registry/evals/japanese-itpassport-exam01.yaml
evals/registry/evals/japanese-national-medical-exam01.yaml
evals/registry/evals/japanese-national-medical-exam02.yaml
evals/registry/evals/japanese-station.yaml
evals/registry/evals/japanese_approval.yaml
evals/registry/evals/japanese_city_name_pronuciation.yaml
evals/registry/evals/japanese_driving_license.yaml
evals/registry/evals/japanese_mahjong_discard_tile.yaml
evals/registry/evals/japanese_number_reading.yaml
evals/registry/evals/japanese_onomatopoeia.yaml
evals/registry/evals/japanese_populer_video_game_title_and_the_publisher.yaml
evals/registry/evals/japanese_prime_minister.yaml
evals/registry/evals/japanese_romantic_context.yaml
evals/registry/evals/jee-math.yaml
evals/registry/evals/job_listing_title_for_a_caregiver_in_japan.yaml
evals/registry/evals/json_patch_object.yaml
evals/registry/evals/kanji-idioms.yaml
evals/registry/evals/knot-theory.yaml
evals/registry/evals/korean-consonant-vowel-combination.yaml
evals/registry/evals/korean-honorific.yaml
evals/registry/evals/korean-phonetics.yaml
evals/registry/evals/korean-postposition.yaml
evals/registry/evals/korean_date_counting.yaml
evals/registry/evals/korean_dialects.yaml
evals/registry/evals/korean_foreign_words.yaml
evals/registry/evals/korean_romanization.yaml
evals/registry/evals/korean_spaces.yaml
evals/registry/evals/korean_spelling.yaml
evals/registry/evals/korean_yaminjeongeum.yaml
evals/registry/evals/language.yaml
evals/registry/evals/largest_country.yaml
evals/registry/evals/last-word-nth.yaml
evals/registry/evals/lat_long_identify.yaml
evals/registry/evals/latin_grammar.yaml
evals/registry/evals/linear-equations.yaml
evals/registry/evals/linear-regression.yaml
evals/registry/evals/list_comparison_missing_name.yaml
evals/registry/evals/logic-container.yaml
evals/registry/evals/logic-grid-eval.yaml
evals/registry/evals/logic-liar-paradox.yaml
evals/registry/evals/logic-riddles.yaml
evals/registry/evals/logic-statements.yaml
evals/registry/evals/logic.yaml
evals/registry/evals/logic_and_probability.yaml
evals/registry/evals/logical-black-scholes.yaml
evals/registry/evals/logical_counting.yaml
evals/registry/evals/logical_reasoning_letter_series_test.yaml
evals/registry/evals/logiqa-logical-reasoning-plus.yaml
evals/registry/evals/logiqa.yaml
evals/registry/evals/loss-logic.yaml
evals/registry/evals/lunar-calendar.yaml
evals/registry/evals/make-me-pay.yaml
evals/registry/evals/make-me-say.yaml
evals/registry/evals/mandaliof-table.yaml
evals/registry/evals/manga-translation.yaml
evals/registry/evals/map-electronic-component-part-to-fact.yaml
evals/registry/evals/mapping_to_matricies.yaml
evals/registry/evals/marxist_philosophy_exam.yaml
evals/registry/evals/mate-in-one.yaml
evals/registry/evals/math-derivatives.yaml
evals/registry/evals/math_equations.yaml
evals/registry/evals/math_for_5th-grader.yaml
evals/registry/evals/math_logic_operations.yaml
evals/registry/evals/math_polish.yaml
evals/registry/evals/matrix-mult-rows.yaml
evals/registry/evals/mazes.yaml
evals/registry/evals/medication_dose.yaml
evals/registry/evals/medmcqa.yaml
evals/registry/evals/mendelian_inheritance.yaml
evals/registry/evals/missing-operators.yaml
evals/registry/evals/mmlu.yaml
evals/registry/evals/mmmu.yaml
evals/registry/evals/monthly_metric_comparison.yaml
evals/registry/evals/moral_exceptQA.yaml
evals/registry/evals/multi-step-equations.yaml
evals/registry/evals/multistep-word-problems.yaml
evals/registry/evals/multistep_web_tasks.yaml
evals/registry/evals/music-theory-chord-names.yaml
evals/registry/evals/music-theory-chord-notes.yaml
evals/registry/evals/music-theory.yaml
evals/registry/evals/music_theory_scale_modes.yaml
evals/registry/evals/naughty_strings.yaml
evals/registry/evals/nepali-numerals.yaml
evals/registry/evals/nepali-song-singer.yaml
evals/registry/evals/ner_finance.yaml
evals/registry/evals/newsology.yaml
evals/registry/evals/next-val-series.yaml
evals/registry/evals/nfl-point-combinations.yaml
evals/registry/evals/non-compound-names.yaml
evals/registry/evals/norwegian-lexicon.yaml
evals/registry/evals/norwegian-rhymes.yaml
evals/registry/evals/number-pattern.yaml
evals/registry/evals/number-reading.yaml
evals/registry/evals/number_series_test.yaml
evals/registry/evals/numbers_game.yaml
evals/registry/evals/numeral-type-comparisons.yaml
evals/registry/evals/numerical-cabbala-casanova.yaml
evals/registry/evals/nutrition.yaml
evals/registry/evals/ordered-history-events.yaml
evals/registry/evals/ordering_randomised_versionlist.yaml
evals/registry/evals/osm_mapping_one_way.yaml
evals/registry/evals/override-system-instruction.yaml
evals/registry/evals/pantone_to_hex.yaml
evals/registry/evals/parable-to-moral-match.yaml
evals/registry/evals/pararule-plus-multi-step-deductive-reasoning.yaml
evals/registry/evals/partially_solved_crossword_clues.yaml
evals/registry/evals/passing-balls.yaml
evals/registry/evals/path_enclosed_area.yaml
evals/registry/evals/pattern_identification.yaml
evals/registry/evals/persian-kinship-riddles.yaml
evals/registry/evals/ph_calculation.yaml
evals/registry/evals/phonetics-identify-words-needing-missing-gpcs.yaml
evals/registry/evals/physics-interaction.yaml
evals/registry/evals/pointer-value-retrieval.yaml
evals/registry/evals/points-on-line.yaml
evals/registry/evals/poker_analysis.yaml
evals/registry/evals/poker_hand_ranks.yaml
evals/registry/evals/polish-lexicon.yaml
evals/registry/evals/polish-proverbs.yaml
evals/registry/evals/polish-syllable-count.yaml
evals/registry/evals/polish_rhymes_generation.yaml
evals/registry/evals/population_span_extraction.yaml
evals/registry/evals/portuguese-kinship-riddles.yaml
evals/registry/evals/portuguese-sarcasm.yaml
evals/registry/evals/portuguese-syllable-count.yaml
evals/registry/evals/positive-binary-operations.yaml
evals/registry/evals/premature-conclusions.yaml
evals/registry/evals/probabilities-word-problems.yaml
evals/registry/evals/probability_questions.yaml
evals/registry/evals/product-ie.yaml
evals/registry/evals/product-matching.yaml
evals/registry/evals/prompt-injection.yaml
evals/registry/evals/proofreader.yaml
evals/registry/evals/pure_korean.yaml
evals/registry/evals/python_list_comprehension.yaml
evals/registry/evals/qa.yaml
evals/registry/evals/quartz.yaml
evals/registry/evals/ral_to_hex.yaml
evals/registry/evals/rare-and-loanwords-dutch-lexicon.yaml
evals/registry/evals/raven-matrices.yaml
evals/registry/evals/reasoning_with_contradictory_statements.yaml
evals/registry/evals/rectangles.yaml
evals/registry/evals/recurrence-relation.yaml
evals/registry/evals/regex-match.yaml
evals/registry/evals/relative-orientations.yaml
evals/registry/evals/research-question-extraction.yaml
evals/registry/evals/resistor-ohm-calculator.yaml
evals/registry/evals/resource_id_extraction.yaml
evals/registry/evals/reverse-polish-notation.yaml
evals/registry/evals/reverse-shell.yaml
evals/registry/evals/reverse-sort-words-eng.yaml
evals/registry/evals/reverse-string.yaml
evals/registry/evals/rhetorical-devices.yaml
evals/registry/evals/rock-climbing.yaml
evals/registry/evals/romanian-logic.yaml
evals/registry/evals/romanian_homonyms.yaml
evals/registry/evals/rot13.yaml
evals/registry/evals/ru_rhymes.yaml
evals/registry/evals/rubiks-colors.yaml
evals/registry/evals/rucola.yaml
evals/registry/evals/russe.yaml
evals/registry/evals/russian-english-homonym-context-resolution.yaml
evals/registry/evals/russian-lexicon.yaml
evals/registry/evals/russian-nlp-tasks.yaml
evals/registry/evals/russian-rhyme.yaml
evals/registry/evals/russian-verse.yaml
evals/registry/evals/russian_medical.yaml
evals/registry/evals/russian_sarcasm.yaml
evals/registry/evals/sandbagging.yaml
evals/registry/evals/sarcasm.yaml
evals/registry/evals/schelling_point.yaml
evals/registry/evals/seating_arrangements.yaml
evals/registry/evals/security_guide.yaml
evals/registry/evals/self_prompting.yaml
evals/registry/evals/seo_keywords.yaml
evals/registry/evals/sexagenary-cycle-calculation.yaml
evals/registry/evals/shape-in-shape.yaml
evals/registry/evals/shared-borders.yaml
evals/registry/evals/shopping_discount_comparison.yaml
evals/registry/evals/simple-block-puzzles.yaml
evals/registry/evals/simple-charting.yaml
evals/registry/evals/simple-knowledge-mongolian.yaml
evals/registry/evals/simple-visual-understanding.yaml
evals/registry/evals/simple_math.yaml
evals/registry/evals/simple_physics_engine.yaml
evals/registry/evals/sindarin-fluency.yaml
evals/registry/evals/singapore_data_protection_decisions.yaml
evals/registry/evals/singlestore-vectorsearch.yaml
evals/registry/evals/skill_acquisition.yaml
evals/registry/evals/smiles_to_formula.yaml
evals/registry/evals/soc_codes.yaml
evals/registry/evals/solve-for-variable.yaml
evals/registry/evals/sort-numeric.yaml
evals/registry/evals/south-african-bands.yaml
evals/registry/evals/spanish-lexicon.yaml
evals/registry/evals/spanish_feminine_noun_masculine_article.yaml
evals/registry/evals/split_chinese_characters.yaml
evals/registry/evals/sql.yaml
evals/registry/evals/squares-gpt.yaml
evals/registry/evals/stats-tests.yaml
evals/registry/evals/steganography.yaml
evals/registry/evals/stock-options.yaml
evals/registry/evals/superficial-patterns.yaml
evals/registry/evals/svg_alphabet.yaml
evals/registry/evals/svg_to_text.yaml
evals/registry/evals/svg_understanding.yaml
evals/registry/evals/swap-words.yaml
evals/registry/evals/swedish-spelling.yaml
evals/registry/evals/swedish_sat.yaml
evals/registry/evals/syllables_long_words.yaml
evals/registry/evals/syntax-check.yaml
evals/registry/evals/taxes.yaml
evals/registry/evals/tempo_to_measure_count.yaml
evals/registry/evals/test-basic.yaml
evals/registry/evals/test-comp-sci.yaml
evals/registry/evals/test-modelgraded-battle.yaml
evals/registry/evals/test-modelgraded-generated.yaml
evals/registry/evals/test-modelgraded.yaml
evals/registry/evals/test_english_pronunciations.yaml
evals/registry/evals/test_japanese_english_numerals.yaml
evals/registry/evals/test_japanese_radical.yaml
evals/registry/evals/test_japanese_units.yaml
evals/registry/evals/tetris.yaml
evals/registry/evals/text_compression.yaml
evals/registry/evals/theory_of_mind.yaml
evals/registry/evals/thirty_six_stratagems.yaml
evals/registry/evals/three-pt-mapping.yaml
evals/registry/evals/time-zone-conversion.yaml
evals/registry/evals/tokyo-station-number.yaml
evals/registry/evals/track_objects.yaml
evals/registry/evals/track_the_stat.yaml
evals/registry/evals/tracking-shuffled-objects.yaml
evals/registry/evals/tricky-word-problems.yaml
evals/registry/evals/turkish_characters.yaml
evals/registry/evals/twenty_questions.yaml
evals/registry/evals/ukraine-eit.yaml
evals/registry/evals/ukraine-gec.yaml
evals/registry/evals/ukraine_electronic_petitions.yaml
evals/registry/evals/unified-patch.yaml
evals/registry/evals/unique_combinations.yaml
evals/registry/evals/unsolvable_questions.yaml
evals/registry/evals/unwanted-rhyming.yaml
evals/registry/evals/urdu-lexicon.yaml
evals/registry/evals/urdu-transliteration.yaml
evals/registry/evals/us-tort-law.yaml
evals/registry/evals/utah_real_estate.yaml
evals/registry/evals/utility_price_parsing.yaml
evals/registry/evals/viewport_to_grid_size.yaml
evals/registry/evals/vigenere.yaml
evals/registry/evals/vintage_phone_keyboard_decode.yaml
evals/registry/evals/which-is-heavier.yaml
evals/registry/evals/wkt_understanding.yaml
evals/registry/evals/word-association.yaml
evals/registry/evals/word_vector_over_reliance.yaml
evals/registry/modelgraded/arithmetic-expression.yaml
evals/registry/modelgraded/battle.yaml
evals/registry/modelgraded/best.yaml
evals/registry/modelgraded/closedqa.yaml
evals/registry/modelgraded/diversity.yaml
evals/registry/modelgraded/fact.yaml
evals/registry/modelgraded/humor.yaml
evals/registry/modelgraded/iambic_pentameter.yaml
evals/registry/modelgraded/keywords.yaml
evals/registry/modelgraded/onomatopoeia.yaml
evals/registry/modelgraded/possible.yaml
evals/registry/modelgraded/regression-equation.yaml
evals/registry/modelgraded/research-question-extraction.yaml
evals/registry/modelgraded/rhyming.yaml
evals/registry/modelgraded/security.yaml
evals/registry/modelgraded/singlestore.yaml
evals/registry/modelgraded/sql.yaml
evals/registry/modelgraded/translation.yaml
evals/registry/solvers/already_said_that.yaml
evals/registry/solvers/anthropic.yaml
evals/registry/solvers/bluff.yaml
evals/registry/solvers/cant_do_that_anymore.yaml
evals/registry/solvers/defaults.yaml
evals/registry/solvers/error_recovery.yaml
evals/registry/solvers/function_deduction.yaml
evals/registry/solvers/gemini.yaml
evals/registry/solvers/hr-ml-agent-bench.yaml
evals/registry/solvers/identifying_variables.yaml
evals/registry/solvers/incontext_rl.yaml
evals/registry/solvers/make-me-pay.yaml
evals/registry/solvers/multistep_web_tasks.yaml
evals/registry/solvers/sandbagging.yaml
evals/registry/solvers/self_prompting.yaml
evals/registry/solvers/skill_acquisition.yaml
evals/registry/solvers/theory_of_mind.yaml
evals/registry/solvers/together.yaml
evals/registry/solvers/track_the_stat.yaml
evals/registry/solvers/twenty_questions.yaml
evals/solvers/human_cli_solver.py
evals/solvers/memory.py
evals/solvers/solver.py
evals/solvers/solver_test.py
evals/solvers/utils.py
evals/solvers/nested/cot_solver.py
evals/solvers/nested/fewshot_solver.py
evals/solvers/nested/hhh_solver.py
evals/solvers/nested/self_consistency_solver.py
evals/solvers/postprocessors/base.py
evals/solvers/postprocessors/postprocessors.py
evals/solvers/postprocessors/postprocessors_test.py
evals/solvers/prompts/cot.py
evals/solvers/prompts/hhh.py
evals/solvers/prompts/hhh_test.py
evals/solvers/providers/anthropic/anthropic_solver.py
evals/solvers/providers/anthropic/anthropic_solver_test.py
evals/solvers/providers/google/gemini_solver.py
evals/solvers/providers/google/gemini_solver_test.py
evals/solvers/providers/openai/openai_assistants_solver.py
evals/solvers/providers/openai/openai_assistants_solver_test.py
evals/solvers/providers/openai/openai_solver.py
evals/solvers/providers/together/together_solver.py
evals/solvers/providers/together/together_solver_test.py
evals/utils/api_utils.py
evals/utils/log_utils.py
evals/utils/misc.py
evals/utils/snowflake.py
evals/utils/test.py