"""
Embedded Notebook Code - All flowlab functions
This file contains the complete code from all RL notebooks.
Auto-generated from Jupyter notebooks.
"""

# Dictionary mapping lab numbers to their complete code
LAB_CODE = {
    1: 'import gymnasium as gym\nimport numpy as np\nimport torch\nimport torch.nn as nn\n\ndef features(state):\n  return np.array([1, state])\n\nw = np.zeros(2)\nalpha = 0.01\ngamma = 0.99\n\ndef update(w, s, r, s_next):\n  td_error = r + gamma * np.dot(w, features(s_next)) - np.dot(w, features(s))\n  w += alpha * td_error * features(s)\n  return w\n\nclass ValueNet(nn.Module):\n  def __init__(self):\n    super().__init__()\n    self.fc = nn.Sequential(\n    nn.Linear(1, 64),\n    nn.ReLU(),\n    nn.Linear(64, 1)\n    )\n\n  def forward(self, x):\n    return self.fc(x)\n\nmodel = ValueNet()\noptimizer = torch.optim.Adam(model.parameters(), lr=0.001)\nloss_fn = nn.MSELoss()\n\nenv = gym.make(\'CartPole-v1\')\nstate, _ = env.reset()\nfor episode in range(500):\n  state, _ = env.reset()\n  done = False\n  while not done:\n    action = env.action_space.sample() # fixed/random policy\n    next_state, reward, terminated, truncated, _ = env.step(action)\n    done = terminated or truncated\n    # use a single state dimension for simplicity\n    s = state[0]\n    s_next = next_state[0]\n    w = update(w, s, reward, s_next)\n    state = next_state\n\ndef features(state):\n  return np.array([1, state, state**2])\n\nw = np.zeros(3)\n\nmodel = ValueNet()\noptimizer = torch.optim.Adam(model.parameters(), lr=0.001)\nloss_fn = nn.MSELoss()\ngamma = 0.99\nenv = gym.make(\'CartPole-v1\')\nfor episode in range(500):\n  state, _ = env.reset()\n  done = False\n  while not done:\n    state_tensor = torch.tensor([state[0]], dtype=torch.float32)\n    value = model(state_tensor)\n    action = env.action_space.sample()\n    next_state, reward, terminated, truncated, _ = env.step(action)\n    done = terminated or truncated\n    # Calculate target without gradient tracking for the target value estimation\n    with torch.no_grad():\n      next_state_tensor = torch.tensor([next_state[0]], dtype=torch.float32)\n      next_value = model(next_state_tensor)\n      target = reward + gamma * next_value\n\n    # Calculate loss and perform backpropagation\n    loss = loss_fn(value, target)\n    optimizer.zero_grad()\n    loss.backward()\n    optimizer.step()\n    state = next_state\n\nimport gymnasium as gym\nimport numpy as np\n\ndef features(state):\n  return np.array([1, state, state**2])\n\ndef update(w_local, s, r, s_next):\n\n  td_error = r + gamma * np.dot(w_local, features(s_next)) - np.dot(w_local, features(s))\n  w_local += alpha * td_error * features(s)\n  return w_local\n\nrewards_per_episode_tabular = []\ntd_errors_per_episode_tabular = []\n\n\nw = np.zeros(3)\nalpha = 0.01\ngamma = 0.99\n\nenv = gym.make(\'CartPole-v1\')\n\nfor episode in range(500):\n  state, _ = env.reset()\n  done = False\n  episode_reward = 0\n  current_episode_td_errors = []\n\n  while not done:\n    s = state[0]\n    action = env.action_space.sample()\n    next_state, reward, terminated, truncated, _ = env.step(action)\n    done = terminated or truncated\n\n    s_next = next_state[0]\n\n\n    td_error = reward + gamma * np.dot(w, features(s_next)) - np.dot(w, features(s))\n    current_episode_td_errors.append(abs(td_error))\n\n\n    w = update(w, s, reward, s_next)\n\n    episode_reward += reward\n    state = next_state\n\n  rewards_per_episode_tabular.append(episode_reward)\n  if current_episode_td_errors:\n    td_errors_per_episode_tabular.append(np.mean(current_episode_td_errors))\n  else:\n    td_errors_per_episode_tabular.append(0)\n\n\nw_memory_bytes_tabular = w.nbytes\nprint(f"Memory usage of weight vector \'w\' (tabular method): {w_memory_bytes_tabular} bytes")\n\nimport torch\nimport torch.nn as nn\nimport gymnasium as gym\nimport numpy as np\n\n\nclass ValueNet(nn.Module):\n  def __init__(self):\n    super().__init__()\n    self.fc = nn.Sequential(\n    nn.Linear(1, 64),\n    nn.ReLU(),\n    nn.Linear(64, 1)\n    )\n\n  def forward(self, x):\n    return self.fc(x)\n\n\nmodel = ValueNet()\noptimizer = torch.optim.Adam(model.parameters(), lr=0.001)\nloss_fn = nn.MSELoss()\ngamma = 0.99\nenv = gym.make(\'CartPole-v1\')\n\nrewards_per_episode_nn = []\nlosses_per_episode_nn = []\n\nfor episode in range(500):\n  state, _ = env.reset()\n  done = False\n  episode_reward = 0\n  current_episode_losses = []\n\n  while not done:\n\n    state_tensor = torch.tensor([state[0]], dtype=torch.float32).unsqueeze(0)\n    value = model(state_tensor)\n\n\n    action = env.action_space.sample()\n    next_state, reward, terminated, truncated, _ = env.step(action)\n    done = terminated or truncated\n\n\n    with torch.no_grad():\n      next_state_tensor = torch.tensor([next_state[0]], dtype=torch.float32).unsqueeze(0)\n      next_value = model(next_state_tensor)\n      target = reward + gamma * next_value\n\n\n    loss = loss_fn(value, target)\n    optimizer.zero_grad()\n    loss.backward()\n    optimizer.step()\n\n    episode_reward += reward\n    current_episode_losses.append(loss.item())\n    state = next_state\n\n  rewards_per_episode_nn.append(episode_reward)\n  if current_episode_losses:\n    losses_per_episode_nn.append(np.mean(current_episode_losses))\n  else:\n    losses_per_episode_nn.append(0)\n\n\nmodel_memory_bytes_nn = sum(p.numel() * p.element_size() for p in model.parameters())\nprint(f"Memory usage of neural network model: {model_memory_bytes_nn} bytes")\n\nimport matplotlib.pyplot as plt\n\n\nplt.figure(figsize=(12, 6))\nplt.plot(rewards_per_episode_tabular, label=\'Tabular Method\')\nplt.plot(rewards_per_episode_nn, label=\'Neural Network Method\')\nplt.xlabel(\'Episode\')\nplt.ylabel(\'Total Reward\')\nplt.title(\'Rewards Per Episode: Tabular vs. Neural Network\')\nplt.legend()\nplt.grid(True)\nplt.show()\n\n\nplt.figure(figsize=(12, 6))\nplt.plot(td_errors_per_episode_tabular, label=\'Tabular Method (Mean TD Error)\')\nplt.plot(losses_per_episode_nn, label=\'Neural Network Method (Mean MSE Loss)\')\nplt.xlabel(\'Episode\')\nplt.ylabel(\'Mean Error/Loss\')\nplt.title(\'Mean Error/Loss Per Episode: Tabular vs. Neural Network\')\nplt.legend()\nplt.grid(True)\nplt.show()\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport torch\nimport torch.nn as nn\nimport gymnasium as gym\n\n\nclass ValueNet(nn.Module):\n  def __init__(self):\n    super().__init__()\n    self.fc = nn.Sequential(\n    nn.Linear(1, 64),\n    nn.ReLU(),\n    nn.Linear(64, 1)\n    )\n\n  def forward(self, x):\n    return self.fc(x)\n\n\nlearning_rates = [0.0001, 0.001, 0.01]\n\ndef train_nn(learning_rate):\n  model = ValueNet()\n  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)\n  loss_fn = nn.MSELoss()\n  gamma = 0.99\n  env = gym.make(\'CartPole-v1\')\n\n  rewards_per_episode_nn = []\n  losses_per_episode_nn = []\n\n\n  for episode in range(500):\n    state, _ = env.reset()\n    done = False\n    episode_reward = 0\n    current_episode_losses = []\n\n    while not done:\n\n      state_tensor = torch.tensor([state[0]], dtype=torch.float32).unsqueeze(0)\n      value = model(state_tensor)\n      action = env.action_space.sample()\n      next_state, reward, terminated, truncated, _ = env.step(action)\n      done = terminated or truncated\n      with torch.no_grad():\n\n        next_state_tensor = torch.tensor([next_state[0]], dtype=torch.float32).unsqueeze(0)\n        next_value = model(next_state_tensor)\n        target = reward + gamma * next_value\n\n      loss = loss_fn(value, target)\n      optimizer.zero_grad()\n      loss.backward()\n      optimizer.step()\n      state = next_state\n\n      episode_reward += reward\n      current_episode_losses.append(loss.item())\n\n    rewards_per_episode_nn.append(episode_reward)\n    if current_episode_losses:\n      losses_per_episode_nn.append(np.mean(current_episode_losses))\n    else:\n      losses_per_episode_nn.append(0)\n\n\n  model_memory_bytes = sum(p.numel() * p.element_size() for p in model.parameters())\n  print(f"Learning Rate: {learning_rate}, Memory usage of neural network model: {model_memory_bytes} bytes")\n\n  return rewards_per_episode_nn, losses_per_episode_nn\n\n\nresults = {}\nfor lr in learning_rates:\n  print(f"\\nRunning experiment with learning rate: {lr}")\n  rewards, losses = train_nn(lr)\n  results[lr] = {\n      \'rewards\': rewards,\n      \'losses\': losses\n  }\n\n\nplt.figure(figsize=(12, 7))\nfor lr, data in results.items():\n  plt.plot(data[\'rewards\'], label=f\'LR = {lr}\')\nplt.xlabel(\'Episode\')\nplt.ylabel(\'Total Reward\')\nplt.title(\'Neural Network: Rewards Per Episode for Different Learning Rates\')\nplt.legend()\nplt.grid(True)\nplt.show()\n\n\nplt.figure(figsize=(12, 7))\nfor lr, data in results.items():\n  plt.plot(data[\'losses\'], label=f\'LR = {lr}\')\nplt.xlabel(\'Episode\')\nplt.ylabel(\'Mean Loss\')\nplt.title(\'Neural Network: Mean Loss Per Episode for Different Learning Rates\')\nplt.legend()\nplt.grid(True)\nplt.show()',
    2: 'import numpy\n\n# Grid dimensions\nrows, cols = 3, 4\n# Define states (excluding wall)\nstates = [(i, j) for i in range(rows) for j in range(cols)]\nwall = (1, 2)\ngoal = (0, 3)\ndanger = (2, 3)\nstates.remove(wall) # wall is not a valid state\n# Define actions\nactions = ["UP", "DOWN", "LEFT", "RIGHT"]\n# Rewards\ndef reward(state):\n  if state == goal:\n    return 1.0\n  elif state == danger:\n    return -1.0\n  else:\n    return -0.04\n\ndef next_state(state, action):\n  i, j = state\n  if action == "UP":\n    i = max(i - 1, 0)\n  elif action == "DOWN":\n    i = min(i + 1, rows - 1)\n  elif action == "LEFT":\n    j = max(j - 1, 0)\n  elif action == "RIGHT":\n    j = min(j + 1, cols - 1)\n  # If move hits wall → stay in same state\n  if (i, j) == wall:\n    return state\n  return (i, j)\n\ndef transition_probabilities(state, action):\n  if state in [goal, danger]:\n    return {state: 1.0} # Terminal states 100% probability\n  probs = {}\n  intended = next_state(state, action)\n  # Slips: define left and right turns\n  if action == "UP":\n    left, right = "LEFT", "RIGHT"\n  elif action == "DOWN":\n    left, right = "RIGHT", "LEFT"\n  elif action == "LEFT":\n    left, right = "DOWN", "UP"\n  else: # RIGHT\n    left, right = "UP", "DOWN"\n\n  slip_left = next_state(state, left)\n  slip_right = next_state(state, right)\n\n  probs[intended] = probs.get(intended, 0) + 0.8\n  probs[slip_left] = probs.get(slip_left, 0) + 0.1\n  probs[slip_right] = probs.get(slip_right, 0) + 0.1\n  return probs\n\ngamma = 0.9 # Discount factor\n# Example 1: From state (2,0), action = UP\nstate = (2, 0)\naction = "UP"\ntransitions = transition_probabilities(state, action)\nprint(f"From state {state}, action={action}:")\nfor next_s, prob in transitions.items():\n  print(f" -> {next_s} with P={prob:.2f}, Reward={reward(next_s)}")\n# Example 2: From state (0,2), action = RIGHT\nstate = (0, 2)\naction = "RIGHT"\ntransitions = transition_probabilities(state, action)\nprint(f"\\nFrom state {state}, action={action}:")\nfor next_s, prob in transitions.items():\n  print(f" -> {next_s} with P={prob:.2f}, Reward={reward(next_s)}")\n\nprint("States:", states)\nprint("Actions:", actions)\nprint("Rewards:", reward(goal), reward(danger))\n\ntemp_state = (2, 1)\ntemp_action = "UP"\nprint(transition_probabilities(temp_state, temp_action))\n\ntemp_state = (0, 2)\ntemp_action = "UP"\nprint(transition_probabilities(temp_state, temp_action))\n\ndef transition_probabilities_mod(state, action):\n  if state in [goal, danger]:\n    return {state: 1.0} # Terminal states 100% probability\n  probs = {}\n  intended = next_state(state, action)\n  # Slips: define left and right turns\n  if action == "UP":\n    left, right = "LEFT", "RIGHT"\n  elif action == "DOWN":\n    left, right = "RIGHT", "LEFT"\n  elif action == "LEFT":\n    left, right = "DOWN", "UP"\n  else: # RIGHT\n    left, right = "UP", "DOWN"\n\n  slip_left = next_state(state, left)\n  slip_right = next_state(state, right)\n\n  probs[intended] = probs.get(intended, 0) + 0.7\n  probs[slip_left] = probs.get(slip_left, 0) + 0.15\n  probs[slip_right] = probs.get(slip_right, 0) + 0.15\n  return probs\n\ngamma = 0.9 # Discount factor\n# Example 1: From state (2,0), action = UP\nstate = (2, 0)\naction = "UP"\ntransitions = transition_probabilities(state, action)\nprint(f"From state {state}, action={action}:")\nfor next_s, prob in transitions.items():\n  print(f" -> {next_s} with P={prob:.2f}, Reward={reward(next_s)}")\n# Example 2: From state (0,2), action = RIGHT\nstate = (0, 2)\naction = "RIGHT"\ntransitions = transition_probabilities(state, action)\nprint(f"\\nFrom state {state}, action={action}:")\nfor next_s, prob in transitions.items():\n  print(f" -> {next_s} with P={prob:.2f}, Reward={reward(next_s)}")',
    3: 'import numpy as np\nfrom matplotlib import pyplot as plt\n\nS = [\'c1\', \'c2\', \'c3\', \'pass\', \'rest\', \'tv\', \'sleep\']\nR = np.array([-2, -2, -2, +10, +1, -1, 0])\n\nP = np.array([\n    [0.0, 0.5, 0.0, 0.0, 0.0, 0.5, 0.0],\n    [0.0, 0.0, 0.8, 0.0, 0.0, 0.0, 0.2],\n    [0.0, 0.0, 0.0, 0.6, 0.4, 0.0, 0.0],\n    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],\n    [0.2, 0.4, 0.4, 0.0, 0.0, 0.0, 0.0],\n    [0.1, 0.0, 0.0, 0.0, 0.0, 0.9, 0.0],\n    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]\n])\n\ngamma = 0.5\n\nassert(np.all(np.sum(P, axis=1) == 1))\n\ndef sample_episode(P, s=0, log=True):\n    print_str = S[s] + \', \'\n    episode = [s]\n\n    while(S[episode[-1]] != \'sleep\'):\n        episode.append(np.random.choice(len(P), 1, p=P[episode[-1]])[0])\n        print_str += str(S[episode[-1]]) + \', \'\n    if log:\n        print(print_str)\n    return np.array(episode)\n\n# Generate sample episodes\nprint(\'first sample: \')\nepisode = sample_episode(P, s=0)\nprint(\'\\nsecond sample: \')\nepisode = sample_episode(P, s=0)\nprint(\'\\nthird sample: \')\nepisode = sample_episode(P, s=0)\n\nepisode = sample_episode(P, s=0)\nepisode_reward = R[episode]\nG_t = 0\n\nfor k in range(0, len(episode)):\n    G_t += gamma**k * episode_reward[k]\n    print("G_t = {:.4f}, gamma^k = {:.4f}".format(G_t, gamma**k))\n\nV = np.zeros(len(P))\nnum_episodes = 2000\n\nfor i in range(num_episodes):\n    for s in range(len(P)):\n        episode = sample_episode(P, s, log=False)\n        episode_reward = R[episode]\n        G_t = 0\n        for k in range(0, len(episode)):\n            G_t += gamma**k * episode_reward[k]\n        V[s] += G_t\n    if (i+1) % 100 == 0:\n        np.set_printoptions(precision=2)\n        print(V / (i + 1))\n\nV = V / num_episodes\nprint(V)\n\nI = np.identity(len(P))\nV = np.linalg.solve(I - gamma * P, R)\nprint(V)\n\ngamma = 0.9\nV = np.zeros(len(P))\nnum_episodes = 2000\n\nfor i in range(num_episodes):\n    for s in range(len(P)):\n        episode = sample_episode(P, s, log=False)\n        episode_reward = R[episode]\n        G_t = 0\n\n        for k in range(0, len(episode)):\n            G_t += gamma**k * episode_reward[k]\n\n        V[s] += G_t\n\n    if (i+1) % 100 == 0:\n        np.set_printoptions(precision=2)\n        print(V / (i + 1))\n\nV = V / num_episodes\nprint(V)\n\nepisode = sample_episode(P, s=0)\nepisode_reward = R[episode]\nG_t = 0\n\nfor k in range(0, len(episode)):\n    G_t += gamma**k * episode_reward[k]\n    print("G_t = {:.4f}, gamma^k = {:.4f}".format(G_t, gamma**k))\n\nI = np.identity(len(P))\nV = np.linalg.solve(I - gamma * P, R)\nprint(V)\n\nR = np.array([-2, -2, -2, +10, +1, +2, 0])\n\nepisode = sample_episode(P, s=0)\nepisode_reward = R[episode]\nG_t = 0\n\nfor k in range(0, len(episode)):\n    G_t += gamma**k * episode_reward[k]\n    print("G_t = {:.4f}, gamma^k = {:.4f}".format(G_t, gamma**k))\n\nV = np.zeros(len(P))\nnum_episodes = 2000\n\nfor i in range(num_episodes):\n    for s in range(len(P)):\n        episode = sample_episode(P, s, log=False)\n        episode_reward = R[episode]\n        G_t = 0\n        for k in range(0, len(episode)):\n            G_t += gamma**k * episode_reward[k]\n        V[s] += G_t\n    if (i+1) % 100 == 0:\n        np.set_printoptions(precision=2)\n        print(V / (i + 1))\n\nV = V / num_episodes\nprint(V)\n\nI = np.identity(len(P))\nV = np.linalg.solve(I - gamma * P, R)\nprint(V)\n\n# States and rewards\nS = [\'c1\', \'c2\', \'c3\', \'pass\', \'rest\', \'tv\', \'sleep\', \'go for a walk\']\nR = np.array([-2, -2, -2, +10, +1, -1, 0, +1])\n\n# Transition probability matrix\nP = np.array([\n    [0.0, 0.4, 0.0, 0.0, 0.0, 0.4, 0.0, 0.2],\n    [0.0, 0.0, 0.6, 0.0, 0.0, 0.0, 0.2, 0.2],\n    [0.0, 0.0, 0.0, 0.6, 0.4, 0.0, 0.0, 0.0],\n    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0],\n    [0.2, 0.4, 0.4, 0.0, 0.0, 0.0, 0.0, 0.0],\n    [0.1, 0.0, 0.0, 0.0, 0.0, 0.9, 0.0, 0.0],\n    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0],\n    [0.0, 0.3, 0.0, 0.0, 0.5, 0.2, 0.0, 0.0]\n])\n\nepisode = sample_episode(P, s=0)\nepisode_reward = R[episode]\nG_t = 0\n\nfor k in range(0, len(episode)):\n    G_t += gamma**k * episode_reward[k]\n    print("G_t = {:.4f}, gamma^k = {:.4f}".format(G_t, gamma**k))\n\nV = np.zeros(len(P))\nnum_episodes = 2000\n\nfor i in range(num_episodes):\n    for s in range(len(P)):\n        episode = sample_episode(P, s, log=False)\n        episode_reward = R[episode]\n        G_t = 0\n        for k in range(0, len(episode)):\n            G_t += gamma**k * episode_reward[k]\n        V[s] += G_t\n    if (i+1) % 100 == 0:\n        np.set_printoptions(precision=2)\n        print(V / (i + 1))\n\nV = V / num_episodes\nprint(V)\n\n# After running Monte Carlo\nplt.figure(figsize=(10, 6))  # Set the figure size\nplt.bar(S, V)\nplt.xlabel("States")\nplt.ylabel("Estimated Value V(s)")\nplt.title("Monte Carlo Value Estimates (Final)")\nplt.show()',
    4: '!pip install setuptools==65.5.0 "wheel<0.40.0"\n!apt update\n!pip install pygame==2.5.2\n\nimport gymnasium as gym\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nenv = gym.make(\'FrozenLake-v1\', is_slippery=True, render_mode=\'ansi\')\nenv.reset()\n\ngamma = 0.99\ntheta = 1e-8\nV = np.zeros(env.observation_space.n)\n\nP = env.unwrapped.P\n\nprint(f\'The environments observation space: {env.observation_space}\')\nprint(f\'The environments actions space: {env.action_space}\')\nprint(f\'The environments reward range: {env.unwrapped.reward_range}\')\n\nprint(env.render())\n\nwhile True:\n    delta = 0\n    for s in range(env.observation_space.n):\n        v = V[s]\n        q_sa = []\n        for a in range(env.action_space.n):\n            q = 0\n            for prob, next_state, reward, done in P[s][a]:\n                q += prob * (reward + gamma * V[next_state])\n            q_sa.append(q)\n        V[s] = max(q_sa)\n        delta = max(delta, abs(v - V[s]))\n    if delta < theta:\n        break\n\npolicy = np.zeros((env.observation_space.n, env.action_space.n))\nfor s in range(env.observation_space.n):\n    q_sa = np.zeros(env.action_space.n)\n    for a in range(env.action_space.n):\n        for prob, next_state, reward, done in P[s][a]:\n            q_sa[a] += prob * (reward + gamma * V[next_state])\n    best_action = np.argmax(q_sa)\n    policy[s][best_action] = 1.0\n\ndef plot(V, policy, col_ramp=1, dpi=175, draw_vals=False):\n    """\n    Visualizes the FrozenLake environment grid, showing:\n    - State values (V)\n    - Policy directions (arrows)\n    - Special tiles: S (Start), F (Frozen), H (Hole), G (Goal)\n    """\n    plt.rcParams[\'figure.dpi\'] = dpi\n    plt.rcParams.update({\'axes.edgecolor\': (0.32, 0.36, 0.38)})\n    plt.rcParams.update({\'font.size\': 6 if env.unwrapped.nrow == 8 else 8})\n    plt.figure(figsize=(3, 3))\n\n    # Use environment layout (map)\n    desc = env.unwrapped.desc\n    nrow, ncol = desc.shape\n    V_sq = V.reshape((nrow, ncol))\n\n    # Set up the plot\n    plt.imshow(V_sq, cmap=\'cool\' if col_ramp else \'gray\', alpha=0.7)\n    ax = plt.gca()\n\n    # Define direction arrows\n    arrow_dict = {\n        0: \'←\',  # LEFT\n        1: \'↓\',  # DOWN\n        2: \'→\',  # RIGHT\n        3: \'↑\'   # UP\n    }\n\n    # Draw grid lines\n    for x in range(ncol + 1):\n        ax.axvline(x - 0.5, lw=0.5, color=\'black\')\n    for y in range(nrow + 1):\n        ax.axhline(y - 0.5, lw=0.5, color=\'black\')\n\n    # Fill each grid cell with value, symbol, and arrow\n    for r in range(nrow):\n        for c in range(ncol):\n            s = r * ncol + c\n            val = V[s]\n\n            # Tile text (S, F, H, G)\n            tile = desc[r, c].decode(\'utf-8\')\n            if tile == \'H\':\n                color = \'red\'\n            elif tile == \'G\':\n                color = \'green\'\n            elif tile == \'S\':\n                color = \'blue\'\n            else:\n                color = \'black\'\n\n            # Draw tile letter\n            plt.text(c, r, tile, ha=\'center\', va=\'center\', color=color, fontsize=10, fontweight=\'bold\')\n\n            # Draw state value\n            if draw_vals and tile not in [\'H\']:\n                plt.text(c, r + 0.3, f"{val:.2f}", ha=\'center\', va=\'center\', color=\'black\', fontsize=6)\n\n            # Draw arrow for best action\n            if policy is not None:\n                best_action = np.argmax(policy[s])\n                plt.text(c, r - 0.25, arrow_dict[best_action], ha=\'center\', va=\'center\', color=\'purple\',\n                         fontsize=12)\n\n    plt.title("FrozenLake: Policy and State Values")\n    plt.axis(\'off\')\n    plt.show()\n\nplot(V, policy, draw_vals=True)\n\nnS = env.observation_space.n\nnA = env.action_space.n\npolicy = np.ones([nS, nA]) / nA\nprint("Policy shape:", policy.shape)\n\ndef policy_evaluation(env, policy, discount_factor=1.0, theta=1e-9, draw=False):\n    nS = env.observation_space.n\n    nA = env.action_space.n\n    V = np.zeros(nS)\n    P = env.unwrapped.P\n\n    while True:\n        delta = 0\n        for s in range(nS):\n            v = 0\n            for a, action_prob in enumerate(policy[s]):\n                for prob, next_state, reward, done in P[s][a]:\n                    v += action_prob * prob * (reward + discount_factor * V[next_state])\n            delta = max(delta, abs(V[s] - v))\n            V[s] = v\n        if delta < theta:\n            break\n\n    if draw:\n        print("Value function after policy evaluation:")\n        print(V.reshape(int(np.sqrt(nS)), int(np.sqrt(nS))))\n    return V\n\nV = policy_evaluation(env, policy, draw=True)\nplot(V, policy, draw_vals=True)',
    5: 'import gymnasium as gym\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n\n# Step 2: Create the FrozenLake Environment\n\nenv = gym.make(\'FrozenLake-v1\', is_slippery=True, render_mode=\'ansi\')\n\n# Get essential environment properties\nnS = env.observation_space.n # Number of states (16 for 4x4)\nnA = env.action_space.n # Number of actions (4: Left, Down, Right, Up)\n\n# Helper function for Policy Evaluation\ndef policy_evaluation(env, policy, gamma=1.0, theta=1e-8):\n    V = np.zeros(nS)\n    while True:\n        delta = 0\n        for s in range(nS):\n            v_old = V[s]\n            # V(s) = sum_a [ pi(a|s) * Q(s, a) ]\n            v_new = 0\n            for a in range(nA):\n                # We assume a deterministic policy for now: policy[s, a] = 1 for the chosen action\n                action_prob = policy[s][a]\n                if action_prob > 0: # Only check the action chosen by the policy\n                    # Q(s, a) = sum_s\', r [ P(s\'|s, a) * (r + gamma * V(s\')) ]\n                    for prob, next_state, reward, done in env.unwrapped.P[s][a]:\n                        v_new += action_prob * prob * (reward + gamma * V[next_state])\n\n            V[s] = v_new\n            delta = max(delta, np.abs(v_old - V[s]))\n\n        if delta < theta:\n            break\n    return V\n\n# Step 3: Compute Q-Values from Value Function\ndef q_from_v(env, V, s, gamma=1.0):\n    q = np.zeros(nA)\n    for a in range(nA):\n        for prob, next_state, reward, done in env.unwrapped.P[s][a]:\n            q[a] += prob * (reward + gamma * V[next_state])\n    return q\n\n# Step 4: Implement Policy Improvement (Task 3\'s core function)\ndef policy_improvement(env, V, discount_factor=1.0):\n    # nS and nA are already defined globally\n\n    # Initialize a policy where all actions are equally likely (random)\n    # This is a placeholder for the policy we are about to improve.\n    # The output policy will be deterministic (one action per state)\n    policy = np.zeros([nS, nA])\n\n    for s in range(nS):\n        # Calculate Q-values for all actions in the current state \'s\'\n        Q = q_from_v(env, V, s, discount_factor)\n\n        # Find the action that maximizes the Q-value (the greedy action)\n        best_action = np.argmax(Q)\n\n        # Update the policy: set the probability of the best action to 1 and others to 0\n        policy[s, best_action] = 1.0 # This creates a deterministic greedy policy\n\n    return policy\n\n\n# Step 5: Visualization Function\ndef plot(V, policy, title_suffix="Initial Policy", draw_vals=True):\n    nrow = env.unwrapped.nrow\n    ncol = env.unwrapped.ncol\n    arrow_symbols = {0: \'←\', 1: \'↓\', 2: \'→\', 3: \'↑\'}\n\n    # The policy is an array of probabilities, so we get the best action index\n    best_actions = np.argmax(policy, axis=1)\n\n    grid = np.reshape(V, (nrow, ncol))\n\n    plt.figure(figsize=(6, 6))\n    plt.imshow(grid, cmap=\'cool\', interpolation=\'none\')\n\n    for s in range(nrow * ncol):\n        row, col = divmod(s, ncol)\n\n        if draw_vals:\n            # Display value\n            plt.text(col, row, f\'{V[s]:.2f}\', ha=\'center\', va=\'center\', color=\'black\', fontsize=10)\n        else:\n            # Display best action arrow\n            plt.text(col, row, arrow_symbols[best_actions[s]], ha=\'center\', va=\'center\', color=\'white\', fontsize=16)\n\n    plt.title(("Value Function (" + title_suffix + ")") if draw_vals else ("Policy (" + title_suffix + ")"))\n    plt.axis(\'off\')\n    plt.colorbar(label=\'State Value\')\n    plt.show()\n\n\n# --- Task 3: Coding Implementation ---\nprint("--- Task 3: Policy Improvement on Random Values ---")\n\n# Initialize a random value function V\nV_random = np.random.rand(nS)\n\n# Create a truly random initial policy for comparison\ninitial_policy = np.full([nS, nA], 1/nA)\n\n# Display the initial state values and policy (based on random V and uniform policy)\nplot(V_random, initial_policy, "Random V (Before Improvement)", draw_vals=True)\nplot(V_random, initial_policy, "Uniform Policy (Before Improvement)", draw_vals=False)\n\n# Compute the new greedy policy based on V_random\nnew_policy_greedy = policy_improvement(env, V_random)\n\n# Evaluate the new policy to get its true value function\nV_improved = policy_evaluation(env, new_policy_greedy)\n\n# Display the *improved* value function and the new greedy policy\nplot(V_improved, new_policy_greedy, "Improved Policy V", draw_vals=True)\nplot(V_improved, new_policy_greedy, "Greedy Policy (After Improvement)", draw_vals=False)\n\n\n\n# --- Task 4: Policy Iteration Extension ---\nprint("\\n--- Task 4: Policy Iteration to Find Optimal Policy ---")\n\ndef policy_iteration(env, gamma=1.0, theta=1e-8):\n    # Start with a random uniform policy\n    policy = np.full([nS, nA], 1/nA)\n\n    iteration_count = 0\n    while True:\n        # 1. Policy Evaluation: Find the value function for the current policy\n        V = policy_evaluation(env, policy, gamma, theta)\n\n        # 2. Policy Improvement: Find the new greedy policy based on V\n        new_policy = policy_improvement(env, V, gamma)\n\n        # Check for stability: Is the new policy the same as the old one?\n        policy_stable = np.array_equal(policy, new_policy)\n\n        # Update the policy\n        policy = new_policy\n\n        iteration_count += 1\n\n        if policy_stable:\n            break\n\n    return V, policy, iteration_count\n\nV_optimal, policy_optimal, iterations = policy_iteration(env, gamma=0.9) # Using gamma=0.9 for standard RL\n\nprint(f"Policy Iteration converged in {iterations} iterations.")\n\n# Plot the optimal results\nplot(V_optimal, policy_optimal, f"Optimal V (Converged in {iterations} iters)", draw_vals=True)\nplot(V_optimal, policy_optimal, f"Optimal Policy (Converged in {iterations} iters)", draw_vals=False)',
    6: 'import gymnasium as gym\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nenv = gym.make(\'Taxi-v3\', render_mode=\'ansi\')\n\ndef value_iteration(env, discount_factor=0.99, theta=1e-6, max_iterations=10000):\n    nS = env.observation_space.n\n    nA = env.action_space.n\n    P = env.unwrapped.P\n    V = np.zeros(nS)\n\n    for i in range(max_iterations):\n        delta = 0\n        for s in range(nS):\n            q_sa = np.zeros(nA)\n            for a in range(nA):\n                for prob, next_state, reward, done in P[s][a]:\n                    q_sa[a] += prob * (reward + discount_factor * V[next_state])\n            new_v = np.max(q_sa)\n            delta = max(delta, np.abs(new_v - V[s]))\n            V[s] = new_v\n\n        if delta < theta:\n            break\n\n    policy = extract_policy_from_v(env, V, discount_factor)\n\n    return V, policy, i + 1\n\ndef extract_policy_from_v(env, V, discount_factor=0.99):\n    nS = env.observation_space.n\n    nA = env.action_space.n\n    P = env.unwrapped.P\n    policy = np.zeros((nS, nA))\n\n    for s in range(nS):\n        q_sa = np.zeros(nA)\n        for a in range(nA):\n            for prob, next_state, reward, done in P[s][a]:\n                q_sa[a] += prob * (reward + discount_factor * V[next_state])\n        best_a = np.argmax(q_sa)\n        policy[s] = np.eye(nA)[best_a]\n\n    return policy\n\ndef plot_values(env, V):\n  plt.figure(figsize=(10, 4))\n  plt.plot(V)\n  plt.title("Value Function for Taxi-v3")\n  plt.xlabel("State (0–499)")\n  plt.ylabel("Value")\n  plt.grid(True)\n  plt.show()\n\ndef plot_policy(env, policy):\n  nS = env.observation_space.n\n  actions = np.argmax(policy, axis=1)\n  plt.figure(figsize=(10, 4))\n  plt.bar(np.arange(nS), actions)\n  plt.title("Greedy Policy (Best Action per State)")\n  plt.xlabel("State")\n  plt.ylabel("Action (0=South, 1=North, 2=East, 3=West, 4=Pickup, 5=Dropoff)")\n  plt.show()\n\nenv = gym.make(\'Taxi-v3\')\ngamma = 0.99\nV_opt, policy_opt, iterations = value_iteration(env, discount_factor=gamma)\nprint(f"Converged in {iterations} iterations.")\nplot_values(env, V_opt)\nplot_policy(env, policy_opt)\n\ndef evaluate_policy(env, policy, n_episodes=1000):\n    success = 0\n    for _ in range(n_episodes):\n        obs, _ = env.reset()\n        done = False\n        while not done:\n            action = np.argmax(policy[obs])\n            obs, reward, terminated, truncated, _ = env.step(action)\n            done = terminated or truncated\n            if reward > 0:\n                success += 1\n    return success / n_episodes\n\nrate = evaluate_policy(env, policy_opt)\nprint(f"Success Rate: {rate * 100:.2f}%")\n\ndef evaluate_policy(env, policy, n_episodes=1000):\n    wins = 0\n    total_steps = 0\n    win_steps = 0\n\n    for _ in range(n_episodes):\n        s, _ = env.reset()\n        steps = 0\n        finished = False\n        while not finished:\n            a = np.argmax(policy[s])\n            s, r, term, trunc, _ = env.step(a)\n            steps += 1\n            finished = term or trunc\n            if r == 20:\n                wins += 1\n                win_steps += steps\n                break\n        total_steps += steps\n\n    rate = wins / n_episodes\n    avg = win_steps / wins if wins else 0\n    return rate, avg\n\ndiscounts = [0.6, 0.9, 0.99]\nsummary = []\n\nfor g in discounts:\n    taxi = gym.make(\'Taxi-v3\')\n    V_star, pi_star, it = value_iteration(taxi, discount_factor=g)\n    win_rate, steps = evaluate_policy(taxi, pi_star)\n    summary.append((g, it, win_rate, steps))\n    print(f"γ={g:.2f} → {it} iters | {win_rate:.1%} success | {steps:.1f} avg steps")\n\ng_vals = [0.6, 0.9, 0.99]\nfor g in g_vals:\n    e = gym.make(\'Taxi-v3\')\n    V, pi, its = value_iteration(e, discount_factor=g)\n    sr, ast = evaluate_policy(e, pi)\n    print(f"γ={g} | iters={its} | success={sr:.1%} | steps={ast:.1f}")\n\ndef value_iteration_with_delta(env, gamma=0.99, eps=1e-6):\n    nS = env.observation_space.n\n    nA = env.action_space.n\n    P = env.unwrapped.P\n    V = np.zeros(nS)\n    deltas = []\n\n    for _ in range(10000):\n        delta = 0\n        for s in range(nS):\n            q = np.zeros(nA)\n            for a in range(nA):\n                for pr, ns, rw, _ in P[s][a]:\n                    q[a] += pr * (rw + gamma * V[ns])\n            new_v = np.max(q)\n            delta = max(delta, abs(new_v - V[s]))\n            V[s] = new_v\n        deltas.append(delta)\n        if delta < eps:\n            break\n\n    pi = extract_policy_from_v(env, V, gamma)\n    return V, pi, len(deltas), deltas\n\n# Plot convergence\nenv = gym.make(\'Taxi-v3\')\n_, _, _, dlist = value_iteration_with_delta(env, gamma=0.99)\nplt.figure(figsize=(8,4))\nplt.plot(dlist, marker=\'.\')\nplt.yscale(\'log\')\nplt.title(\'Maximum ΔV per iteration\')\nplt.xlabel(\'Iteration\')\nplt.ylabel(\'ΔV (log)\')\nplt.grid(True, alpha=0.4)\nplt.show()\n\nclass ToughTaxi(gym.Wrapper):\n    def step(self, action):\n        obs, rew, done, trunc, info = self.env.step(action)\n        if rew == -1:   rew = -2      # harsher time penalty\n        if rew == -10:  rew = -30     # harsher illegal penalty\n        return obs, rew, done, trunc, info\n\nmod_env = ToughTaxi(gym.make(\'Taxi-v3\'))\nV_m, pi_m, it_m = value_iteration(mod_env, discount_factor=0.99)\nsr_m, st_m = evaluate_policy(mod_env, pi_m)\nprint(f"Modified → {it_m} iters | {sr_m:.1%} success | {st_m:.1f} steps")\n\nprint("\\n=== COMPARISON ===")\n\n# Taxi\nt_env = gym.make(\'Taxi-v3\')\nVt, pt, it_t = value_iteration(t_env, discount_factor=0.99)\nrt, st = evaluate_policy(t_env, pt)\nprint(f"Taxi-v3  : {t_env.observation_space.n} states | {it_t} iters | {rt:.1%} success")\n\n# FrozenLake (no slip)\nf_env = gym.make(\'FrozenLake-v1\', is_slippery=False)\nVf, pf, it_f = value_iteration(f_env, discount_factor=0.99)\nrf, sf = evaluate_policy(f_env, pf)\nprint(f"FrozenLake: {f_env.observation_space.n} states | {it_f} iters | {rf:.1%} success")',
    7: 'import numpy as np\nimport gymnasium as gym\nimport matplotlib.pyplot as plt\n\nenv = gym.make(\'FrozenLake-v1\', is_slippery = \'False\')\n\ndef monte_carlo(env, policy, episodes = 10000, df = 0.99):\n  V = np.zeros(env.observation_space.n)\n  returns = {s:[] for s in range(env.observation_space.n)}\n  V_hist = []\n\n  for ep in range(episodes):\n    episode = []\n    state, _ = env.reset()\n    done = False\n\n    while not done:\n      action = policy[state]\n      next_state, reward, terminated, truncated, _ = env.step(action)\n      done = terminated or truncated\n      episode.append((state, reward))\n      state = next_state\n\n    G = 0\n    visited_states = set()\n    for s, r in reversed(episode):\n      G = df * G + r\n      if s not in visited_states:\n        returns[s].append(G)\n        V[s] = np.mean(returns[s])\n        visited_states.add(s)\n\n    V_hist.append(V.copy())\n  return V, V_hist\n\ndef temp_diff(env, policy, episodes = 10000, lr = 0.05, df = 0.99):\n  V = np.zeros(env.observation_space.n)\n  V_hist = []\n\n  for ep in range(episodes):\n    episode = []\n    state, _ = env.reset()\n    done = False\n\n    while not done:\n      action  = policy[state]\n      next_state, reward, terminated, truncated , _ = env.step(action)\n      done = terminated or truncated\n      V[state] = V[state] + lr * (reward + df * V[next_state] - V[state])\n      state = next_state\n    V_hist.append(V.copy())\n  return V, V_hist\n\nnp.random.seed(42)\n\npolicy = {s: np.random.choice([0, 1, 2, 3]) for s in range(env.observation_space.n)}\n\nV_MC, V_MC_hist = monte_carlo(env, policy)\nV_TD, V_TD_hist = temp_diff(env, policy)\n\nprint("Monte Carlo Value: ")\nprint(V_MC)\nprint("Temporal Difference Value: ")\nprint(V_TD)\n\n\ndef convergence(V_track, title):\n  plt.figure(figsize = (8, 5))\n  for s in range(env.observation_space.n):\n    values = [v[s] for v in V_track]\n    plt.plot(values, label = f"State {s}")\n\n  plt.title(title)\n  plt.xlabel("Episodes")\n  plt.ylabel("Value")\n  plt.legend()\n  plt.grid(True)\n  plt.show()\n\n\n\nconvergence(V_MC_hist, "Monte Carlo Value Convergence")\nconvergence(V_TD_hist, "Temporal Difference Value Convergence")',
    8: 'import gymnasium as gym\nimport numpy as np\nimport matplotlib.pyplot as plt\n\ndef monte_carlo_prediction(env, num_episodes=5000, gamma=0.99):\n    V = np.zeros(env.observation_space.n)\n    returns_sum = {s: 0 for s in range(env.observation_space.n)}\n    returns_count = {s: 0 for s in range(env.observation_space.n)}\n\n    V_over_time = []\n\n    for episode in range(num_episodes):\n        episode_memory = []\n        state, _ = env.reset()\n        done = False\n\n        while not done:\n            action = np.random.choice(env.action_space.n)\n            next_state, reward, terminated, truncated, _ = env.step(action)\n            done = terminated or truncated\n\n            episode_memory.append((state, reward))\n            state = next_state\n\n        G = 0\n        visited_states = set()\n        for t in reversed(range(len(episode_memory))):\n            state_t, reward_t = episode_memory[t]\n            G = reward_t + gamma * G\n\n            if state_t not in visited_states:\n                visited_states.add(state_t)\n                returns_sum[state_t] += G\n                returns_count[state_t] += 1\n                V[state_t] = returns_sum[state_t] / returns_count[state_t]\n\n        V_over_time.append(V.copy())\n\n    return np.array(V_over_time)\n\n\ndef td0_prediction(env, num_episodes=5000, gamma=0.99, alpha=0.1):\n    V = np.zeros(env.observation_space.n)\n    V_over_time = []\n\n    for episode in range(num_episodes):\n        state, _ = env.reset()\n        done = False\n\n        while not done:\n            action = np.random.choice(env.action_space.n)\n            next_state, reward, terminated, truncated, _ = env.step(action)\n            done = terminated or truncated\n\n            V[state] += alpha * (reward + gamma * V[next_state] - V[state])\n            state = next_state\n\n        V_over_time.append(V.copy())\n\n    return np.array(V_over_time)\n\n\ndef run_experiment(env_name, is_slippery):\n    env = gym.make(env_name, is_slippery=is_slippery)\n    print("\\nEnvironment:", env_name, "| slippery:", is_slippery)\n\n    mc_values = monte_carlo_prediction(env)\n    td_values = td0_prediction(env)\n\n    return mc_values, td_values, env\n\n\ndef plot_results(mc_values, td_values, env, title_suffix):\n    plt.figure(figsize=(14, 5))\n\n    plt.subplot(1, 2, 1)\n    plt.plot(mc_values[:, 0], label="State 0")\n    plt.plot(mc_values[:, -1], label=f"State {env.observation_space.n - 1}")\n    plt.title("Monte Carlo Value Evolution " + title_suffix)\n    plt.xlabel("Episodes")\n    plt.ylabel("Value")\n    plt.legend()\n\n    plt.subplot(1, 2, 2)\n    plt.plot(td_values[:, 0], label="State 0")\n    plt.plot(td_values[:, -1], label=f"State {env.observation_space.n - 1}")\n    plt.title("TD(0) Value Evolution " + title_suffix)\n    plt.xlabel("Episodes")\n    plt.ylabel("Value")\n    plt.legend()\n\n    plt.show()\n\n\n\nimport gymnasium as gym\nimport numpy as np\nimport matplotlib.pyplot as plt\n# Create FrozenLake environment\nenv = gym.make("FrozenLake-v1", is_slippery=False)\n# Initialize Q-table\nQ = np.zeros((env.observation_space.n, env.action_space.n))\n# Hyperparameters\nalpha = 0.8\ngamma = 0.95\nepsilon = 0.1\nepisodes = 2000\nreward_list = []\ndef epsilon_greedy(state):\n  if np.random.random() < epsilon:\n   return env.action_space.sample()\n  else:\n   return np.argmax(Q[state])\nfor episode in range(episodes):\n  state, info = env.reset()\n  total_reward = 0\n  done = False\n  while not done:\n    action = epsilon_greedy(state)\n    next_state, reward, terminated, truncated, info = env.step(action)\n    done = terminated or truncated\n    # Q-learning update\n    best_next_action = np.max(Q[next_state])\n    Q[state, action] += alpha * (reward + gamma * best_next_action - Q[state, action])\n    state = next_state\n    total_reward += reward\n  reward_list.append(total_reward)\nprint("Training finished using Q-Learning!")\nplt.plot(reward_list)\nplt.title("Q-Learning Rewards")\nplt.xlabel("Episode")\nplt.ylabel("Reward")\nplt.show()\n\nimport numpy as np\nif not hasattr(np, "bool8"):\n    np.bool8 = np.bool_\n\n\nimport gym\nimport numpy as np\nimport matplotlib.pyplot as plt\nenv = gym.make("FrozenLake-v1", is_slippery=False)\nQ = np.zeros((env.observation_space.n, env.action_space.n))\nalpha = 0.8\ngamma = 0.95\nepsilon = 0.1\nepisodes = 2000\nreward_list = []\ndef epsilon_greedy(state):\n  if np.random.random() < epsilon:\n   return env.action_space.sample()\n  else:\n   return np.argmax(Q[state])\nfor episode in range(episodes):\n  state = env.reset()\n\n  action = epsilon_greedy(state)\n  total_reward = 0\n  done = False\n  while not done:\n    next_state, reward, done, info = env.step(action)\n    next_action = epsilon_greedy(next_state)\n    Q[state, action] += alpha * (reward + gamma * Q[next_state, next_action] - Q[state,\n    action])\n    state = next_state\n    action = next_action\n    total_reward += reward\n    reward_list.append(total_reward)\nprint("Training finished using SARSA!")\nplt.plot(reward_list)\nplt.title("SARSA Rewards")\nplt.xlabel("Episode")\nplt.ylabel("Reward")\nplt.show()\n\nimport gym\nimport numpy as np\nimport matplotlib.pyplot as plt\nenv = gym.make("FrozenLake-v1", is_slippery=False)\nQ = np.zeros((env.observation_space.n, env.action_space.n))\nalpha = 0.1\ngamma = 0.5\nepsilon = 0.01\nepisodes = 2000\nreward_list = []\ndef epsilon_greedy(state):\n  if np.random.random() < epsilon:\n   return env.action_space.sample()\n  else:\n   return np.argmax(Q[state])\nfor episode in range(episodes):\n  state = env.reset()\n\n  action = epsilon_greedy(state)\n  total_reward = 0\n  done = False\n  while not done:\n    next_state, reward, done, info = env.step(action)\n    next_action = epsilon_greedy(next_state)\n    Q[state, action] += alpha * (reward + gamma * Q[next_state, next_action] - Q[state,\n    action])\n    state = next_state\n    action = next_action\n    total_reward += reward\n    reward_list.append(total_reward)\nprint("Training finished using SARSA!")\nplt.plot(reward_list)\nplt.title("SARSA Rewards")\nplt.xlabel("Episode")\nplt.ylabel("Reward")\nplt.show()\n\nimport gym\nimport numpy as np\nimport matplotlib.pyplot as plt\nenv = gym.make("FrozenLake-v1", is_slippery=False)\nQ = np.zeros((env.observation_space.n, env.action_space.n))\nalpha = 0.9\ngamma = 0.99\nepsilon = 0.3\nepisodes = 2000\nreward_list = []\ndef epsilon_greedy(state):\n  if np.random.random() < epsilon:\n   return env.action_space.sample()\n  else:\n   return np.argmax(Q[state])\nfor episode in range(episodes):\n  state = env.reset()\n\n  action = epsilon_greedy(state)\n  total_reward = 0\n  done = False\n  while not done:\n    next_state, reward, done, info = env.step(action)\n    next_action = epsilon_greedy(next_state)\n    Q[state, action] += alpha * (reward + gamma * Q[next_state, next_action] - Q[state,\n    action])\n    state = next_state\n    action = next_action\n    total_reward += reward\n    reward_list.append(total_reward)\nprint("Training finished using SARSA!")\nplt.plot(reward_list)\nplt.title("SARSA Rewards")\nplt.xlabel("Episode")\nplt.ylabel("Reward")\nplt.show()',
    9: 'import gymnasium as gym\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nenv = gym.make("Blackjack-v1", sab=True)\n\nQ_weighted = {}\nC_weighted = {}\nQ_ordinary = {}\nreturns_sum = {}\nreturns_count = {}\n\ngamma = 1.0\nnum_episodes = 500_000\n\ndef init_state(state):\n    if state not in Q_weighted:\n        Q_weighted[state] = np.zeros(2)\n        C_weighted[state] = np.zeros(2)\n        Q_ordinary[state] = np.zeros(2)\n        returns_sum[state] = np.zeros(2)\n        returns_count[state] = np.zeros(2)\n\ndef behavior_policy():\n    return np.random.randint(0, 2)\n\ndef target_policy(state):\n    return int(np.argmax(Q_weighted[state]))\n\nfor episode_idx in range(num_episodes):\n    episode = []\n    state, _ = env.reset()\n    done = False\n\n    while not done:\n        action = behavior_policy()\n        next_state, reward, terminated, truncated, _ = env.step(action)\n        done = terminated or truncated\n        episode.append((state, action, reward))\n        state = next_state\n\n    G = 0.0\n    W = 1.0\n\n    for (s, a, r) in reversed(episode):\n        init_state(s)\n        G = gamma * G + r\n\n        if a != target_policy(s):\n            break\n        W *= 2.0\n\n        returns_sum[s][a] += W * G\n        returns_count[s][a] += 1\n        Q_ordinary[s][a] = returns_sum[s][a] / returns_count[s][a] if returns_count[s][a] > 0 else 0\n\n        C_weighted[s][a] += W\n        if C_weighted[s][a] > 0:\n            Q_weighted[s][a] += (W / C_weighted[s][a]) * (G - Q_weighted[s][a])\n\nstates_to_check = [(21, 10, True), (20, 5, False), (18, 10, False), (15, 10, True)]\n\nprint("Variance Comparison (Ordinary vs Weighted IS):")\nprint("-" * 65)\nprint(f"{\'State\':<22} {\'Action\':<8} {\'Ordinary Q\':<12} {\'Weighted Q\':<12} {\'Ordinary Updates\':<15}")\nprint("-" * 65)\nfor state in states_to_check:\n    if state in Q_ordinary:\n        for a in [0, 1]:\n            ord_q = Q_ordinary[state][a]\n            wtd_q = Q_weighted[state][a]\n            count = returns_count[state][a]\n            print(f"{str(state):<22} {a:<8} {ord_q:8.3f}     {wtd_q:8.3f}     {int(count):<15}")\n\nenv_fl = gym.make("FrozenLake-v1", is_slippery=False)\nn_states = env_fl.observation_space.n\nn_actions = env_fl.action_space.n\n\ndef run_qlearning(epsilon, episodes=10_000, alpha=0.1, gamma=0.99):\n    Q = np.zeros((n_states, n_actions))\n    episode_rewards = np.zeros(episodes)\n\n    for ep in range(episodes):\n        state, _ = env_fl.reset()\n        done = False\n        total_reward = 0\n\n        while not done:\n            if np.random.rand() < epsilon:\n                action = env_fl.action_space.sample()\n            else:\n                action = np.argmax(Q[state])\n\n            next_state, reward, terminated, truncated, _ = env_fl.step(action)\n            done = terminated or truncated\n\n            if not terminated and not truncated:\n                reward = -0.01\n            elif terminated and reward == 0:\n                reward = -1.0\n\n            best_next = np.max(Q[next_state])\n            Q[state, action] += alpha * (reward + gamma * best_next - Q[state, action])\n\n            state = next_state\n            total_reward += reward\n\n        episode_rewards[ep] = total_reward\n\n    return Q, episode_rewards\n\nprint("Running Task 2: Q-Learning with ε = 0.1, 0.3, 0.8...")\nepsilons = [0.1, 0.3, 0.8]\nall_rewards = {}\n\nfor eps in epsilons:\n    print(f"   Training with ε = {eps}...")\n    Q, rewards = run_qlearning(eps, episodes=10000)\n    all_rewards[eps] = np.cumsum(rewards) / (np.arange(len(rewards)) + 1)\n\n# Plot learning curves\nplt.figure(figsize=(10, 6))\nfor eps in epsilons:\n    plt.plot(all_rewards[eps], label=f\'ε = {eps}\')\nplt.title(\'Q-Learning: Average Reward per Episode (FrozenLake)\')\nplt.xlabel(\'Episodes\')\nplt.ylabel(\'Average Cumulative Reward\')\nplt.legend()\nplt.grid(True)\nplt.show()\n\nprint("→ ε = 0.3 gives the best balance of exploration and exploitation.\\n")\n\nQ_final = run_qlearning(epsilon=0.3, episodes=5000)[0]\n\nprint("Task 3: Behavior Policy vs Target Policy (ε=0.0.3)")\nprint("-" * 60)\nprint(f"{\'State\':<6} {\'Behavior Action\':<16} {\'Target Action\':<14} {\'Match?\'}")\nprint("-" * 60)\n\nmismatch = 0\ntotal = 0\nstate, _ = env_fl.reset()\ndone = False\n\nwhile not done:\n    if np.random.rand() < 0.3:\n        beh_action = env_fl.action_space.sample()\n    else:\n        beh_action = np.argmax(Q_final[state])\n\n    tar_action = np.argmax(Q_final[state])\n\n    match = beh_action == tar_action\n    if not match:\n        mismatch += 1\n    total += 1\n\n    print(f"{state:<6} {beh_action:<16} {tar_action:<14} {\'Yes\' if match else \'No\'}")\n\n    state, _, done, _, _ = env_fl.step(beh_action)\n    if done:\n        break\n\nprint(f"\\nMismatch rate: {mismatch/total:.1%}~30% (as expected with ε=0.3)\\n")\nprint("→ This mismatch is what makes Q-Learning OFF-POLICY!\\n")\n\n_, ql_rewards = run_qlearning(epsilon=0.3, episodes=6000)\nql_avg = np.cumsum(ql_rewards) / (np.arange(6000) + 1)\n\nQ_mc = np.zeros((n_states, n_actions))\nC_mc = np.zeros((n_states, n_actions))\nmc_rewards = []\n\nfor ep in range(6000):\n    episode = []\n    state, _ = env_fl.reset()\n    done = False\n    while not done:\n        action = np.random.randint(4)\n        next_state, reward, terminated, truncated, _ = env_fl.step(action)\n        done = terminated or truncated\n        if terminated and reward == 0:\n            reward = -1\n        episode.append((state, action, reward))\n        state = next_state\n\n    G = 0.0\n    W = 1.0\n    for (s, a, r) in reversed(episode):\n        G = gamma * G + r\n        if a != np.argmax(Q_mc[s]):\n            break\n        W *= 4.0\n\n        C_mc[s, a] += W\n        if C_mc[s, a] > 0:\n            Q_mc[s, a] += (W / C_mc[s, a]) * (G - Q_mc[s, a])\n\n    mc_rewards.append(G)\n\nmc_avg = np.cumsum(mc_rewards) / (np.arange(6000) + 1)\n\nplt.figure(figsize=(10, 6))\nplt.plot(ql_avg, label=\'Q-Learning (TD, Off-Policy)\', linewidth=2.5)\nplt.plot(mc_avg, label=\'Off-Policy Monte Carlo (IS)\', linewidth=2.5)\nplt.title(\'Off-Policy MC vs Q-Learning (FrozenLake)\')\nplt.xlabel(\'Episodes\')\nplt.ylabel(\'Average Return\')\nplt.legend()\nplt.grid(True)\nplt.show()\n\nprint("TASK 4 ANSWERS:")\nprint("• Which converges faster? → Q-LEARNING (by a large margin)")\nprint("• Why does Q-Learning not require full episodes?")\nprint("   → Because it uses Temporal Difference (bootstrapping):")\nprint("     Q(s,a) ← Q(s,a) + α [r + γ max Q(s\',a\') − Q(s,a)]")\nprint("     → Updates after every step using estimated future value")\nprint("     → No need to wait until episode end")\nprint("   → Off-Policy MC must wait for full episode + uses high-variance IS")',
    10: 'import gymnasium as gym\nimport numpy as np\nimport torch\nimport torch.nn as nn\n\ndef features(state):\n  return np.array([1, state])\n\nw = np.zeros(2)\nalpha = 0.01\ngamma = 0.99\n\ndef update(w, s, r, s_next):\n  td_error = r + gamma * np.dot(w, features(s_next)) - np.dot(w, features(s))\n  w += alpha * td_error * features(s)\n  return w\n\nclass ValueNet(nn.Module):\n  def __init__(self):\n    super().__init__()\n    self.fc = nn.Sequential(\n    nn.Linear(1, 64),\n    nn.ReLU(),\n    nn.Linear(64, 1)\n    )\n\n  def forward(self, x):\n    return self.fc(x)\n\nmodel = ValueNet()\noptimizer = torch.optim.Adam(model.parameters(), lr=0.001)\nloss_fn = nn.MSELoss()\n\nenv = gym.make(\'CartPole-v1\')\nstate, _ = env.reset()\nfor episode in range(500):\n  state, _ = env.reset()\n  done = False\n  while not done:\n    action = env.action_space.sample() # fixed/random policy\n    next_state, reward, terminated, truncated, _ = env.step(action)\n    done = terminated or truncated\n    # use a single state dimension for simplicity\n    s = state[0]\n    s_next = next_state[0]\n    w = update(w, s, reward, s_next)\n    state = next_state\n\ndef features(state):\n  return np.array([1, state, state**2])\n\nw = np.zeros(3)\n\nmodel = ValueNet()\noptimizer = torch.optim.Adam(model.parameters(), lr=0.001)\nloss_fn = nn.MSELoss()\ngamma = 0.99\nenv = gym.make(\'CartPole-v1\')\nfor episode in range(500):\n  state, _ = env.reset()\n  done = False\n  while not done:\n    state_tensor = torch.tensor([state[0]], dtype=torch.float32)\n    value = model(state_tensor)\n    action = env.action_space.sample()\n    next_state, reward, terminated, truncated, _ = env.step(action)\n    done = terminated or truncated\n    # Calculate target without gradient tracking for the target value estimation\n    with torch.no_grad():\n      next_state_tensor = torch.tensor([next_state[0]], dtype=torch.float32)\n      next_value = model(next_state_tensor)\n      target = reward + gamma * next_value\n\n    # Calculate loss and perform backpropagation\n    loss = loss_fn(value, target)\n    optimizer.zero_grad()\n    loss.backward()\n    optimizer.step()\n    state = next_state\n\nimport gymnasium as gym\nimport numpy as np\n\ndef features(state):\n  return np.array([1, state, state**2])\n\ndef update(w_local, s, r, s_next):\n\n  td_error = r + gamma * np.dot(w_local, features(s_next)) - np.dot(w_local, features(s))\n  w_local += alpha * td_error * features(s)\n  return w_local\n\nrewards_per_episode_tabular = []\ntd_errors_per_episode_tabular = []\n\n\nw = np.zeros(3)\nalpha = 0.01\ngamma = 0.99\n\nenv = gym.make(\'CartPole-v1\')\n\nfor episode in range(500):\n  state, _ = env.reset()\n  done = False\n  episode_reward = 0\n  current_episode_td_errors = []\n\n  while not done:\n    s = state[0]\n    action = env.action_space.sample()\n    next_state, reward, terminated, truncated, _ = env.step(action)\n    done = terminated or truncated\n\n    s_next = next_state[0]\n\n\n    td_error = reward + gamma * np.dot(w, features(s_next)) - np.dot(w, features(s))\n    current_episode_td_errors.append(abs(td_error))\n\n\n    w = update(w, s, reward, s_next)\n\n    episode_reward += reward\n    state = next_state\n\n  rewards_per_episode_tabular.append(episode_reward)\n  if current_episode_td_errors:\n    td_errors_per_episode_tabular.append(np.mean(current_episode_td_errors))\n  else:\n    td_errors_per_episode_tabular.append(0)\n\n\nw_memory_bytes_tabular = w.nbytes\nprint(f"Memory usage of weight vector \'w\' (tabular method): {w_memory_bytes_tabular} bytes")\n\nimport torch\nimport torch.nn as nn\nimport gymnasium as gym\nimport numpy as np\n\n\nclass ValueNet(nn.Module):\n  def __init__(self):\n    super().__init__()\n    self.fc = nn.Sequential(\n    nn.Linear(1, 64),\n    nn.ReLU(),\n    nn.Linear(64, 1)\n    )\n\n  def forward(self, x):\n    return self.fc(x)\n\n\nmodel = ValueNet()\noptimizer = torch.optim.Adam(model.parameters(), lr=0.001)\nloss_fn = nn.MSELoss()\ngamma = 0.99\nenv = gym.make(\'CartPole-v1\')\n\nrewards_per_episode_nn = []\nlosses_per_episode_nn = []\n\nfor episode in range(500):\n  state, _ = env.reset()\n  done = False\n  episode_reward = 0\n  current_episode_losses = []\n\n  while not done:\n\n    state_tensor = torch.tensor([state[0]], dtype=torch.float32).unsqueeze(0)\n    value = model(state_tensor)\n\n\n    action = env.action_space.sample()\n    next_state, reward, terminated, truncated, _ = env.step(action)\n    done = terminated or truncated\n\n\n    with torch.no_grad():\n      next_state_tensor = torch.tensor([next_state[0]], dtype=torch.float32).unsqueeze(0)\n      next_value = model(next_state_tensor)\n      target = reward + gamma * next_value\n\n\n    loss = loss_fn(value, target)\n    optimizer.zero_grad()\n    loss.backward()\n    optimizer.step()\n\n    episode_reward += reward\n    current_episode_losses.append(loss.item())\n    state = next_state\n\n  rewards_per_episode_nn.append(episode_reward)\n  if current_episode_losses:\n    losses_per_episode_nn.append(np.mean(current_episode_losses))\n  else:\n    losses_per_episode_nn.append(0)\n\n\nmodel_memory_bytes_nn = sum(p.numel() * p.element_size() for p in model.parameters())\nprint(f"Memory usage of neural network model: {model_memory_bytes_nn} bytes")\n\nimport matplotlib.pyplot as plt\n\n\nplt.figure(figsize=(12, 6))\nplt.plot(rewards_per_episode_tabular, label=\'Tabular Method\')\nplt.plot(rewards_per_episode_nn, label=\'Neural Network Method\')\nplt.xlabel(\'Episode\')\nplt.ylabel(\'Total Reward\')\nplt.title(\'Rewards Per Episode: Tabular vs. Neural Network\')\nplt.legend()\nplt.grid(True)\nplt.show()\n\n\nplt.figure(figsize=(12, 6))\nplt.plot(td_errors_per_episode_tabular, label=\'Tabular Method (Mean TD Error)\')\nplt.plot(losses_per_episode_nn, label=\'Neural Network Method (Mean MSE Loss)\')\nplt.xlabel(\'Episode\')\nplt.ylabel(\'Mean Error/Loss\')\nplt.title(\'Mean Error/Loss Per Episode: Tabular vs. Neural Network\')\nplt.legend()\nplt.grid(True)\nplt.show()\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport torch\nimport torch.nn as nn\nimport gymnasium as gym\n\n\nclass ValueNet(nn.Module):\n  def __init__(self):\n    super().__init__()\n    self.fc = nn.Sequential(\n    nn.Linear(1, 64),\n    nn.ReLU(),\n    nn.Linear(64, 1)\n    )\n\n  def forward(self, x):\n    return self.fc(x)\n\n\nlearning_rates = [0.0001, 0.001, 0.01]\n\ndef train_nn(learning_rate):\n  model = ValueNet()\n  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)\n  loss_fn = nn.MSELoss()\n  gamma = 0.99\n  env = gym.make(\'CartPole-v1\')\n\n  rewards_per_episode_nn = []\n  losses_per_episode_nn = []\n\n\n  for episode in range(500):\n    state, _ = env.reset()\n    done = False\n    episode_reward = 0\n    current_episode_losses = []\n\n    while not done:\n\n      state_tensor = torch.tensor([state[0]], dtype=torch.float32).unsqueeze(0)\n      value = model(state_tensor)\n      action = env.action_space.sample()\n      next_state, reward, terminated, truncated, _ = env.step(action)\n      done = terminated or truncated\n      with torch.no_grad():\n\n        next_state_tensor = torch.tensor([next_state[0]], dtype=torch.float32).unsqueeze(0)\n        next_value = model(next_state_tensor)\n        target = reward + gamma * next_value\n\n      loss = loss_fn(value, target)\n      optimizer.zero_grad()\n      loss.backward()\n      optimizer.step()\n      state = next_state\n\n      episode_reward += reward\n      current_episode_losses.append(loss.item())\n\n    rewards_per_episode_nn.append(episode_reward)\n    if current_episode_losses:\n      losses_per_episode_nn.append(np.mean(current_episode_losses))\n    else:\n      losses_per_episode_nn.append(0)\n\n\n  model_memory_bytes = sum(p.numel() * p.element_size() for p in model.parameters())\n  print(f"Learning Rate: {learning_rate}, Memory usage of neural network model: {model_memory_bytes} bytes")\n\n  return rewards_per_episode_nn, losses_per_episode_nn\n\n\nresults = {}\nfor lr in learning_rates:\n  print(f"\\nRunning experiment with learning rate: {lr}")\n  rewards, losses = train_nn(lr)\n  results[lr] = {\n      \'rewards\': rewards,\n      \'losses\': losses\n  }\n\n\nplt.figure(figsize=(12, 7))\nfor lr, data in results.items():\n  plt.plot(data[\'rewards\'], label=f\'LR = {lr}\')\nplt.xlabel(\'Episode\')\nplt.ylabel(\'Total Reward\')\nplt.title(\'Neural Network: Rewards Per Episode for Different Learning Rates\')\nplt.legend()\nplt.grid(True)\nplt.show()\n\n\nplt.figure(figsize=(12, 7))\nfor lr, data in results.items():\n  plt.plot(data[\'losses\'], label=f\'LR = {lr}\')\nplt.xlabel(\'Episode\')\nplt.ylabel(\'Mean Loss\')\nplt.title(\'Neural Network: Mean Loss Per Episode for Different Learning Rates\')\nplt.legend()\nplt.grid(True)\nplt.show()',
    11: 'import numpy as np\nimport matplotlib.pyplot as plt\nfrom matplotlib.ticker import MaxNLocator\nimport seaborn as sns\n\n# Set style for better visualizations\nsns.set_style("whitegrid")\nplt.rcParams[\'figure.figsize\'] = (12, 8)\nplt.rcParams[\'font.size\'] = 12\n\n# Task 1: Compare different discount factors (gamma)\ndef task1_discount_factor_comparison():\n    print("=" * 60)\n    print("TASK 1: DISCOUNT FACTOR (GAMMA) COMPARISON")\n    print("=" * 60)\n\n    # Fixed batch dataset for MC\n    episodes = [\n        [(0, 1), (1, 1), (2, 0)],\n        [(0, 1), (1, 0), (2, 1)],\n        [(0, 0), (1, 1), (2, 1)]\n    ]\n\n    # Fixed transitions for TD methods\n    transitions = [\n        (0, 1, 1),\n        (1, 1, 2),\n        (2, 0, 2),\n        (0, 1, 1),\n        (1, 0, 2)\n    ]\n\n    # Different gamma values to test\n    gamma_values = [0.1, 0.5, 0.9, 0.99]\n    mc_results = []\n    td0_results = []\n    tdl_results = []\n\n    # True values for calculating MSE (approximated from many runs with gamma=0.9)\n    true_values = np.array([1.86, 1.78, 0.90])\n\n    print("\\nBatch Monte Carlo Results with different gamma values:")\n    print("-" * 40)\n    for gamma in gamma_values:\n        V = np.zeros(3)\n        returns = {s: [] for s in range(3)}\n\n        for episode in episodes:\n            G = 0\n            for (state, reward) in reversed(episode):\n                G = reward + gamma * G\n                returns[state].append(G)\n\n        for state in returns:\n            if returns[state]:  # Check if there are returns for this state\n                V[state] = np.mean(returns[state])\n\n        mse = np.mean((V - true_values) ** 2)\n        mc_results.append((gamma, V.copy(), mse))\n        print(f"Gamma = {gamma:.2f} | Values: {V} | MSE: {mse:.4f}")\n\n    print("\\nBatch TD(0) Results with different gamma values:")\n    print("-" * 40)\n    for gamma in gamma_values:\n        num_states = 3\n        V = np.zeros(num_states)\n        alpha = 0.1\n        epochs = 50\n\n        for epoch in range(epochs):\n            td_sums = np.zeros(num_states)\n            td_counts = np.zeros(num_states)\n\n            for s, r, s_next in transitions:\n                td_error = r + gamma * V[s_next] - V[s]\n                td_sums[s] += td_error\n                td_counts[s] += 1\n\n            for s in range(num_states):\n                if td_counts[s] > 0:\n                    V[s] += alpha * (td_sums[s] / td_counts[s])\n\n        mse = np.mean((V - true_values) ** 2)\n        td0_results.append((gamma, V.copy(), mse))\n        print(f"Gamma = {gamma:.2f} | Values: {V} | MSE: {mse:.4f}")\n\n    print("\\nBatch TD(λ) Results with different gamma values (λ=0.8):")\n    print("-" * 40)\n    for gamma in gamma_values:\n        V = np.zeros(3)\n        eligibility = np.zeros(3)\n        alpha = 0.1\n        lam = 0.8\n        epochs = 50\n\n        for epoch in range(epochs):\n            eligibility[:] = 0\n            for s, r, s_next in transitions:\n                td_error = r + gamma * V[s_next] - V[s]\n                eligibility[s] += 1\n                V += alpha * td_error * eligibility\n                eligibility *= gamma * lam\n\n        mse = np.mean((V - true_values) ** 2)\n        tdl_results.append((gamma, V.copy(), mse))\n        print(f"Gamma = {gamma:.2f} | Values: {V} | MSE: {mse:.4f}")\n\n    # Plot results for Task 1\n    plt.figure(figsize=(15, 10))\n\n    # Plot 1: Value estimates comparison\n    plt.subplot(2, 2, 1)\n    states = [\'State 0\', \'State 1\', \'State 2\']\n    x = np.arange(len(states))\n    width = 0.2\n\n    # Plot MC values for different gammas\n    for i, (gamma, values, _) in enumerate(mc_results):\n        plt.bar(x + i*width, values, width, label=f\'MC γ={gamma}\')\n\n    plt.xlabel(\'States\')\n    plt.ylabel(\'Value Estimates\')\n    plt.title(\'Batch Monte Carlo: Effect of Discount Factor\')\n    plt.xticks(x + width*1.5, states)\n    plt.legend()\n    plt.grid(True, alpha=0.3)\n\n    # Plot 2: TD(0) values comparison\n    plt.subplot(2, 2, 2)\n    for i, (gamma, values, _) in enumerate(td0_results):\n        plt.bar(x + i*width, values, width, label=f\'TD(0) γ={gamma}\')\n\n    plt.xlabel(\'States\')\n    plt.ylabel(\'Value Estimates\')\n    plt.title(\'Batch TD(0): Effect of Discount Factor\')\n    plt.xticks(x + width*1.5, states)\n    plt.legend()\n    plt.grid(True, alpha=0.3)\n\n    # Plot 3: MSE comparison\n    plt.subplot(2, 2, 3)\n    mc_mse = [mse for _, _, mse in mc_results]\n    td0_mse = [mse for _, _, mse in td0_results]\n    tdl_mse = [mse for _, _, mse in tdl_results]\n\n    plt.plot(gamma_values, mc_mse, \'o-\', linewidth=2, label=\'Monte Carlo\')\n    plt.plot(gamma_values, td0_mse, \'s-\', linewidth=2, label=\'TD(0)\')\n    plt.plot(gamma_values, tdl_mse, \'^-\', linewidth=2, label=\'TD(λ)\')\n    plt.xlabel(\'Discount Factor (γ)\')\n    plt.ylabel(\'Mean Squared Error (MSE)\')\n    plt.title(\'MSE vs Discount Factor\')\n    plt.legend()\n    plt.grid(True, alpha=0.3)\n\n    # Plot 4: Value estimates for highest gamma\n    plt.subplot(2, 2, 4)\n    mc_vals = mc_results[-1][1]\n    td0_vals = td0_results[-1][1]\n    tdl_vals = tdl_results[-1][1]\n\n    plt.bar(x - 0.2, mc_vals, 0.2, label=\'Monte Carlo\')\n    plt.bar(x, td0_vals, 0.2, label=\'TD(0)\')\n    plt.bar(x + 0.2, tdl_vals, 0.2, label=\'TD(λ)\')\n    plt.axhline(y=true_values[0], color=\'r\', linestyle=\'--\', alpha=0.3)\n    plt.axhline(y=true_values[1], color=\'r\', linestyle=\'--\', alpha=0.3)\n    plt.axhline(y=true_values[2], color=\'r\', linestyle=\'--\', alpha=0.3)\n\n    plt.xlabel(\'States\')\n    plt.ylabel(\'Value Estimates\')\n    plt.title(f\'Algorithm Comparison (γ={gamma_values[-1]})\')\n    plt.xticks(x, states)\n    plt.legend()\n    plt.grid(True, alpha=0.3)\n\n    plt.tight_layout()\n    plt.savefig(\'task1_discount_comparison.png\')\n    plt.show()\n\n    return mc_results, td0_results, tdl_results\n\n# Task 2: Analyze convergence with different number of epochs\ndef task2_convergence_analysis():\n    print("\\n" + "=" * 60)\n    print("TASK 2: CONVERGENCE ANALYSIS WITH DIFFERENT EPOCHS")\n    print("=" * 60)\n\n    transitions = [\n        (0, 1, 1),\n        (1, 1, 2),\n        (2, 0, 2),\n        (0, 1, 1),\n        (1, 0, 2)\n    ]\n\n    episodes = [\n        [(0, 1), (1, 1), (2, 0)],\n        [(0, 1), (1, 0), (2, 1)],\n        [(0, 0), (1, 1), (2, 1)]\n    ]\n\n    gamma = 0.9\n    alpha = 0.1\n    lam = 0.8\n\n    # Different epoch values to test\n    epoch_values = [1, 5, 10, 20, 30, 40, 50, 100]\n\n    # True values (reference)\n    true_values = np.array([1.86, 1.78, 0.90])\n\n    # Store results\n    mc_errors = []\n    td0_errors = []\n    tdl_errors = []\n\n    print("\\nConvergence Analysis:")\n    print("-" * 60)\n    print(f"{\'Epochs\':<10} {\'MC MSE\':<15} {\'TD(0) MSE\':<15} {\'TD(λ) MSE\':<15}")\n    print("-" * 60)\n\n    for epochs in epoch_values:\n        # Batch Monte Carlo (note: MC doesn\'t use epochs in the same way)\n        V_mc = np.zeros(3)\n        returns = {s: [] for s in range(3)}\n\n        for episode in episodes:\n            G = 0\n            for (state, reward) in reversed(episode):\n                G = reward + gamma * G\n                returns[state].append(G)\n\n        for state in returns:\n            if returns[state]:\n                V_mc[state] = np.mean(returns[state])\n\n        mse_mc = np.mean((V_mc - true_values) ** 2)\n        mc_errors.append(mse_mc)\n\n        # Batch TD(0)\n        V_td0 = np.zeros(3)\n        for epoch in range(epochs):\n            td_sums = np.zeros(3)\n            td_counts = np.zeros(3)\n\n            for s, r, s_next in transitions:\n                td_error = r + gamma * V_td0[s_next] - V_td0[s]\n                td_sums[s] += td_error\n                td_counts[s] += 1\n\n            for s in range(3):\n                if td_counts[s] > 0:\n                    V_td0[s] += alpha * (td_sums[s] / td_counts[s])\n\n        mse_td0 = np.mean((V_td0 - true_values) ** 2)\n        td0_errors.append(mse_td0)\n\n        # Batch TD(λ)\n        V_tdl = np.zeros(3)\n        for epoch in range(epochs):\n            eligibility = np.zeros(3)\n            for s, r, s_next in transitions:\n                td_error = r + gamma * V_tdl[s_next] - V_tdl[s]\n                eligibility[s] += 1\n                V_tdl += alpha * td_error * eligibility\n                eligibility *= gamma * lam\n\n        mse_tdl = np.mean((V_tdl - true_values) ** 2)\n        tdl_errors.append(mse_tdl)\n\n        print(f"{epochs:<10} {mse_mc:<15.6f} {mse_td0:<15.6f} {mse_tdl:<15.6f}")\n\n    # Plot convergence results\n    plt.figure(figsize=(15, 10))\n\n    # Plot 1: MSE vs Epochs\n    plt.subplot(2, 2, 1)\n    plt.plot(epoch_values, mc_errors, \'o-\', linewidth=2, label=\'Monte Carlo\')\n    plt.plot(epoch_values, td0_errors, \'s-\', linewidth=2, label=\'TD(0)\')\n    plt.plot(epoch_values, tdl_errors, \'^-\', linewidth=2, label=\'TD(λ)\')\n    plt.xlabel(\'Number of Epochs\')\n    plt.ylabel(\'Mean Squared Error (MSE)\')\n    plt.title(\'Convergence Analysis: MSE vs Epochs\')\n    plt.legend()\n    plt.grid(True, alpha=0.3)\n    plt.yscale(\'log\')  # Log scale to better see convergence\n\n    # Plot 2: Value estimates progression for TD(0)\n    plt.subplot(2, 2, 2)\n    V_progress = np.zeros((len(epoch_values), 3))\n    V = np.zeros(3)\n\n    epoch_idx = 0\n    for epoch in range(1, max(epoch_values) + 1):\n        td_sums = np.zeros(3)\n        td_counts = np.zeros(3)\n\n        for s, r, s_next in transitions:\n            td_error = r + gamma * V[s_next] - V[s]\n            td_sums[s] += td_error\n            td_counts[s] += 1\n\n        for s in range(3):\n            if td_counts[s] > 0:\n                V[s] += alpha * (td_sums[s] / td_counts[s])\n\n        if epoch in epoch_values:\n            V_progress[epoch_idx] = V.copy()\n            epoch_idx += 1\n\n    for state in range(3):\n        plt.plot(epoch_values, V_progress[:, state], \'o-\', linewidth=2, label=f\'State {state}\')\n        plt.axhline(y=true_values[state], color=\'r\', linestyle=\'--\', alpha=0.7)\n\n    plt.xlabel(\'Number of Epochs\')\n    plt.ylabel(\'Value Estimates\')\n    plt.title(\'TD(0): Value Estimates Convergence\')\n    plt.legend()\n    plt.grid(True, alpha=0.3)\n\n    # Plot 3: Value estimates progression for TD(λ)\n    plt.subplot(2, 2, 3)\n    V_progress = np.zeros((len(epoch_values), 3))\n    V = np.zeros(3)\n\n    epoch_idx = 0\n    for epoch in range(1, max(epoch_values) + 1):\n        eligibility = np.zeros(3)\n        for s, r, s_next in transitions:\n            td_error = r + gamma * V[s_next] - V[s]\n            eligibility[s] += 1\n            V += alpha * td_error * eligibility\n            eligibility *= gamma * lam\n\n        if epoch in epoch_values:\n            V_progress[epoch_idx] = V.copy()\n            epoch_idx += 1\n\n    for state in range(3):\n        plt.plot(epoch_values, V_progress[:, state], \'o-\', linewidth=2, label=f\'State {state}\')\n        plt.axhline(y=true_values[state], color=\'r\', linestyle=\'--\', alpha=0.7)\n\n    plt.xlabel(\'Number of Epochs\')\n    plt.ylabel(\'Value Estimates\')\n    plt.title(\'TD(λ): Value Estimates Convergence\')\n    plt.legend()\n    plt.grid(True, alpha=0.3)\n\n    # Plot 4: Final values comparison\n    plt.subplot(2, 2, 4)\n    x = np.arange(3)\n    width = 0.25\n\n    # Get final values from last epoch\n    final_mc = np.zeros(3)\n    returns = {s: [] for s in range(3)}\n    for episode in episodes:\n        G = 0\n        for (state, reward) in reversed(episode):\n            G = reward + gamma * G\n            returns[state].append(G)\n    for state in returns:\n        if returns[state]:\n            final_mc[state] = np.mean(returns[state])\n\n    plt.bar(x - width, final_mc, width, label=\'Monte Carlo\')\n    plt.bar(x, V_td0, width, label=\'TD(0)\')\n    plt.bar(x + width, V_tdl, width, label=\'TD(λ)\')\n    plt.plot(x, true_values, \'ro-\', linewidth=2, markersize=8, label=\'True Values\')\n\n    plt.xlabel(\'States\')\n    plt.ylabel(\'Value Estimates\')\n    plt.title(\'Final Value Estimates Comparison\')\n    plt.xticks(x, [\'State 0\', \'State 1\', \'State 2\'])\n    plt.legend()\n    plt.grid(True, alpha=0.3)\n\n    plt.tight_layout()\n    plt.savefig(\'task2_convergence_analysis.png\')\n    plt.show()\n\n    return epoch_values, mc_errors, td0_errors, tdl_errors\n\n# Task 3: Compare TD(0) and TD(λ) for different lambda values\ndef task3_lambda_comparison():\n    print("\\n" + "=" * 60)\n    print("TASK 3: TD(0) vs TD(λ) COMPARISON WITH DIFFERENT LAMBDA VALUES")\n    print("=" * 60)\n\n    transitions = [\n        (0, 1, 1),\n        (1, 1, 2),\n        (2, 0, 2),\n        (0, 1, 1),\n        (1, 0, 2)\n    ]\n\n    gamma = 0.9\n    alpha = 0.1\n    epochs = 50\n\n    # Different lambda values to test\n    lambda_values = [0.0, 0.2, 0.5, 0.8, 1.0]  # 0.0 is TD(0), 1.0 is MC-like\n\n    # True values (reference)\n    true_values = np.array([1.86, 1.78, 0.90])\n\n    # Store results\n    td_results = []\n    tdl_results = []\n    td_mse = []\n    tdl_mse = []\n\n    print("\\nLambda Comparison Results:")\n    print("-" * 70)\n    print(f"{\'Lambda\':<10} {\'TD(0) Values\':<30} {\'TD(λ) Values\':<30} {\'TD(0) MSE\':<10} {\'TD(λ) MSE\':<10}")\n    print("-" * 70)\n\n    # TD(0) is the same for all lambda comparisons (lambda=0)\n    V_td0 = np.zeros(3)\n    for epoch in range(epochs):\n        td_sums = np.zeros(3)\n        td_counts = np.zeros(3)\n\n        for s, r, s_next in transitions:\n            td_error = r + gamma * V_td0[s_next] - V_td0[s]\n            td_sums[s] += td_error\n            td_counts[s] += 1\n\n        for s in range(3):\n            if td_counts[s] > 0:\n                V_td0[s] += alpha * (td_sums[s] / td_counts[s])\n\n    mse_td0 = np.mean((V_td0 - true_values) ** 2)\n\n    for lam in lambda_values:\n        # Run TD(λ) with current lambda\n        V_tdl = np.zeros(3)\n        for epoch in range(epochs):\n            eligibility = np.zeros(3)\n            for s, r, s_next in transitions:\n                td_error = r + gamma * V_tdl[s_next] - V_tdl[s]\n                eligibility[s] += 1\n                V_tdl += alpha * td_error * eligibility\n                eligibility *= gamma * lam\n\n        mse_tdl = np.mean((V_tdl - true_values) ** 2)\n\n        td_results.append((lam, V_td0.copy(), mse_td0))\n        tdl_results.append((lam, V_tdl.copy(), mse_tdl))\n        td_mse.append(mse_td0)\n        tdl_mse.append(mse_tdl)\n\n        print(f"{lam:<10.1f} {str(np.round(V_td0, 4)):<30} {str(np.round(V_tdl, 4)):<30} {mse_td0:<10.6f} {mse_tdl:<10.6f}")\n\n    # Plot lambda comparison results\n    plt.figure(figsize=(15, 12))\n\n    # Plot 1: MSE vs Lambda\n    plt.subplot(3, 1, 1)\n    plt.plot(lambda_values, [mse_td0]*len(lambda_values), \'s--\', linewidth=2, label=\'TD(0)\')\n    plt.plot(lambda_values, tdl_mse, \'o-\', linewidth=2, label=\'TD(λ)\')\n    plt.xlabel(\'Lambda (λ)\')\n    plt.ylabel(\'Mean Squared Error (MSE)\')\n    plt.title(\'TD(0) vs TD(λ): MSE Comparison\')\n    plt.legend()\n    plt.grid(True, alpha=0.3)\n    plt.xticks(lambda_values)\n\n    # Plot 2: Value estimates for different lambdas (State 0)\n    plt.subplot(3, 1, 2)\n    state0_vals = [V[0] for _, V, _ in tdl_results]\n    state1_vals = [V[1] for _, V, _ in tdl_results]\n    state2_vals = [V[2] for _, V, _ in tdl_results]\n\n    plt.plot(lambda_values, [V_td0[0]]*len(lambda_values), \'s--\', linewidth=2, label=\'TD(0) State 0\')\n    plt.plot(lambda_values, state0_vals, \'o-\', linewidth=2, label=\'TD(λ) State 0\')\n    plt.axhline(y=true_values[0], color=\'r\', linestyle=\'--\', alpha=0.7, label=\'True Value State 0\')\n\n    plt.xlabel(\'Lambda (λ)\')\n    plt.ylabel(\'Value Estimate\')\n    plt.title(\'State 0 Value Estimates vs Lambda\')\n    plt.legend()\n    plt.grid(True, alpha=0.3)\n    plt.xticks(lambda_values)\n\n    # Plot 3: Value estimates for different lambdas (State 1 and 2)\n    plt.subplot(3, 1, 3)\n    plt.plot(lambda_values, [V_td0[1]]*len(lambda_values), \'s--\', linewidth=2, label=\'TD(0) State 1\')\n    plt.plot(lambda_values, state1_vals, \'o-\', linewidth=2, label=\'TD(λ) State 1\')\n    plt.plot(lambda_values, [V_td0[2]]*len(lambda_values), \'s--\', linewidth=2, label=\'TD(0) State 2\')\n    plt.plot(lambda_values, state2_vals, \'o-\', linewidth=2, label=\'TD(λ) State 2\')\n    plt.axhline(y=true_values[1], color=\'r\', linestyle=\'--\', alpha=0.5)\n    plt.axhline(y=true_values[2], color=\'g\', linestyle=\'--\', alpha=0.5)\n\n    plt.xlabel(\'Lambda (λ)\')\n    plt.ylabel(\'Value Estimate\')\n    plt.title(\'States 1 & 2 Value Estimates vs Lambda\')\n    plt.legend()\n    plt.grid(True, alpha=0.3)\n    plt.xticks(lambda_values)\n\n    plt.tight_layout()\n    plt.savefig(\'task3_lambda_comparison.png\')\n    plt.show()\n\n    # Additional detailed comparison plot\n    plt.figure(figsize=(15, 8))\n\n    # For each lambda, show the value estimates\n    x = np.arange(3)  # Three states\n    width = 0.15\n\n    plt.subplot(1, 2, 1)\n    for i, (lam, values, _) in enumerate(tdl_results):\n        plt.bar(x + (i-2)*width, values, width, label=f\'λ={lam}\')\n\n    plt.bar(x + (len(tdl_results)-2)*width + 0.15, V_td0, width, label=\'TD(0)\', color=\'black\', alpha=0.7)\n    plt.plot(x, true_values, \'ro-\', linewidth=2, markersize=8, label=\'True Values\')\n\n    plt.xlabel(\'States\')\n    plt.ylabel(\'Value Estimates\')\n    plt.title(\'Value Estimates for Different Lambda Values\')\n    plt.xticks(x, [\'State 0\', \'State 1\', \'State 2\'])\n    plt.legend(bbox_to_anchor=(1.05, 1), loc=\'upper left\')\n    plt.grid(True, alpha=0.3)\n\n    # Learning curve comparison for different lambdas\n    plt.subplot(1, 2, 2)\n    epochs_range = range(1, epochs + 1)\n\n    # Track MSE over epochs for different lambdas\n    mse_history = {lam: [] for lam in lambda_values}\n\n    # TD(0) learning curve\n    V_td0_learning_curve = np.zeros(3)\n    mse_history_td0 = []\n    for epoch in range(epochs):\n        td_sums = np.zeros(3)\n        td_counts = np.zeros(3)\n\n        for s, r, s_next in transitions:\n            td_error = r + gamma * V_td0_learning_curve[s_next] - V_td0_learning_curve[s]\n            td_sums[s] += td_error\n            td_counts[s] += 1\n\n        for s in range(3):\n            if td_counts[s] > 0:\n                V_td0_learning_curve[s] += alpha * (td_sums[s] / td_counts[s])\n\n        mse = np.mean((V_td0_learning_curve - true_values) ** 2)\n        mse_history_td0.append(mse)\n    mse_history[0.0] = mse_history_td0\n\n    # TD(λ) learning curves for different lambdas\n    for lam in lambda_values:\n        if lam == 0.0:\n            continue\n\n        V_tdl_learning_curve = np.zeros(3)\n        for epoch in range(epochs):\n            eligibility = np.zeros(3)\n            for s, r, s_next in transitions:\n                td_error = r + gamma * V_tdl_learning_curve[s_next] - V_tdl_learning_curve[s]\n                eligibility[s] += 1\n                V_tdl_learning_curve += alpha * td_error * eligibility\n                eligibility *= gamma * lam\n\n            mse = np.mean((V_tdl_learning_curve - true_values) ** 2)\n            mse_history[lam].append(mse)\n\n    # Plot learning curves\n    for lam in sorted(mse_history.keys()):\n        if lam == 0.0:\n            plt.plot(epochs_range, mse_history[lam], linewidth=2.5, label=f\'TD(0)\')\n        else:\n            plt.plot(epochs_range, mse_history[lam], linewidth=2, label=f\'TD(λ={lam})\')\n\n    plt.xlabel(\'Epochs\')\n    plt.ylabel(\'MSE\')\n    plt.title(\'Convergence Speed for Different Lambda Values\')\n    plt.legend()\n    plt.grid(True, alpha=0.3)\n    plt.yscale(\'log\')\n\n    plt.tight_layout()\n    plt.savefig(\'task3_detailed_comparison.png\')\n    plt.show()\n\n    return lambda_values, td_results, tdl_results\n\n# Main execution function\ndef main():\n    print("BATCH REINFORCEMENT LEARNING LAB")\n    print("=" * 60)\n\n    # Execute all tasks\n    mc_results, td0_results, tdl_results = task1_discount_factor_comparison()\n    epoch_values, mc_errors, td0_errors, tdl_errors = task2_convergence_analysis()\n    lambda_values, td_results, tdl_results = task3_lambda_comparison()\n\n    print("\\n" + "=" * 60)\n    print("SUMMARY OF KEY INSIGHTS")\n    print("=" * 60)\n    print("1. Discount Factor (γ) Effects:")\n    print("   - Higher γ values lead to more consideration of future rewards")\n    print("   - Optimal γ depends on the specific problem and desired behavior")\n    print("   - TD methods generally converge faster than Monte Carlo")\n\n    print("\\n2. Convergence Analysis:")\n    print("   - More epochs lead to better convergence (lower MSE)")\n    print("   - TD(λ) typically converges faster than TD(0) or MC")\n    print("   - Diminishing returns after a certain number of epochs")\n\n    print("\\n3. Lambda Parameter Effects:")\n    print("   - λ=0 corresponds to TD(0), λ=1 corresponds to Monte Carlo-like behavior")\n    print("   - Intermediate λ values (0.5-0.8) often provide best performance")\n    print("   - TD(λ) combines benefits of both bootstrapping and full returns")\n\n    print("\\nLab completed successfully! Check the generated plots for visual analysis.")\n\nif __name__ == "__main__":\n    main()',
    12: '!pip uninstall -y gym\n!pip install gymnasium\nimport gymnasium as gym\nimport torch\nimport torch.nn as nn\nimport torch.optim as optim\nimport random\nimport numpy as np\nfrom collections import deque\nimport matplotlib.pyplot as plt\n\n\n# --- Step 1: Define the Q-Network ---\nclass QNetwork(nn.Module):\n    def __init__(self, state_dim, action_dim):\n        super(QNetwork, self).__init__()\n        # Three-layer fully connected architecture\n        self.fc1 = nn.Linear(state_dim, 64)\n        self.fc2 = nn.Linear(64, 64)\n        self.fc3 = nn.Linear(64, action_dim)\n\n    def forward(self, x):\n        # ReLU activation for non-linearity\n        x = torch.relu(self.fc1(x))\n        x = torch.relu(self.fc2(x))\n        return self.fc3(x)\n\n\n# --- Step 2: Helper Functions for Training ---\ndef select_action(state, epsilon, env, q_net):\n    """Selects action using Epsilon-Greedy strategy."""\n    if random.random() < epsilon:\n        return env.action_space.sample()  # Exploration\n    else:\n        state = torch.FloatTensor(state).unsqueeze(0)\n        with torch.no_grad():\n            return q_net(state).argmax().item()  # Exploitation\n\ndef replay(memory, batch_size, q_net, target_net, optimizer, gamma):\n    """Performs a batch update from experience replay memory."""\n    if len(memory) < batch_size:\n        return\n\n    # Randomly sample a mini-batch\n    batch = random.sample(memory, batch_size)\n    states, actions, rewards, next_states, dones = zip(*batch)\n\n    # Convert to tensors\n    states = torch.FloatTensor(np.array(states))\n    actions = torch.LongTensor(actions).unsqueeze(1)\n    rewards = torch.FloatTensor(rewards).unsqueeze(1)\n    next_states = torch.FloatTensor(np.array(next_states))\n    dones = torch.FloatTensor(dones).unsqueeze(1)\n\n    # Current Q-values (predicted by the main network)\n    q_values = q_net(states).gather(1, actions)\n\n    # Target Q-values (predicted by the target network)\n    with torch.no_grad():\n        next_q_values = target_net(next_states).max(1)[0].unsqueeze(1)\n        # Bellman Equation: y = r + gamma * max(Q_target(s\', a\'))\n        targets = rewards + gamma * next_q_values * (1 - dones)\n\n    # Compute loss and optimize\n    loss = nn.MSELoss()(q_values, targets)\n    optimizer.zero_grad()\n    loss.backward()\n    optimizer.step()\n\n\n\n# --- Step 3: Main Training Loop ---\ndef train_dqn():\n    # Environment Setup\n    env = gym.make(\'CartPole-v1\')\n    state_dim = env.observation_space.shape[0]\n    action_dim = env.action_space.n\n\n    # Networks Initialization\n    q_net = QNetwork(state_dim, action_dim)\n    target_net = QNetwork(state_dim, action_dim)\n    target_net.load_state_dict(q_net.state_dict()) # Initial sync\n\n    optimizer = optim.Adam(q_net.parameters(), lr=0.001)\n    memory = deque(maxlen=2000)\n\n    # Hyperparameters\n    batch_size = 64\n    gamma = 0.99\n    target_update = 10\n    episodes = 200\n    epsilon = 1.0\n    epsilon_decay = 0.995\n    epsilon_min = 0.01\n\n    rewards_list = []\n\n    print("Starting training...")\n    for ep in range(episodes):\n        state, _ = env.reset() # Fix: Correctly unpack (observation, info) from gymnasium\n        total_reward = 0\n        done = False\n\n        while not done:\n            action = select_action(state, epsilon, env, q_net)\n            next_state, reward, terminated, truncated, _ = env.step(action)\n            done = terminated or truncated\n\n            # Store experience\n            memory.append((state, action, reward, next_state, done))\n\n            state = next_state\n            total_reward += reward\n\n            # Batch Learning\n            replay(memory, batch_size, q_net, target_net, optimizer, gamma)\n\n        # Update exploration rate\n        epsilon = max(epsilon_min, epsilon * epsilon_decay)\n\n        # Periodically update the target network\n        if ep % target_update == 0:\n            target_net.load_state_dict(q_net.state_dict())\n\n        rewards_list.append(total_reward)\n\n        if (ep + 1) % 10 == 0:\n            print(f"Episode: {ep+1} | Reward: {total_reward} | Epsilon: {epsilon:.3f}")\n\n    env.close()\n    return rewards_list\n\n\n\n\n\n# --- Step 4: Visualization (Task 8) ---\ndef plot_results(rewards):\n    plt.figure(figsize=(10, 5))\n    plt.plot(rewards, label=\'Episode Reward\', alpha=0.3, color=\'blue\')\n\n    # Calculate Moving Average (Window = 10 as per Task 8)\n    if len(rewards) >= 10:\n        moving_avg = np.convolve(rewards, np.ones(10)/10, mode=\'valid\')\n        plt.plot(range(9, len(rewards)), moving_avg, label=\'Moving Average (10)\', color=\'red\')\n\n    plt.axhline(y=195, color=\'green\', linestyle=\'--\', label=\'Solved Threshold (195)\')\n    plt.xlabel(\'Episode\')\n    plt.ylabel(\'Reward\')\n    plt.title(\'DQN Learning Curve on CartPole-v1\')\n    plt.legend()\n    plt.show()\n\nif __name__ == "__main__":\n    training_rewards = train_dqn()\n    plot_results(training_rewards)',
    'OEL1': 'import gymnasium as gym\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom pprint import pprint\n\ndef policy_evaluation(env, policy, V, gamma=1.0, theta=1e-8):\n  state_func_ave = []\n\n  while True:\n    delta = 0\n    for s in range(env.observation_space.n):\n        v = 0\n        for a, action_prob in enumerate(policy[s]):\n            for prob, next_state, reward, done in P[s][a]:\n                v += action_prob * prob * (reward + gamma * V[next_state])\n        delta = max(delta, abs(v - V[s]))\n        V[s] = v\n\n    if delta < theta:\n        break\n\n    state_func_ave.append(np.mean(V))\n\n  return V, state_func_ave\n\ndef q_from_v(env, V, s, gamma=1):\n    q = np.zeros(env.action_space.n)\n    for a in range(env.action_space.n):\n        for prob, next_state, reward, done in P[s][a]:\n            q[a] += prob * (reward + gamma * V[next_state])\n    return q\n\ndef policy_improvement(env, V, discount_factor=1.0):\n    nS = env.observation_space.n\n    nA = env.action_space.n\n    policy = np.zeros([nS, nA])\n\n    for s in range(nS):\n        Q = q_from_v(env, V, s, discount_factor)\n        best_action = np.argmax(Q)\n        policy[s] = np.eye(nA)[best_action]\n\n    return policy\n\ndef policy_iteration(env, gamma=1.0, theta=1e-8):\n    nS = env.observation_space.n\n    nA = env.action_space.n\n\n    policy = np.ones([nS, nA]) / nA\n    V = np.zeros(nS)\n\n    iteration = 0\n    policy_stable = False\n\n    while not policy_stable:\n        iteration += 1\n\n        V, _ = policy_evaluation(env, policy, V, gamma, theta) # Get only the state value function\n\n        policy_stable = True\n        for s in range(nS):\n            old_action = np.argmax(policy[s])\n\n            Q = q_from_v(env, V, s, gamma)\n            best_action = np.argmax(Q)\n\n            new_policy_s = np.eye(nA)[best_action]\n\n            if not np.array_equal(policy[s], new_policy_s):\n                policy_stable = False\n                policy[s] = new_policy_s\n\n    return policy, V, iteration\n\ndef simple_plot(value_function_list, policy=None):\n    total_rows = total_columns = int(np.sqrt(len(value_function_list)))\n    value_function_matrix = value_function_list.reshape((total_rows, total_columns))\n    figure, axes = plt.subplots()\n    colored_axes = axes.matshow(value_function_matrix, cmap=\'cool\')\n    figure.colorbar(colored_axes)\n\n    description = env.unwrapped.desc\n    arrows = {0: \'←\', 1: \'↓\', 2: \'→\', 3: \'↑\'}\n\n    for i in range(total_rows):\n        for j in range(total_columns):\n            state_index = i * total_columns + j\n            tile = description[i, j].decode(\'utf-8\')\n            text = tile\n\n            if policy is not None:\n                best_action = np.argmax(policy[state_index])\n                text += \'\\n\' + arrows[best_action]\n                axes.text(j, i, text, ha=\'center\', va=\'center\', color=\'black\')\n                axes.text(j, i + 0.3, f"{value_function_list[state_index]:.2f}", ha=\'center\', va=\'center\', color=\'black\', fontsize=8)\n\n    plt.title("FrozenLake Values")\n    plt.show()\n\nenv = gym.make(\'FrozenLake-v1\', is_slippery=True, render_mode=\'ansi\')\nenv.reset()\nP = env.unwrapped.P\n\nprint(f\'The environments observation space: {env.observation_space}\')\nprint(f\'The environments actions space: {env.action_space}\')\nprint(f\'The environments reward range: {env.unwrapped.reward_range}\')\n\nprint(env.render())\n\npprint(P)\n\n# Initialize a random policy\nnS = env.observation_space.n\nnA = env.action_space.n\nrandom_policy = np.ones([nS, nA]) / nA\ngamma = 1.0\n\nV = np.zeros(env.observation_space.n)\nV_random_evaluated, V_average_list = policy_evaluation(env, random_policy, V, gamma=gamma)\n\nplt.plot(V_average_list)\nplt.title("Average Value Function per Iteration")\nplt.xlabel("Iteration")\nplt.ylabel("Average Value Function")\nplt.show()\n\noptimal_policy, optimal_V, iterations = policy_iteration(env, gamma=0.9, theta=1e-8)\nprint(f\'Convergence occured in {iterations} steps.\')\nsimple_plot(optimal_V, optimal_policy)\n\nenv_slippery = gym.make(\'FrozenLake-v1\', is_slippery=False, render_mode=\'ansi\')\n\noptimal_policy_slippery, optimal_V_slippery, iterations_slippery = policy_iteration(env_slippery, gamma=0.9, theta=1e-8)\nprint(f\'Convergence with non-slippery environment occured in {iterations_slippery} steps.\')\nsimple_plot(optimal_V_slippery, optimal_policy_slippery)',
    'OEL2': 'import gymnasium as gym\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n# Create environment\nenv = gym.make("FrozenLake-v1", is_slippery=True)\n\nnum_states = env.observation_space.n\nnum_actions = env.action_space.n\n\nprint("Number of states.",num_states)\nprint("Number of action space: ", num_actions)\n\n# Random Policy\n\ndef random_policy(state):\n    # Choose an action randomly\n    return np.random.choice(num_actions)\n\n# Monte Carlo Prediction\n\ndef mc_prediction(env, gamma=0.99, episodes=5000):\n    V = np.zeros(num_states)\n    returns = {s: [] for s in range(num_states)}  # store returns for averaging\n    V_record = []  # store values over time to plot convergence\n\n    for ep in range(episodes):\n        episode = []\n        state = env.reset()[0]\n\n        done = False\n        while not done:\n            action = random_policy(state)\n            next_state, reward, done, truncated, info = env.step(action)\n            episode.append((state, reward))\n            state = next_state\n\n        G = 0\n        visited = set()\n        # backward return calculation\n        for t in reversed(range(len(episode))):\n            s_t, r_t = episode[t]\n            G = gamma * G + r_t\n\n            if s_t not in visited:\n                returns[s_t].append(G)\n                V[s_t] = np.mean(returns[s_t])\n                visited.add(s_t)\n\n        V_record.append(V.copy())\n\n    return V, V_record\n\n# TD(0) Prediction\n\ndef td_zero_prediction(env, gamma=0.99, alpha=0.1, episodes=5000):\n    V = np.zeros(num_states)\n    V_record = []\n\n    for ep in range(episodes):\n        state = env.reset()[0]\n\n        done = False\n        while not done:\n            action = random_policy(state)\n            next_state, reward, done, truncated, info = env.step(action)\n\n            # TD update rule\n            V[state] = V[state] + alpha * (reward + gamma * V[next_state] - V[state])\n            state = next_state\n\n        V_record.append(V.copy())\n\n    return V, V_record\n\n\n# Run Experiments\n\nmc_V, mc_record = mc_prediction(env, episodes=2000)\ntd_V, td_record = td_zero_prediction(env, episodes=2000)\n\n# Convert lists to arrays for plotting\nmc_record = np.array(mc_record)\ntd_record = np.array(td_record)\n\n\n# Plot Value Convergence by Sohail\n\nplt.figure(figsize=(10,6))\nplt.plot(mc_record[:, 0], label="MC State 0")\nplt.plot(td_record[:, 0], label="TD State 0")\nplt.xlabel("Episodes")\nplt.ylabel("Value Estimate of State 0")\nplt.title("MC vs TD(0) Value Convergence (State 0)")\nplt.legend()\nplt.grid()\nplt.show()\n\n\n# Plot Final Value Functions\n# ---------------------------\nplt.figure(figsize=(10,6))\nplt.plot(mc_V, label="Final MC Values")\nplt.plot(td_V, label="Final TD Values")\nplt.xlabel("State Number")\nplt.ylabel("Estimated Value")\nplt.title("Final Value Estimates: MC vs TD(0)")\nplt.legend()\nplt.grid()\nplt.show()\n\n\n\n\n# TASK 2: SARSA vs Q-Learning (All Subparts)\n\n\nimport gymnasium as gym\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom collections import defaultdict\n\n\n# ε-greedy action selection\ndef epsilon_greedy(Q, state, epsilon, nA):\n    if np.random.rand() < epsilon:\n        return np.random.randint(nA)\n    return np.argmax(Q[state])\n\n\n# SARSA (On-Policy)\n\ndef sarsa(env, episodes=5000, alpha=0.1, gamma=0.99, epsilon=1.0, decay="linear"):\n    Q = defaultdict(lambda: np.zeros(env.action_space.n))\n    rewards = []\n\n    for ep in range(episodes):\n        state, _ = env.reset()\n        action = epsilon_greedy(Q, state, epsilon, env.action_space.n)\n        total_reward = 0\n        done = False\n\n        while not done:\n            next_state, reward, terminated, truncated, _ = env.step(action)\n            next_action = epsilon_greedy(Q, next_state, epsilon, env.action_space.n)\n\n            # SARSA Update\n            Q[state][action] += alpha * (\n                reward + gamma * Q[next_state][next_action] - Q[state][action]\n            )\n\n            state = next_state\n            action = next_action\n            done = terminated or truncated\n            total_reward += reward\n\n        rewards.append(total_reward)\n\n        # decays\n        if decay == "linear":\n            epsilon = max(0.01, epsilon - 1/episodes)\n        elif decay == "exp":\n            epsilon = max(0.01, epsilon * 0.995)\n\n    return Q, rewards\n\n\n\n# Q-Learning (Off-Policy)\n\ndef q_learning(env, episodes=5000, alpha=0.1, gamma=0.99, epsilon=1.0, decay="linear"):\n    Q = defaultdict(lambda: np.zeros(env.action_space.n))\n    rewards = []\n\n    for ep in range(episodes):\n        state, _ = env.reset()\n        total_reward = 0\n        done = False\n\n        while not done:\n            action = epsilon_greedy(Q, state, epsilon, env.action_space.n)\n            next_state, reward, terminated, truncated, _ = env.step(action)\n\n            # Q-Learning Update\n            Q[state][action] += alpha * (\n                reward + gamma * np.max(Q[next_state]) - Q[state][action]\n            )\n\n            state = next_state\n            done = terminated or truncated\n            total_reward += reward\n\n        rewards.append(total_reward)\n\n        # decays\n        if decay == "linear":\n            epsilon = max(0.01, epsilon - 1/episodes)\n        elif decay == "exp":\n            epsilon = max(0.01, epsilon * 0.995)\n\n    return Q, rewards\n\n\n\n# RUN ON TAXI-V3 (more stable)\n\nenv = gym.make("Taxi-v3")\n\nQ_sarsa, R_sarsa = sarsa(env, decay="linear")\nQ_ql, R_ql = q_learning(env, decay="linear")\n\nplt.figure(figsize=(10,5))\nplt.plot(R_sarsa, label="SARSA Rewards")\nplt.plot(R_ql, label="Q-Learning Rewards")\nplt.title("SARSA vs Q-Learning Performance")\nplt.legend()\nplt.grid()\nplt.show()\n\n\n# --- 1. ENVIRONMENT SETUP ---\nenv = gym.make(\'FrozenLake-v1\', is_slippery=True)\nnum_states = env.observation_space.n\nnum_actions = env.action_space.n\n\n# --- 2. COMMON PARAMETERS ---\nepisodes = 10000\ngamma = 0.95\nalpha = 0.1\nepsilon_start = 1.0\nepsilon_min = 0.01\n\n# Helper function to choose action using Epsilon-Greedy Strategy (re-used)\ndef epsilon_greedy_action(Q, state, epsilon, num_actions):\n    if np.random.random() < epsilon:\n        return env.action_space.sample()\n    else:\n        return np.argmax(Q[state, :])\n\n# --- 3. Q-Learning Function with Epsilon Schedule ---\ndef q_learning_with_decay(env, episodes, gamma, alpha, epsilon_start, epsilon_min, decay_type):\n    Q = np.zeros((num_states, num_actions))\n    reward_history = []\n\n    # Calculate decay rate based on type\n    if decay_type == \'linear\':\n        epsilon_decay_rate = (epsilon_start - epsilon_min) / episodes\n    elif decay_type == \'exponential\':\n        # This rate is for epsilon_start * (rate)^episodes = epsilon_min\n        epsilon_decay_rate = np.power(epsilon_min / epsilon_start, 1.0 / episodes)\n\n    for episode in range(episodes):\n        # --- Epsilon Update based on Decay Type ---\n        epsilon = epsilon_start\n        if decay_type == \'constant\':\n            epsilon = 0.1 # Keep a fixed epsilon for comparison\n        elif decay_type == \'linear\':\n            epsilon = max(epsilon_min, epsilon_start - episode * epsilon_decay_rate)\n        elif decay_type == \'exponential\':\n            epsilon = max(epsilon_min, epsilon_start * epsilon_decay_rate**episode)\n\n        # --- Q-Learning Steps (re-used) ---\n        state, _ = env.reset()\n        done = False\n        total_reward = 0\n        while not done:\n            action = epsilon_greedy_action(Q, state, epsilon, num_actions)\n            new_state, reward, terminated, truncated, _ = env.step(action)\n            done = terminated or truncated\n            total_reward += reward\n\n            max_future_q = np.max(Q[new_state, :])\n            Q[state, action] += alpha * (reward + gamma * max_future_q - Q[state, action])\n            state = new_state\n\n        reward_history.append(total_reward)\n\n    window_size = 100\n    smoothed_rewards = np.convolve(reward_history, np.ones(window_size)/window_size, mode=\'valid\')\n    return smoothed_rewards\n\n# --- 4. RUN AND PLOT ---\n# Use Q-Learning for the comparison (works for SARSA too)\nrewards_constant = q_learning_with_decay(env, episodes, gamma, alpha, epsilon_start, epsilon_min, \'constant\')\nrewards_linear = q_learning_with_decay(env, episodes, gamma, alpha, epsilon_start, epsilon_min, \'linear\')\nrewards_exponential = q_learning_with_decay(env, episodes, gamma, alpha, epsilon_start, epsilon_min, \'exponential\')\n\nplt.figure(figsize=(10, 6))\nplt.plot(rewards_constant, label=\'Constant $\\epsilon=0.1$\')\nplt.plot(rewards_linear, label=\'Linear Decay\')\nplt.plot(rewards_exponential, label=\'Exponential Decay\')\nplt.title(f\'Q-Learning Performance with Different $\\epsilon$-Decay Schedules\', fontsize=14)\nplt.xlabel(f\'Episode (Smoothed by Window of 100)\', fontsize=12)\nplt.ylabel(\'Average Reward\', fontsize=12)\nplt.legend()\nplt.grid(True, alpha=0.5)\nplt.show()\n\nenv.close()\n\n\n# TASK 3: Bellman Update Experiment by Sohail\n\n\nimport gymnasium as gym\nimport numpy as np\n\ndef bellman_update(V, state, next_state, reward, gamma):\n    # manual Bellman expectation update\n    return reward + gamma * V[next_state]\n\nenv = gym.make("FrozenLake-v1")\ngamma_values = [0.5, 0.9, 0.99]\nresults = {}\n\nfor gamma in gamma_values:\n    V = np.zeros(env.observation_space.n)\n    for _ in range(5000):\n        state, _ = env.reset()\n        done = False\n        while not done:\n            action = env.action_space.sample()\n            next_state, reward, done, truncated, _ = env.step(action)\n            V[state] = bellman_update(V, state, next_state, reward, gamma)\n            state = next_state\n    results[gamma] = V\n\nfor gamma in results:\n    print(f"\\nGamma = {gamma}\\nValue Function:\\n", results[gamma])\n\n\n\n# TASK 4: Off-Policy MC using Ordinary IS & Weighted IS\n\n\nimport gymnasium as gym\nimport numpy as np\nfrom collections import defaultdict\nimport matplotlib.pyplot as plt\n\ndef off_policy_mc(env, episodes=3000, gamma=0.99, behavior_eps=1.0):\n    V_ordinary = defaultdict(float)\n    V_weighted = defaultdict(float)\n    C = defaultdict(float)  # cumulative weights\n\n    ordinary_curve = []\n    weighted_curve = []\n\n    for ep in range(episodes):\n        state, _ = env.reset()\n        episode = []\n        done = False\n\n        # generate behavior policy episode (random or high-epsilon greedy)\n        while not done:\n            if np.random.rand() < behavior_eps:\n                action = env.action_space.sample()\n            else:\n                action = 0\n            next_state, reward, terminated, truncated, _ = env.step(action)\n            episode.append((state, action, reward))\n            state = next_state\n            done = terminated or truncated\n\n        G = 0\n        W = 1\n\n        for (s, a, r) in reversed(episode):\n            G = gamma * G + r\n\n            # behavior policy prob = uniform (1/n)\n            behavior_prob = 1 / env.action_space.n\n\n            # target policy prob (always pick action 0)\n            target_prob = 1 if a == 0 else 0\n\n            # importance weight\n            if behavior_prob == 0:\n                continue\n            W *= target_prob / behavior_prob\n\n            # Ordinary IS\n            V_ordinary[s] += (W * (G - V_ordinary[s]))\n\n            # Weighted IS\n            C[s] += W\n            if C[s] != 0:\n                V_weighted[s] += (W / C[s]) * (G - V_weighted[s])\n\n        ordinary_curve.append(np.mean(list(V_ordinary.values())))\n        weighted_curve.append(np.mean(list(V_weighted.values())))\n\n    return ordinary_curve, weighted_curve\n\n\nenv = gym.make("FrozenLake-v1")\no_curve, w_curve = off_policy_mc(env, behavior_eps=0.9)\n\nplt.plot(o_curve, label="Ordinary IS")\nplt.plot(w_curve, label="Weighted IS")\nplt.title("Off-Policy MC Using Importance Sampling")\nplt.legend()\nplt.grid()\nplt.show()\n\n\n\n# TASK 5: Q-Learning (alpha and gamma comparisons)\n\n\nimport gymnasium as gym\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom collections import defaultdict\n\ndef test_q(env, alpha, gamma):\n    Q = defaultdict(lambda: np.zeros(env.action_space.n))\n    rewards = []\n\n    for ep in range(2000):\n        state, _ = env.reset()\n        done = False\n        total_reward = 0\n\n        while not done:\n            action = np.random.randint(env.action_space.n)\n            next_state, reward, terminated, truncated, _ = env.step(action)\n\n            Q[state][action] += alpha * (\n                reward + gamma * np.max(Q[next_state]) - Q[state][action]\n            )\n\n            state = next_state\n            done = terminated or truncated\n            total_reward += reward\n        rewards.append(total_reward)\n    return rewards\n\nenv = gym.make("Taxi-v3")\n\nalphas = [0.1, 0.5, 0.9]\nfor a in alphas:\n    r = test_q(env, a, 0.99)\n    plt.plot(r, label=f"α = {a}")\n\nplt.title("Different Learning Rates in Q-Learning")\nplt.legend()\nplt.grid()\nplt.show()\n\ngammas = [0.5, 0.9, 0.99]\nfor g in gammas:\n    r = test_q(env, 0.5, g)\n    plt.plot(r, label=f"γ = {g}")\n\nplt.title("Different Discount Factors in Q-Learning")\nplt.legend()\nplt.grid()\nplt.show()\n\n\n\n# TASK 6: Environment Experiment + Harder Env + State Visit Counter\n\n\nimport gymnasium as gym\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom collections import defaultdict\n\ndef q_learning_visits(env, episodes=2000):\n    Q = defaultdict(lambda: np.zeros(env.action_space.n))\n    visits = defaultdict(int)\n    rewards = []\n\n    for ep in range(episodes):\n        state, _ = env.reset()\n        done = False\n        total_reward = 0\n\n        while not done:\n            visits[state] += 1\n            action = np.random.randint(env.action_space.n)\n            next_state, reward, terminated, truncated, _ = env.step(action)\n\n            Q[state][action] += 0.1 * (\n                reward + 0.99 * np.max(Q[next_state]) - Q[state][action]\n            )\n\n            state = next_state\n            done = terminated or truncated\n            total_reward += reward\n\n        rewards.append(total_reward)\n\n    return Q, rewards, visits\n\n\nenv_easy = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=True)\nenv_hard = gym.make("FrozenLake-v1", map_name="8x8", is_slippery=True)\n\n_, r_easy, v_easy = q_learning_visits(env_easy)\n_, r_hard, v_hard = q_learning_visits(env_hard)\n\nplt.plot(r_easy, label="Easy Env")\nplt.plot(r_hard, label="Hard Env")\nplt.legend()\nplt.title("Environment Difficulty Comparison")\nplt.grid()\nplt.show()\n\nprint("Most visited states (easy):", dict(list(v_easy.items())[:8]))\nprint("Most visited states (hard):", dict(list(v_hard.items())[:8]))\n\n\nfrom google.colab import ai\nans = ai.generate_text("which one is better sara or q learning")\n\n\n# TASK 7: Heatmap + Noise + Exploration Strategies\n\n\nimport gymnasium as gym\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom collections import defaultdict\n\n\n# Heatmap Function\n\ndef plot_heatmap(Q):\n    table = np.array([Q[s] for s in sorted(Q.keys())])\n    plt.figure(figsize=(8,4))\n    sns.heatmap(table, annot=False)\n    plt.title("Q-Table Heatmap")\n    plt.show()\n\n\n\n# Noisy Environment Wrapper\n\nclass NoisyWrapper(gym.Wrapper):\n    def step(self, action):\n        if np.random.rand() < 0.2:   # 20% noise\n            action = self.action_space.sample()\n        return super().step(action)\n\n\n# Softmax exploration\n\ndef softmax(q, tau=1.0):\n    ex = np.exp(q / tau)\n    return ex / np.sum(ex)\n\n\ndef choose_softmax(Q, s):\n    probs = softmax(Q[s])\n    return np.random.choice(len(Q[s]), p=probs)\n\n\n# Train with Softmax\n\ndef q_softmax(env):\n    Q = defaultdict(lambda: np.zeros(env.action_space.n))\n    for ep in range(2000):\n        s, _ = env.reset()\n        done = False\n        while not done:\n            a = choose_softmax(Q, s)\n            ns, r, term, trunc, _ = env.step(a)\n            Q[s][a] += 0.1 * (r + 0.99*np.max(Q[ns]) - Q[s][a])\n            s = ns\n            done = term or trunc\n    return Q\n\n\n\n# RUN ON NORMAL AND NOISY ENV\n\nenv = gym.make("Taxi-v3")\nnoisy_env = NoisyWrapper(gym.make("Taxi-v3"))\n\nQ_normal = q_softmax(env)\nQ_noisy = q_softmax(noisy_env)\n\nprint("Normal Q-table heatmap:")\nplot_heatmap(Q_normal)\n\nprint("Noisy Q-table heatmap:")\nplot_heatmap(Q_noisy)\n',
}

def get_lab_code(lab_id):
    """Get the complete code for a lab"""
    return LAB_CODE.get(lab_id, f"Lab {lab_id} code not found")
