In [4]:
%reset
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
import matplotlib.pyplot as plt

# Hyperparameters for the DQN
GAMMA = 0.99                # Discount factor for future rewards
LEARNING_RATE = 0.001       # Learning rate for the neural network
BATCH_SIZE = 64             # Number of transitions to sample for training
MEMORY_SIZE = 100000        # Maximum size of the replay memory
TARGET_UPDATE_FREQ = 10     # Frequency of updating the target network
EPSILON_START = 1.0         # Initial epsilon for the epsilon-greedy policy
EPSILON_MIN = 0.01          # Minimum value for epsilon
EPSILON_DECAY = 0.995       # Decay rate for epsilon after each episode
NUM_EPISODES = 50         # Number of episodes to train the agent
MAX_STEPS = 100            # Maximum number of steps per episode

# STEP 1: Define the neural network architecture
# This network approximates the Q-value function Q(s, a)
class DQNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)  # First fully connected layer
        self.fc2 = nn.Linear(128, 128)        # Second fully connected layer
        self.fc3 = nn.Linear(128, action_size)  # Output layer for Q-values

    def forward(self, x):
        # Forward pass: compute Q-values for a given state
        x = torch.relu(self.fc1(x))  # Apply ReLU activation to first layer
        x = torch.relu(self.fc2(x))  # Apply ReLU activation to second layer
        x = self.fc3(x)              # No activation in the output layer (raw Q-values)
        return x

# STEP 2: Define the replay memory
# This is used to store transitions and sample them randomly for training
class ReplayMemory:
    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)  # Fixed-size queue for storing transitions

    def push(self, transition):
        # Add a new transition to the memory
        self.memory.append(transition)

    def sample(self, batch_size):
        # Randomly sample a batch of transitions from the memory
        return random.sample(self.memory, batch_size)

    def __len__(self):
        # Return the current size of the memory
        return len(self.memory)

# STEP 3: Define the DQN agent
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.epsilon = EPSILON_START  # Initialize epsilon for the epsilon-greedy policy
        self.memory = ReplayMemory(MEMORY_SIZE)

        # Initialize the policy network (learns Q-values) and target network (stable reference)
        self.policy_net = DQNetwork(state_size, action_size).to(device)
        self.target_net = DQNetwork(state_size, action_size).to(device)

        # Use Adam optimizer to train the policy network
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=LEARNING_RATE)

        # Synchronize the target network with the policy network
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()  # Target network is not trained directly

    def select_action(self, state):
        # Use epsilon-greedy policy to select an action
        if np.random.rand() < self.epsilon:
            return np.random.randint(self.action_size)  # Explore: random action
        state = torch.FloatTensor(state).unsqueeze(0).to(device)  # Convert state to tensor
        with torch.no_grad():
            q_values = self.policy_net(state)  # Predict Q-values using the policy network
        return q_values.argmax().item()  # Exploit: select action with highest Q-value

    def update_epsilon(self):
        # Decay epsilon to reduce exploration over time
        self.epsilon = max(EPSILON_MIN, self.epsilon * EPSILON_DECAY)

    def store_transition(self, transition):
        # Store the transition in replay memory
        self.memory.push(transition)

    def train(self):
        # Train the policy network using a batch of transitions from replay memory
        if len(self.memory) < BATCH_SIZE:
            return  # Wait until the memory has enough samples

        # Sample a batch of transitions
        transitions = self.memory.sample(BATCH_SIZE)
        states, actions, rewards, next_states, dones = zip(*transitions)

        # Convert transitions to tensors
        states = torch.FloatTensor(states).to(device)
        actions = torch.LongTensor(actions).unsqueeze(1).to(device)
        rewards = torch.FloatTensor(rewards).to(device)
        next_states = torch.FloatTensor(next_states).to(device)
        dones = torch.FloatTensor(dones).to(device)

        # Compute Q(s, a) using the policy network
        q_values = self.policy_net(states).gather(1, actions)

        # Compute target Q-values: r + γ * max Q(s', a') for non-terminal states
        with torch.no_grad():
            next_q_values = self.target_net(next_states).max(1)[0]
            target_q_values = rewards + (GAMMA * next_q_values * (1 - dones))

        # Compute the loss between the Q-values and the target Q-values
        loss = nn.MSELoss()(q_values.squeeze(), target_q_values)

        # Perform a gradient descent step to update the policy network
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_target_network(self):
        # Copy the weights from the policy network to the target network
        self.target_net.load_state_dict(self.policy_net.state_dict())

# STEP 4: Train the DQN agent
env = gym.make("LunarLander-v3", render_mode="human")  # Initialize the Lunar Lander environment
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

state_size = env.observation_space.shape[0]  # State size from the environment
action_size = env.action_space.n             # Number of possible actions
agent = DQNAgent(state_size, action_size)    # Create the DQN agent

rewards = []  # List to store rewards for each episode

for episode in range(NUM_EPISODES):
    state, _ = env.reset()  # Reset the environment at the start of each episode
    total_reward = 0  # Initialize the total reward for the episode

    for step in range(MAX_STEPS):
        # Select an action using the epsilon-greedy policy
        action = agent.select_action(state)

        # Take the action in the environment
        next_state, reward, done, truncated, _ = env.step(action)

        # Store the transition in replay memory
        agent.store_transition((state, action, reward, next_state, done))

        # Train the policy network using experience replay
        agent.train()

        total_reward += reward
        state = next_state  # Update the current state

        if done or truncated:
            break

    rewards.append(total_reward)  # Store the total reward for this episode
    agent.update_epsilon()  # Decay epsilon for the next episode

    # Update the target network periodically
    if episode % TARGET_UPDATE_FREQ == 0:
        agent.update_target_network()

    # Print progress every 10 episodes
    if (episode + 1) % 2 == 0:
        print(f"Episode {episode + 1}/{NUM_EPISODES}, Reward: {total_reward:.2f}, Epsilon: {agent.epsilon:.3f}")

# Plot the rewards over episodes
plt.plot(rewards)
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("DQN on LunarLander-v2")
plt.show()

# Test the trained agent
state, _ = env.reset()
total_reward = 0

for step in range(MAX_STEPS):
    env.render()  # Render the environment
    action = agent.select_action(state)  # Select the best action
    next_state, reward, done, truncated, _ = env.step(action)
    total_reward += reward
    state = next_state

    if done or truncated:
        break

env.close()
print(f"Total Reward in Test: {total_reward}")

Once deleted, variables cannot be recovered. Proceed (y/[n])? y
Episode 2/100, Reward: -436.30, Epsilon: 0.990
Episode 4/100, Reward: 17.82, Epsilon: 0.980
Episode 6/100, Reward: -318.87, Epsilon: 0.970
Episode 8/100, Reward: -316.76, Epsilon: 0.961


KeyboardInterrupt: 

In [7]:
# Testing the trained DQN agent
state, _ = env.reset()  # Reset the environment to get the initial state
total_reward = 0        # Variable to store the total reward during the test episode
agent.epsilon = 0       # Set epsilon to 0 to disable exploration (pure exploitation)

for step in range(MAX_STEPS):
    env.render()  # Render the environment for visualization
    action = agent.select_action(state)  # Select the best action using the policy network
    next_state, reward, done, truncated, _ = env.step(action)  # Take the action in the environment
    total_reward += reward  # Accumulate the reward
    state = next_state  # Update the current state

    if done or truncated:
        break  # Exit the loop if the episode ends

env.close()  # Close the environment
print(f"Total Reward in Test: {total_reward:.2f}")

error: display Surface quit

	Environment Basics:
	•	What does gym.make("LunarLander-v3") do in this script?
	•	How is the state_size and action_size determined in the script?
	2.	Replay Memory:
	•	What is the purpose of the ReplayMemory class in the script?
	•	Why do we use random sampling (random.sample) in the replay memory?
	3.	Neural Network:
	•	Describe the architecture of the DQNetwork used in the script. What does each layer do?
	•	Why does the output layer of the neural network not use an activation function?
	4.	Epsilon-Greedy Policy:
	•	Explain how the epsilon-greedy policy works in the script.
	•	What happens as epsilon decays during training?
	5.	Target Network:
	•	Why does the script use a separate target network (self.target_net) in addition to the policy network?
	•	How often is the target network updated, and why?
	6.	Training Loop:
	•	What triggers the training of the policy network in the train() method?
	•	How does the script calculate the target Q-values during training?
    Intermediate Questions

	7.	Hyperparameters:
	•	What role does the GAMMA parameter play in the calculation of Q-values?
	•	How might increasing the BATCH_SIZE affect the training process?
	8.	Testing:
	•	How is testing different from training in terms of exploration vs. exploitation?
	•	What modifications would you make to evaluate the agent over 100 test episodes?
	9.	Experience Replay:
	•	What would happen if we didn’t use experience replay in the DQN?
	•	How could you modify the script to implement prioritized experience replay?
	
    10.	Agent Performance:
	•	Based on the rewards plot, how can you tell if the agent is improving during training?
	•	Suggest ways to improve the agent’s performance if the rewards plateau early.
    
    
    11. Change the environemnt and apply DQN on Cartpole environemnt. 