# Deep Deterministic Policy Gradients (DDPG)
---
In this notebook, we train DDPG with OpenAI Gym's BipedalWalker-v2 environment.

### 1. Import the Necessary Packages

In [1]:
%reset
import gymnasium as gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
#matplotlib inline
seed=10
#from ddpg_agent import Agent


import numpy as np
import random
import copy
from collections import namedtuple, deque

from model import Actor, Critic

import torch
import torch.nn.functional as F
import torch.optim as optim

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


### 2. Instantiate the Environment and Agent

In [2]:

BUFFER_SIZE = int(1e6)  # Maximum size of the replay buffer (stores past experiences for training).
BATCH_SIZE = 128        # Number of samples to use in a single training step.
GAMMA = 0.99            # Discount factor for future rewards (used in Bellman equation).
TAU = 1e-3              # Soft update parameter for target networks.
LR_ACTOR = 1e-4         # Learning rate for the actor network.
LR_CRITIC = 3e-4        # Learning rate for the critic network.
WEIGHT_DECAY = 0.0001   # L2 regularization factor to prevent overfitting in the critic.

# Choose between GPU and CPU based on availability
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")




class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, random_seed):
        """
        Initialize the agent with actor and critic networks, replay buffer, and noise process.
        
        Params:
        - state_size: Size of the state space.
        - action_size: Size of the action space.
        - random_seed: Seed for reproducibility.
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Initialize Actor networks
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)  # The main actor network.
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)  # Target actor network for stability.
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)  # Optimizer for the actor.

        # Initialize Critic networks
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)  # The main critic network.
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)  # Target critic network.
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Ornstein-Uhlenbeck Noise for exploration in continuous action space
        self.noise = OUNoise(action_size, random_seed)

        # Replay buffer to store past experiences
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
    
    def step(self, state, action, reward, next_state, done):
        """
        Save experience in the replay buffer and train the agent if enough samples are available.
        """
        self.memory.add(state, action, reward, next_state, done)  # Add the experience to the replay buffer.

        # Train the agent using a batch of experiences when the buffer has enough samples
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()  # Sample a batch of experiences.
            self.learn(experiences, GAMMA)  # Perform learning step using sampled experiences.

    def act(self, state, add_noise=True):
        """
        Return an action for a given state based on the current policy.
        If `add_noise` is True, exploration noise is added to the action.
        """
        state = torch.from_numpy(state).float().to(device)  # Convert state to PyTorch tensor.
        self.actor_local.eval()  # Set actor to evaluation mode (disables training-specific behavior like dropout).
        with torch.no_grad():  # Avoid gradient calculation for efficiency.
            action = self.actor_local(state).cpu().data.numpy()  # Compute the action using the actor network.
        self.actor_local.train()  # Switch back to training mode.
        if add_noise:  # Add exploration noise if enabled.
            action += self.noise.sample()
        return np.clip(action, -1, 1)  # Ensure the action is within valid bounds.

    def reset(self):
        """
        Reset the noise process (called at the beginning of each episode).
        """
        self.noise.reset()

    def learn(self, experiences, gamma):
        """
        Update the actor and critic networks using a batch of experience tuples.
        
        Params:
        - experiences: A batch of (state, action, reward, next_state, done) tuples.
        - gamma: Discount factor for future rewards.
        """
        states, actions, rewards, next_states, dones = experiences

        # --- Update Critic ---
        # Predict the next-state actions and their Q-values using the target networks.
        actions_next = self.actor_target(next_states)  # Next actions from target actor.
        Q_targets_next = self.critic_target(next_states, actions_next)  # Target Q-values.
        # Compute the Q-value targets for the current state-action pairs.
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))  # Bellman equation.
        # Compute critic loss as the mean squared error between expected and target Q-values.
        Q_expected = self.critic_local(states, actions)  # Expected Q-values from local critic.
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Optimize the critic network.
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # --- Update Actor ---
        # Compute actor loss as the negative mean Q-value of the predicted actions.
        actions_pred = self.actor_local(states)  # Predicted actions from local actor.
        actor_loss = -self.critic_local(states, actions_pred).mean()  # Negative mean Q-value.
        # Optimize the actor network.
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # --- Update Target Networks ---
        # Soft update both actor and critic target networks.
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """
        Soft update model parameters.
        New target parameters = tau * local_parameters + (1 - tau) * target_parameters.
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)

class OUNoise:
    """Ornstein-Uhlenbeck process for generating correlated noise."""
    
    def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2):
        """
        Initialize the parameters of the noise process.
        
        Params:
        - size: Dimensionality of the noise.
        - seed: Random seed.
        - mu: Long-term mean of the noise process.
        - theta: Rate of mean reversion.
        - sigma: Volatility (spread) of the noise.
        """
        self.mu = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        self.seed = random.seed(seed)
        self.reset()

    def reset(self):
        """
        Reset the noise to its mean value (mu).
        """
        self.state = copy.copy(self.mu)

    def sample(self):
        """
        Generate a noise sample by updating the internal state.
        """
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(len(x))
        self.state = x + dx
        return self.state

class ReplayBuffer:
    """Fixed-size buffer to store experience tuples for training."""
    
    def __init__(self, action_size, buffer_size, batch_size, seed):
        """
        Initialize the replay buffer.
        
        Params:
        - action_size: Size of the action space.
        - buffer_size: Maximum number of experiences to store.
        - batch_size: Number of experiences to sample for training.
        - seed: Random seed.
        """
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)  # Use deque for efficient memory management.
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)
    
    def add(self, state, action, reward, next_state, done):
        """
        Add a new experience to the buffer.
        """
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)
    
    def sample(self):
        """
        Sample a random batch of experiences for training.
        """
        experiences = random.sample(self.memory, k=self.batch_size)

        # Convert sampled experiences to PyTorch tensors.
        states = torch.from_numpy(np.vstack([e.state for e in experiences])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences])).float().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences]).astype(np.uint8)).float().to(device)

        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        """
        Return the current size of the buffer.
        """
        return len(self.memory)
    
    


In [3]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F


def hidden_init(layer):
    """
    Calculate the limits for uniform weight initialization for a given layer.

    This function uses the formula `Â±1/sqrt(fan_in)` where `fan_in` is the number of input units
    to the layer. This initialization ensures that the weights are scaled properly to
    avoid exploding or vanishing gradients during training.

    Params:
    - layer: PyTorch layer (e.g., nn.Linear)

    Returns:
    - Tuple of (lower_limit, upper_limit) for uniform initialization.
    """
    fan_in = layer.weight.data.size()[0]  # Number of input units to the layer
    lim = 1. / np.sqrt(fan_in)  # Compute the scaling limit
    return (-lim, lim)  # Return lower and upper bounds


class Actor(nn.Module):
    """
    Actor (Policy) Model: Maps states to actions.
    
    The Actor model is responsible for determining which action to take in a given state.
    It outputs actions in a continuous space, bounded between -1 and 1.

    Attributes:
    - Fully connected layers (fc1 and fc2)
    - Activation functions (ReLU and Tanh)
    """

    def __init__(self, state_size, action_size, seed, fc_units=256):
        """
        Initialize the Actor model.
        
        Params:
        - state_size: Dimension of the state space.
        - action_size: Dimension of the action space.
        - seed: Random seed for reproducibility.
        - fc_units: Number of units in the first hidden layer.
        """
        super(Actor, self).__init__()
        self.seed = torch.manual_seed(seed)  # Set random seed for reproducibility
        self.fc1 = nn.Linear(state_size, fc_units)  # First hidden layer
        self.fc2 = nn.Linear(fc_units, action_size)  # Output layer
        self.reset_parameters()  # Initialize weights

    def reset_parameters(self):
        """
        Initialize weights of the layers using uniform distribution with bounds
        calculated from `hidden_init`. This ensures proper weight scaling.
        """
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))  # Initialize fc1 weights
        self.fc2.weight.data.uniform_(-3e-3, 3e-3)  # Smaller range for output layer weights

    def forward(self, state):
        """
        Perform a forward pass through the network to map states to actions.
        
        Activation functions:
        - ReLU for hidden layers to introduce non-linearity.
        - Tanh for the output layer to ensure actions are bounded between -1 and 1.

        Params:
        - state: Input state tensor.

        Returns:
        - Action tensor with values in the range [-1, 1].
        """
        x = F.relu(self.fc1(state))  # Apply ReLU to the first hidden layer
        return F.tanh(self.fc2(x))  # Apply Tanh to the output layer to bound actions


class Critic(nn.Module):
    """
    Critic (Value) Model: Maps (state, action) pairs to Q-values.
    
    The Critic evaluates the value (Q-value) of a given state-action pair, which
    represents the expected future reward starting from that state and taking that action.

    Attributes:
    - Fully connected layers for feature extraction
    - Q-value output layer
    """

    def __init__(self, state_size, action_size, seed, fcs1_units=256, fc2_units=256, fc3_units=128):
        """
        Initialize the Critic model.

        Params:
        - state_size: Dimension of the state space.
        - action_size: Dimension of the action space.
        - seed: Random seed for reproducibility.
        - fcs1_units: Number of units in the first hidden layer for state processing.
        - fc2_units: Number of units in the second hidden layer.
        - fc3_units: Number of units in the third hidden layer.
        """
        super(Critic, self).__init__()
        self.seed = torch.manual_seed(seed)  # Set random seed for reproducibility
        self.fcs1 = nn.Linear(state_size, fcs1_units)  # First layer for state inputs
        self.fc2 = nn.Linear(fcs1_units + action_size, fc2_units)  # Combine state and action
        self.fc3 = nn.Linear(fc2_units, fc3_units)  # Third hidden layer
        self.fc4 = nn.Linear(fc3_units, 1)  # Output layer for Q-value
        self.reset_parameters()  # Initialize weights

    def reset_parameters(self):
        """
        Initialize weights of all layers using uniform distribution with bounds
        calculated from `hidden_init`. Ensures proper weight scaling for stable training.
        """
        self.fcs1.weight.data.uniform_(*hidden_init(self.fcs1))  # Initialize fcs1 weights
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))  # Initialize fc2 weights
        self.fc3.weight.data.uniform_(*hidden_init(self.fc3))  # Initialize fc3 weights
        self.fc4.weight.data.uniform_(-3e-3, 3e-3)  # Smaller range for output layer weights

    def forward(self, state, action):
        """
        Perform a forward pass through the network to map (state, action) pairs to Q-values.
        
        The Critic processes the state separately in the first layer and combines
        it with the action in the subsequent layers.

        Activation functions:
        - Leaky ReLU for hidden layers to allow small gradients even when the unit is inactive.

        Params:
        - state: Input state tensor.
        - action: Input action tensor.

        Returns:
        - Q-value tensor representing the value of the (state, action) pair.
        """
        xs = F.leaky_relu(self.fcs1(state))  # Process state through the first layer
        x = torch.cat((xs, action), dim=1)  # Concatenate state features and action
        x = F.leaky_relu(self.fc2(x))  # Apply Leaky ReLU to combined input
        x = F.leaky_relu(self.fc3(x))  # Apply Leaky ReLU to the third layer
        return self.fc4(x)  # Output Q-value


In [4]:
env = gym.make("InvertedPendulum-v5", render_mode="human")

agent = Agent(state_size=env.observation_space.shape[0], action_size=env.action_space.shape[0], random_seed=10)

In [5]:
env.action_space.seed(seed)
env.observation_space.seed(seed)

10

### 3. Train the Agent with DDPG

Run the code cell below to train the agent from scratch.  Alternatively, you can skip to the next code cell to load the pre-trained weights from file.

In [7]:
def ddpg_with_visualization(n_episodes=2000, max_t=700, render_every=100):
    """
    Train a DDPG agent with optional environment visualization during training.

    This function trains a Deep Deterministic Policy Gradient (DDPG) agent in a given environment.
    It supports visualizing the agent's behavior during training by rendering selected episodes.

    Params:
    - n_episodes (int): Total number of training episodes.
    - max_t (int): Maximum number of time steps per episode.
    - render_every (int): Frequency of episodes to render the environment. Set to a larger value to reduce overhead.

    Returns:
    - scores (list): A list of cumulative rewards (scores) for each episode.
    """

    # Initialize a deque to store the most recent 100 episode scores (for calculating average scores)
    scores_deque = deque(maxlen=100)
    scores = []  # List to store scores for all episodes
    max_score = -np.Inf  # Track the maximum average score seen so far (useful for saving the best model)

    # Loop over episodes for training
    for i_episode in range(1, n_episodes + 1):
        # Reset the environment and unpack the initial state and info dictionary
        state, _ = env.reset()

        # Reset the agent's internal states (e.g., noise process)
        agent.reset()

        # Initialize the score for this episode
        score = 0

        # Loop over time steps in the current episode
        for t in range(max_t):
            # Render the environment only if the episode number is a multiple of `render_every`
            if i_episode % render_every == 0:
                env.render()  # Visualize the environment

            # Let the agent decide an action based on the current state
            action = agent.act(state)

            # Perform the action in the environment and observe the next state, reward, and termination status
            next_state, reward, done, truncated, _ = env.step(action)

            # Send the experience tuple to the agent for learning
            agent.step(state, action, reward, next_state, done or truncated)

            # Update the current state to the next state
            state = next_state

            # Accumulate the reward for this episode
            score += reward

            # Break the loop if the episode ends (either `done` or `truncated`)
            if done or truncated:
                break

        # Append the score for this episode to the deque and the scores list
        scores_deque.append(score)
        scores.append(score)

        # Print the current episode's score and the running average score
        print(
            '\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format(
                i_episode, np.mean(scores_deque), score
            ),
            end=""  # Use end="" to overwrite the line for better readability
        )

        # Every 100 episodes, save the model weights and print the average score
        if i_episode % 100 == 0:
            # Save the actor and critic networks' weights
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')

            # Print the average score over the last 100 episodes
            print(
                '\rEpisode {}\tAverage Score: {:.2f}'.format(
                    i_episode, np.mean(scores_deque)
                )
            )

    # Close the environment after training to free resources
    env.close()

    # Return the list of scores for all episodes (useful for plotting or analysis)
    return scores

In [8]:
scores = ddpg_with_visualization()

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

Episode 100	Average Score: 5.11	Score: 3.000
Episode 200	Average Score: 3.33	Score: 3.00
Episode 300	Average Score: 3.32	Score: 3.00
Episode 400	Average Score: 3.25	Score: 3.00


KeyboardInterrupt: 

### 4. Watch a Smart Agent!

In the next code cell, you will load the trained weights from file to watch a smart agent!

In [10]:
# Load the trained model
agent.actor_local.load_state_dict(torch.load('checkpoint_actor.pth'))
agent.critic_local.load_state_dict(torch.load('checkpoint_critic.pth'))

# Set the actor to evaluation mode
agent.actor_local.eval()

# Reset the environment
state, _ = env.reset()  # Unpack the tuple returned by reset()
agent.reset()  # Reset the agent's noise process (if any)

# Run a single episode
while True:
    action = agent.act(state, add_noise=False)  # Disable noise for deterministic behavior
    env.render()  # Render the environment
    next_state, reward, done, truncated, _ = env.step(action)  # Handle truncated flag
    state = next_state
    if done or truncated:  # Break if the episode ends
        break

env.close()  # Close the environment