

Module easyagents.agents

This module contains the public api of the EasyAgents reinforcement learning library.

It consist mainly of the class hierarchy of the available agents (algorithms), registrations and the management of the available backends. In their implementation the agents forward their calls to the chosen backend.

View Source

"""This module contains the public api of the EasyAgents reinforcement learning library.

    It consist mainly of the class hierarchy of the available agents (algorithms), registrations and

    the management of the available backends. In their implementation the agents forward their calls

    to the chosen backend.

"""

from abc import ABC

from collections import namedtuple

import json

import os

import statistics

from typing import Dict, List, Optional, Tuple, Type, Union

from easyagents import core

from easyagents.backends import core as bcore

from easyagents.callbacks import plot

import easyagents.backends.default

import easyagents.backends.tfagents

import tensorflow as tf

# import easyagents.backends.tforce

_backends: [bcore.BackendAgentFactory] = []

"""The seed used for all agents and gym environments. If None no seed is set (default)."""

seed: Optional[int] = None

def load(directory: str,

         callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None):

    """Loads an agent from directory.

    After a successful load play() may be called directly.

    The agent, model, backend, seed and play policy are restored according to the previously saved agent.

    Args:

        directory: the directory containing the previously saved policy.

        callbacks: list of callbacks called during save (eg log.Agent)

    Result:

        a new instance of EasyAgents

    """

    assert directory

    agent_json_path = os.path.join(directory, EasyAgent._KEY_EASYAGENT_FILENAME)

    policy_directory = os.path.join(directory, EasyAgent._KEY_POLICY_DIRECTORY)

    assert os.path.isdir(directory), f'directory "{directory}" not found.'

    assert os.path.isfile(agent_json_path), 'file "{agent_json_path}" not found.'

    assert os.path.isdir(policy_directory), f'directory "{policy_directory}" not found.'

    with open(agent_json_path) as jsonfile:

        agent_dict = json.load(jsonfile)

    result = EasyAgent._from_dict(agent_dict)

    callbacks = result._to_callback_list(callbacks=callbacks)

    result._backend_agent.load(directory=policy_directory, callbacks=callbacks)

    return result

def register_backend(backend: bcore.BackendAgentFactory):

    """registers a backend as a factory for agent implementations.

    If another backend with the same name is already registered, the old backend is replaced by backend.

    """

    assert backend

    old_backends = [b for b in _backends if b.backend_name == backend.backend_name]

    for old_backend in old_backends:

        _backends.remove(old_backend)

    _backends.append(backend)

def activate_tensorforce():

    """registers the tensorforce backend.

    Due to an incompatibility between tensorforce and tf-agents, both libraries may not run

    in the same python instance. Thus - for the time being - once this method is called,

    the tfagents backend may not be used anymore.

    """

    import easyagents.backends.tforce

    global _backends

    assert  easyagents.backends.core._tf_eager_execution_active is None or \

            easyagents.backends.core._tf_eager_execution_active == False, \

            "tensorforce can not be activated, since tensorflow eager execution mode was already actived."

    _backends = []

    register_backend(easyagents.backends.default.DefaultAgentFactory(register_tensorforce=True))

    register_backend(easyagents.backends.tforce.TensorforceAgentFactory())

def _activate_tfagents():

    """registers the tfagents backend.

    Due to an incompatibility between tensorforce and tf-agents, both libraries may not run

    in the same python instance.

    """

    global _backends

    assert  easyagents.backends.core._tf_eager_execution_active is None or \

            easyagents.backends.core._tf_eager_execution_active == True, \

            "tfagents can not be activated, since tensorflow eager execution mode was already disabled."

    _backends = []

    register_backend(easyagents.backends.default.DefaultAgentFactory(register_tensorforce=False))

    register_backend(easyagents.backends.tfagents.TfAgentAgentFactory())

_activate_tfagents()

class EasyAgent(ABC):

    """Abstract base class for all easy reinforcment learning agents.

    Besides forwarding train and play it implements persistence."""

    _KEY_BACKEND = 'backend'

    _KEY_EASYAGENT_CLASS = 'easyagent_class'

    _KEY_EASYAGENT_FILENAME = 'easyagent.json'

    _KEY_MODEL_CONFIG = 'model_config'

    _KEY_POLICY_DIRECTORY = 'policy'

    _KEY_VERSION = 'version'

    def __init__(self,

                 gym_env_name: str,

                 fc_layers: Union[Tuple[int, ...], int, None] = None,

                 backend: str = None):

        """

            Args:

                gym_env_name: name of an OpenAI gym environment to be used for training and evaluation

                fc_layers: defines the neural network to be used, a sequence of fully connected

                    layers of the given size. Eg (75,40) yields a neural network consisting

                    out of 2 hidden layers, the first one containing 75 and the second layer

                    containing 40 neurons.

                backend=the backend to be used (eg 'tfagents'), if None a default implementation is used.

                    call get_backends() to get a list of the available backends.

        """

        model_config = core.ModelConfig(gym_env_name=gym_env_name, fc_layers=fc_layers, seed=seed)

        self._initialize(model_config=model_config, backend_name=backend)

        return

    def _initialize(self, model_config: core.ModelConfig, backend_name: str = None):

        if backend_name is None:

            backend_name = easyagents.backends.default.DefaultAgentFactory.backend_name

        backend: bcore.BackendAgentFactory = _get_backend(backend_name)

        assert model_config is not None, "model_config not set."

        assert backend, f'Backend "{backend_name}" not found. The registered backends are {get_backends()}.'

        self._model_config: core.ModelConfig = model_config

        backend_agent = backend.create_agent(easyagent_type=type(self), model_config=model_config)

        assert backend_agent, f'Backend "{backend_name}" does not implement "{type(self).__name__}". ' + \

                              f'Choose one of the following backend {get_backends(type(self))}.'

        self._backend_agent: Optional[bcore._BackendAgent] = backend_agent

        self._backend_name: str = backend_name

        self._backend_agent._agent_context._agent_saver = self.save

        return

    def _add_plot_callbacks(self, callbacks: List[core.AgentCallback],

                            default_plots: Optional[bool],

                            default_plot_callbacks: List[plot._PlotCallback]

                            ) -> List[core.AgentCallback]:

        """Adds the default callbacks and sorts all callbacks in the order

            _PreProcessCallbacks, AgentCallbacks, _PostProcessCallbacks.

        Args:

            callbacks: existing callbacks to prepare

            default_plots: if set or if None and callbacks does not contain plots then the default plots are added

            default_plot_callbacks: plot callbacks to add.

        """

        pre_process: List[core.AgentCallback] = [plot._PreProcess()]

        agent: List[core.AgentCallback] = []

        post_process: List[core.AgentCallback] = [plot._PostProcess()]

        if default_plots is None:

            default_plots = True

            for c in callbacks:

                default_plots = default_plots and (not isinstance(c, plot._PlotCallback))

        if default_plots:

            agent = default_plot_callbacks

        for c in callbacks:

            if isinstance(c, core._PreProcessCallback):

                pre_process.append(c)

            else:

                if isinstance(c, core._PostProcessCallback):

                    post_process.append(c)

                else:

                    agent.append(c)

        result: List[core.AgentCallback] = pre_process + agent + post_process

        return result

    @staticmethod

    def _from_dict(param_dict: Dict[str, object]):

        """recreates a new agent instance according to the definition previously created by _to_dict.

        Returns:

            new agent instance (excluding any trained policy), the agent type is preserved.

        """

        assert param_dict

        mc: core.ModelConfig = core.ModelConfig._from_dict(param_dict[EasyAgent._KEY_MODEL_CONFIG])

        agent_class = globals()[param_dict[EasyAgent._KEY_EASYAGENT_CLASS]]

        backend: str = param_dict[EasyAgent._KEY_BACKEND]

        result = agent_class(gym_env_name=mc.original_env_name, backend=backend)

        result._initialize(model_config=mc, backend_name=backend)

        return result

    def _to_callback_list(self, callbacks: Union[Optional[core.AgentCallback], List[core.AgentCallback]]) -> List[

        core.AgentCallback]:

        """maps callbacks to an admissible callback list.

        if callbacks is None an empty list is returned.

        if callbacks is an AgentCallback a list containing only this callback is returned

        otherwise callbacks is returned

        """

        result: List[core.AgentCallback] = []

        if not callbacks is None:

            if isinstance(callbacks, core.AgentCallback):

                result = [callbacks]

            else:

                assert isinstance(callbacks, list), "callback not an AgentCallback or a list thereof."

                result = callbacks

        return result

    def _to_dict(self) -> Dict[str, object]:

        """saves the agent definition to a dict.

            Returns:

                dict containing all parameters to recreate the agent (excluding a trained policy)

        """

        result: Dict[str, object] = dict()

        result[EasyAgent._KEY_VERSION] = easyagents.__version__

        result[EasyAgent._KEY_EASYAGENT_CLASS] = self.__class__.__name__

        result[EasyAgent._KEY_BACKEND] = self._backend_name

        result[EasyAgent._KEY_MODEL_CONFIG] = self._model_config._to_dict()

        result[EasyAgent._KEY_POLICY_DIRECTORY] = 'policy'

        return result

    def evaluate(self,

                 num_episodes: int = 50,

                 max_steps_per_episode: int = 50):

        """Plays num_episodes with the current policy and computes metrics on rewards.

        Args:

            num_episodes: number of episodes to play

            max_steps_per_episode: max steps per episode

        Returns:

            extensible score metrics

        """

        play_context = core.PlayContext()

        play_context.max_steps_per_episode = max_steps_per_episode

        play_context.num_episodes = num_episodes

        self.play(play_context=play_context, default_plots=False)

        Metrics = namedtuple('Metrics', 'steps rewards')

        Rewards = namedtuple('Rewards', 'mean std min max all')

        all_rewards = list(play_context.sum_of_rewards.values())

        mean_reward, std_reward, min_reward, max_reward = statistics.mean(all_rewards), statistics.stdev(

            all_rewards), min(all_rewards), max(all_rewards)

        rewards = Rewards(mean=mean_reward, std=std_reward, min=min_reward, max=max_reward, all=all_rewards)

        Steps = namedtuple('Steps', 'mean std min max all')

        all_num_steps = []

        for i in play_context.rewards.keys():

            all_num_steps.append(len(play_context.rewards[i]))

        mean_steps, std_steps, min_steps, max_steps = statistics.mean(all_num_steps), statistics.stdev(

            all_num_steps), min(all_num_steps), max(all_num_steps)

        steps = Steps(mean=mean_steps, std=std_steps, min=min_steps, max=max_steps, all=all_num_steps)

        metrics = Metrics(rewards=rewards, steps=steps)

        return metrics

    def play(self,

             callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

             num_episodes: int = 1,

             max_steps_per_episode: int = 1000,

             play_context: core.PlayContext = None,

             default_plots: bool = None):

        """Plays num_episodes with the current policy.

        Args:

            callbacks: list of callbacks called during each episode play

            num_episodes: number of episodes to play

            max_steps_per_episode: max steps per episode

            play_context: play configuration to be used. If set override all other play context arguments

            default_plots: if set addes a set of default callbacks (plot.State, plot.Rewards, ...)

        Returns:

            play_context containg the actions taken and the rewards received during training

        """

        assert self._backend_agent._agent_context._is_policy_trained, "No trained policy available. Call train() first."

        if play_context is None:

            play_context = core.PlayContext()

            play_context.max_steps_per_episode = max_steps_per_episode

            play_context.num_episodes = num_episodes

        callbacks = self._to_callback_list(callbacks=callbacks)

        callbacks = self._add_plot_callbacks(callbacks, default_plots, [plot.Steps(), plot.Rewards()])

        self._backend_agent.play(play_context=play_context, callbacks=callbacks)

        return play_context

    def save(self, directory: Optional[str] = None,

             callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None) -> str:

        """Saves the currently trained actor policy in directory.

        If save is called before a trained policy is created, eg by calling train, an exception is raised.

        Args:

             directory: the directory to save the policy weights to.

                if the directory does not exist yet, a new directory is created. if None the policy is saved

                in a temp directory.

             callbacks: list of callbacks called during save (eg log.Agent)

        Returns:

            the absolute path to the directory containing the saved policy.

        """

        if directory is None:

            directory = bcore._get_temp_path()

        assert directory

        assert self._backend_agent._agent_context._is_policy_trained, "No trained policy available. Call train() first."

        directory = bcore._mkdir(directory)

        agent_json_path = os.path.join(directory, EasyAgent._KEY_EASYAGENT_FILENAME)

        with open(agent_json_path, 'w') as jsonfile:

            agent_dict = self._to_dict()

            json.dump(agent_dict, jsonfile, sort_keys=True, indent=2)

        callbacks = self._to_callback_list(callbacks=callbacks)

        policy_directory = os.path.join(directory, EasyAgent._KEY_POLICY_DIRECTORY)

        policy_directory = bcore._mkdir(policy_directory)

        self._backend_agent.save(directory=policy_directory, callbacks=callbacks)

        return directory

    def train(self, train_context: core.TrainContext,

              callbacks: Union[List[core.AgentCallback], core.AgentCallback, None],

              default_plots: Optional[bool]):

        """Trains a new model using the gym environment passed during instantiation.

        Args:

            callbacks: list of callbacks called during the training and evaluation

            train_context: training configuration to be used (num_iterations,num_episodes_per_iteration,...)

            default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...).

                if None default callbacks are only added if the callbacks list is empty

        """

        assert train_context, "train_context not set."

        callbacks = self._to_callback_list(callbacks=callbacks)

        callbacks = self._add_plot_callbacks(callbacks, default_plots, [plot.Loss(), plot.Steps(), plot.Rewards()])

        self._backend_agent.train(train_context=train_context, callbacks=callbacks)

def get_backends(agent: Optional[Type[EasyAgent]] = None):

    """returns a list of all registered backends containing an implementation for the EasyAgent type agent.

    Args:

        agent: type deriving from EasyAgent for which the backend identifiers are returned.

    Returns:

        a list of admissible values for the 'backend' argument of EazyAgents constructors or a list of all

        available backends if agent is None.

    """

    result = [b.backend_name for b in _backends]

    if agent:

        result = [b.backend_name for b in _backends if agent in b.get_algorithms()]

    return result

def _get_backend(backend_name: str):

    """Yields the backend with the given name.

    Returns:

        the backend instance or None if no backend is found."""

    assert backend_name

    backends = [b for b in _backends if b.backend_name == backend_name]

    assert len(backends) <= 1, f'no backend found with name "{backend_name}". Available backends = {get_backends()}'

    result = None

    if backends:

        result = backends[0]

    return result

class CemAgent(EasyAgent):

    """creates a new agent based on the cross-entropy-method algorithm.

       From https://learning.mpi-sws.org/mlss2016/slides/2016-MLSS-RL.pdf:

        Initialize µ ∈Rd,σ ∈Rd

        for iteration = 1,2,... num_iterations do

            Collect num_episodes_per_iteration samples of θi ∼ N(µ,diag(σ))

            Perform a noisy evaluation Ri ∼ θi

            Select the top elite_set_fraction of samples (e.g. p = 0.2), which we’ll call the elite set

            Fit a Gaussian distribution, with diagonal covariance, to the elite set, obtaining a new µ,σ.

        end for

        Return the ﬁnal µ.

        see https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.81.6579&rep=rep1&type=pdf

    """

    def __init__(self, gym_env_name: str, fc_layers: Optional[Tuple[int, ...]] = None, backend: str = None):

        super().__init__(gym_env_name, fc_layers, backend)

        assert False, "CemAgent is currently not available (pending migration of keras-rl to tf2.0)"

    def train(self,

              callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

              num_iterations: int = 100,

              num_episodes_per_iteration: int = 50,

              max_steps_per_episode: int = 500,

              elite_set_fraction: float = 0.1,

              num_iterations_between_eval: int = 5,

              num_episodes_per_eval: int = 10,

              train_context: core.CemTrainContext = None,

              default_plots: bool = None):

        """Trains a new model using the gym environment passed during instantiation.

        Args:

            callbacks: list of callbacks called during training and evaluation

            num_iterations: number of times the training is repeated (with additional data)

            num_episodes_per_iteration: number of episodes played in each iteration. for each episode a new

                policy is sampled from the current weight distribution.

            max_steps_per_episode: maximum number of steps per episode

            elite_set_fraction: the fraction of policies which are members of the elite set.

                These policies are used to fit a new weight distribution in each iteration.

            num_iterations_between_eval: number of training iterations before the current policy is evaluated.

                if 0 no evaluation is performed.

            num_episodes_per_eval: number of episodes played to estimate the average return and steps

            train_context: training configuration to be used. if set overrides all other training context arguments.

            default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...).

                if None default callbacks are only added if the callbacks list is empty

        Returns:

            train_context: the training configuration containing the loss and sum of rewards encountered

                during training

        """

        if train_context is None:

            train_context = core.CemTrainContext()

            train_context.num_iterations = num_iterations

            train_context.max_steps_per_episode = max_steps_per_episode

            train_context.elite_set_fraction = elite_set_fraction

            train_context.num_iterations_between_eval = num_iterations_between_eval

            train_context.num_episodes_per_eval = num_episodes_per_eval

        super().train(train_context=train_context, callbacks=callbacks, default_plots=default_plots)

        return train_context

class DqnAgent(EasyAgent):

    """creates a new agent based on the Dqn algorithm.

    From wikipedia:

    The DeepMind system used a deep convolutional neural network, with layers of tiled convolutional filters to mimic

    the effects of receptive fields. Reinforcement learning is unstable or divergent when a nonlinear function

    approximator such as a neural network is used to represent Q.

    This instability comes from the correlations present in the sequence of observations, the fact that small updates

    to Q may significantly change the policy and the data distribution, and the correlations between Q and the

    target values.

    The technique used experience replay, a biologically inspired mechanism that uses a random sample of prior actions

    instead of the most recent action to proceed.[2] This removes correlations in the observation sequence and smooths

    changes in the data distribution. Iterative update adjusts Q towards target values that are only periodically

    updated, further reducing correlations with the target.[17]

    see also: https://deepmind.com/research/publications/human-level-control-through-deep-reinforcement-learning

    """

    def train(self,

              callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

              num_iterations: int = 20000,

              max_steps_per_episode: int = 500,

              num_steps_per_iteration: int = 1,

              num_steps_buffer_preload=1000,

              num_steps_sampled_from_buffer=64,

              num_iterations_between_eval: int = 1000,

              num_episodes_per_eval: int = 10,

              learning_rate: float = 0.001,

              train_context: core.StepsTrainContext = None,

              default_plots: bool = None):

        """Trains a new model using the gym environment passed during instantiation.

        Args:

            callbacks: list of callbacks called during training and evaluation

            num_iterations: number of times the training is repeated (with additional data)

            max_steps_per_episode: maximum number of steps per episode

            num_steps_per_iteration: number of steps played per training iteration

            num_steps_buffer_preload: number of initial collect steps to preload the buffer

            num_steps_sampled_from_buffer: the number of steps sampled from buffer for each iteration training

            num_iterations_between_eval: number of training iterations before the current policy is evaluated.

                if 0 no evaluation is performed.

            num_episodes_per_eval: number of episodes played to estimate the average return and steps

            learning_rate: the learning rate used in the next iteration's policy training (0,1]

            train_context: training configuration to be used. if set overrides all other training context arguments.

            default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...).

                if None default callbacks are only added if the callbacks list is empty

        Returns:

            train_context: the training configuration containing the loss and sum of rewards encountered

                during training

        """

        if train_context is None:

            train_context = core.StepsTrainContext()

            train_context.num_iterations = num_iterations

            train_context.max_steps_per_episode = max_steps_per_episode

            train_context.num_steps_per_iteration = num_steps_per_iteration

            train_context.num_steps_buffer_preload = num_steps_buffer_preload

            train_context.num_steps_sampled_from_buffer = num_steps_sampled_from_buffer

            train_context.num_iterations_between_eval = num_iterations_between_eval

            train_context.num_episodes_per_eval = num_episodes_per_eval

            train_context.learning_rate = learning_rate

        super().train(train_context=train_context, callbacks=callbacks, default_plots=default_plots)

        return train_context

class DoubleDqnAgent(DqnAgent):

    """Agent based on the Double Dqn algorithm (https://arxiv.org/abs/1509.06461)"""

class DuelingDqnAgent(DqnAgent):

    """Agent based on the Dueling Dqn algorithm (https://arxiv.org/abs/1511.06581)."""

class PpoAgent(EasyAgent):

    """creates a new agent based on the PPO algorithm.

        PPO is an actor-critic algorithm using 2 neural networks. The actor network

        to predict the next action to be taken and the critic network to estimate

        the value of the game state we are currently in (the expected, discounted

        sum of future rewards when following the current actor network).

        see also: https://spinningup.openai.com/en/latest/algorithms/ppo.html

    """

    def train(self,

              callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

              num_iterations: int = 100,

              num_episodes_per_iteration: int = 10,

              max_steps_per_episode: int = 500,

              num_epochs_per_iteration: int = 10,

              num_iterations_between_eval: int = 5,

              num_episodes_per_eval: int = 10,

              learning_rate: float = 0.001,

              train_context: core.PpoTrainContext = None,

              default_plots: bool = None):

        """Trains a new model using the gym environment passed during instantiation.

        Args:

            callbacks: list of callbacks called during training and evaluation

            num_iterations: number of times the training is repeated (with additional data)

            num_episodes_per_iteration: number of episodes played per training iteration

            max_steps_per_episode: maximum number of steps per episode

            num_epochs_per_iteration: number of times the data collected for the current iteration

                is used to retrain the current policy

            num_iterations_between_eval: number of training iterations before the current policy is evaluated.

                if 0 no evaluation is performed.

            num_episodes_per_eval: number of episodes played to estimate the average return and steps

            learning_rate: the learning rate used in the next iteration's policy training (0,1]

            train_context: training configuration to be used. if set overrides all other training context arguments.

            default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...).

                if None default callbacks are only added if the callbacks list is empty

        Returns:

            train_context: the training configuration containing the loss and sum of rewards encountered

                during training

        """

        if train_context is None:

            train_context = core.PpoTrainContext()

            train_context.num_iterations = num_iterations

            train_context.num_episodes_per_iteration = num_episodes_per_iteration

            train_context.max_steps_per_episode = max_steps_per_episode

            train_context.num_epochs_per_iteration = num_epochs_per_iteration

            train_context.num_iterations_between_eval = num_iterations_between_eval

            train_context.num_episodes_per_eval = num_episodes_per_eval

            train_context.learning_rate = learning_rate

        super().train(train_context=train_context, callbacks=callbacks, default_plots=default_plots)

        return train_context

class RandomAgent(EasyAgent):

    """Agent which always chooses uniform random actions."""

    def train(self,

              callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

              num_iterations: int = 10,

              max_steps_per_episode: int = 1000,

              num_episodes_per_eval: int = 10,

              train_context: core.TrainContext = None,

              default_plots: bool = None):

        """Evaluates the environment using a uniform random policy.

        The evaluation is performed in batches of num_episodes_per_eval episodes.

        Args:

            callbacks: list of callbacks called during training and evaluation

            num_iterations: number of times a batch of num_episodes_per_eval episodes is evaluated.

            max_steps_per_episode: maximum number of steps per episode

            num_episodes_per_eval: number of episodes played to estimate the average return and steps

            train_context: training configuration to be used. if set overrides all other training context arguments.

            default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...)

        Returns:

            train_context: the training configuration containing the loss and sum of rewards encountered

                during training

        """

        if train_context is None:

            train_context = core.TrainContext()

            train_context.num_iterations = num_iterations

            train_context.max_steps_per_episode = max_steps_per_episode

            train_context.num_epochs_per_iteration = 0

            train_context.num_iterations_between_eval = 1

            train_context.num_episodes_per_eval = num_episodes_per_eval

            train_context.learning_rate = 1

        super().train(train_context=train_context, callbacks=callbacks, default_plots=default_plots)

        return train_context

class ReinforceAgent(EasyAgent):

    """creates a new agent based on the Reinforce algorithm.

        Reinforce is a vanilla policy gradient algorithm using a single actor network.

        see also: www-anw.cs.umass.edu/~barto/courses/cs687/williams92simple.pdf

    """

    def train(self,

              callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

              num_iterations: int = 100,

              num_episodes_per_iteration: int = 10,

              max_steps_per_episode: int = 500,

              num_epochs_per_iteration: int = 10,

              num_iterations_between_eval: int = 5,

              num_episodes_per_eval: int = 10,

              learning_rate: float = 0.001,

              train_context: core.EpisodesTrainContext = None,

              default_plots: bool = None):

        """Trains a new model using the gym environment passed during instantiation.

        Args:

            callbacks: list of callbacks called during training and evaluation

            num_iterations: number of times the training is repeated (with additional data)

            num_episodes_per_iteration: number of episodes played per training iteration

            max_steps_per_episode: maximum number of steps per episode

            num_epochs_per_iteration: number of times the data collected for the current iteration

                is used to retrain the current policy

            num_iterations_between_eval: number of training iterations before the current policy is evaluated.

                if 0 no evaluation is performed.

            num_episodes_per_eval: number of episodes played to estimate the average return and steps

            learning_rate: the learning rate used in the next iteration's policy training (0,1]

            train_context: training configuration to be used. if set overrides all other training context arguments.

            default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...).

                if None default callbacks are only added if the callbacks list is empty

        Returns:

            train_context: the training configuration containing the loss and sum of rewards encountered

                during training

        """

        if train_context is None:

            train_context = core.EpisodesTrainContext()

            train_context.num_iterations = num_iterations

            train_context.num_episodes_per_iteration = num_episodes_per_iteration

            train_context.max_steps_per_episode = max_steps_per_episode

            train_context.num_epochs_per_iteration = num_epochs_per_iteration

            train_context.num_iterations_between_eval = num_iterations_between_eval

            train_context.num_episodes_per_eval = num_episodes_per_eval

            train_context.learning_rate = learning_rate

        super().train(train_context=train_context, callbacks=callbacks, default_plots=default_plots)

        return train_context

class SacAgent(DqnAgent):

    """Agent based on the Soft-Actor-Critic algorithm (https://arxiv.org/abs/1812.05905)."""

Variables

seed

Functions

activate_tensorforce

def activate_tensorforce(

)

registers the tensorforce backend.

Due to an incompatibility between tensorforce and tf-agents, both libraries may not run in the same python instance. Thus - for the time being - once this method is called, the tfagents backend may not be used anymore.

View Source

def activate_tensorforce():

    """registers the tensorforce backend.

    Due to an incompatibility between tensorforce and tf-agents, both libraries may not run

    in the same python instance. Thus - for the time being - once this method is called,

    the tfagents backend may not be used anymore.

    """

    import easyagents.backends.tforce

    global _backends

    assert  easyagents.backends.core._tf_eager_execution_active is None or \

            easyagents.backends.core._tf_eager_execution_active == False, \

            "tensorforce can not be activated, since tensorflow eager execution mode was already actived."

    _backends = []

    register_backend(easyagents.backends.default.DefaultAgentFactory(register_tensorforce=True))

    register_backend(easyagents.backends.tforce.TensorforceAgentFactory())

get_backends

def get_backends(
    agent: Union[Type[easyagents.agents.EasyAgent], NoneType] = None
)

returns a list of all registered backends containing an implementation for the EasyAgent type agent.

Args: agent: type deriving from EasyAgent for which the backend identifiers are returned.

Returns: a list of admissible values for the 'backend' argument of EazyAgents constructors or a list of all available backends if agent is None.

View Source

def get_backends(agent: Optional[Type[EasyAgent]] = None):

    """returns a list of all registered backends containing an implementation for the EasyAgent type agent.

    Args:

        agent: type deriving from EasyAgent for which the backend identifiers are returned.

    Returns:

        a list of admissible values for the 'backend' argument of EazyAgents constructors or a list of all

        available backends if agent is None.

    """

    result = [b.backend_name for b in _backends]

    if agent:

        result = [b.backend_name for b in _backends if agent in b.get_algorithms()]

    return result

load

def load(
    directory: str,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None
)

Loads an agent from directory.

After a successful load play() may be called directly. The agent, model, backend, seed and play policy are restored according to the previously saved agent.

Args: directory: the directory containing the previously saved policy. callbacks: list of callbacks called during save (eg log.Agent)

Result: a new instance of EasyAgents

View Source

def load(directory: str,

         callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None):

    """Loads an agent from directory.

    After a successful load play() may be called directly.

    The agent, model, backend, seed and play policy are restored according to the previously saved agent.

    Args:

        directory: the directory containing the previously saved policy.

        callbacks: list of callbacks called during save (eg log.Agent)

    Result:

        a new instance of EasyAgents

    """

    assert directory

    agent_json_path = os.path.join(directory, EasyAgent._KEY_EASYAGENT_FILENAME)

    policy_directory = os.path.join(directory, EasyAgent._KEY_POLICY_DIRECTORY)

    assert os.path.isdir(directory), f'directory "{directory}" not found.'

    assert os.path.isfile(agent_json_path), 'file "{agent_json_path}" not found.'

    assert os.path.isdir(policy_directory), f'directory "{policy_directory}" not found.'

    with open(agent_json_path) as jsonfile:

        agent_dict = json.load(jsonfile)

    result = EasyAgent._from_dict(agent_dict)

    callbacks = result._to_callback_list(callbacks=callbacks)

    result._backend_agent.load(directory=policy_directory, callbacks=callbacks)

    return result

register_backend

def register_backend(
    backend: easyagents.backends.core.BackendAgentFactory
)

registers a backend as a factory for agent implementations.

If another backend with the same name is already registered, the old backend is replaced by backend.

View Source

def register_backend(backend: bcore.BackendAgentFactory):

    """registers a backend as a factory for agent implementations.

    If another backend with the same name is already registered, the old backend is replaced by backend.

    """

    assert backend

    old_backends = [b for b in _backends if b.backend_name == backend.backend_name]

    for old_backend in old_backends:

        _backends.remove(old_backend)

    _backends.append(backend)

Classes

CemAgent

class CemAgent(
    gym_env_name: str,
    fc_layers: Union[Tuple[int, ...], NoneType] = None,
    backend: str = None
)

creates a new agent based on the cross-entropy-method algorithm.

From https://learning.mpi-sws.org/mlss2016/slides/2016-MLSS-RL.pdf: Initialize µ ∈Rd,σ ∈Rd for iteration = 1,2,... num_iterations do Collect num_episodes_per_iteration samples of θi ∼ N(µ,diag(σ)) Perform a noisy evaluation Ri ∼ θi Select the top elite_set_fraction of samples (e.g. p = 0.2), which we’ll call the elite set Fit a Gaussian distribution, with diagonal covariance, to the elite set, obtaining a new µ,σ. end for Return the ﬁnal µ.

see https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.81.6579&rep=rep1&type=pdf

View Source

class CemAgent(EasyAgent):

    """creates a new agent based on the cross-entropy-method algorithm.

       From https://learning.mpi-sws.org/mlss2016/slides/2016-MLSS-RL.pdf:

        Initialize µ ∈Rd,σ ∈Rd

        for iteration = 1,2,... num_iterations do

            Collect num_episodes_per_iteration samples of θi ∼ N(µ,diag(σ))

            Perform a noisy evaluation Ri ∼ θi

            Select the top elite_set_fraction of samples (e.g. p = 0.2), which we’ll call the elite set

            Fit a Gaussian distribution, with diagonal covariance, to the elite set, obtaining a new µ,σ.

        end for

        Return the ﬁnal µ.

        see https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.81.6579&rep=rep1&type=pdf

    """

    def __init__(self, gym_env_name: str, fc_layers: Optional[Tuple[int, ...]] = None, backend: str = None):

        super().__init__(gym_env_name, fc_layers, backend)

        assert False, "CemAgent is currently not available (pending migration of keras-rl to tf2.0)"

    def train(self,

              callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

              num_iterations: int = 100,

              num_episodes_per_iteration: int = 50,

              max_steps_per_episode: int = 500,

              elite_set_fraction: float = 0.1,

              num_iterations_between_eval: int = 5,

              num_episodes_per_eval: int = 10,

              train_context: core.CemTrainContext = None,

              default_plots: bool = None):

        """Trains a new model using the gym environment passed during instantiation.

        Args:

            callbacks: list of callbacks called during training and evaluation

            num_iterations: number of times the training is repeated (with additional data)

            num_episodes_per_iteration: number of episodes played in each iteration. for each episode a new

                policy is sampled from the current weight distribution.

            max_steps_per_episode: maximum number of steps per episode

            elite_set_fraction: the fraction of policies which are members of the elite set.

                These policies are used to fit a new weight distribution in each iteration.

            num_iterations_between_eval: number of training iterations before the current policy is evaluated.

                if 0 no evaluation is performed.

            num_episodes_per_eval: number of episodes played to estimate the average return and steps

            train_context: training configuration to be used. if set overrides all other training context arguments.

            default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...).

                if None default callbacks are only added if the callbacks list is empty

        Returns:

            train_context: the training configuration containing the loss and sum of rewards encountered

                during training

        """

        if train_context is None:

            train_context = core.CemTrainContext()

            train_context.num_iterations = num_iterations

            train_context.max_steps_per_episode = max_steps_per_episode

            train_context.elite_set_fraction = elite_set_fraction

            train_context.num_iterations_between_eval = num_iterations_between_eval

            train_context.num_episodes_per_eval = num_episodes_per_eval

        super().train(train_context=train_context, callbacks=callbacks, default_plots=default_plots)

        return train_context

Ancestors (in MRO)

easyagents.agents.EasyAgent
abc.ABC

Methods

evaluate

def evaluate(
    self,
    num_episodes: int = 50,
    max_steps_per_episode: int = 50
)

Plays num_episodes with the current policy and computes metrics on rewards.

Args: num_episodes: number of episodes to play max_steps_per_episode: max steps per episode

Returns: extensible score metrics

View Source

    def evaluate(self,

                 num_episodes: int = 50,

                 max_steps_per_episode: int = 50):

        """Plays num_episodes with the current policy and computes metrics on rewards.

        Args:

            num_episodes: number of episodes to play

            max_steps_per_episode: max steps per episode

        Returns:

            extensible score metrics

        """

        play_context = core.PlayContext()

        play_context.max_steps_per_episode = max_steps_per_episode

        play_context.num_episodes = num_episodes

        self.play(play_context=play_context, default_plots=False)

        Metrics = namedtuple('Metrics', 'steps rewards')

        Rewards = namedtuple('Rewards', 'mean std min max all')

        all_rewards = list(play_context.sum_of_rewards.values())

        mean_reward, std_reward, min_reward, max_reward = statistics.mean(all_rewards), statistics.stdev(

            all_rewards), min(all_rewards), max(all_rewards)

        rewards = Rewards(mean=mean_reward, std=std_reward, min=min_reward, max=max_reward, all=all_rewards)

        Steps = namedtuple('Steps', 'mean std min max all')

        all_num_steps = []

        for i in play_context.rewards.keys():

            all_num_steps.append(len(play_context.rewards[i]))

        mean_steps, std_steps, min_steps, max_steps = statistics.mean(all_num_steps), statistics.stdev(

            all_num_steps), min(all_num_steps), max(all_num_steps)

        steps = Steps(mean=mean_steps, std=std_steps, min=min_steps, max=max_steps, all=all_num_steps)

        metrics = Metrics(rewards=rewards, steps=steps)

        return metrics

play

def play(
    self,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None,
    num_episodes: int = 1,
    max_steps_per_episode: int = 1000,
    play_context: easyagents.core.PlayContext = None,
    default_plots: bool = None
)

Plays num_episodes with the current policy.

Args: callbacks: list of callbacks called during each episode play num_episodes: number of episodes to play max_steps_per_episode: max steps per episode play_context: play configuration to be used. If set override all other play context arguments default_plots: if set addes a set of default callbacks (plot.State, plot.Rewards, ...)

Returns: play_context containg the actions taken and the rewards received during training

View Source

    def play(self,

             callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

             num_episodes: int = 1,

             max_steps_per_episode: int = 1000,

             play_context: core.PlayContext = None,

             default_plots: bool = None):

        """Plays num_episodes with the current policy.

        Args:

            callbacks: list of callbacks called during each episode play

            num_episodes: number of episodes to play

            max_steps_per_episode: max steps per episode

            play_context: play configuration to be used. If set override all other play context arguments

            default_plots: if set addes a set of default callbacks (plot.State, plot.Rewards, ...)

        Returns:

            play_context containg the actions taken and the rewards received during training

        """

        assert self._backend_agent._agent_context._is_policy_trained, "No trained policy available. Call train() first."

        if play_context is None:

            play_context = core.PlayContext()

            play_context.max_steps_per_episode = max_steps_per_episode

            play_context.num_episodes = num_episodes

        callbacks = self._to_callback_list(callbacks=callbacks)

        callbacks = self._add_plot_callbacks(callbacks, default_plots, [plot.Steps(), plot.Rewards()])

        self._backend_agent.play(play_context=play_context, callbacks=callbacks)

        return play_context

save

def save(
    self,
    directory: Union[str, NoneType] = None,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None
) -> str

Saves the currently trained actor policy in directory.

If save is called before a trained policy is created, eg by calling train, an exception is raised.

Args: directory: the directory to save the policy weights to. if the directory does not exist yet, a new directory is created. if None the policy is saved in a temp directory. callbacks: list of callbacks called during save (eg log.Agent)

Returns: the absolute path to the directory containing the saved policy.

View Source

    def save(self, directory: Optional[str] = None,

             callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None) -> str:

        """Saves the currently trained actor policy in directory.

        If save is called before a trained policy is created, eg by calling train, an exception is raised.

        Args:

             directory: the directory to save the policy weights to.

                if the directory does not exist yet, a new directory is created. if None the policy is saved

                in a temp directory.

             callbacks: list of callbacks called during save (eg log.Agent)

        Returns:

            the absolute path to the directory containing the saved policy.

        """

        if directory is None:

            directory = bcore._get_temp_path()

        assert directory

        assert self._backend_agent._agent_context._is_policy_trained, "No trained policy available. Call train() first."

        directory = bcore._mkdir(directory)

        agent_json_path = os.path.join(directory, EasyAgent._KEY_EASYAGENT_FILENAME)

        with open(agent_json_path, 'w') as jsonfile:

            agent_dict = self._to_dict()

            json.dump(agent_dict, jsonfile, sort_keys=True, indent=2)

        callbacks = self._to_callback_list(callbacks=callbacks)

        policy_directory = os.path.join(directory, EasyAgent._KEY_POLICY_DIRECTORY)

        policy_directory = bcore._mkdir(policy_directory)

        self._backend_agent.save(directory=policy_directory, callbacks=callbacks)

        return directory

train

def train(
    self,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None,
    num_iterations: int = 100,
    num_episodes_per_iteration: int = 50,
    max_steps_per_episode: int = 500,
    elite_set_fraction: float = 0.1,
    num_iterations_between_eval: int = 5,
    num_episodes_per_eval: int = 10,
    train_context: easyagents.core.CemTrainContext = None,
    default_plots: bool = None
)

Trains a new model using the gym environment passed during instantiation.

Args: callbacks: list of callbacks called during training and evaluation num_iterations: number of times the training is repeated (with additional data) num_episodes_per_iteration: number of episodes played in each iteration. for each episode a new policy is sampled from the current weight distribution. max_steps_per_episode: maximum number of steps per episode elite_set_fraction: the fraction of policies which are members of the elite set. These policies are used to fit a new weight distribution in each iteration. num_iterations_between_eval: number of training iterations before the current policy is evaluated. if 0 no evaluation is performed. num_episodes_per_eval: number of episodes played to estimate the average return and steps train_context: training configuration to be used. if set overrides all other training context arguments. default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...). if None default callbacks are only added if the callbacks list is empty

Returns: train_context: the training configuration containing the loss and sum of rewards encountered during training

View Source

    def train(self,

              callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

              num_iterations: int = 100,

              num_episodes_per_iteration: int = 50,

              max_steps_per_episode: int = 500,

              elite_set_fraction: float = 0.1,

              num_iterations_between_eval: int = 5,

              num_episodes_per_eval: int = 10,

              train_context: core.CemTrainContext = None,

              default_plots: bool = None):

        """Trains a new model using the gym environment passed during instantiation.

        Args:

            callbacks: list of callbacks called during training and evaluation

            num_iterations: number of times the training is repeated (with additional data)

            num_episodes_per_iteration: number of episodes played in each iteration. for each episode a new

                policy is sampled from the current weight distribution.

            max_steps_per_episode: maximum number of steps per episode

            elite_set_fraction: the fraction of policies which are members of the elite set.

                These policies are used to fit a new weight distribution in each iteration.

            num_iterations_between_eval: number of training iterations before the current policy is evaluated.

                if 0 no evaluation is performed.

            num_episodes_per_eval: number of episodes played to estimate the average return and steps

            train_context: training configuration to be used. if set overrides all other training context arguments.

            default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...).

                if None default callbacks are only added if the callbacks list is empty

        Returns:

            train_context: the training configuration containing the loss and sum of rewards encountered

                during training

        """

        if train_context is None:

            train_context = core.CemTrainContext()

            train_context.num_iterations = num_iterations

            train_context.max_steps_per_episode = max_steps_per_episode

            train_context.elite_set_fraction = elite_set_fraction

            train_context.num_iterations_between_eval = num_iterations_between_eval

            train_context.num_episodes_per_eval = num_episodes_per_eval

        super().train(train_context=train_context, callbacks=callbacks, default_plots=default_plots)

        return train_context

DoubleDqnAgent

class DoubleDqnAgent(
    gym_env_name: str,
    fc_layers: Union[Tuple[int, ...], int, NoneType] = None,
    backend: str = None
)

Agent based on the Double Dqn algorithm (https://arxiv.org/abs/1509.06461)

View Source

class DoubleDqnAgent(DqnAgent):

    """Agent based on the Double Dqn algorithm (https://arxiv.org/abs/1509.06461)"""

Ancestors (in MRO)

easyagents.agents.DqnAgent
easyagents.agents.EasyAgent
abc.ABC

Methods

evaluate

def evaluate(
    self,
    num_episodes: int = 50,
    max_steps_per_episode: int = 50
)

Plays num_episodes with the current policy and computes metrics on rewards.

Args: num_episodes: number of episodes to play max_steps_per_episode: max steps per episode

Returns: extensible score metrics

View Source

    def evaluate(self,

                 num_episodes: int = 50,

                 max_steps_per_episode: int = 50):

        """Plays num_episodes with the current policy and computes metrics on rewards.

        Args:

            num_episodes: number of episodes to play

            max_steps_per_episode: max steps per episode

        Returns:

            extensible score metrics

        """

        play_context = core.PlayContext()

        play_context.max_steps_per_episode = max_steps_per_episode

        play_context.num_episodes = num_episodes

        self.play(play_context=play_context, default_plots=False)

        Metrics = namedtuple('Metrics', 'steps rewards')

        Rewards = namedtuple('Rewards', 'mean std min max all')

        all_rewards = list(play_context.sum_of_rewards.values())

        mean_reward, std_reward, min_reward, max_reward = statistics.mean(all_rewards), statistics.stdev(

            all_rewards), min(all_rewards), max(all_rewards)

        rewards = Rewards(mean=mean_reward, std=std_reward, min=min_reward, max=max_reward, all=all_rewards)

        Steps = namedtuple('Steps', 'mean std min max all')

        all_num_steps = []

        for i in play_context.rewards.keys():

            all_num_steps.append(len(play_context.rewards[i]))

        mean_steps, std_steps, min_steps, max_steps = statistics.mean(all_num_steps), statistics.stdev(

            all_num_steps), min(all_num_steps), max(all_num_steps)

        steps = Steps(mean=mean_steps, std=std_steps, min=min_steps, max=max_steps, all=all_num_steps)

        metrics = Metrics(rewards=rewards, steps=steps)

        return metrics

play

def play(
    self,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None,
    num_episodes: int = 1,
    max_steps_per_episode: int = 1000,
    play_context: easyagents.core.PlayContext = None,
    default_plots: bool = None
)

Plays num_episodes with the current policy.

Args: callbacks: list of callbacks called during each episode play num_episodes: number of episodes to play max_steps_per_episode: max steps per episode play_context: play configuration to be used. If set override all other play context arguments default_plots: if set addes a set of default callbacks (plot.State, plot.Rewards, ...)

Returns: play_context containg the actions taken and the rewards received during training

View Source

    def play(self,

             callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

             num_episodes: int = 1,

             max_steps_per_episode: int = 1000,

             play_context: core.PlayContext = None,

             default_plots: bool = None):

        """Plays num_episodes with the current policy.

        Args:

            callbacks: list of callbacks called during each episode play

            num_episodes: number of episodes to play

            max_steps_per_episode: max steps per episode

            play_context: play configuration to be used. If set override all other play context arguments

            default_plots: if set addes a set of default callbacks (plot.State, plot.Rewards, ...)

        Returns:

            play_context containg the actions taken and the rewards received during training

        """

        assert self._backend_agent._agent_context._is_policy_trained, "No trained policy available. Call train() first."

        if play_context is None:

            play_context = core.PlayContext()

            play_context.max_steps_per_episode = max_steps_per_episode

            play_context.num_episodes = num_episodes

        callbacks = self._to_callback_list(callbacks=callbacks)

        callbacks = self._add_plot_callbacks(callbacks, default_plots, [plot.Steps(), plot.Rewards()])

        self._backend_agent.play(play_context=play_context, callbacks=callbacks)

        return play_context

save

def save(
    self,
    directory: Union[str, NoneType] = None,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None
) -> str

Saves the currently trained actor policy in directory.

If save is called before a trained policy is created, eg by calling train, an exception is raised.

Args: directory: the directory to save the policy weights to. if the directory does not exist yet, a new directory is created. if None the policy is saved in a temp directory. callbacks: list of callbacks called during save (eg log.Agent)

Returns: the absolute path to the directory containing the saved policy.

View Source

    def save(self, directory: Optional[str] = None,

             callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None) -> str:

        """Saves the currently trained actor policy in directory.

        If save is called before a trained policy is created, eg by calling train, an exception is raised.

        Args:

             directory: the directory to save the policy weights to.

                if the directory does not exist yet, a new directory is created. if None the policy is saved

                in a temp directory.

             callbacks: list of callbacks called during save (eg log.Agent)

        Returns:

            the absolute path to the directory containing the saved policy.

        """

        if directory is None:

            directory = bcore._get_temp_path()

        assert directory

        assert self._backend_agent._agent_context._is_policy_trained, "No trained policy available. Call train() first."

        directory = bcore._mkdir(directory)

        agent_json_path = os.path.join(directory, EasyAgent._KEY_EASYAGENT_FILENAME)

        with open(agent_json_path, 'w') as jsonfile:

            agent_dict = self._to_dict()

            json.dump(agent_dict, jsonfile, sort_keys=True, indent=2)

        callbacks = self._to_callback_list(callbacks=callbacks)

        policy_directory = os.path.join(directory, EasyAgent._KEY_POLICY_DIRECTORY)

        policy_directory = bcore._mkdir(policy_directory)

        self._backend_agent.save(directory=policy_directory, callbacks=callbacks)

        return directory

train

def train(
    self,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None,
    num_iterations: int = 20000,
    max_steps_per_episode: int = 500,
    num_steps_per_iteration: int = 1,
    num_steps_buffer_preload=1000,
    num_steps_sampled_from_buffer=64,
    num_iterations_between_eval: int = 1000,
    num_episodes_per_eval: int = 10,
    learning_rate: float = 0.001,
    train_context: easyagents.core.StepsTrainContext = None,
    default_plots: bool = None
)

Trains a new model using the gym environment passed during instantiation.

Args: callbacks: list of callbacks called during training and evaluation num_iterations: number of times the training is repeated (with additional data) max_steps_per_episode: maximum number of steps per episode num_steps_per_iteration: number of steps played per training iteration num_steps_buffer_preload: number of initial collect steps to preload the buffer num_steps_sampled_from_buffer: the number of steps sampled from buffer for each iteration training num_iterations_between_eval: number of training iterations before the current policy is evaluated. if 0 no evaluation is performed. num_episodes_per_eval: number of episodes played to estimate the average return and steps learning_rate: the learning rate used in the next iteration's policy training (0,1] train_context: training configuration to be used. if set overrides all other training context arguments. default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...). if None default callbacks are only added if the callbacks list is empty

Returns: train_context: the training configuration containing the loss and sum of rewards encountered during training

View Source

    def train(self,

              callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

              num_iterations: int = 20000,

              max_steps_per_episode: int = 500,

              num_steps_per_iteration: int = 1,

              num_steps_buffer_preload=1000,

              num_steps_sampled_from_buffer=64,

              num_iterations_between_eval: int = 1000,

              num_episodes_per_eval: int = 10,

              learning_rate: float = 0.001,

              train_context: core.StepsTrainContext = None,

              default_plots: bool = None):

        """Trains a new model using the gym environment passed during instantiation.

        Args:

            callbacks: list of callbacks called during training and evaluation

            num_iterations: number of times the training is repeated (with additional data)

            max_steps_per_episode: maximum number of steps per episode

            num_steps_per_iteration: number of steps played per training iteration

            num_steps_buffer_preload: number of initial collect steps to preload the buffer

            num_steps_sampled_from_buffer: the number of steps sampled from buffer for each iteration training

            num_iterations_between_eval: number of training iterations before the current policy is evaluated.

                if 0 no evaluation is performed.

            num_episodes_per_eval: number of episodes played to estimate the average return and steps

            learning_rate: the learning rate used in the next iteration's policy training (0,1]

            train_context: training configuration to be used. if set overrides all other training context arguments.

            default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...).

                if None default callbacks are only added if the callbacks list is empty

        Returns:

            train_context: the training configuration containing the loss and sum of rewards encountered

                during training

        """

        if train_context is None:

            train_context = core.StepsTrainContext()

            train_context.num_iterations = num_iterations

            train_context.max_steps_per_episode = max_steps_per_episode

            train_context.num_steps_per_iteration = num_steps_per_iteration

            train_context.num_steps_buffer_preload = num_steps_buffer_preload

            train_context.num_steps_sampled_from_buffer = num_steps_sampled_from_buffer

            train_context.num_iterations_between_eval = num_iterations_between_eval

            train_context.num_episodes_per_eval = num_episodes_per_eval

            train_context.learning_rate = learning_rate

        super().train(train_context=train_context, callbacks=callbacks, default_plots=default_plots)

        return train_context

DqnAgent

class DqnAgent(
    gym_env_name: str,
    fc_layers: Union[Tuple[int, ...], int, NoneType] = None,
    backend: str = None
)

creates a new agent based on the Dqn algorithm.

From wikipedia: The DeepMind system used a deep convolutional neural network, with layers of tiled convolutional filters to mimic the effects of receptive fields. Reinforcement learning is unstable or divergent when a nonlinear function approximator such as a neural network is used to represent Q. This instability comes from the correlations present in the sequence of observations, the fact that small updates to Q may significantly change the policy and the data distribution, and the correlations between Q and the target values.

The technique used experience replay, a biologically inspired mechanism that uses a random sample of prior actions instead of the most recent action to proceed.[2] This removes correlations in the observation sequence and smooths changes in the data distribution. Iterative update adjusts Q towards target values that are only periodically updated, further reducing correlations with the target.[17]

see also: https://deepmind.com/research/publications/human-level-control-through-deep-reinforcement-learning

View Source

class DqnAgent(EasyAgent):

    """creates a new agent based on the Dqn algorithm.

    From wikipedia:

    The DeepMind system used a deep convolutional neural network, with layers of tiled convolutional filters to mimic

    the effects of receptive fields. Reinforcement learning is unstable or divergent when a nonlinear function

    approximator such as a neural network is used to represent Q.

    This instability comes from the correlations present in the sequence of observations, the fact that small updates

    to Q may significantly change the policy and the data distribution, and the correlations between Q and the

    target values.

    The technique used experience replay, a biologically inspired mechanism that uses a random sample of prior actions

    instead of the most recent action to proceed.[2] This removes correlations in the observation sequence and smooths

    changes in the data distribution. Iterative update adjusts Q towards target values that are only periodically

    updated, further reducing correlations with the target.[17]

    see also: https://deepmind.com/research/publications/human-level-control-through-deep-reinforcement-learning

    """

    def train(self,

              callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

              num_iterations: int = 20000,

              max_steps_per_episode: int = 500,

              num_steps_per_iteration: int = 1,

              num_steps_buffer_preload=1000,

              num_steps_sampled_from_buffer=64,

              num_iterations_between_eval: int = 1000,

              num_episodes_per_eval: int = 10,

              learning_rate: float = 0.001,

              train_context: core.StepsTrainContext = None,

              default_plots: bool = None):

        """Trains a new model using the gym environment passed during instantiation.

        Args:

            callbacks: list of callbacks called during training and evaluation

            num_iterations: number of times the training is repeated (with additional data)

            max_steps_per_episode: maximum number of steps per episode

            num_steps_per_iteration: number of steps played per training iteration

            num_steps_buffer_preload: number of initial collect steps to preload the buffer

            num_steps_sampled_from_buffer: the number of steps sampled from buffer for each iteration training

            num_iterations_between_eval: number of training iterations before the current policy is evaluated.

                if 0 no evaluation is performed.

            num_episodes_per_eval: number of episodes played to estimate the average return and steps

            learning_rate: the learning rate used in the next iteration's policy training (0,1]

            train_context: training configuration to be used. if set overrides all other training context arguments.

            default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...).

                if None default callbacks are only added if the callbacks list is empty

        Returns:

            train_context: the training configuration containing the loss and sum of rewards encountered

                during training

        """

        if train_context is None:

            train_context = core.StepsTrainContext()

            train_context.num_iterations = num_iterations

            train_context.max_steps_per_episode = max_steps_per_episode

            train_context.num_steps_per_iteration = num_steps_per_iteration

            train_context.num_steps_buffer_preload = num_steps_buffer_preload

            train_context.num_steps_sampled_from_buffer = num_steps_sampled_from_buffer

            train_context.num_iterations_between_eval = num_iterations_between_eval

            train_context.num_episodes_per_eval = num_episodes_per_eval

            train_context.learning_rate = learning_rate

        super().train(train_context=train_context, callbacks=callbacks, default_plots=default_plots)

        return train_context

Ancestors (in MRO)

easyagents.agents.EasyAgent
abc.ABC

Descendants

easyagents.agents.DoubleDqnAgent
easyagents.agents.DuelingDqnAgent
easyagents.agents.SacAgent

Methods

evaluate

def evaluate(
    self,
    num_episodes: int = 50,
    max_steps_per_episode: int = 50
)

Plays num_episodes with the current policy and computes metrics on rewards.

Args: num_episodes: number of episodes to play max_steps_per_episode: max steps per episode

Returns: extensible score metrics

View Source

    def evaluate(self,

                 num_episodes: int = 50,

                 max_steps_per_episode: int = 50):

        """Plays num_episodes with the current policy and computes metrics on rewards.

        Args:

            num_episodes: number of episodes to play

            max_steps_per_episode: max steps per episode

        Returns:

            extensible score metrics

        """

        play_context = core.PlayContext()

        play_context.max_steps_per_episode = max_steps_per_episode

        play_context.num_episodes = num_episodes

        self.play(play_context=play_context, default_plots=False)

        Metrics = namedtuple('Metrics', 'steps rewards')

        Rewards = namedtuple('Rewards', 'mean std min max all')

        all_rewards = list(play_context.sum_of_rewards.values())

        mean_reward, std_reward, min_reward, max_reward = statistics.mean(all_rewards), statistics.stdev(

            all_rewards), min(all_rewards), max(all_rewards)

        rewards = Rewards(mean=mean_reward, std=std_reward, min=min_reward, max=max_reward, all=all_rewards)

        Steps = namedtuple('Steps', 'mean std min max all')

        all_num_steps = []

        for i in play_context.rewards.keys():

            all_num_steps.append(len(play_context.rewards[i]))

        mean_steps, std_steps, min_steps, max_steps = statistics.mean(all_num_steps), statistics.stdev(

            all_num_steps), min(all_num_steps), max(all_num_steps)

        steps = Steps(mean=mean_steps, std=std_steps, min=min_steps, max=max_steps, all=all_num_steps)

        metrics = Metrics(rewards=rewards, steps=steps)

        return metrics

play

def play(
    self,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None,
    num_episodes: int = 1,
    max_steps_per_episode: int = 1000,
    play_context: easyagents.core.PlayContext = None,
    default_plots: bool = None
)

Plays num_episodes with the current policy.

Args: callbacks: list of callbacks called during each episode play num_episodes: number of episodes to play max_steps_per_episode: max steps per episode play_context: play configuration to be used. If set override all other play context arguments default_plots: if set addes a set of default callbacks (plot.State, plot.Rewards, ...)

Returns: play_context containg the actions taken and the rewards received during training

View Source

    def play(self,

             callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

             num_episodes: int = 1,

             max_steps_per_episode: int = 1000,

             play_context: core.PlayContext = None,

             default_plots: bool = None):

        """Plays num_episodes with the current policy.

        Args:

            callbacks: list of callbacks called during each episode play

            num_episodes: number of episodes to play

            max_steps_per_episode: max steps per episode

            play_context: play configuration to be used. If set override all other play context arguments

            default_plots: if set addes a set of default callbacks (plot.State, plot.Rewards, ...)

        Returns:

            play_context containg the actions taken and the rewards received during training

        """

        assert self._backend_agent._agent_context._is_policy_trained, "No trained policy available. Call train() first."

        if play_context is None:

            play_context = core.PlayContext()

            play_context.max_steps_per_episode = max_steps_per_episode

            play_context.num_episodes = num_episodes

        callbacks = self._to_callback_list(callbacks=callbacks)

        callbacks = self._add_plot_callbacks(callbacks, default_plots, [plot.Steps(), plot.Rewards()])

        self._backend_agent.play(play_context=play_context, callbacks=callbacks)

        return play_context

save

def save(
    self,
    directory: Union[str, NoneType] = None,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None
) -> str

Saves the currently trained actor policy in directory.

If save is called before a trained policy is created, eg by calling train, an exception is raised.

Args: directory: the directory to save the policy weights to. if the directory does not exist yet, a new directory is created. if None the policy is saved in a temp directory. callbacks: list of callbacks called during save (eg log.Agent)

Returns: the absolute path to the directory containing the saved policy.

View Source

    def save(self, directory: Optional[str] = None,

             callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None) -> str:

        """Saves the currently trained actor policy in directory.

        If save is called before a trained policy is created, eg by calling train, an exception is raised.

        Args:

             directory: the directory to save the policy weights to.

                if the directory does not exist yet, a new directory is created. if None the policy is saved

                in a temp directory.

             callbacks: list of callbacks called during save (eg log.Agent)

        Returns:

            the absolute path to the directory containing the saved policy.

        """

        if directory is None:

            directory = bcore._get_temp_path()

        assert directory

        assert self._backend_agent._agent_context._is_policy_trained, "No trained policy available. Call train() first."

        directory = bcore._mkdir(directory)

        agent_json_path = os.path.join(directory, EasyAgent._KEY_EASYAGENT_FILENAME)

        with open(agent_json_path, 'w') as jsonfile:

            agent_dict = self._to_dict()

            json.dump(agent_dict, jsonfile, sort_keys=True, indent=2)

        callbacks = self._to_callback_list(callbacks=callbacks)

        policy_directory = os.path.join(directory, EasyAgent._KEY_POLICY_DIRECTORY)

        policy_directory = bcore._mkdir(policy_directory)

        self._backend_agent.save(directory=policy_directory, callbacks=callbacks)

        return directory

train

def train(
    self,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None,
    num_iterations: int = 20000,
    max_steps_per_episode: int = 500,
    num_steps_per_iteration: int = 1,
    num_steps_buffer_preload=1000,
    num_steps_sampled_from_buffer=64,
    num_iterations_between_eval: int = 1000,
    num_episodes_per_eval: int = 10,
    learning_rate: float = 0.001,
    train_context: easyagents.core.StepsTrainContext = None,
    default_plots: bool = None
)

Trains a new model using the gym environment passed during instantiation.

Args: callbacks: list of callbacks called during training and evaluation num_iterations: number of times the training is repeated (with additional data) max_steps_per_episode: maximum number of steps per episode num_steps_per_iteration: number of steps played per training iteration num_steps_buffer_preload: number of initial collect steps to preload the buffer num_steps_sampled_from_buffer: the number of steps sampled from buffer for each iteration training num_iterations_between_eval: number of training iterations before the current policy is evaluated. if 0 no evaluation is performed. num_episodes_per_eval: number of episodes played to estimate the average return and steps learning_rate: the learning rate used in the next iteration's policy training (0,1] train_context: training configuration to be used. if set overrides all other training context arguments. default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...). if None default callbacks are only added if the callbacks list is empty

Returns: train_context: the training configuration containing the loss and sum of rewards encountered during training

View Source

    def train(self,

              callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

              num_iterations: int = 20000,

              max_steps_per_episode: int = 500,

              num_steps_per_iteration: int = 1,

              num_steps_buffer_preload=1000,

              num_steps_sampled_from_buffer=64,

              num_iterations_between_eval: int = 1000,

              num_episodes_per_eval: int = 10,

              learning_rate: float = 0.001,

              train_context: core.StepsTrainContext = None,

              default_plots: bool = None):

        """Trains a new model using the gym environment passed during instantiation.

        Args:

            callbacks: list of callbacks called during training and evaluation

            num_iterations: number of times the training is repeated (with additional data)

            max_steps_per_episode: maximum number of steps per episode

            num_steps_per_iteration: number of steps played per training iteration

            num_steps_buffer_preload: number of initial collect steps to preload the buffer

            num_steps_sampled_from_buffer: the number of steps sampled from buffer for each iteration training

            num_iterations_between_eval: number of training iterations before the current policy is evaluated.

                if 0 no evaluation is performed.

            num_episodes_per_eval: number of episodes played to estimate the average return and steps

            learning_rate: the learning rate used in the next iteration's policy training (0,1]

            train_context: training configuration to be used. if set overrides all other training context arguments.

            default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...).

                if None default callbacks are only added if the callbacks list is empty

        Returns:

            train_context: the training configuration containing the loss and sum of rewards encountered

                during training

        """

        if train_context is None:

            train_context = core.StepsTrainContext()

            train_context.num_iterations = num_iterations

            train_context.max_steps_per_episode = max_steps_per_episode

            train_context.num_steps_per_iteration = num_steps_per_iteration

            train_context.num_steps_buffer_preload = num_steps_buffer_preload

            train_context.num_steps_sampled_from_buffer = num_steps_sampled_from_buffer

            train_context.num_iterations_between_eval = num_iterations_between_eval

            train_context.num_episodes_per_eval = num_episodes_per_eval

            train_context.learning_rate = learning_rate

        super().train(train_context=train_context, callbacks=callbacks, default_plots=default_plots)

        return train_context

DuelingDqnAgent

class DuelingDqnAgent(
    gym_env_name: str,
    fc_layers: Union[Tuple[int, ...], int, NoneType] = None,
    backend: str = None
)

Agent based on the Dueling Dqn algorithm (https://arxiv.org/abs/1511.06581).

View Source

class DuelingDqnAgent(DqnAgent):

    """Agent based on the Dueling Dqn algorithm (https://arxiv.org/abs/1511.06581)."""

Ancestors (in MRO)

easyagents.agents.DqnAgent
easyagents.agents.EasyAgent
abc.ABC

Methods

evaluate

def evaluate(
    self,
    num_episodes: int = 50,
    max_steps_per_episode: int = 50
)

Plays num_episodes with the current policy and computes metrics on rewards.

Args: num_episodes: number of episodes to play max_steps_per_episode: max steps per episode

Returns: extensible score metrics

View Source

    def evaluate(self,

                 num_episodes: int = 50,

                 max_steps_per_episode: int = 50):

        """Plays num_episodes with the current policy and computes metrics on rewards.

        Args:

            num_episodes: number of episodes to play

            max_steps_per_episode: max steps per episode

        Returns:

            extensible score metrics

        """

        play_context = core.PlayContext()

        play_context.max_steps_per_episode = max_steps_per_episode

        play_context.num_episodes = num_episodes

        self.play(play_context=play_context, default_plots=False)

        Metrics = namedtuple('Metrics', 'steps rewards')

        Rewards = namedtuple('Rewards', 'mean std min max all')

        all_rewards = list(play_context.sum_of_rewards.values())

        mean_reward, std_reward, min_reward, max_reward = statistics.mean(all_rewards), statistics.stdev(

            all_rewards), min(all_rewards), max(all_rewards)

        rewards = Rewards(mean=mean_reward, std=std_reward, min=min_reward, max=max_reward, all=all_rewards)

        Steps = namedtuple('Steps', 'mean std min max all')

        all_num_steps = []

        for i in play_context.rewards.keys():

            all_num_steps.append(len(play_context.rewards[i]))

        mean_steps, std_steps, min_steps, max_steps = statistics.mean(all_num_steps), statistics.stdev(

            all_num_steps), min(all_num_steps), max(all_num_steps)

        steps = Steps(mean=mean_steps, std=std_steps, min=min_steps, max=max_steps, all=all_num_steps)

        metrics = Metrics(rewards=rewards, steps=steps)

        return metrics

play

def play(
    self,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None,
    num_episodes: int = 1,
    max_steps_per_episode: int = 1000,
    play_context: easyagents.core.PlayContext = None,
    default_plots: bool = None
)

Plays num_episodes with the current policy.

Args: callbacks: list of callbacks called during each episode play num_episodes: number of episodes to play max_steps_per_episode: max steps per episode play_context: play configuration to be used. If set override all other play context arguments default_plots: if set addes a set of default callbacks (plot.State, plot.Rewards, ...)

Returns: play_context containg the actions taken and the rewards received during training

View Source

    def play(self,

             callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

             num_episodes: int = 1,

             max_steps_per_episode: int = 1000,

             play_context: core.PlayContext = None,

             default_plots: bool = None):

        """Plays num_episodes with the current policy.

        Args:

            callbacks: list of callbacks called during each episode play

            num_episodes: number of episodes to play

            max_steps_per_episode: max steps per episode

            play_context: play configuration to be used. If set override all other play context arguments

            default_plots: if set addes a set of default callbacks (plot.State, plot.Rewards, ...)

        Returns:

            play_context containg the actions taken and the rewards received during training

        """

        assert self._backend_agent._agent_context._is_policy_trained, "No trained policy available. Call train() first."

        if play_context is None:

            play_context = core.PlayContext()

            play_context.max_steps_per_episode = max_steps_per_episode

            play_context.num_episodes = num_episodes

        callbacks = self._to_callback_list(callbacks=callbacks)

        callbacks = self._add_plot_callbacks(callbacks, default_plots, [plot.Steps(), plot.Rewards()])

        self._backend_agent.play(play_context=play_context, callbacks=callbacks)

        return play_context

save

def save(
    self,
    directory: Union[str, NoneType] = None,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None
) -> str

Saves the currently trained actor policy in directory.

If save is called before a trained policy is created, eg by calling train, an exception is raised.

Args: directory: the directory to save the policy weights to. if the directory does not exist yet, a new directory is created. if None the policy is saved in a temp directory. callbacks: list of callbacks called during save (eg log.Agent)

Returns: the absolute path to the directory containing the saved policy.

View Source

    def save(self, directory: Optional[str] = None,

             callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None) -> str:

        """Saves the currently trained actor policy in directory.

        If save is called before a trained policy is created, eg by calling train, an exception is raised.

        Args:

             directory: the directory to save the policy weights to.

                if the directory does not exist yet, a new directory is created. if None the policy is saved

                in a temp directory.

             callbacks: list of callbacks called during save (eg log.Agent)

        Returns:

            the absolute path to the directory containing the saved policy.

        """

        if directory is None:

            directory = bcore._get_temp_path()

        assert directory

        assert self._backend_agent._agent_context._is_policy_trained, "No trained policy available. Call train() first."

        directory = bcore._mkdir(directory)

        agent_json_path = os.path.join(directory, EasyAgent._KEY_EASYAGENT_FILENAME)

        with open(agent_json_path, 'w') as jsonfile:

            agent_dict = self._to_dict()

            json.dump(agent_dict, jsonfile, sort_keys=True, indent=2)

        callbacks = self._to_callback_list(callbacks=callbacks)

        policy_directory = os.path.join(directory, EasyAgent._KEY_POLICY_DIRECTORY)

        policy_directory = bcore._mkdir(policy_directory)

        self._backend_agent.save(directory=policy_directory, callbacks=callbacks)

        return directory

train

def train(
    self,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None,
    num_iterations: int = 20000,
    max_steps_per_episode: int = 500,
    num_steps_per_iteration: int = 1,
    num_steps_buffer_preload=1000,
    num_steps_sampled_from_buffer=64,
    num_iterations_between_eval: int = 1000,
    num_episodes_per_eval: int = 10,
    learning_rate: float = 0.001,
    train_context: easyagents.core.StepsTrainContext = None,
    default_plots: bool = None
)

Trains a new model using the gym environment passed during instantiation.

Args: callbacks: list of callbacks called during training and evaluation num_iterations: number of times the training is repeated (with additional data) max_steps_per_episode: maximum number of steps per episode num_steps_per_iteration: number of steps played per training iteration num_steps_buffer_preload: number of initial collect steps to preload the buffer num_steps_sampled_from_buffer: the number of steps sampled from buffer for each iteration training num_iterations_between_eval: number of training iterations before the current policy is evaluated. if 0 no evaluation is performed. num_episodes_per_eval: number of episodes played to estimate the average return and steps learning_rate: the learning rate used in the next iteration's policy training (0,1] train_context: training configuration to be used. if set overrides all other training context arguments. default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...). if None default callbacks are only added if the callbacks list is empty

Returns: train_context: the training configuration containing the loss and sum of rewards encountered during training

View Source

    def train(self,

              callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

              num_iterations: int = 20000,

              max_steps_per_episode: int = 500,

              num_steps_per_iteration: int = 1,

              num_steps_buffer_preload=1000,

              num_steps_sampled_from_buffer=64,

              num_iterations_between_eval: int = 1000,

              num_episodes_per_eval: int = 10,

              learning_rate: float = 0.001,

              train_context: core.StepsTrainContext = None,

              default_plots: bool = None):

        """Trains a new model using the gym environment passed during instantiation.

        Args:

            callbacks: list of callbacks called during training and evaluation

            num_iterations: number of times the training is repeated (with additional data)

            max_steps_per_episode: maximum number of steps per episode

            num_steps_per_iteration: number of steps played per training iteration

            num_steps_buffer_preload: number of initial collect steps to preload the buffer

            num_steps_sampled_from_buffer: the number of steps sampled from buffer for each iteration training

            num_iterations_between_eval: number of training iterations before the current policy is evaluated.

                if 0 no evaluation is performed.

            num_episodes_per_eval: number of episodes played to estimate the average return and steps

            learning_rate: the learning rate used in the next iteration's policy training (0,1]

            train_context: training configuration to be used. if set overrides all other training context arguments.

            default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...).

                if None default callbacks are only added if the callbacks list is empty

        Returns:

            train_context: the training configuration containing the loss and sum of rewards encountered

                during training

        """

        if train_context is None:

            train_context = core.StepsTrainContext()

            train_context.num_iterations = num_iterations

            train_context.max_steps_per_episode = max_steps_per_episode

            train_context.num_steps_per_iteration = num_steps_per_iteration

            train_context.num_steps_buffer_preload = num_steps_buffer_preload

            train_context.num_steps_sampled_from_buffer = num_steps_sampled_from_buffer

            train_context.num_iterations_between_eval = num_iterations_between_eval

            train_context.num_episodes_per_eval = num_episodes_per_eval

            train_context.learning_rate = learning_rate

        super().train(train_context=train_context, callbacks=callbacks, default_plots=default_plots)

        return train_context

EasyAgent

class EasyAgent(
    gym_env_name: str,
    fc_layers: Union[Tuple[int, ...], int, NoneType] = None,
    backend: str = None
)

Abstract base class for all easy reinforcment learning agents.

Besides forwarding train and play it implements persistence.

View Source

class EasyAgent(ABC):

    """Abstract base class for all easy reinforcment learning agents.

    Besides forwarding train and play it implements persistence."""

    _KEY_BACKEND = 'backend'

    _KEY_EASYAGENT_CLASS = 'easyagent_class'

    _KEY_EASYAGENT_FILENAME = 'easyagent.json'

    _KEY_MODEL_CONFIG = 'model_config'

    _KEY_POLICY_DIRECTORY = 'policy'

    _KEY_VERSION = 'version'

    def __init__(self,

                 gym_env_name: str,

                 fc_layers: Union[Tuple[int, ...], int, None] = None,

                 backend: str = None):

        """

            Args:

                gym_env_name: name of an OpenAI gym environment to be used for training and evaluation

                fc_layers: defines the neural network to be used, a sequence of fully connected

                    layers of the given size. Eg (75,40) yields a neural network consisting

                    out of 2 hidden layers, the first one containing 75 and the second layer

                    containing 40 neurons.

                backend=the backend to be used (eg 'tfagents'), if None a default implementation is used.

                    call get_backends() to get a list of the available backends.

        """

        model_config = core.ModelConfig(gym_env_name=gym_env_name, fc_layers=fc_layers, seed=seed)

        self._initialize(model_config=model_config, backend_name=backend)

        return

    def _initialize(self, model_config: core.ModelConfig, backend_name: str = None):

        if backend_name is None:

            backend_name = easyagents.backends.default.DefaultAgentFactory.backend_name

        backend: bcore.BackendAgentFactory = _get_backend(backend_name)

        assert model_config is not None, "model_config not set."

        assert backend, f'Backend "{backend_name}" not found. The registered backends are {get_backends()}.'

        self._model_config: core.ModelConfig = model_config

        backend_agent = backend.create_agent(easyagent_type=type(self), model_config=model_config)

        assert backend_agent, f'Backend "{backend_name}" does not implement "{type(self).__name__}". ' + \

                              f'Choose one of the following backend {get_backends(type(self))}.'

        self._backend_agent: Optional[bcore._BackendAgent] = backend_agent

        self._backend_name: str = backend_name

        self._backend_agent._agent_context._agent_saver = self.save

        return

    def _add_plot_callbacks(self, callbacks: List[core.AgentCallback],

                            default_plots: Optional[bool],

                            default_plot_callbacks: List[plot._PlotCallback]

                            ) -> List[core.AgentCallback]:

        """Adds the default callbacks and sorts all callbacks in the order

            _PreProcessCallbacks, AgentCallbacks, _PostProcessCallbacks.

        Args:

            callbacks: existing callbacks to prepare

            default_plots: if set or if None and callbacks does not contain plots then the default plots are added

            default_plot_callbacks: plot callbacks to add.

        """

        pre_process: List[core.AgentCallback] = [plot._PreProcess()]

        agent: List[core.AgentCallback] = []

        post_process: List[core.AgentCallback] = [plot._PostProcess()]

        if default_plots is None:

            default_plots = True

            for c in callbacks:

                default_plots = default_plots and (not isinstance(c, plot._PlotCallback))

        if default_plots:

            agent = default_plot_callbacks

        for c in callbacks:

            if isinstance(c, core._PreProcessCallback):

                pre_process.append(c)

            else:

                if isinstance(c, core._PostProcessCallback):

                    post_process.append(c)

                else:

                    agent.append(c)

        result: List[core.AgentCallback] = pre_process + agent + post_process

        return result

    @staticmethod

    def _from_dict(param_dict: Dict[str, object]):

        """recreates a new agent instance according to the definition previously created by _to_dict.

        Returns:

            new agent instance (excluding any trained policy), the agent type is preserved.

        """

        assert param_dict

        mc: core.ModelConfig = core.ModelConfig._from_dict(param_dict[EasyAgent._KEY_MODEL_CONFIG])

        agent_class = globals()[param_dict[EasyAgent._KEY_EASYAGENT_CLASS]]

        backend: str = param_dict[EasyAgent._KEY_BACKEND]

        result = agent_class(gym_env_name=mc.original_env_name, backend=backend)

        result._initialize(model_config=mc, backend_name=backend)

        return result

    def _to_callback_list(self, callbacks: Union[Optional[core.AgentCallback], List[core.AgentCallback]]) -> List[

        core.AgentCallback]:

        """maps callbacks to an admissible callback list.

        if callbacks is None an empty list is returned.

        if callbacks is an AgentCallback a list containing only this callback is returned

        otherwise callbacks is returned

        """

        result: List[core.AgentCallback] = []

        if not callbacks is None:

            if isinstance(callbacks, core.AgentCallback):

                result = [callbacks]

            else:

                assert isinstance(callbacks, list), "callback not an AgentCallback or a list thereof."

                result = callbacks

        return result

    def _to_dict(self) -> Dict[str, object]:

        """saves the agent definition to a dict.

            Returns:

                dict containing all parameters to recreate the agent (excluding a trained policy)

        """

        result: Dict[str, object] = dict()

        result[EasyAgent._KEY_VERSION] = easyagents.__version__

        result[EasyAgent._KEY_EASYAGENT_CLASS] = self.__class__.__name__

        result[EasyAgent._KEY_BACKEND] = self._backend_name

        result[EasyAgent._KEY_MODEL_CONFIG] = self._model_config._to_dict()

        result[EasyAgent._KEY_POLICY_DIRECTORY] = 'policy'

        return result

    def evaluate(self,

                 num_episodes: int = 50,

                 max_steps_per_episode: int = 50):

        """Plays num_episodes with the current policy and computes metrics on rewards.

        Args:

            num_episodes: number of episodes to play

            max_steps_per_episode: max steps per episode

        Returns:

            extensible score metrics

        """

        play_context = core.PlayContext()

        play_context.max_steps_per_episode = max_steps_per_episode

        play_context.num_episodes = num_episodes

        self.play(play_context=play_context, default_plots=False)

        Metrics = namedtuple('Metrics', 'steps rewards')

        Rewards = namedtuple('Rewards', 'mean std min max all')

        all_rewards = list(play_context.sum_of_rewards.values())

        mean_reward, std_reward, min_reward, max_reward = statistics.mean(all_rewards), statistics.stdev(

            all_rewards), min(all_rewards), max(all_rewards)

        rewards = Rewards(mean=mean_reward, std=std_reward, min=min_reward, max=max_reward, all=all_rewards)

        Steps = namedtuple('Steps', 'mean std min max all')

        all_num_steps = []

        for i in play_context.rewards.keys():

            all_num_steps.append(len(play_context.rewards[i]))

        mean_steps, std_steps, min_steps, max_steps = statistics.mean(all_num_steps), statistics.stdev(

            all_num_steps), min(all_num_steps), max(all_num_steps)

        steps = Steps(mean=mean_steps, std=std_steps, min=min_steps, max=max_steps, all=all_num_steps)

        metrics = Metrics(rewards=rewards, steps=steps)

        return metrics

    def play(self,

             callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

             num_episodes: int = 1,

             max_steps_per_episode: int = 1000,

             play_context: core.PlayContext = None,

             default_plots: bool = None):

        """Plays num_episodes with the current policy.

        Args:

            callbacks: list of callbacks called during each episode play

            num_episodes: number of episodes to play

            max_steps_per_episode: max steps per episode

            play_context: play configuration to be used. If set override all other play context arguments

            default_plots: if set addes a set of default callbacks (plot.State, plot.Rewards, ...)

        Returns:

            play_context containg the actions taken and the rewards received during training

        """

        assert self._backend_agent._agent_context._is_policy_trained, "No trained policy available. Call train() first."

        if play_context is None:

            play_context = core.PlayContext()

            play_context.max_steps_per_episode = max_steps_per_episode

            play_context.num_episodes = num_episodes

        callbacks = self._to_callback_list(callbacks=callbacks)

        callbacks = self._add_plot_callbacks(callbacks, default_plots, [plot.Steps(), plot.Rewards()])

        self._backend_agent.play(play_context=play_context, callbacks=callbacks)

        return play_context

    def save(self, directory: Optional[str] = None,

             callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None) -> str:

        """Saves the currently trained actor policy in directory.

        If save is called before a trained policy is created, eg by calling train, an exception is raised.

        Args:

             directory: the directory to save the policy weights to.

                if the directory does not exist yet, a new directory is created. if None the policy is saved

                in a temp directory.

             callbacks: list of callbacks called during save (eg log.Agent)

        Returns:

            the absolute path to the directory containing the saved policy.

        """

        if directory is None:

            directory = bcore._get_temp_path()

        assert directory

        assert self._backend_agent._agent_context._is_policy_trained, "No trained policy available. Call train() first."

        directory = bcore._mkdir(directory)

        agent_json_path = os.path.join(directory, EasyAgent._KEY_EASYAGENT_FILENAME)

        with open(agent_json_path, 'w') as jsonfile:

            agent_dict = self._to_dict()

            json.dump(agent_dict, jsonfile, sort_keys=True, indent=2)

        callbacks = self._to_callback_list(callbacks=callbacks)

        policy_directory = os.path.join(directory, EasyAgent._KEY_POLICY_DIRECTORY)

        policy_directory = bcore._mkdir(policy_directory)

        self._backend_agent.save(directory=policy_directory, callbacks=callbacks)

        return directory

    def train(self, train_context: core.TrainContext,

              callbacks: Union[List[core.AgentCallback], core.AgentCallback, None],

              default_plots: Optional[bool]):

        """Trains a new model using the gym environment passed during instantiation.

        Args:

            callbacks: list of callbacks called during the training and evaluation

            train_context: training configuration to be used (num_iterations,num_episodes_per_iteration,...)

            default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...).

                if None default callbacks are only added if the callbacks list is empty

        """

        assert train_context, "train_context not set."

        callbacks = self._to_callback_list(callbacks=callbacks)

        callbacks = self._add_plot_callbacks(callbacks, default_plots, [plot.Loss(), plot.Steps(), plot.Rewards()])

        self._backend_agent.train(train_context=train_context, callbacks=callbacks)

Ancestors (in MRO)

abc.ABC

Descendants

easyagents.agents.CemAgent
easyagents.agents.DqnAgent
easyagents.agents.PpoAgent
easyagents.agents.RandomAgent
easyagents.agents.ReinforceAgent

Methods

evaluate

def evaluate(
    self,
    num_episodes: int = 50,
    max_steps_per_episode: int = 50
)

Plays num_episodes with the current policy and computes metrics on rewards.

Args: num_episodes: number of episodes to play max_steps_per_episode: max steps per episode

Returns: extensible score metrics

View Source

    def evaluate(self,

                 num_episodes: int = 50,

                 max_steps_per_episode: int = 50):

        """Plays num_episodes with the current policy and computes metrics on rewards.

        Args:

            num_episodes: number of episodes to play

            max_steps_per_episode: max steps per episode

        Returns:

            extensible score metrics

        """

        play_context = core.PlayContext()

        play_context.max_steps_per_episode = max_steps_per_episode

        play_context.num_episodes = num_episodes

        self.play(play_context=play_context, default_plots=False)

        Metrics = namedtuple('Metrics', 'steps rewards')

        Rewards = namedtuple('Rewards', 'mean std min max all')

        all_rewards = list(play_context.sum_of_rewards.values())

        mean_reward, std_reward, min_reward, max_reward = statistics.mean(all_rewards), statistics.stdev(

            all_rewards), min(all_rewards), max(all_rewards)

        rewards = Rewards(mean=mean_reward, std=std_reward, min=min_reward, max=max_reward, all=all_rewards)

        Steps = namedtuple('Steps', 'mean std min max all')

        all_num_steps = []

        for i in play_context.rewards.keys():

            all_num_steps.append(len(play_context.rewards[i]))

        mean_steps, std_steps, min_steps, max_steps = statistics.mean(all_num_steps), statistics.stdev(

            all_num_steps), min(all_num_steps), max(all_num_steps)

        steps = Steps(mean=mean_steps, std=std_steps, min=min_steps, max=max_steps, all=all_num_steps)

        metrics = Metrics(rewards=rewards, steps=steps)

        return metrics

play

def play(
    self,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None,
    num_episodes: int = 1,
    max_steps_per_episode: int = 1000,
    play_context: easyagents.core.PlayContext = None,
    default_plots: bool = None
)

Plays num_episodes with the current policy.

Args: callbacks: list of callbacks called during each episode play num_episodes: number of episodes to play max_steps_per_episode: max steps per episode play_context: play configuration to be used. If set override all other play context arguments default_plots: if set addes a set of default callbacks (plot.State, plot.Rewards, ...)

Returns: play_context containg the actions taken and the rewards received during training

View Source

    def play(self,

             callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

             num_episodes: int = 1,

             max_steps_per_episode: int = 1000,

             play_context: core.PlayContext = None,

             default_plots: bool = None):

        """Plays num_episodes with the current policy.

        Args:

            callbacks: list of callbacks called during each episode play

            num_episodes: number of episodes to play

            max_steps_per_episode: max steps per episode

            play_context: play configuration to be used. If set override all other play context arguments

            default_plots: if set addes a set of default callbacks (plot.State, plot.Rewards, ...)

        Returns:

            play_context containg the actions taken and the rewards received during training

        """

        assert self._backend_agent._agent_context._is_policy_trained, "No trained policy available. Call train() first."

        if play_context is None:

            play_context = core.PlayContext()

            play_context.max_steps_per_episode = max_steps_per_episode

            play_context.num_episodes = num_episodes

        callbacks = self._to_callback_list(callbacks=callbacks)

        callbacks = self._add_plot_callbacks(callbacks, default_plots, [plot.Steps(), plot.Rewards()])

        self._backend_agent.play(play_context=play_context, callbacks=callbacks)

        return play_context

save

def save(
    self,
    directory: Union[str, NoneType] = None,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None
) -> str

Saves the currently trained actor policy in directory.

If save is called before a trained policy is created, eg by calling train, an exception is raised.

Args: directory: the directory to save the policy weights to. if the directory does not exist yet, a new directory is created. if None the policy is saved in a temp directory. callbacks: list of callbacks called during save (eg log.Agent)

Returns: the absolute path to the directory containing the saved policy.

View Source

    def save(self, directory: Optional[str] = None,

             callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None) -> str:

        """Saves the currently trained actor policy in directory.

        If save is called before a trained policy is created, eg by calling train, an exception is raised.

        Args:

             directory: the directory to save the policy weights to.

                if the directory does not exist yet, a new directory is created. if None the policy is saved

                in a temp directory.

             callbacks: list of callbacks called during save (eg log.Agent)

        Returns:

            the absolute path to the directory containing the saved policy.

        """

        if directory is None:

            directory = bcore._get_temp_path()

        assert directory

        assert self._backend_agent._agent_context._is_policy_trained, "No trained policy available. Call train() first."

        directory = bcore._mkdir(directory)

        agent_json_path = os.path.join(directory, EasyAgent._KEY_EASYAGENT_FILENAME)

        with open(agent_json_path, 'w') as jsonfile:

            agent_dict = self._to_dict()

            json.dump(agent_dict, jsonfile, sort_keys=True, indent=2)

        callbacks = self._to_callback_list(callbacks=callbacks)

        policy_directory = os.path.join(directory, EasyAgent._KEY_POLICY_DIRECTORY)

        policy_directory = bcore._mkdir(policy_directory)

        self._backend_agent.save(directory=policy_directory, callbacks=callbacks)

        return directory

train

def train(
    self,
    train_context: easyagents.core.TrainContext,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType],
    default_plots: Union[bool, NoneType]
)

Trains a new model using the gym environment passed during instantiation.

Args: callbacks: list of callbacks called during the training and evaluation train_context: training configuration to be used (num_iterations,num_episodes_per_iteration,...) default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...). if None default callbacks are only added if the callbacks list is empty

View Source

    def train(self, train_context: core.TrainContext,

              callbacks: Union[List[core.AgentCallback], core.AgentCallback, None],

              default_plots: Optional[bool]):

        """Trains a new model using the gym environment passed during instantiation.

        Args:

            callbacks: list of callbacks called during the training and evaluation

            train_context: training configuration to be used (num_iterations,num_episodes_per_iteration,...)

            default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...).

                if None default callbacks are only added if the callbacks list is empty

        """

        assert train_context, "train_context not set."

        callbacks = self._to_callback_list(callbacks=callbacks)

        callbacks = self._add_plot_callbacks(callbacks, default_plots, [plot.Loss(), plot.Steps(), plot.Rewards()])

        self._backend_agent.train(train_context=train_context, callbacks=callbacks)

PpoAgent

class PpoAgent(
    gym_env_name: str,
    fc_layers: Union[Tuple[int, ...], int, NoneType] = None,
    backend: str = None
)

creates a new agent based on the PPO algorithm.

PPO is an actor-critic algorithm using 2 neural networks. The actor network to predict the next action to be taken and the critic network to estimate the value of the game state we are currently in (the expected, discounted sum of future rewards when following the current actor network).

see also: https://spinningup.openai.com/en/latest/algorithms/ppo.html

View Source

class PpoAgent(EasyAgent):

    """creates a new agent based on the PPO algorithm.

        PPO is an actor-critic algorithm using 2 neural networks. The actor network

        to predict the next action to be taken and the critic network to estimate

        the value of the game state we are currently in (the expected, discounted

        sum of future rewards when following the current actor network).

        see also: https://spinningup.openai.com/en/latest/algorithms/ppo.html

    """

    def train(self,

              callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

              num_iterations: int = 100,

              num_episodes_per_iteration: int = 10,

              max_steps_per_episode: int = 500,

              num_epochs_per_iteration: int = 10,

              num_iterations_between_eval: int = 5,

              num_episodes_per_eval: int = 10,

              learning_rate: float = 0.001,

              train_context: core.PpoTrainContext = None,

              default_plots: bool = None):

        """Trains a new model using the gym environment passed during instantiation.

        Args:

            callbacks: list of callbacks called during training and evaluation

            num_iterations: number of times the training is repeated (with additional data)

            num_episodes_per_iteration: number of episodes played per training iteration

            max_steps_per_episode: maximum number of steps per episode

            num_epochs_per_iteration: number of times the data collected for the current iteration

                is used to retrain the current policy

            num_iterations_between_eval: number of training iterations before the current policy is evaluated.

                if 0 no evaluation is performed.

            num_episodes_per_eval: number of episodes played to estimate the average return and steps

            learning_rate: the learning rate used in the next iteration's policy training (0,1]

            train_context: training configuration to be used. if set overrides all other training context arguments.

            default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...).

                if None default callbacks are only added if the callbacks list is empty

        Returns:

            train_context: the training configuration containing the loss and sum of rewards encountered

                during training

        """

        if train_context is None:

            train_context = core.PpoTrainContext()

            train_context.num_iterations = num_iterations

            train_context.num_episodes_per_iteration = num_episodes_per_iteration

            train_context.max_steps_per_episode = max_steps_per_episode

            train_context.num_epochs_per_iteration = num_epochs_per_iteration

            train_context.num_iterations_between_eval = num_iterations_between_eval

            train_context.num_episodes_per_eval = num_episodes_per_eval

            train_context.learning_rate = learning_rate

        super().train(train_context=train_context, callbacks=callbacks, default_plots=default_plots)

        return train_context

Ancestors (in MRO)

easyagents.agents.EasyAgent
abc.ABC

Methods

evaluate

def evaluate(
    self,
    num_episodes: int = 50,
    max_steps_per_episode: int = 50
)

Plays num_episodes with the current policy and computes metrics on rewards.

Args: num_episodes: number of episodes to play max_steps_per_episode: max steps per episode

Returns: extensible score metrics

View Source

    def evaluate(self,

                 num_episodes: int = 50,

                 max_steps_per_episode: int = 50):

        """Plays num_episodes with the current policy and computes metrics on rewards.

        Args:

            num_episodes: number of episodes to play

            max_steps_per_episode: max steps per episode

        Returns:

            extensible score metrics

        """

        play_context = core.PlayContext()

        play_context.max_steps_per_episode = max_steps_per_episode

        play_context.num_episodes = num_episodes

        self.play(play_context=play_context, default_plots=False)

        Metrics = namedtuple('Metrics', 'steps rewards')

        Rewards = namedtuple('Rewards', 'mean std min max all')

        all_rewards = list(play_context.sum_of_rewards.values())

        mean_reward, std_reward, min_reward, max_reward = statistics.mean(all_rewards), statistics.stdev(

            all_rewards), min(all_rewards), max(all_rewards)

        rewards = Rewards(mean=mean_reward, std=std_reward, min=min_reward, max=max_reward, all=all_rewards)

        Steps = namedtuple('Steps', 'mean std min max all')

        all_num_steps = []

        for i in play_context.rewards.keys():

            all_num_steps.append(len(play_context.rewards[i]))

        mean_steps, std_steps, min_steps, max_steps = statistics.mean(all_num_steps), statistics.stdev(

            all_num_steps), min(all_num_steps), max(all_num_steps)

        steps = Steps(mean=mean_steps, std=std_steps, min=min_steps, max=max_steps, all=all_num_steps)

        metrics = Metrics(rewards=rewards, steps=steps)

        return metrics

play

def play(
    self,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None,
    num_episodes: int = 1,
    max_steps_per_episode: int = 1000,
    play_context: easyagents.core.PlayContext = None,
    default_plots: bool = None
)

Plays num_episodes with the current policy.

Args: callbacks: list of callbacks called during each episode play num_episodes: number of episodes to play max_steps_per_episode: max steps per episode play_context: play configuration to be used. If set override all other play context arguments default_plots: if set addes a set of default callbacks (plot.State, plot.Rewards, ...)

Returns: play_context containg the actions taken and the rewards received during training

View Source

    def play(self,

             callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

             num_episodes: int = 1,

             max_steps_per_episode: int = 1000,

             play_context: core.PlayContext = None,

             default_plots: bool = None):

        """Plays num_episodes with the current policy.

        Args:

            callbacks: list of callbacks called during each episode play

            num_episodes: number of episodes to play

            max_steps_per_episode: max steps per episode

            play_context: play configuration to be used. If set override all other play context arguments

            default_plots: if set addes a set of default callbacks (plot.State, plot.Rewards, ...)

        Returns:

            play_context containg the actions taken and the rewards received during training

        """

        assert self._backend_agent._agent_context._is_policy_trained, "No trained policy available. Call train() first."

        if play_context is None:

            play_context = core.PlayContext()

            play_context.max_steps_per_episode = max_steps_per_episode

            play_context.num_episodes = num_episodes

        callbacks = self._to_callback_list(callbacks=callbacks)

        callbacks = self._add_plot_callbacks(callbacks, default_plots, [plot.Steps(), plot.Rewards()])

        self._backend_agent.play(play_context=play_context, callbacks=callbacks)

        return play_context

save

def save(
    self,
    directory: Union[str, NoneType] = None,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None
) -> str

Saves the currently trained actor policy in directory.

If save is called before a trained policy is created, eg by calling train, an exception is raised.

Args: directory: the directory to save the policy weights to. if the directory does not exist yet, a new directory is created. if None the policy is saved in a temp directory. callbacks: list of callbacks called during save (eg log.Agent)

Returns: the absolute path to the directory containing the saved policy.

View Source

    def save(self, directory: Optional[str] = None,

             callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None) -> str:

        """Saves the currently trained actor policy in directory.

        If save is called before a trained policy is created, eg by calling train, an exception is raised.

        Args:

             directory: the directory to save the policy weights to.

                if the directory does not exist yet, a new directory is created. if None the policy is saved

                in a temp directory.

             callbacks: list of callbacks called during save (eg log.Agent)

        Returns:

            the absolute path to the directory containing the saved policy.

        """

        if directory is None:

            directory = bcore._get_temp_path()

        assert directory

        assert self._backend_agent._agent_context._is_policy_trained, "No trained policy available. Call train() first."

        directory = bcore._mkdir(directory)

        agent_json_path = os.path.join(directory, EasyAgent._KEY_EASYAGENT_FILENAME)

        with open(agent_json_path, 'w') as jsonfile:

            agent_dict = self._to_dict()

            json.dump(agent_dict, jsonfile, sort_keys=True, indent=2)

        callbacks = self._to_callback_list(callbacks=callbacks)

        policy_directory = os.path.join(directory, EasyAgent._KEY_POLICY_DIRECTORY)

        policy_directory = bcore._mkdir(policy_directory)

        self._backend_agent.save(directory=policy_directory, callbacks=callbacks)

        return directory

train

def train(
    self,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None,
    num_iterations: int = 100,
    num_episodes_per_iteration: int = 10,
    max_steps_per_episode: int = 500,
    num_epochs_per_iteration: int = 10,
    num_iterations_between_eval: int = 5,
    num_episodes_per_eval: int = 10,
    learning_rate: float = 0.001,
    train_context: easyagents.core.PpoTrainContext = None,
    default_plots: bool = None
)

Trains a new model using the gym environment passed during instantiation.

Args: callbacks: list of callbacks called during training and evaluation num_iterations: number of times the training is repeated (with additional data) num_episodes_per_iteration: number of episodes played per training iteration max_steps_per_episode: maximum number of steps per episode num_epochs_per_iteration: number of times the data collected for the current iteration is used to retrain the current policy num_iterations_between_eval: number of training iterations before the current policy is evaluated. if 0 no evaluation is performed. num_episodes_per_eval: number of episodes played to estimate the average return and steps learning_rate: the learning rate used in the next iteration's policy training (0,1] train_context: training configuration to be used. if set overrides all other training context arguments. default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...). if None default callbacks are only added if the callbacks list is empty

Returns: train_context: the training configuration containing the loss and sum of rewards encountered during training

View Source

    def train(self,

              callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

              num_iterations: int = 100,

              num_episodes_per_iteration: int = 10,

              max_steps_per_episode: int = 500,

              num_epochs_per_iteration: int = 10,

              num_iterations_between_eval: int = 5,

              num_episodes_per_eval: int = 10,

              learning_rate: float = 0.001,

              train_context: core.PpoTrainContext = None,

              default_plots: bool = None):

        """Trains a new model using the gym environment passed during instantiation.

        Args:

            callbacks: list of callbacks called during training and evaluation

            num_iterations: number of times the training is repeated (with additional data)

            num_episodes_per_iteration: number of episodes played per training iteration

            max_steps_per_episode: maximum number of steps per episode

            num_epochs_per_iteration: number of times the data collected for the current iteration

                is used to retrain the current policy

            num_iterations_between_eval: number of training iterations before the current policy is evaluated.

                if 0 no evaluation is performed.

            num_episodes_per_eval: number of episodes played to estimate the average return and steps

            learning_rate: the learning rate used in the next iteration's policy training (0,1]

            train_context: training configuration to be used. if set overrides all other training context arguments.

            default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...).

                if None default callbacks are only added if the callbacks list is empty

        Returns:

            train_context: the training configuration containing the loss and sum of rewards encountered

                during training

        """

        if train_context is None:

            train_context = core.PpoTrainContext()

            train_context.num_iterations = num_iterations

            train_context.num_episodes_per_iteration = num_episodes_per_iteration

            train_context.max_steps_per_episode = max_steps_per_episode

            train_context.num_epochs_per_iteration = num_epochs_per_iteration

            train_context.num_iterations_between_eval = num_iterations_between_eval

            train_context.num_episodes_per_eval = num_episodes_per_eval

            train_context.learning_rate = learning_rate

        super().train(train_context=train_context, callbacks=callbacks, default_plots=default_plots)

        return train_context

RandomAgent

class RandomAgent(
    gym_env_name: str,
    fc_layers: Union[Tuple[int, ...], int, NoneType] = None,
    backend: str = None
)

Agent which always chooses uniform random actions.

View Source

class RandomAgent(EasyAgent):

    """Agent which always chooses uniform random actions."""

    def train(self,

              callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

              num_iterations: int = 10,

              max_steps_per_episode: int = 1000,

              num_episodes_per_eval: int = 10,

              train_context: core.TrainContext = None,

              default_plots: bool = None):

        """Evaluates the environment using a uniform random policy.

        The evaluation is performed in batches of num_episodes_per_eval episodes.

        Args:

            callbacks: list of callbacks called during training and evaluation

            num_iterations: number of times a batch of num_episodes_per_eval episodes is evaluated.

            max_steps_per_episode: maximum number of steps per episode

            num_episodes_per_eval: number of episodes played to estimate the average return and steps

            train_context: training configuration to be used. if set overrides all other training context arguments.

            default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...)

        Returns:

            train_context: the training configuration containing the loss and sum of rewards encountered

                during training

        """

        if train_context is None:

            train_context = core.TrainContext()

            train_context.num_iterations = num_iterations

            train_context.max_steps_per_episode = max_steps_per_episode

            train_context.num_epochs_per_iteration = 0

            train_context.num_iterations_between_eval = 1

            train_context.num_episodes_per_eval = num_episodes_per_eval

            train_context.learning_rate = 1

        super().train(train_context=train_context, callbacks=callbacks, default_plots=default_plots)

        return train_context

Ancestors (in MRO)

easyagents.agents.EasyAgent
abc.ABC

Methods

evaluate

def evaluate(
    self,
    num_episodes: int = 50,
    max_steps_per_episode: int = 50
)

Plays num_episodes with the current policy and computes metrics on rewards.

Args: num_episodes: number of episodes to play max_steps_per_episode: max steps per episode

Returns: extensible score metrics

View Source

    def evaluate(self,

                 num_episodes: int = 50,

                 max_steps_per_episode: int = 50):

        """Plays num_episodes with the current policy and computes metrics on rewards.

        Args:

            num_episodes: number of episodes to play

            max_steps_per_episode: max steps per episode

        Returns:

            extensible score metrics

        """

        play_context = core.PlayContext()

        play_context.max_steps_per_episode = max_steps_per_episode

        play_context.num_episodes = num_episodes

        self.play(play_context=play_context, default_plots=False)

        Metrics = namedtuple('Metrics', 'steps rewards')

        Rewards = namedtuple('Rewards', 'mean std min max all')

        all_rewards = list(play_context.sum_of_rewards.values())

        mean_reward, std_reward, min_reward, max_reward = statistics.mean(all_rewards), statistics.stdev(

            all_rewards), min(all_rewards), max(all_rewards)

        rewards = Rewards(mean=mean_reward, std=std_reward, min=min_reward, max=max_reward, all=all_rewards)

        Steps = namedtuple('Steps', 'mean std min max all')

        all_num_steps = []

        for i in play_context.rewards.keys():

            all_num_steps.append(len(play_context.rewards[i]))

        mean_steps, std_steps, min_steps, max_steps = statistics.mean(all_num_steps), statistics.stdev(

            all_num_steps), min(all_num_steps), max(all_num_steps)

        steps = Steps(mean=mean_steps, std=std_steps, min=min_steps, max=max_steps, all=all_num_steps)

        metrics = Metrics(rewards=rewards, steps=steps)

        return metrics

play

def play(
    self,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None,
    num_episodes: int = 1,
    max_steps_per_episode: int = 1000,
    play_context: easyagents.core.PlayContext = None,
    default_plots: bool = None
)

Plays num_episodes with the current policy.

Args: callbacks: list of callbacks called during each episode play num_episodes: number of episodes to play max_steps_per_episode: max steps per episode play_context: play configuration to be used. If set override all other play context arguments default_plots: if set addes a set of default callbacks (plot.State, plot.Rewards, ...)

Returns: play_context containg the actions taken and the rewards received during training

View Source

    def play(self,

             callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

             num_episodes: int = 1,

             max_steps_per_episode: int = 1000,

             play_context: core.PlayContext = None,

             default_plots: bool = None):

        """Plays num_episodes with the current policy.

        Args:

            callbacks: list of callbacks called during each episode play

            num_episodes: number of episodes to play

            max_steps_per_episode: max steps per episode

            play_context: play configuration to be used. If set override all other play context arguments

            default_plots: if set addes a set of default callbacks (plot.State, plot.Rewards, ...)

        Returns:

            play_context containg the actions taken and the rewards received during training

        """

        assert self._backend_agent._agent_context._is_policy_trained, "No trained policy available. Call train() first."

        if play_context is None:

            play_context = core.PlayContext()

            play_context.max_steps_per_episode = max_steps_per_episode

            play_context.num_episodes = num_episodes

        callbacks = self._to_callback_list(callbacks=callbacks)

        callbacks = self._add_plot_callbacks(callbacks, default_plots, [plot.Steps(), plot.Rewards()])

        self._backend_agent.play(play_context=play_context, callbacks=callbacks)

        return play_context

save

def save(
    self,
    directory: Union[str, NoneType] = None,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None
) -> str

Saves the currently trained actor policy in directory.

If save is called before a trained policy is created, eg by calling train, an exception is raised.

Args: directory: the directory to save the policy weights to. if the directory does not exist yet, a new directory is created. if None the policy is saved in a temp directory. callbacks: list of callbacks called during save (eg log.Agent)

Returns: the absolute path to the directory containing the saved policy.

View Source

    def save(self, directory: Optional[str] = None,

             callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None) -> str:

        """Saves the currently trained actor policy in directory.

        If save is called before a trained policy is created, eg by calling train, an exception is raised.

        Args:

             directory: the directory to save the policy weights to.

                if the directory does not exist yet, a new directory is created. if None the policy is saved

                in a temp directory.

             callbacks: list of callbacks called during save (eg log.Agent)

        Returns:

            the absolute path to the directory containing the saved policy.

        """

        if directory is None:

            directory = bcore._get_temp_path()

        assert directory

        assert self._backend_agent._agent_context._is_policy_trained, "No trained policy available. Call train() first."

        directory = bcore._mkdir(directory)

        agent_json_path = os.path.join(directory, EasyAgent._KEY_EASYAGENT_FILENAME)

        with open(agent_json_path, 'w') as jsonfile:

            agent_dict = self._to_dict()

            json.dump(agent_dict, jsonfile, sort_keys=True, indent=2)

        callbacks = self._to_callback_list(callbacks=callbacks)

        policy_directory = os.path.join(directory, EasyAgent._KEY_POLICY_DIRECTORY)

        policy_directory = bcore._mkdir(policy_directory)

        self._backend_agent.save(directory=policy_directory, callbacks=callbacks)

        return directory

train

def train(
    self,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None,
    num_iterations: int = 10,
    max_steps_per_episode: int = 1000,
    num_episodes_per_eval: int = 10,
    train_context: easyagents.core.TrainContext = None,
    default_plots: bool = None
)

Evaluates the environment using a uniform random policy.

The evaluation is performed in batches of num_episodes_per_eval episodes.

Args: callbacks: list of callbacks called during training and evaluation num_iterations: number of times a batch of num_episodes_per_eval episodes is evaluated. max_steps_per_episode: maximum number of steps per episode num_episodes_per_eval: number of episodes played to estimate the average return and steps train_context: training configuration to be used. if set overrides all other training context arguments. default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...)

Returns: train_context: the training configuration containing the loss and sum of rewards encountered during training

View Source

    def train(self,

              callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

              num_iterations: int = 10,

              max_steps_per_episode: int = 1000,

              num_episodes_per_eval: int = 10,

              train_context: core.TrainContext = None,

              default_plots: bool = None):

        """Evaluates the environment using a uniform random policy.

        The evaluation is performed in batches of num_episodes_per_eval episodes.

        Args:

            callbacks: list of callbacks called during training and evaluation

            num_iterations: number of times a batch of num_episodes_per_eval episodes is evaluated.

            max_steps_per_episode: maximum number of steps per episode

            num_episodes_per_eval: number of episodes played to estimate the average return and steps

            train_context: training configuration to be used. if set overrides all other training context arguments.

            default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...)

        Returns:

            train_context: the training configuration containing the loss and sum of rewards encountered

                during training

        """

        if train_context is None:

            train_context = core.TrainContext()

            train_context.num_iterations = num_iterations

            train_context.max_steps_per_episode = max_steps_per_episode

            train_context.num_epochs_per_iteration = 0

            train_context.num_iterations_between_eval = 1

            train_context.num_episodes_per_eval = num_episodes_per_eval

            train_context.learning_rate = 1

        super().train(train_context=train_context, callbacks=callbacks, default_plots=default_plots)

        return train_context

ReinforceAgent

class ReinforceAgent(
    gym_env_name: str,
    fc_layers: Union[Tuple[int, ...], int, NoneType] = None,
    backend: str = None
)

creates a new agent based on the Reinforce algorithm.

Reinforce is a vanilla policy gradient algorithm using a single actor network.

see also: www-anw.cs.umass.edu/~barto/courses/cs687/williams92simple.pdf

View Source

class ReinforceAgent(EasyAgent):

    """creates a new agent based on the Reinforce algorithm.

        Reinforce is a vanilla policy gradient algorithm using a single actor network.

        see also: www-anw.cs.umass.edu/~barto/courses/cs687/williams92simple.pdf

    """

    def train(self,

              callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

              num_iterations: int = 100,

              num_episodes_per_iteration: int = 10,

              max_steps_per_episode: int = 500,

              num_epochs_per_iteration: int = 10,

              num_iterations_between_eval: int = 5,

              num_episodes_per_eval: int = 10,

              learning_rate: float = 0.001,

              train_context: core.EpisodesTrainContext = None,

              default_plots: bool = None):

        """Trains a new model using the gym environment passed during instantiation.

        Args:

            callbacks: list of callbacks called during training and evaluation

            num_iterations: number of times the training is repeated (with additional data)

            num_episodes_per_iteration: number of episodes played per training iteration

            max_steps_per_episode: maximum number of steps per episode

            num_epochs_per_iteration: number of times the data collected for the current iteration

                is used to retrain the current policy

            num_iterations_between_eval: number of training iterations before the current policy is evaluated.

                if 0 no evaluation is performed.

            num_episodes_per_eval: number of episodes played to estimate the average return and steps

            learning_rate: the learning rate used in the next iteration's policy training (0,1]

            train_context: training configuration to be used. if set overrides all other training context arguments.

            default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...).

                if None default callbacks are only added if the callbacks list is empty

        Returns:

            train_context: the training configuration containing the loss and sum of rewards encountered

                during training

        """

        if train_context is None:

            train_context = core.EpisodesTrainContext()

            train_context.num_iterations = num_iterations

            train_context.num_episodes_per_iteration = num_episodes_per_iteration

            train_context.max_steps_per_episode = max_steps_per_episode

            train_context.num_epochs_per_iteration = num_epochs_per_iteration

            train_context.num_iterations_between_eval = num_iterations_between_eval

            train_context.num_episodes_per_eval = num_episodes_per_eval

            train_context.learning_rate = learning_rate

        super().train(train_context=train_context, callbacks=callbacks, default_plots=default_plots)

        return train_context

Ancestors (in MRO)

easyagents.agents.EasyAgent
abc.ABC

Methods

evaluate

def evaluate(
    self,
    num_episodes: int = 50,
    max_steps_per_episode: int = 50
)

Plays num_episodes with the current policy and computes metrics on rewards.

Args: num_episodes: number of episodes to play max_steps_per_episode: max steps per episode

Returns: extensible score metrics

View Source

    def evaluate(self,

                 num_episodes: int = 50,

                 max_steps_per_episode: int = 50):

        """Plays num_episodes with the current policy and computes metrics on rewards.

        Args:

            num_episodes: number of episodes to play

            max_steps_per_episode: max steps per episode

        Returns:

            extensible score metrics

        """

        play_context = core.PlayContext()

        play_context.max_steps_per_episode = max_steps_per_episode

        play_context.num_episodes = num_episodes

        self.play(play_context=play_context, default_plots=False)

        Metrics = namedtuple('Metrics', 'steps rewards')

        Rewards = namedtuple('Rewards', 'mean std min max all')

        all_rewards = list(play_context.sum_of_rewards.values())

        mean_reward, std_reward, min_reward, max_reward = statistics.mean(all_rewards), statistics.stdev(

            all_rewards), min(all_rewards), max(all_rewards)

        rewards = Rewards(mean=mean_reward, std=std_reward, min=min_reward, max=max_reward, all=all_rewards)

        Steps = namedtuple('Steps', 'mean std min max all')

        all_num_steps = []

        for i in play_context.rewards.keys():

            all_num_steps.append(len(play_context.rewards[i]))

        mean_steps, std_steps, min_steps, max_steps = statistics.mean(all_num_steps), statistics.stdev(

            all_num_steps), min(all_num_steps), max(all_num_steps)

        steps = Steps(mean=mean_steps, std=std_steps, min=min_steps, max=max_steps, all=all_num_steps)

        metrics = Metrics(rewards=rewards, steps=steps)

        return metrics

play

def play(
    self,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None,
    num_episodes: int = 1,
    max_steps_per_episode: int = 1000,
    play_context: easyagents.core.PlayContext = None,
    default_plots: bool = None
)

Plays num_episodes with the current policy.

Args: callbacks: list of callbacks called during each episode play num_episodes: number of episodes to play max_steps_per_episode: max steps per episode play_context: play configuration to be used. If set override all other play context arguments default_plots: if set addes a set of default callbacks (plot.State, plot.Rewards, ...)

Returns: play_context containg the actions taken and the rewards received during training

View Source

    def play(self,

             callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

             num_episodes: int = 1,

             max_steps_per_episode: int = 1000,

             play_context: core.PlayContext = None,

             default_plots: bool = None):

        """Plays num_episodes with the current policy.

        Args:

            callbacks: list of callbacks called during each episode play

            num_episodes: number of episodes to play

            max_steps_per_episode: max steps per episode

            play_context: play configuration to be used. If set override all other play context arguments

            default_plots: if set addes a set of default callbacks (plot.State, plot.Rewards, ...)

        Returns:

            play_context containg the actions taken and the rewards received during training

        """

        assert self._backend_agent._agent_context._is_policy_trained, "No trained policy available. Call train() first."

        if play_context is None:

            play_context = core.PlayContext()

            play_context.max_steps_per_episode = max_steps_per_episode

            play_context.num_episodes = num_episodes

        callbacks = self._to_callback_list(callbacks=callbacks)

        callbacks = self._add_plot_callbacks(callbacks, default_plots, [plot.Steps(), plot.Rewards()])

        self._backend_agent.play(play_context=play_context, callbacks=callbacks)

        return play_context

save

def save(
    self,
    directory: Union[str, NoneType] = None,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None
) -> str

Saves the currently trained actor policy in directory.

If save is called before a trained policy is created, eg by calling train, an exception is raised.

Args: directory: the directory to save the policy weights to. if the directory does not exist yet, a new directory is created. if None the policy is saved in a temp directory. callbacks: list of callbacks called during save (eg log.Agent)

Returns: the absolute path to the directory containing the saved policy.

View Source

    def save(self, directory: Optional[str] = None,

             callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None) -> str:

        """Saves the currently trained actor policy in directory.

        If save is called before a trained policy is created, eg by calling train, an exception is raised.

        Args:

             directory: the directory to save the policy weights to.

                if the directory does not exist yet, a new directory is created. if None the policy is saved

                in a temp directory.

             callbacks: list of callbacks called during save (eg log.Agent)

        Returns:

            the absolute path to the directory containing the saved policy.

        """

        if directory is None:

            directory = bcore._get_temp_path()

        assert directory

        assert self._backend_agent._agent_context._is_policy_trained, "No trained policy available. Call train() first."

        directory = bcore._mkdir(directory)

        agent_json_path = os.path.join(directory, EasyAgent._KEY_EASYAGENT_FILENAME)

        with open(agent_json_path, 'w') as jsonfile:

            agent_dict = self._to_dict()

            json.dump(agent_dict, jsonfile, sort_keys=True, indent=2)

        callbacks = self._to_callback_list(callbacks=callbacks)

        policy_directory = os.path.join(directory, EasyAgent._KEY_POLICY_DIRECTORY)

        policy_directory = bcore._mkdir(policy_directory)

        self._backend_agent.save(directory=policy_directory, callbacks=callbacks)

        return directory

train

def train(
    self,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None,
    num_iterations: int = 100,
    num_episodes_per_iteration: int = 10,
    max_steps_per_episode: int = 500,
    num_epochs_per_iteration: int = 10,
    num_iterations_between_eval: int = 5,
    num_episodes_per_eval: int = 10,
    learning_rate: float = 0.001,
    train_context: easyagents.core.EpisodesTrainContext = None,
    default_plots: bool = None
)

Trains a new model using the gym environment passed during instantiation.

Args: callbacks: list of callbacks called during training and evaluation num_iterations: number of times the training is repeated (with additional data) num_episodes_per_iteration: number of episodes played per training iteration max_steps_per_episode: maximum number of steps per episode num_epochs_per_iteration: number of times the data collected for the current iteration is used to retrain the current policy num_iterations_between_eval: number of training iterations before the current policy is evaluated. if 0 no evaluation is performed. num_episodes_per_eval: number of episodes played to estimate the average return and steps learning_rate: the learning rate used in the next iteration's policy training (0,1] train_context: training configuration to be used. if set overrides all other training context arguments. default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...). if None default callbacks are only added if the callbacks list is empty

Returns: train_context: the training configuration containing the loss and sum of rewards encountered during training

View Source

    def train(self,

              callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

              num_iterations: int = 100,

              num_episodes_per_iteration: int = 10,

              max_steps_per_episode: int = 500,

              num_epochs_per_iteration: int = 10,

              num_iterations_between_eval: int = 5,

              num_episodes_per_eval: int = 10,

              learning_rate: float = 0.001,

              train_context: core.EpisodesTrainContext = None,

              default_plots: bool = None):

        """Trains a new model using the gym environment passed during instantiation.

        Args:

            callbacks: list of callbacks called during training and evaluation

            num_iterations: number of times the training is repeated (with additional data)

            num_episodes_per_iteration: number of episodes played per training iteration

            max_steps_per_episode: maximum number of steps per episode

            num_epochs_per_iteration: number of times the data collected for the current iteration

                is used to retrain the current policy

            num_iterations_between_eval: number of training iterations before the current policy is evaluated.

                if 0 no evaluation is performed.

            num_episodes_per_eval: number of episodes played to estimate the average return and steps

            learning_rate: the learning rate used in the next iteration's policy training (0,1]

            train_context: training configuration to be used. if set overrides all other training context arguments.

            default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...).

                if None default callbacks are only added if the callbacks list is empty

        Returns:

            train_context: the training configuration containing the loss and sum of rewards encountered

                during training

        """

        if train_context is None:

            train_context = core.EpisodesTrainContext()

            train_context.num_iterations = num_iterations

            train_context.num_episodes_per_iteration = num_episodes_per_iteration

            train_context.max_steps_per_episode = max_steps_per_episode

            train_context.num_epochs_per_iteration = num_epochs_per_iteration

            train_context.num_iterations_between_eval = num_iterations_between_eval

            train_context.num_episodes_per_eval = num_episodes_per_eval

            train_context.learning_rate = learning_rate

        super().train(train_context=train_context, callbacks=callbacks, default_plots=default_plots)

        return train_context

SacAgent

class SacAgent(
    gym_env_name: str,
    fc_layers: Union[Tuple[int, ...], int, NoneType] = None,
    backend: str = None
)

Agent based on the Soft-Actor-Critic algorithm (https://arxiv.org/abs/1812.05905).

View Source

class SacAgent(DqnAgent):

    """Agent based on the Soft-Actor-Critic algorithm (https://arxiv.org/abs/1812.05905)."""

Ancestors (in MRO)

easyagents.agents.DqnAgent
easyagents.agents.EasyAgent
abc.ABC

Methods

evaluate

def evaluate(
    self,
    num_episodes: int = 50,
    max_steps_per_episode: int = 50
)

Plays num_episodes with the current policy and computes metrics on rewards.

Args: num_episodes: number of episodes to play max_steps_per_episode: max steps per episode

Returns: extensible score metrics

View Source

    def evaluate(self,

                 num_episodes: int = 50,

                 max_steps_per_episode: int = 50):

        """Plays num_episodes with the current policy and computes metrics on rewards.

        Args:

            num_episodes: number of episodes to play

            max_steps_per_episode: max steps per episode

        Returns:

            extensible score metrics

        """

        play_context = core.PlayContext()

        play_context.max_steps_per_episode = max_steps_per_episode

        play_context.num_episodes = num_episodes

        self.play(play_context=play_context, default_plots=False)

        Metrics = namedtuple('Metrics', 'steps rewards')

        Rewards = namedtuple('Rewards', 'mean std min max all')

        all_rewards = list(play_context.sum_of_rewards.values())

        mean_reward, std_reward, min_reward, max_reward = statistics.mean(all_rewards), statistics.stdev(

            all_rewards), min(all_rewards), max(all_rewards)

        rewards = Rewards(mean=mean_reward, std=std_reward, min=min_reward, max=max_reward, all=all_rewards)

        Steps = namedtuple('Steps', 'mean std min max all')

        all_num_steps = []

        for i in play_context.rewards.keys():

            all_num_steps.append(len(play_context.rewards[i]))

        mean_steps, std_steps, min_steps, max_steps = statistics.mean(all_num_steps), statistics.stdev(

            all_num_steps), min(all_num_steps), max(all_num_steps)

        steps = Steps(mean=mean_steps, std=std_steps, min=min_steps, max=max_steps, all=all_num_steps)

        metrics = Metrics(rewards=rewards, steps=steps)

        return metrics

play

def play(
    self,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None,
    num_episodes: int = 1,
    max_steps_per_episode: int = 1000,
    play_context: easyagents.core.PlayContext = None,
    default_plots: bool = None
)

Plays num_episodes with the current policy.

Args: callbacks: list of callbacks called during each episode play num_episodes: number of episodes to play max_steps_per_episode: max steps per episode play_context: play configuration to be used. If set override all other play context arguments default_plots: if set addes a set of default callbacks (plot.State, plot.Rewards, ...)

Returns: play_context containg the actions taken and the rewards received during training

View Source

    def play(self,

             callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

             num_episodes: int = 1,

             max_steps_per_episode: int = 1000,

             play_context: core.PlayContext = None,

             default_plots: bool = None):

        """Plays num_episodes with the current policy.

        Args:

            callbacks: list of callbacks called during each episode play

            num_episodes: number of episodes to play

            max_steps_per_episode: max steps per episode

            play_context: play configuration to be used. If set override all other play context arguments

            default_plots: if set addes a set of default callbacks (plot.State, plot.Rewards, ...)

        Returns:

            play_context containg the actions taken and the rewards received during training

        """

        assert self._backend_agent._agent_context._is_policy_trained, "No trained policy available. Call train() first."

        if play_context is None:

            play_context = core.PlayContext()

            play_context.max_steps_per_episode = max_steps_per_episode

            play_context.num_episodes = num_episodes

        callbacks = self._to_callback_list(callbacks=callbacks)

        callbacks = self._add_plot_callbacks(callbacks, default_plots, [plot.Steps(), plot.Rewards()])

        self._backend_agent.play(play_context=play_context, callbacks=callbacks)

        return play_context

save

def save(
    self,
    directory: Union[str, NoneType] = None,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None
) -> str

Saves the currently trained actor policy in directory.

If save is called before a trained policy is created, eg by calling train, an exception is raised.

Args: directory: the directory to save the policy weights to. if the directory does not exist yet, a new directory is created. if None the policy is saved in a temp directory. callbacks: list of callbacks called during save (eg log.Agent)

Returns: the absolute path to the directory containing the saved policy.

View Source

    def save(self, directory: Optional[str] = None,

             callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None) -> str:

        """Saves the currently trained actor policy in directory.

        If save is called before a trained policy is created, eg by calling train, an exception is raised.

        Args:

             directory: the directory to save the policy weights to.

                if the directory does not exist yet, a new directory is created. if None the policy is saved

                in a temp directory.

             callbacks: list of callbacks called during save (eg log.Agent)

        Returns:

            the absolute path to the directory containing the saved policy.

        """

        if directory is None:

            directory = bcore._get_temp_path()

        assert directory

        assert self._backend_agent._agent_context._is_policy_trained, "No trained policy available. Call train() first."

        directory = bcore._mkdir(directory)

        agent_json_path = os.path.join(directory, EasyAgent._KEY_EASYAGENT_FILENAME)

        with open(agent_json_path, 'w') as jsonfile:

            agent_dict = self._to_dict()

            json.dump(agent_dict, jsonfile, sort_keys=True, indent=2)

        callbacks = self._to_callback_list(callbacks=callbacks)

        policy_directory = os.path.join(directory, EasyAgent._KEY_POLICY_DIRECTORY)

        policy_directory = bcore._mkdir(policy_directory)

        self._backend_agent.save(directory=policy_directory, callbacks=callbacks)

        return directory

train

def train(
    self,
    callbacks: Union[List[easyagents.core.AgentCallback], easyagents.core.AgentCallback, NoneType] = None,
    num_iterations: int = 20000,
    max_steps_per_episode: int = 500,
    num_steps_per_iteration: int = 1,
    num_steps_buffer_preload=1000,
    num_steps_sampled_from_buffer=64,
    num_iterations_between_eval: int = 1000,
    num_episodes_per_eval: int = 10,
    learning_rate: float = 0.001,
    train_context: easyagents.core.StepsTrainContext = None,
    default_plots: bool = None
)

Trains a new model using the gym environment passed during instantiation.

Args: callbacks: list of callbacks called during training and evaluation num_iterations: number of times the training is repeated (with additional data) max_steps_per_episode: maximum number of steps per episode num_steps_per_iteration: number of steps played per training iteration num_steps_buffer_preload: number of initial collect steps to preload the buffer num_steps_sampled_from_buffer: the number of steps sampled from buffer for each iteration training num_iterations_between_eval: number of training iterations before the current policy is evaluated. if 0 no evaluation is performed. num_episodes_per_eval: number of episodes played to estimate the average return and steps learning_rate: the learning rate used in the next iteration's policy training (0,1] train_context: training configuration to be used. if set overrides all other training context arguments. default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...). if None default callbacks are only added if the callbacks list is empty

Returns: train_context: the training configuration containing the loss and sum of rewards encountered during training

View Source

    def train(self,

              callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,

              num_iterations: int = 20000,

              max_steps_per_episode: int = 500,

              num_steps_per_iteration: int = 1,

              num_steps_buffer_preload=1000,

              num_steps_sampled_from_buffer=64,

              num_iterations_between_eval: int = 1000,

              num_episodes_per_eval: int = 10,

              learning_rate: float = 0.001,

              train_context: core.StepsTrainContext = None,

              default_plots: bool = None):

        """Trains a new model using the gym environment passed during instantiation.

        Args:

            callbacks: list of callbacks called during training and evaluation

            num_iterations: number of times the training is repeated (with additional data)

            max_steps_per_episode: maximum number of steps per episode

            num_steps_per_iteration: number of steps played per training iteration

            num_steps_buffer_preload: number of initial collect steps to preload the buffer

            num_steps_sampled_from_buffer: the number of steps sampled from buffer for each iteration training

            num_iterations_between_eval: number of training iterations before the current policy is evaluated.

                if 0 no evaluation is performed.

            num_episodes_per_eval: number of episodes played to estimate the average return and steps

            learning_rate: the learning rate used in the next iteration's policy training (0,1]

            train_context: training configuration to be used. if set overrides all other training context arguments.

            default_plots: if set adds a set of default callbacks (plot.State, plot.Rewards, plot.Loss,...).

                if None default callbacks are only added if the callbacks list is empty

        Returns:

            train_context: the training configuration containing the loss and sum of rewards encountered

                during training

        """

        if train_context is None:

            train_context = core.StepsTrainContext()

            train_context.num_iterations = num_iterations

            train_context.max_steps_per_episode = max_steps_per_episode

            train_context.num_steps_per_iteration = num_steps_per_iteration

            train_context.num_steps_buffer_preload = num_steps_buffer_preload

            train_context.num_steps_sampled_from_buffer = num_steps_sampled_from_buffer

            train_context.num_iterations_between_eval = num_iterations_between_eval

            train_context.num_episodes_per_eval = num_episodes_per_eval

            train_context.learning_rate = learning_rate

        super().train(train_context=train_context, callbacks=callbacks, default_plots=default_plots)

        return train_context