Browse Source

Removing pytorch_rl code because it is broken. Will update README.

Maxime Chevalier-Boisvert 6 years ago
parent
commit
a9653e81de

+ 0 - 21
pytorch_rl/LICENSE

@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2017 Ilya Kostrikov
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.

+ 0 - 70
pytorch_rl/arguments.py

@@ -1,70 +0,0 @@
-import argparse
-
-import torch
-
-
-def get_args():
-    parser = argparse.ArgumentParser(description='RL')
-    parser.add_argument('--algo', default='a2c',
-                        help='algorithm to use: a2c | ppo | acktr')
-    parser.add_argument('--lr', type=float, default=7e-4,
-                        help='learning rate (default: 7e-4)')
-    parser.add_argument('--eps', type=float, default=1e-5,
-                        help='RMSprop optimizer epsilon (default: 1e-5)')
-    parser.add_argument('--alpha', type=float, default=0.99,
-                        help='RMSprop optimizer apha (default: 0.99)')
-    parser.add_argument('--gamma', type=float, default=0.99,
-                        help='discount factor for rewards (default: 0.99)')
-    parser.add_argument('--use-gae', action='store_true', default=False,
-                        help='use generalized advantage estimation')
-    parser.add_argument('--tau', type=float, default=0.95,
-                        help='gae parameter (default: 0.95)')
-    parser.add_argument('--entropy-coef', type=float, default=0.01,
-                        help='entropy term coefficient (default: 0.01)')
-    parser.add_argument('--value-loss-coef', type=float, default=0.5,
-                        help='value loss coefficient (default: 0.5)')
-    parser.add_argument('--max-grad-norm', type=float, default=0.5,
-                        help='value loss coefficient (default: 0.5)')
-    parser.add_argument('--seed', type=int, default=1,
-                        help='random seed (default: 1)')
-    parser.add_argument('--num-processes', type=int, default=32,
-                        help='how many training CPU processes to use (default: 32)')
-    parser.add_argument('--num-steps', type=int, default=5,
-                        help='number of forward steps in A2C (default: 5)')
-    parser.add_argument('--ppo-epoch', type=int, default=4,
-                        help='number of ppo epochs (default: 4)')
-    parser.add_argument('--num-mini-batch', type=int, default=32,
-                        help='number of batches for ppo (default: 32)')
-    parser.add_argument('--clip-param', type=float, default=0.2,
-                        help='ppo clip parameter (default: 0.2)')
-    parser.add_argument('--num-stack', type=int, default=1,
-                        help='number of frames to stack (default: 1)')
-    parser.add_argument('--log-interval', type=int, default=10,
-                        help='log interval, one log per n updates (default: 10)')
-    parser.add_argument('--save-interval', type=int, default=100,
-                        help='save interval, one save per n updates (default: 10)')
-    parser.add_argument('--vis-interval', type=int, default=10,
-                        help='vis interval, one log per n updates')
-    parser.add_argument('--num-frames', type=int, default=10e6,
-                        help='number of frames to train (default: 10e6)')
-    parser.add_argument('--env-name', default='PongNoFrameskip-v4',
-                        help='environment to train on (default: PongNoFrameskip-v4)')
-    parser.add_argument('--log-dir', default='/tmp/gym/',
-                        help='directory to save agent logs (default: /tmp/gym)')
-    parser.add_argument('--save-dir', default='./trained_models/',
-                        help='directory to save agent logs (default: ./trained_models/)')
-    parser.add_argument('--no-cuda', action='store_true', default=False,
-                        help='disables CUDA training')
-    parser.add_argument('--recurrent-policy', action='store_true', default=True,
-                        help='use a recurrent policy')
-    parser.add_argument('--no-vis', action='store_true', default=False,
-                        help='disables visdom visualization')
-    args = parser.parse_args()
-
-    args.cuda = not args.no_cuda and torch.cuda.is_available()
-    args.vis = not args.no_vis
-
-    if not args.cuda:
-        print('*** WARNING: CUDA NOT ENABLED ***')
-
-    return args

+ 0 - 81
pytorch_rl/distributions.py

@@ -1,81 +0,0 @@
-import math
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.autograd import Variable
-from utils import AddBias
-
-
-class Categorical(nn.Module):
-    def __init__(self, num_inputs, num_outputs):
-        super(Categorical, self).__init__()
-        self.linear = nn.Linear(num_inputs, num_outputs)
-
-    def forward(self, x):
-        x = self.linear(x)
-        return x
-
-    def sample(self, x, deterministic):
-        x = self(x)
-
-        probs = F.softmax(x, dim=1)
-        if deterministic is False:
-            action = probs.multinomial()
-        else:
-            action = probs.max(1, keepdim=True)[1]
-        return action
-
-    def logprobs_and_entropy(self, x, actions):
-        x = self(x)
-
-        log_probs = F.log_softmax(x, dim=1)
-        probs = F.softmax(x, dim=1)
-
-        action_log_probs = log_probs.gather(1, actions)
-
-        dist_entropy = -(log_probs * probs).sum(-1).mean()
-        return action_log_probs, dist_entropy
-
-
-class DiagGaussian(nn.Module):
-    def __init__(self, num_inputs, num_outputs):
-        super(DiagGaussian, self).__init__()
-        self.fc_mean = nn.Linear(num_inputs, num_outputs)
-        self.logstd = AddBias(torch.zeros(num_outputs))
-
-    def forward(self, x):
-        action_mean = self.fc_mean(x)
-
-        #  An ugly hack for my KFAC implementation.
-        zeros = Variable(torch.zeros(action_mean.size()), volatile=x.volatile)
-        if x.is_cuda:
-            zeros = zeros.cuda()
-
-        action_logstd = self.logstd(zeros)
-        return action_mean, action_logstd
-
-    def sample(self, x, deterministic):
-        action_mean, action_logstd = self(x)
-
-        action_std = action_logstd.exp()
-
-        if deterministic is False:
-            noise = Variable(torch.randn(action_std.size()))
-            if action_std.is_cuda:
-                noise = noise.cuda()
-            action = action_mean + action_std * noise
-        else:
-            action = action_mean
-        return action
-
-    def logprobs_and_entropy(self, x, actions):
-        action_mean, action_logstd = self(x)
-
-        action_std = action_logstd.exp()
-
-        action_log_probs = -0.5 * ((actions - action_mean) / action_std).pow(2) - 0.5 * math.log(2 * math.pi) - action_logstd
-        action_log_probs = action_log_probs.sum(-1, keepdim=True)
-        dist_entropy = 0.5 + 0.5 * math.log(2 * math.pi) + action_logstd
-        dist_entropy = dist_entropy.sum(-1).mean()
-        return action_log_probs, dist_entropy

+ 0 - 77
pytorch_rl/enjoy.py

@@ -1,77 +0,0 @@
-import argparse
-import os
-import sys
-import types
-import time
-
-import numpy as np
-import torch
-from torch.autograd import Variable
-from vec_env.dummy_vec_env import DummyVecEnv
-
-from envs import make_env
-
-parser = argparse.ArgumentParser(description='RL')
-parser.add_argument('--seed', type=int, default=1,
-                    help='random seed (default: 1)')
-parser.add_argument('--num-stack', type=int, default=1,
-                    help='number of frames to stack (default: 1)')
-parser.add_argument('--log-interval', type=int, default=10,
-                    help='log interval, one log per n updates (default: 10)')
-parser.add_argument('--env-name', default='PongNoFrameskip-v4',
-                    help='environment to train on (default: PongNoFrameskip-v4)')
-parser.add_argument('--load-dir', default='./trained_models/',
-                    help='directory to save agent logs (default: ./trained_models/)')
-args = parser.parse_args()
-
-env = make_env(args.env_name, args.seed, 0, None)
-env = DummyVecEnv([env])
-
-actor_critic, ob_rms = torch.load(os.path.join(args.load_dir, args.env_name + ".pt"))
-
-render_func = env.envs[0].render
-
-obs_shape = env.observation_space.shape
-obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])
-current_obs = torch.zeros(1, *obs_shape)
-states = torch.zeros(1, actor_critic.state_size)
-masks = torch.zeros(1, 1)
-
-def update_current_obs(obs):
-    shape_dim0 = env.observation_space.shape[0]
-    obs = torch.from_numpy(obs).float()
-    if args.num_stack > 1:
-        current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
-    current_obs[:, -shape_dim0:] = obs
-
-render_func('human')
-obs = env.reset()
-update_current_obs(obs)
-
-while True:
-    value, action, _, states = actor_critic.act(
-        Variable(current_obs, volatile=True),
-        Variable(states, volatile=True),
-        Variable(masks, volatile=True),
-        deterministic=True
-    )
-    states = states.data
-    cpu_actions = action.data.squeeze(1).cpu().numpy()
-
-    # Observation, reward and next obs
-    obs, reward, done, _ = env.step(cpu_actions)
-
-    time.sleep(0.05)
-
-    masks.fill_(0.0 if done else 1.0)
-
-    if current_obs.dim() == 4:
-        current_obs *= masks.unsqueeze(2).unsqueeze(2)
-    else:
-        current_obs *= masks
-    update_current_obs(obs)
-
-    renderer = render_func('human')
-
-    if not renderer.window:
-        sys.exit(0)

+ 0 - 24
pytorch_rl/envs.py

@@ -1,24 +0,0 @@
-import os
-import numpy
-import gym
-from gym import spaces
-
-try:
-    import gym_minigrid
-    from gym_minigrid.wrappers import *
-except:
-    pass
-
-def make_env(env_id, seed, rank, log_dir):
-    def _thunk():
-        env = gym.make(env_id)
-
-        env.seed(seed + rank)
-
-        # Maxime: until RL code supports dict observations, squash observations into a flat vector
-        if isinstance(env.observation_space, spaces.Dict):
-            env = FlatObsWrapper(env)
-
-        return env
-
-    return _thunk

+ 0 - 239
pytorch_rl/kfac.py

@@ -1,239 +0,0 @@
-import math
-
-import torch
-import torch.optim as optim
-import torch.nn as nn
-import torch.nn.functional as F
-from utils import AddBias
-
-# TODO: In order to make this code faster:
-# 1) Implement _extract_patches as a single cuda kernel
-# 2) Compute QR decomposition in a separate process
-# 3) Actually make a general KFAC optimizer so it fits PyTorch
-
-
-def _extract_patches(x, kernel_size, stride, padding):
-    if padding[0] + padding[1] > 0:
-        x = F.pad(x, (padding[1], padding[1], padding[0],
-                      padding[0])).data  # Actually check dims
-    x = x.unfold(2, kernel_size[0], stride[0])
-    x = x.unfold(3, kernel_size[1], stride[1])
-    x = x.transpose_(1, 2).transpose_(2, 3).contiguous()
-    x = x.view(
-        x.size(0), x.size(1), x.size(2), x.size(3) * x.size(4) * x.size(5))
-    return x
-
-
-def compute_cov_a(a, classname, layer_info, fast_cnn):
-    batch_size = a.size(0)
-
-    if classname == 'Conv2d':
-        if fast_cnn:
-            a = _extract_patches(a, *layer_info)
-            a = a.view(a.size(0), -1, a.size(-1))
-            a = a.mean(1)
-        else:
-            a = _extract_patches(a, *layer_info)
-            a = a.view(-1, a.size(-1)).div_(a.size(1)).div_(a.size(2))
-    elif classname == 'AddBias':
-        is_cuda = a.is_cuda
-        a = torch.ones(a.size(0), 1)
-        if is_cuda:
-            a = a.cuda()
-
-    return a.t() @ (a / batch_size)
-
-
-def compute_cov_g(g, classname, layer_info, fast_cnn):
-    batch_size = g.size(0)
-
-    if classname == 'Conv2d':
-        if fast_cnn:
-            g = g.view(g.size(0), g.size(1), -1)
-            g = g.sum(-1)
-        else:
-            g = g.transpose(1, 2).transpose(2, 3).contiguous()
-            g = g.view(-1, g.size(-1)).mul_(g.size(1)).mul_(g.size(2))
-    elif classname == 'AddBias':
-        g = g.view(g.size(0), g.size(1), -1)
-        g = g.sum(-1)
-
-    g_ = g * batch_size
-    return g_.t() @ (g_ / g.size(0))
-
-
-def update_running_stat(aa, m_aa, momentum):
-    # Do the trick to keep aa unchanged and not create any additional tensors
-    m_aa *= momentum / (1 - momentum)
-    m_aa += aa
-    m_aa *= (1 - momentum)
-
-
-class SplitBias(nn.Module):
-    def __init__(self, module):
-        super(SplitBias, self).__init__()
-        self.module = module
-        self.add_bias = AddBias(module.bias.data)
-        self.module.bias = None
-
-    def forward(self, input):
-        x = self.module(input)
-        x = self.add_bias(x)
-        return x
-
-
-class KFACOptimizer(optim.Optimizer):
-    def __init__(self,
-                 model,
-                 lr=0.25,
-                 momentum=0.9,
-                 stat_decay=0.99,
-                 kl_clip=0.001,
-                 damping=1e-2,
-                 weight_decay=0,
-                 fast_cnn=False,
-                 Ts=1,
-                 Tf=10):
-        defaults = dict()
-
-        def split_bias(module):
-            for mname, child in module.named_children():
-                if hasattr(child, 'bias'):
-                    module._modules[mname] = SplitBias(child)
-                else:
-                    split_bias(child)
-
-        split_bias(model)
-
-        super(KFACOptimizer, self).__init__(model.parameters(), defaults)
-
-        self.known_modules = {'Linear', 'Conv2d', 'AddBias'}
-
-        self.modules = []
-        self.grad_outputs = {}
-
-        self.model = model
-        self._prepare_model()
-
-        self.steps = 0
-
-        self.m_aa, self.m_gg = {}, {}
-        self.Q_a, self.Q_g = {}, {}
-        self.d_a, self.d_g = {}, {}
-
-        self.momentum = momentum
-        self.stat_decay = stat_decay
-
-        self.lr = lr
-        self.kl_clip = kl_clip
-        self.damping = damping
-        self.weight_decay = weight_decay
-
-        self.fast_cnn = fast_cnn
-
-        self.Ts = Ts
-        self.Tf = Tf
-
-        self.optim = optim.SGD(
-            model.parameters(),
-            lr=self.lr * (1 - self.momentum),
-            momentum=self.momentum)
-
-    def _save_input(self, module, input):
-        if input[0].volatile == False and self.steps % self.Ts == 0:
-            classname = module.__class__.__name__
-            layer_info = None
-            if classname == 'Conv2d':
-                layer_info = (module.kernel_size, module.stride,
-                              module.padding)
-
-            aa = compute_cov_a(input[0].data, classname, layer_info,
-                               self.fast_cnn)
-
-            # Initialize buffers
-            if self.steps == 0:
-                self.m_aa[module] = aa.clone()
-
-            update_running_stat(aa, self.m_aa[module], self.stat_decay)
-
-    def _save_grad_output(self, module, grad_input, grad_output):
-        if self.acc_stats:
-            classname = module.__class__.__name__
-            layer_info = None
-            if classname == 'Conv2d':
-                layer_info = (module.kernel_size, module.stride,
-                              module.padding)
-
-            gg = compute_cov_g(grad_output[0].data, classname,
-                               layer_info, self.fast_cnn)
-
-            # Initialize buffers
-            if self.steps == 0:
-                self.m_gg[module] = gg.clone()
-
-            update_running_stat(gg, self.m_gg[module], self.stat_decay)
-
-    def _prepare_model(self):
-        for module in self.model.modules():
-            classname = module.__class__.__name__
-            if classname in self.known_modules:
-                assert not ((classname in ['Linear', 'Conv2d']) and module.bias is not None), \
-                                    "You must have a bias as a separate layer"
-
-                self.modules.append(module)
-                module.register_forward_pre_hook(self._save_input)
-                module.register_backward_hook(self._save_grad_output)
-
-    def step(self):
-        # Add weight decay
-        if self.weight_decay > 0:
-            for p in self.model.parameters():
-                p.grad.data.add_(self.weight_decay, p.data)
-
-        updates = {}
-        for i, m in enumerate(self.modules):
-            assert len(list(m.parameters())
-                       ) == 1, "Can handle only one parameter at the moment"
-            classname = m.__class__.__name__
-            p = next(m.parameters())
-
-            la = self.damping + self.weight_decay
-
-            if self.steps % self.Tf == 0:
-                # My asynchronous implementation exists, I will add it later.
-                # Experimenting with different ways to this in PyTorch.
-                self.d_a[m], self.Q_a[m] = torch.symeig(
-                    self.m_aa[m], eigenvectors=True)
-                self.d_g[m], self.Q_g[m] = torch.symeig(
-                    self.m_gg[m], eigenvectors=True)
-
-                self.d_a[m].mul_((self.d_a[m] > 1e-6).float())
-                self.d_g[m].mul_((self.d_g[m] > 1e-6).float())
-
-            if classname == 'Conv2d':
-                p_grad_mat = p.grad.data.view(p.grad.data.size(0), -1)
-            else:
-                p_grad_mat = p.grad.data
-
-            v1 = self.Q_g[m].t() @ p_grad_mat @ self.Q_a[m]
-            v2 = v1 / (
-                self.d_g[m].unsqueeze(1) * self.d_a[m].unsqueeze(0) + la)
-            v = self.Q_g[m] @ v2 @ self.Q_a[m].t()
-
-            v = v.view(p.grad.data.size())
-            updates[p] = v
-
-        vg_sum = 0
-        for p in self.model.parameters():
-            v = updates[p]
-            vg_sum += (v * p.grad.data * self.lr * self.lr).sum()
-
-        nu = min(1, math.sqrt(self.kl_clip / vg_sum))
-
-        for p in self.model.parameters():
-            v = updates[p]
-            p.grad.data.copy_(v)
-            p.grad.data.mul_(nu)
-
-        self.optim.step()
-        self.steps += 1

+ 0 - 267
pytorch_rl/main.py

@@ -1,267 +0,0 @@
-import copy
-import glob
-import os
-import time
-import operator
-from functools import reduce
-
-import gym
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-from torch.autograd import Variable
-
-from arguments import get_args
-from vec_env.dummy_vec_env import DummyVecEnv
-from vec_env.subproc_vec_env import SubprocVecEnv
-from envs import make_env
-from kfac import KFACOptimizer
-from model import Policy
-from storage import RolloutStorage
-from visualize import visdom_plot
-
-args = get_args()
-
-assert args.algo in ['a2c', 'ppo', 'acktr']
-if args.recurrent_policy:
-    assert args.algo in ['a2c', 'ppo'], 'Recurrent policy is not implemented for ACKTR'
-
-num_updates = int(args.num_frames) // args.num_steps // args.num_processes
-
-torch.manual_seed(args.seed)
-if args.cuda:
-    torch.cuda.manual_seed(args.seed)
-
-try:
-    os.makedirs(args.log_dir)
-except OSError:
-    files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv'))
-    for f in files:
-        os.remove(f)
-
-def main():
-    os.environ['OMP_NUM_THREADS'] = '1'
-
-    envs = [make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes)]
-
-    if args.num_processes > 1:
-        envs = SubprocVecEnv(envs)
-    else:
-        envs = DummyVecEnv(envs)
-
-    obs_shape = envs.observation_space.shape
-    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])
-    obs_numel = reduce(operator.mul, obs_shape, 1)
-
-    actor_critic = Policy(obs_numel, envs.action_space)
-
-    # Maxime: log some info about the model and its size
-    modelSize = 0
-    for p in actor_critic.parameters():
-        pSize = reduce(operator.mul, p.size(), 1)
-        modelSize += pSize
-    print(str(actor_critic))
-    print('Total model size: %d' % modelSize)
-
-    if envs.action_space.__class__.__name__ == "Discrete":
-        action_shape = 1
-    else:
-        action_shape = envs.action_space.shape[0]
-
-    if args.cuda:
-        actor_critic.cuda()
-
-    if args.algo == 'a2c':
-        optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha)
-    elif args.algo == 'ppo':
-        optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps)
-    elif args.algo == 'acktr':
-        optimizer = KFACOptimizer(actor_critic)
-
-    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size)
-    current_obs = torch.zeros(args.num_processes, *obs_shape)
-
-    def update_current_obs(obs):
-        shape_dim0 = envs.observation_space.shape[0]
-        obs = torch.from_numpy(obs).float()
-        if args.num_stack > 1:
-            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
-        current_obs[:, -shape_dim0:] = obs
-
-    obs = envs.reset()
-    update_current_obs(obs)
-    rollouts.observations[0].copy_(current_obs)
-
-    # These variables are used to compute average rewards for all processes.
-    episode_rewards = torch.zeros([args.num_processes, 1])
-    final_rewards = torch.zeros([args.num_processes, 1])
-
-    if args.cuda:
-        current_obs = current_obs.cuda()
-        rollouts.cuda()
-
-    start = time.time()
-    for j in range(num_updates):
-        for step in range(args.num_steps):
-            # Sample actions
-            value, action, action_log_prob, states = actor_critic.act(
-                Variable(rollouts.observations[step], volatile=True),
-                Variable(rollouts.states[step], volatile=True),
-                Variable(rollouts.masks[step], volatile=True)
-            )
-            cpu_actions = action.data.squeeze(1).cpu().numpy()
-
-            # Obser reward and next obs
-            obs, reward, done, info = envs.step(cpu_actions)
-            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
-            episode_rewards += reward
-
-            # If done then clean the history of observations.
-            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
-            final_rewards *= masks
-            final_rewards += (1 - masks) * episode_rewards
-            episode_rewards *= masks
-
-            if args.cuda:
-                masks = masks.cuda()
-
-            if current_obs.dim() == 4:
-                current_obs *= masks.unsqueeze(2).unsqueeze(2)
-            elif current_obs.dim() == 3:
-                current_obs *= masks.unsqueeze(2)
-            else:
-                current_obs *= masks
-
-            update_current_obs(obs)
-            rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks)
-
-        next_value = actor_critic(
-            Variable(rollouts.observations[-1], volatile=True),
-            Variable(rollouts.states[-1], volatile=True),
-            Variable(rollouts.masks[-1], volatile=True)
-        )[0].data
-
-        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)
-
-        if args.algo in ['a2c', 'acktr']:
-            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
-                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
-                Variable(rollouts.states[:-1].view(-1, actor_critic.state_size)),
-                Variable(rollouts.masks[:-1].view(-1, 1)),
-                Variable(rollouts.actions.view(-1, action_shape))
-            )
-
-            values = values.view(args.num_steps, args.num_processes, 1)
-            action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)
-
-            advantages = Variable(rollouts.returns[:-1]) - values
-            value_loss = advantages.pow(2).mean()
-
-            action_loss = -(Variable(advantages.data) * action_log_probs).mean()
-
-            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
-                # Sampled fisher, see Martens 2014
-                actor_critic.zero_grad()
-                pg_fisher_loss = -action_log_probs.mean()
-
-                value_noise = Variable(torch.randn(values.size()))
-                if args.cuda:
-                    value_noise = value_noise.cuda()
-
-                sample_values = values + value_noise
-                vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean()
-
-                fisher_loss = pg_fisher_loss + vf_fisher_loss
-                optimizer.acc_stats = True
-                fisher_loss.backward(retain_graph=True)
-                optimizer.acc_stats = False
-
-            optimizer.zero_grad()
-            (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward()
-
-            if args.algo == 'a2c':
-                nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)
-
-            optimizer.step()
-        elif args.algo == 'ppo':
-            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
-            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)
-
-            for e in range(args.ppo_epoch):
-                if args.recurrent_policy:
-                    data_generator = rollouts.recurrent_generator(advantages, args.num_mini_batch)
-                else:
-                    data_generator = rollouts.feed_forward_generator(advantages, args.num_mini_batch)
-
-                for sample in data_generator:
-                    observations_batch, states_batch, actions_batch, \
-                       return_batch, masks_batch, old_action_log_probs_batch, \
-                            adv_targ = sample
-
-                    # Reshape to do in a single forward pass for all steps
-                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
-                        Variable(observations_batch),
-                        Variable(states_batch),
-                        Variable(masks_batch),
-                        Variable(actions_batch)
-                    )
-
-                    adv_targ = Variable(adv_targ)
-                    ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch))
-                    surr1 = ratio * adv_targ
-                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ
-                    action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP)
-
-                    value_loss = (Variable(return_batch) - values).pow(2).mean()
-
-                    optimizer.zero_grad()
-                    (value_loss + action_loss - dist_entropy * args.entropy_coef).backward()
-                    nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)
-                    optimizer.step()
-
-        rollouts.after_update()
-
-        if j % args.save_interval == 0 and args.save_dir != "":
-            save_path = os.path.join(args.save_dir, args.algo)
-            try:
-                os.makedirs(save_path)
-            except OSError:
-                pass
-
-            # A really ugly way to save a model to CPU
-            save_model = actor_critic
-            if args.cuda:
-                save_model = copy.deepcopy(actor_critic).cpu()
-
-            save_model = [save_model,
-                            hasattr(envs, 'ob_rms') and envs.ob_rms or None]
-
-            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))
-
-        if j % args.log_interval == 0:
-            end = time.time()
-            total_num_steps = (j + 1) * args.num_processes * args.num_steps
-            print(
-                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
-                format(
-                    j,
-                    total_num_steps,
-                    int(total_num_steps / (end - start)),
-                    final_rewards.mean(),
-                    final_rewards.median(),
-                    final_rewards.min(),
-                    final_rewards.max(), dist_entropy.data[0],
-                    value_loss.data[0], action_loss.data[0]
-                )
-            )
-
-        if args.vis and j % args.vis_interval == 0:
-            win = visdom_plot(
-                total_num_steps,
-                final_rewards.mean()
-            )
-
-if __name__ == "__main__":
-    main()

+ 0 - 101
pytorch_rl/model.py

@@ -1,101 +0,0 @@
-import operator
-from functools import reduce
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from distributions import Categorical, DiagGaussian
-from utils import orthogonal
-
-class FFPolicy(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, inputs, states, masks):
-        raise NotImplementedError
-
-    def act(self, inputs, states, masks, deterministic=False):
-        value, x, states = self(inputs, states, masks)
-        action = self.dist.sample(x, deterministic=deterministic)
-        action_log_probs, dist_entropy = self.dist.logprobs_and_entropy(x, action)
-        return value, action, action_log_probs, states
-
-    def evaluate_actions(self, inputs, states, masks, actions):
-        value, x, states = self(inputs, states, masks)
-        action_log_probs, dist_entropy = self.dist.logprobs_and_entropy(x, actions)
-        return value, action_log_probs, dist_entropy, states
-
-def weights_init_mlp(m):
-    classname = m.__class__.__name__
-    if classname.find('Linear') != -1:
-        nn.init.xavier_normal(m.weight)
-        if m.bias is not None:
-            m.bias.data.fill_(0)
-
-class Policy(FFPolicy):
-    def __init__(self, num_inputs, action_space):
-        super().__init__()
-
-        self.action_space = action_space
-        assert action_space.__class__.__name__ == "Discrete"
-        num_outputs = action_space.n
-
-        self.fc1 = nn.Linear(num_inputs, 128)
-        self.fc2 = nn.Linear(128, 128)
-
-        # Input size, hidden state size
-        self.gru = nn.GRUCell(128, 128)
-
-        self.a_fc1 = nn.Linear(128, 128)
-        self.a_fc2 = nn.Linear(128, 128)
-        self.dist = Categorical(128, num_outputs)
-
-        self.v_fc1 = nn.Linear(128, 128)
-        self.v_fc2 = nn.Linear(128, 128)
-        self.v_fc3 = nn.Linear(128, 1)
-
-        self.train()
-        self.reset_parameters()
-
-    @property
-    def state_size(self):
-        """
-        Size of the recurrent state of the model (propagated between steps)
-        """
-        return 128
-
-    def reset_parameters(self):
-        self.apply(weights_init_mlp)
-
-        orthogonal(self.gru.weight_ih.data)
-        orthogonal(self.gru.weight_hh.data)
-        self.gru.bias_ih.data.fill_(0)
-        self.gru.bias_hh.data.fill_(0)
-
-        if self.dist.__class__.__name__ == "DiagGaussian":
-            self.dist.fc_mean.weight.data.mul_(0.01)
-
-    def forward(self, inputs, states, masks):
-        batch_numel = reduce(operator.mul, inputs.size()[1:], 1)
-        inputs = inputs.view(-1, batch_numel)
-
-        x = self.fc1(inputs)
-        x = F.tanh(x)
-        x = self.fc2(x)
-        x = F.tanh(x)
-
-        assert inputs.size(0) == states.size(0)
-        states = self.gru(x, states * masks)
-
-        x = self.a_fc1(states)
-        x = F.tanh(x)
-        x = self.a_fc2(x)
-        actions = x
-
-        x = self.v_fc1(states)
-        x = F.tanh(x)
-        x = self.v_fc2(x)
-        x = F.tanh(x)
-        x = self.v_fc3(x)
-        value = x
-
-        return value, actions, states

+ 0 - 116
pytorch_rl/storage.py

@@ -1,116 +0,0 @@
-import torch
-from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
-
-
-class RolloutStorage(object):
-    def __init__(self, num_steps, num_processes, obs_shape, action_space, state_size):
-        self.observations = torch.zeros(num_steps + 1, num_processes, *obs_shape)
-        self.states = torch.zeros(num_steps + 1, num_processes, state_size)
-        self.rewards = torch.zeros(num_steps, num_processes, 1)
-        self.value_preds = torch.zeros(num_steps + 1, num_processes, 1)
-        self.returns = torch.zeros(num_steps + 1, num_processes, 1)
-        self.action_log_probs = torch.zeros(num_steps, num_processes, 1)
-        if action_space.__class__.__name__ == 'Discrete':
-            action_shape = 1
-        else:
-            action_shape = action_space.shape[0]
-        self.actions = torch.zeros(num_steps, num_processes, action_shape)
-        if action_space.__class__.__name__ == 'Discrete':
-            self.actions = self.actions.long()
-        self.masks = torch.ones(num_steps + 1, num_processes, 1)
-
-    def cuda(self):
-        self.observations = self.observations.cuda()
-        self.states = self.states.cuda()
-        self.rewards = self.rewards.cuda()
-        self.value_preds = self.value_preds.cuda()
-        self.returns = self.returns.cuda()
-        self.action_log_probs = self.action_log_probs.cuda()
-        self.actions = self.actions.cuda()
-        self.masks = self.masks.cuda()
-
-    def insert(self, step, current_obs, state, action, action_log_prob, value_pred, reward, mask):
-        self.observations[step + 1].copy_(current_obs)
-        self.states[step + 1].copy_(state)
-        self.actions[step].copy_(action)
-        self.action_log_probs[step].copy_(action_log_prob)
-        self.value_preds[step].copy_(value_pred)
-        self.rewards[step].copy_(reward)
-        self.masks[step + 1].copy_(mask)
-
-    def after_update(self):
-        self.observations[0].copy_(self.observations[-1])
-        self.states[0].copy_(self.states[-1])
-        self.masks[0].copy_(self.masks[-1])
-
-    def compute_returns(self, next_value, use_gae, gamma, tau):
-        if use_gae:
-            self.value_preds[-1] = next_value
-            gae = 0
-            for step in reversed(range(self.rewards.size(0))):
-                delta = self.rewards[step] + gamma * self.value_preds[step + 1] * self.masks[step + 1] - self.value_preds[step]
-                gae = delta + gamma * tau * self.masks[step + 1] * gae
-                self.returns[step] = gae + self.value_preds[step]
-        else:
-            self.returns[-1] = next_value
-            for step in reversed(range(self.rewards.size(0))):
-                self.returns[step] = self.returns[step + 1] * \
-                    gamma * self.masks[step + 1] + self.rewards[step]
-
-
-    def feed_forward_generator(self, advantages, num_mini_batch):
-        num_steps, num_processes = self.rewards.size()[0:2]
-        batch_size = num_processes * num_steps
-        mini_batch_size = batch_size // num_mini_batch
-        sampler = BatchSampler(SubsetRandomSampler(range(batch_size)), mini_batch_size, drop_last=False)
-        for indices in sampler:
-            indices = torch.LongTensor(indices)
-
-            if advantages.is_cuda:
-                indices = indices.cuda()
-
-            observations_batch = self.observations[:-1].view(-1,
-                                        *self.observations.size()[2:])[indices]
-            states_batch = self.states[:-1].view(-1, self.states.size(-1))[indices]
-            actions_batch = self.actions.view(-1, self.actions.size(-1))[indices]
-            return_batch = self.returns[:-1].view(-1, 1)[indices]
-            masks_batch = self.masks[:-1].view(-1, 1)[indices]
-            old_action_log_probs_batch = self.action_log_probs.view(-1, 1)[indices]
-            adv_targ = advantages.view(-1, 1)[indices]
-
-            yield observations_batch, states_batch, actions_batch, \
-                return_batch, masks_batch, old_action_log_probs_batch, adv_targ
-
-    def recurrent_generator(self, advantages, num_mini_batch):
-        num_processes = self.rewards.size(1)
-        num_envs_per_batch = num_processes // num_mini_batch
-        perm = torch.randperm(num_processes)
-        for start_ind in range(0, num_processes, num_envs_per_batch):
-            observations_batch = []
-            states_batch = []
-            actions_batch = []
-            return_batch = []
-            masks_batch = []
-            old_action_log_probs_batch = []
-            adv_targ = []
-
-            for offset in range(num_envs_per_batch):
-                ind = perm[start_ind + offset]
-                observations_batch.append(self.observations[:-1, ind])
-                states_batch.append(self.states[0:1, ind])
-                actions_batch.append(self.actions[:, ind])
-                return_batch.append(self.returns[:-1, ind])
-                masks_batch.append(self.masks[:-1, ind])
-                old_action_log_probs_batch.append(self.action_log_probs[:, ind])
-                adv_targ.append(advantages[:, ind])
-
-            observations_batch = torch.cat(observations_batch, 0)
-            states_batch = torch.cat(states_batch, 0)
-            actions_batch = torch.cat(actions_batch, 0)
-            return_batch = torch.cat(return_batch, 0)
-            masks_batch = torch.cat(masks_batch, 0)
-            old_action_log_probs_batch = torch.cat(old_action_log_probs_batch, 0)
-            adv_targ = torch.cat(adv_targ, 0)
-
-            yield observations_batch, states_batch, actions_batch, \
-                return_batch, masks_batch, old_action_log_probs_batch, adv_targ

+ 0 - 45
pytorch_rl/utils.py

@@ -1,45 +0,0 @@
-import torch
-import torch.nn as nn
-
-
-# Necessary for my KFAC implementation.
-class AddBias(nn.Module):
-    def __init__(self, bias):
-        super(AddBias, self).__init__()
-        self._bias = nn.Parameter(bias.unsqueeze(1))
-
-    def forward(self, x):
-        if x.dim() == 2:
-            bias = self._bias.t().view(1, -1)
-        else:
-            bias = self._bias.t().view(1, -1, 1, 1)
-
-        return x + bias
-
-# A temporary solution from the master branch.
-# https://github.com/pytorch/pytorch/blob/7752fe5d4e50052b3b0bbc9109e599f8157febc0/torch/nn/init.py#L312
-# Remove after the next version of PyTorch gets release.
-def orthogonal(tensor, gain=1):
-    if tensor.ndimension() < 2:
-        raise ValueError("Only tensors with 2 or more dimensions are supported")
-
-    rows = tensor.size(0)
-    cols = tensor[0].numel()
-    flattened = torch.Tensor(rows, cols).normal_(0, 1)
-
-    if rows < cols:
-        flattened.t_()
-
-    # Compute the qr factorization
-    q, r = torch.qr(flattened)
-    # Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf
-    d = torch.diag(r, 0)
-    ph = d.sign()
-    q *= ph.expand_as(q)
-
-    if rows < cols:
-        q.t_()
-
-    tensor.view_as(q).copy_(q)
-    tensor.mul_(gain)
-    return tensor

+ 0 - 21
pytorch_rl/vec_env/LICENSE

@@ -1,21 +0,0 @@
-The MIT License
-
-Copyright (c) 2017 OpenAI (http://openai.com)
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.

+ 0 - 100
pytorch_rl/vec_env/__init__.py

@@ -1,100 +0,0 @@
-from abc import ABC, abstractmethod
-
-class VecEnv(ABC):
-
-    def __init__(self, num_envs, observation_space, action_space):
-        self.num_envs = num_envs
-        self.observation_space = observation_space
-        self.action_space = action_space
-
-    """
-    An abstract asynchronous, vectorized environment.
-    """
-    @abstractmethod
-    def reset(self):
-        """
-        Reset all the environments and return an array of
-        observations.
-
-        If step_async is still doing work, that work will
-        be cancelled and step_wait() should not be called
-        until step_async() is invoked again.
-        """
-        pass
-
-    @abstractmethod
-    def step_async(self, actions):
-        """
-        Tell all the environments to start taking a step
-        with the given actions.
-        Call step_wait() to get the results of the step.
-
-        You should not call this if a step_async run is
-        already pending.
-        """
-        pass
-
-    @abstractmethod
-    def step_wait(self):
-        """
-        Wait for the step taken with step_async().
-
-        Returns (obs, rews, dones, infos):
-         - obs: an array of observations
-         - rews: an array of rewards
-         - dones: an array of "episode done" booleans
-         - infos: an array of info objects
-        """
-        pass
-
-    @abstractmethod
-    def close(self):
-        """
-        Clean up the environments' resources.
-        """
-        pass
-
-    def step(self, actions):
-        self.step_async(actions)
-        return self.step_wait()
-
-    def render(self):
-        logger.warn('Render not defined for %s'%self)
-
-class VecEnvWrapper(VecEnv):
-    def __init__(self, venv, observation_space=None, action_space=None):
-        self.venv = venv
-        VecEnv.__init__(self,
-            num_envs=venv.num_envs,
-            observation_space=observation_space or venv.observation_space,
-            action_space=action_space or venv.action_space)
-
-    def step_async(self, actions):
-        self.venv.step_async(actions)
-
-    @abstractmethod
-    def reset(self):
-        pass
-
-    @abstractmethod
-    def step_wait(self):
-        pass
-
-    def close(self):
-        return self.venv.close()
-
-    def render(self):
-        self.venv.render()
-
-class CloudpickleWrapper(object):
-    """
-    Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
-    """
-    def __init__(self, x):
-        self.x = x
-    def __getstate__(self):
-        import cloudpickle
-        return cloudpickle.dumps(self.x)
-    def __setstate__(self, ob):
-        import pickle
-        self.x = pickle.loads(ob)

+ 0 - 31
pytorch_rl/vec_env/dummy_vec_env.py

@@ -1,31 +0,0 @@
-import numpy as np
-from . import VecEnv
-
-class DummyVecEnv(VecEnv):
-    def __init__(self, env_fns):
-        self.envs = [fn() for fn in env_fns]
-        env = self.envs[0]        
-        VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
-        self.ts = np.zeros(len(self.envs), dtype='int')        
-        self.actions = None
-
-    def step_async(self, actions):
-        self.actions = actions
-
-    def step_wait(self):
-        results = [env.step(a) for (a,env) in zip(self.actions, self.envs)]
-        obs, rews, dones, infos = map(np.array, zip(*results))
-        self.ts += 1
-        for (i, done) in enumerate(dones):
-            if done: 
-                obs[i] = self.envs[i].reset()
-                self.ts[i] = 0
-        self.actions = None
-        return np.array(obs), np.array(rews), np.array(dones), infos
-
-    def reset(self):        
-        results = [env.reset() for env in self.envs]
-        return np.array(results)
-
-    def close(self):
-        return

+ 0 - 82
pytorch_rl/vec_env/subproc_vec_env.py

@@ -1,82 +0,0 @@
-import numpy as np
-from multiprocessing import Process, Pipe
-from vec_env import VecEnv, CloudpickleWrapper
-
-def worker(remote, parent_remote, env_fn_wrapper):
-    parent_remote.close()
-    env = env_fn_wrapper.x()
-    while True:
-        cmd, data = remote.recv()
-        if cmd == 'step':
-            ob, reward, done, info = env.step(data)
-            if done:
-                ob = env.reset()
-            remote.send((ob, reward, done, info))
-        elif cmd == 'reset':
-            ob = env.reset()
-            remote.send(ob)
-        elif cmd == 'reset_task':
-            ob = env.reset_task()
-            remote.send(ob)
-        elif cmd == 'close':
-            remote.close()
-            break
-        elif cmd == 'get_spaces':
-            remote.send((env.observation_space, env.action_space))
-        else:
-            raise NotImplementedError
-
-
-class SubprocVecEnv(VecEnv):
-    def __init__(self, env_fns, spaces=None):
-        """
-        envs: list of gym environments to run in subprocesses
-        """
-        self.waiting = False
-        self.closed = False
-        nenvs = len(env_fns)
-        self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
-        self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
-            for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
-        for p in self.ps:
-            p.daemon = True # if the main process crashes, we should not cause things to hang
-            p.start()
-        for remote in self.work_remotes:
-            remote.close()
-
-        self.remotes[0].send(('get_spaces', None))
-        observation_space, action_space = self.remotes[0].recv()
-        VecEnv.__init__(self, len(env_fns), observation_space, action_space)
-
-    def step_async(self, actions):
-        for remote, action in zip(self.remotes, actions):
-            remote.send(('step', action))
-        self.waiting = True
-
-    def step_wait(self):
-        results = [remote.recv() for remote in self.remotes]
-        self.waiting = False
-        obs, rews, dones, infos = zip(*results)
-        return np.stack(obs), np.stack(rews), np.stack(dones), infos
-
-    def reset(self):
-        for remote in self.remotes:
-            remote.send(('reset', None))
-        return np.stack([remote.recv() for remote in self.remotes])
-
-    def reset_task(self):
-        for remote in self.remotes:
-            remote.send(('reset_task', None))
-        return np.stack([remote.recv() for remote in self.remotes])
-
-    def close(self):
-        if self.closed:
-            return
-        if self.waiting:
-            for remote in self.remotes:
-                remote.recv()
-        for remote in self.remotes:
-            remote.send(('close', None))
-        for p in self.ps:
-            p.join()
-        self.closed = True

+ 0 - 38
pytorch_rl/vec_env/vec_frame_stack.py

@@ -1,38 +0,0 @@
-from vec_env import VecEnvWrapper
-import numpy as np
-from gym import spaces
-
-class VecFrameStack(VecEnvWrapper):
-    """
-    Vectorized environment base class
-    """
-    def __init__(self, venv, nstack):
-        self.venv = venv
-        self.nstack = nstack
-        wos = venv.observation_space # wrapped ob space
-        low = np.repeat(wos.low, self.nstack, axis=-1)
-        high = np.repeat(wos.high, self.nstack, axis=-1)
-        self.stackedobs = np.zeros((venv.num_envs,)+low.shape, low.dtype)
-        observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype)
-        VecEnvWrapper.__init__(self, venv, observation_space=observation_space)
-
-    def step_wait(self):
-        obs, rews, news, infos = self.venv.step_wait()
-        self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1)
-        for (i, new) in enumerate(news):
-            if new:
-                self.stackedobs[i] = 0
-        self.stackedobs[..., -obs.shape[-1]:] = obs
-        return self.stackedobs, rews, news, infos
-
-    def reset(self):
-        """
-        Reset all environments
-        """
-        obs = self.venv.reset()
-        self.stackedobs[...] = 0
-        self.stackedobs[..., -obs.shape[-1]:] = obs
-        return self.stackedobs
-
-    def close(self):
-        self.venv.close()

+ 0 - 54
pytorch_rl/visualize.py

@@ -1,54 +0,0 @@
-import numpy as np
-
-vis = None
-
-win = None
-
-avg_reward = 0
-
-X = []
-Y = []
-
-def visdom_plot(
-    total_num_steps,
-    mean_reward
-):
-    # Lazily import visdom so that people don't need to install visdom
-    # if they're not actually using it
-    from visdom import Visdom
-
-    global vis
-    global win
-    global avg_reward
-
-    if vis is None:
-        vis = Visdom()
-        assert vis.check_connection()
-
-        # Close all existing plots
-        vis.close()
-
-    # Running average for curve smoothing
-    avg_reward = avg_reward * 0.9 + 0.1 * mean_reward
-
-    X.append(total_num_steps)
-    Y.append(avg_reward)
-
-    # The plot with the handle 'win' is updated each time this is called
-    win = vis.line(
-        X = np.array(X),
-        Y = np.array(Y),
-        opts = dict(
-            #title = 'All Environments',
-            xlabel='Total time steps',
-            ylabel='Reward per episode',
-            ytickmin=0,
-            #ytickmax=1,
-            #ytickstep=0.1,
-            #legend=legend,
-            #showlegend=True,
-            width=900,
-            height=500
-        ),
-        win = win
-    )