vor 8 Jahren · a9653e81de
--- a/pytorch_rl/LICENSE
+++ b/pytorch_rl/LICENSE
@@ -1,21 +0,0 @@
 
				-MIT License
			
 
				-
			
 
				-Copyright (c) 2017 Ilya Kostrikov
			
 
				-
			
 
				-Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				-of this software and associated documentation files (the "Software"), to deal
			
 
				-in the Software without restriction, including without limitation the rights
			
 
				-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				-copies of the Software, and to permit persons to whom the Software is
			
 
				-furnished to do so, subject to the following conditions:
			
 
				-
			
 
				-The above copyright notice and this permission notice shall be included in all
			
 
				-copies or substantial portions of the Software.
			
 
				-
			
 
				-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				-SOFTWARE.
			
--- a/pytorch_rl/arguments.py
+++ b/pytorch_rl/arguments.py
@@ -1,70 +0,0 @@
 
				-import argparse
			
 
				-
			
 
				-import torch
			
 
				-
			
 
				-
			
 
				-def get_args():
			
 
				-    parser = argparse.ArgumentParser(description='RL')
			
 
				-    parser.add_argument('--algo', default='a2c',
			
 
				-                        help='algorithm to use: a2c | ppo | acktr')
			
 
				-    parser.add_argument('--lr', type=float, default=7e-4,
			
 
				-                        help='learning rate (default: 7e-4)')
			
 
				-    parser.add_argument('--eps', type=float, default=1e-5,
			
 
				-                        help='RMSprop optimizer epsilon (default: 1e-5)')
			
 
				-    parser.add_argument('--alpha', type=float, default=0.99,
			
 
				-                        help='RMSprop optimizer apha (default: 0.99)')
			
 
				-    parser.add_argument('--gamma', type=float, default=0.99,
			
 
				-                        help='discount factor for rewards (default: 0.99)')
			
 
				-    parser.add_argument('--use-gae', action='store_true', default=False,
			
 
				-                        help='use generalized advantage estimation')
			
 
				-    parser.add_argument('--tau', type=float, default=0.95,
			
 
				-                        help='gae parameter (default: 0.95)')
			
 
				-    parser.add_argument('--entropy-coef', type=float, default=0.01,
			
 
				-                        help='entropy term coefficient (default: 0.01)')
			
 
				-    parser.add_argument('--value-loss-coef', type=float, default=0.5,
			
 
				-                        help='value loss coefficient (default: 0.5)')
			
 
				-    parser.add_argument('--max-grad-norm', type=float, default=0.5,
			
 
				-                        help='value loss coefficient (default: 0.5)')
			
 
				-    parser.add_argument('--seed', type=int, default=1,
			
 
				-                        help='random seed (default: 1)')
			
 
				-    parser.add_argument('--num-processes', type=int, default=32,
			
 
				-                        help='how many training CPU processes to use (default: 32)')
			
 
				-    parser.add_argument('--num-steps', type=int, default=5,
			
 
				-                        help='number of forward steps in A2C (default: 5)')
			
 
				-    parser.add_argument('--ppo-epoch', type=int, default=4,
			
 
				-                        help='number of ppo epochs (default: 4)')
			
 
				-    parser.add_argument('--num-mini-batch', type=int, default=32,
			
 
				-                        help='number of batches for ppo (default: 32)')
			
 
				-    parser.add_argument('--clip-param', type=float, default=0.2,
			
 
				-                        help='ppo clip parameter (default: 0.2)')
			
 
				-    parser.add_argument('--num-stack', type=int, default=1,
			
 
				-                        help='number of frames to stack (default: 1)')
			
 
				-    parser.add_argument('--log-interval', type=int, default=10,
			
 
				-                        help='log interval, one log per n updates (default: 10)')
			
 
				-    parser.add_argument('--save-interval', type=int, default=100,
			
 
				-                        help='save interval, one save per n updates (default: 10)')
			
 
				-    parser.add_argument('--vis-interval', type=int, default=10,
			
 
				-                        help='vis interval, one log per n updates')
			
 
				-    parser.add_argument('--num-frames', type=int, default=10e6,
			
 
				-                        help='number of frames to train (default: 10e6)')
			
 
				-    parser.add_argument('--env-name', default='PongNoFrameskip-v4',
			
 
				-                        help='environment to train on (default: PongNoFrameskip-v4)')
			
 
				-    parser.add_argument('--log-dir', default='/tmp/gym/',
			
 
				-                        help='directory to save agent logs (default: /tmp/gym)')
			
 
				-    parser.add_argument('--save-dir', default='./trained_models/',
			
 
				-                        help='directory to save agent logs (default: ./trained_models/)')
			
 
				-    parser.add_argument('--no-cuda', action='store_true', default=False,
			
 
				-                        help='disables CUDA training')
			
 
				-    parser.add_argument('--recurrent-policy', action='store_true', default=True,
			
 
				-                        help='use a recurrent policy')
			
 
				-    parser.add_argument('--no-vis', action='store_true', default=False,
			
 
				-                        help='disables visdom visualization')
			
 
				-    args = parser.parse_args()
			
 
				-
			
 
				-    args.cuda = not args.no_cuda and torch.cuda.is_available()
			
 
				-    args.vis = not args.no_vis
			
 
				-
			
 
				-    if not args.cuda:
			
 
				-        print('*** WARNING: CUDA NOT ENABLED ***')
			
 
				-
			
 
				-    return args
			
--- a/pytorch_rl/distributions.py
+++ b/pytorch_rl/distributions.py
@@ -1,81 +0,0 @@
 
				-import math
			
 
				-
			
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-import torch.nn.functional as F
			
 
				-from torch.autograd import Variable
			
 
				-from utils import AddBias
			
 
				-
			
 
				-
			
 
				-class Categorical(nn.Module):
			
 
				-    def __init__(self, num_inputs, num_outputs):
			
 
				-        super(Categorical, self).__init__()
			
 
				-        self.linear = nn.Linear(num_inputs, num_outputs)
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        x = self.linear(x)
			
 
				-        return x
			
 
				-
			
 
				-    def sample(self, x, deterministic):
			
 
				-        x = self(x)
			
 
				-
			
 
				-        probs = F.softmax(x, dim=1)
			
 
				-        if deterministic is False:
			
 
				-            action = probs.multinomial()
			
 
				-        else:
			
 
				-            action = probs.max(1, keepdim=True)[1]
			
 
				-        return action
			
 
				-
			
 
				-    def logprobs_and_entropy(self, x, actions):
			
 
				-        x = self(x)
			
 
				-
			
 
				-        log_probs = F.log_softmax(x, dim=1)
			
 
				-        probs = F.softmax(x, dim=1)
			
 
				-
			
 
				-        action_log_probs = log_probs.gather(1, actions)
			
 
				-
			
 
				-        dist_entropy = -(log_probs * probs).sum(-1).mean()
			
 
				-        return action_log_probs, dist_entropy
			
 
				-
			
 
				-
			
 
				-class DiagGaussian(nn.Module):
			
 
				-    def __init__(self, num_inputs, num_outputs):
			
 
				-        super(DiagGaussian, self).__init__()
			
 
				-        self.fc_mean = nn.Linear(num_inputs, num_outputs)
			
 
				-        self.logstd = AddBias(torch.zeros(num_outputs))
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        action_mean = self.fc_mean(x)
			
 
				-
			
 
				-        #  An ugly hack for my KFAC implementation.
			
 
				-        zeros = Variable(torch.zeros(action_mean.size()), volatile=x.volatile)
			
 
				-        if x.is_cuda:
			
 
				-            zeros = zeros.cuda()
			
 
				-
			
 
				-        action_logstd = self.logstd(zeros)
			
 
				-        return action_mean, action_logstd
			
 
				-
			
 
				-    def sample(self, x, deterministic):
			
 
				-        action_mean, action_logstd = self(x)
			
 
				-
			
 
				-        action_std = action_logstd.exp()
			
 
				-
			
 
				-        if deterministic is False:
			
 
				-            noise = Variable(torch.randn(action_std.size()))
			
 
				-            if action_std.is_cuda:
			
 
				-                noise = noise.cuda()
			
 
				-            action = action_mean + action_std * noise
			
 
				-        else:
			
 
				-            action = action_mean
			
 
				-        return action
			
 
				-
			
 
				-    def logprobs_and_entropy(self, x, actions):
			
 
				-        action_mean, action_logstd = self(x)
			
 
				-
			
 
				-        action_std = action_logstd.exp()
			
 
				-
			
 
				-        action_log_probs = -0.5 * ((actions - action_mean) / action_std).pow(2) - 0.5 * math.log(2 * math.pi) - action_logstd
			
 
				-        action_log_probs = action_log_probs.sum(-1, keepdim=True)
			
 
				-        dist_entropy = 0.5 + 0.5 * math.log(2 * math.pi) + action_logstd
			
 
				-        dist_entropy = dist_entropy.sum(-1).mean()
			
 
				-        return action_log_probs, dist_entropy
			
--- a/pytorch_rl/enjoy.py
+++ b/pytorch_rl/enjoy.py
@@ -1,77 +0,0 @@
 
				-import argparse
			
 
				-import os
			
 
				-import sys
			
 
				-import types
			
 
				-import time
			
 
				-
			
 
				-import numpy as np
			
 
				-import torch
			
 
				-from torch.autograd import Variable
			
 
				-from vec_env.dummy_vec_env import DummyVecEnv
			
 
				-
			
 
				-from envs import make_env
			
 
				-
			
 
				-parser = argparse.ArgumentParser(description='RL')
			
 
				-parser.add_argument('--seed', type=int, default=1,
			
 
				-                    help='random seed (default: 1)')
			
 
				-parser.add_argument('--num-stack', type=int, default=1,
			
 
				-                    help='number of frames to stack (default: 1)')
			
 
				-parser.add_argument('--log-interval', type=int, default=10,
			
 
				-                    help='log interval, one log per n updates (default: 10)')
			
 
				-parser.add_argument('--env-name', default='PongNoFrameskip-v4',
			
 
				-                    help='environment to train on (default: PongNoFrameskip-v4)')
			
 
				-parser.add_argument('--load-dir', default='./trained_models/',
			
 
				-                    help='directory to save agent logs (default: ./trained_models/)')
			
 
				-args = parser.parse_args()
			
 
				-
			
 
				-env = make_env(args.env_name, args.seed, 0, None)
			
 
				-env = DummyVecEnv([env])
			
 
				-
			
 
				-actor_critic, ob_rms = torch.load(os.path.join(args.load_dir, args.env_name + ".pt"))
			
 
				-
			
 
				-render_func = env.envs[0].render
			
 
				-
			
 
				-obs_shape = env.observation_space.shape
			
 
				-obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])
			
 
				-current_obs = torch.zeros(1, *obs_shape)
			
 
				-states = torch.zeros(1, actor_critic.state_size)
			
 
				-masks = torch.zeros(1, 1)
			
 
				-
			
 
				-def update_current_obs(obs):
			
 
				-    shape_dim0 = env.observation_space.shape[0]
			
 
				-    obs = torch.from_numpy(obs).float()
			
 
				-    if args.num_stack > 1:
			
 
				-        current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
			
 
				-    current_obs[:, -shape_dim0:] = obs
			
 
				-
			
 
				-render_func('human')
			
 
				-obs = env.reset()
			
 
				-update_current_obs(obs)
			
 
				-
			
 
				-while True:
			
 
				-    value, action, _, states = actor_critic.act(
			
 
				-        Variable(current_obs, volatile=True),
			
 
				-        Variable(states, volatile=True),
			
 
				-        Variable(masks, volatile=True),
			
 
				-        deterministic=True
			
 
				-    )
			
 
				-    states = states.data
			
 
				-    cpu_actions = action.data.squeeze(1).cpu().numpy()
			
 
				-
			
 
				-    # Observation, reward and next obs
			
 
				-    obs, reward, done, _ = env.step(cpu_actions)
			
 
				-
			
 
				-    time.sleep(0.05)
			
 
				-
			
 
				-    masks.fill_(0.0 if done else 1.0)
			
 
				-
			
 
				-    if current_obs.dim() == 4:
			
 
				-        current_obs *= masks.unsqueeze(2).unsqueeze(2)
			
 
				-    else:
			
 
				-        current_obs *= masks
			
 
				-    update_current_obs(obs)
			
 
				-
			
 
				-    renderer = render_func('human')
			
 
				-
			
 
				-    if not renderer.window:
			
 
				-        sys.exit(0)
			
--- a/pytorch_rl/envs.py
+++ b/pytorch_rl/envs.py
@@ -1,24 +0,0 @@
 
				-import os
			
 
				-import numpy
			
 
				-import gym
			
 
				-from gym import spaces
			
 
				-
			
 
				-try:
			
 
				-    import gym_minigrid
			
 
				-    from gym_minigrid.wrappers import *
			
 
				-except:
			
 
				-    pass
			
 
				-
			
 
				-def make_env(env_id, seed, rank, log_dir):
			
 
				-    def _thunk():
			
 
				-        env = gym.make(env_id)
			
 
				-
			
 
				-        env.seed(seed + rank)
			
 
				-
			
 
				-        # Maxime: until RL code supports dict observations, squash observations into a flat vector
			
 
				-        if isinstance(env.observation_space, spaces.Dict):
			
 
				-            env = FlatObsWrapper(env)
			
 
				-
			
 
				-        return env
			
 
				-
			
 
				-    return _thunk
			
--- a/pytorch_rl/kfac.py
+++ b/pytorch_rl/kfac.py
@@ -1,239 +0,0 @@
 
				-import math
			
 
				-
			
 
				-import torch
			
 
				-import torch.optim as optim
			
 
				-import torch.nn as nn
			
 
				-import torch.nn.functional as F
			
 
				-from utils import AddBias
			
 
				-
			
 
				-# TODO: In order to make this code faster:
			
 
				-# 1) Implement _extract_patches as a single cuda kernel
			
 
				-# 2) Compute QR decomposition in a separate process
			
 
				-# 3) Actually make a general KFAC optimizer so it fits PyTorch
			
 
				-
			
 
				-
			
 
				-def _extract_patches(x, kernel_size, stride, padding):
			
 
				-    if padding[0] + padding[1] > 0:
			
 
				-        x = F.pad(x, (padding[1], padding[1], padding[0],
			
 
				-                      padding[0])).data  # Actually check dims
			
 
				-    x = x.unfold(2, kernel_size[0], stride[0])
			
 
				-    x = x.unfold(3, kernel_size[1], stride[1])
			
 
				-    x = x.transpose_(1, 2).transpose_(2, 3).contiguous()
			
 
				-    x = x.view(
			
 
				-        x.size(0), x.size(1), x.size(2), x.size(3) * x.size(4) * x.size(5))
			
 
				-    return x
			
 
				-
			
 
				-
			
 
				-def compute_cov_a(a, classname, layer_info, fast_cnn):
			
 
				-    batch_size = a.size(0)
			
 
				-
			
 
				-    if classname == 'Conv2d':
			
 
				-        if fast_cnn:
			
 
				-            a = _extract_patches(a, *layer_info)
			
 
				-            a = a.view(a.size(0), -1, a.size(-1))
			
 
				-            a = a.mean(1)
			
 
				-        else:
			
 
				-            a = _extract_patches(a, *layer_info)
			
 
				-            a = a.view(-1, a.size(-1)).div_(a.size(1)).div_(a.size(2))
			
 
				-    elif classname == 'AddBias':
			
 
				-        is_cuda = a.is_cuda
			
 
				-        a = torch.ones(a.size(0), 1)
			
 
				-        if is_cuda:
			
 
				-            a = a.cuda()
			
 
				-
			
 
				-    return a.t() @ (a / batch_size)
			
 
				-
			
 
				-
			
 
				-def compute_cov_g(g, classname, layer_info, fast_cnn):
			
 
				-    batch_size = g.size(0)
			
 
				-
			
 
				-    if classname == 'Conv2d':
			
 
				-        if fast_cnn:
			
 
				-            g = g.view(g.size(0), g.size(1), -1)
			
 
				-            g = g.sum(-1)
			
 
				-        else:
			
 
				-            g = g.transpose(1, 2).transpose(2, 3).contiguous()
			
 
				-            g = g.view(-1, g.size(-1)).mul_(g.size(1)).mul_(g.size(2))
			
 
				-    elif classname == 'AddBias':
			
 
				-        g = g.view(g.size(0), g.size(1), -1)
			
 
				-        g = g.sum(-1)
			
 
				-
			
 
				-    g_ = g * batch_size
			
 
				-    return g_.t() @ (g_ / g.size(0))
			
 
				-
			
 
				-
			
 
				-def update_running_stat(aa, m_aa, momentum):
			
 
				-    # Do the trick to keep aa unchanged and not create any additional tensors
			
 
				-    m_aa *= momentum / (1 - momentum)
			
 
				-    m_aa += aa
			
 
				-    m_aa *= (1 - momentum)
			
 
				-
			
 
				-
			
 
				-class SplitBias(nn.Module):
			
 
				-    def __init__(self, module):
			
 
				-        super(SplitBias, self).__init__()
			
 
				-        self.module = module
			
 
				-        self.add_bias = AddBias(module.bias.data)
			
 
				-        self.module.bias = None
			
 
				-
			
 
				-    def forward(self, input):
			
 
				-        x = self.module(input)
			
 
				-        x = self.add_bias(x)
			
 
				-        return x
			
 
				-
			
 
				-
			
 
				-class KFACOptimizer(optim.Optimizer):
			
 
				-    def __init__(self,
			
 
				-                 model,
			
 
				-                 lr=0.25,
			
 
				-                 momentum=0.9,
			
 
				-                 stat_decay=0.99,
			
 
				-                 kl_clip=0.001,
			
 
				-                 damping=1e-2,
			
 
				-                 weight_decay=0,
			
 
				-                 fast_cnn=False,
			
 
				-                 Ts=1,
			
 
				-                 Tf=10):
			
 
				-        defaults = dict()
			
 
				-
			
 
				-        def split_bias(module):
			
 
				-            for mname, child in module.named_children():
			
 
				-                if hasattr(child, 'bias'):
			
 
				-                    module._modules[mname] = SplitBias(child)
			
 
				-                else:
			
 
				-                    split_bias(child)
			
 
				-
			
 
				-        split_bias(model)
			
 
				-
			
 
				-        super(KFACOptimizer, self).__init__(model.parameters(), defaults)
			
 
				-
			
 
				-        self.known_modules = {'Linear', 'Conv2d', 'AddBias'}
			
 
				-
			
 
				-        self.modules = []
			
 
				-        self.grad_outputs = {}
			
 
				-
			
 
				-        self.model = model
			
 
				-        self._prepare_model()
			
 
				-
			
 
				-        self.steps = 0
			
 
				-
			
 
				-        self.m_aa, self.m_gg = {}, {}
			
 
				-        self.Q_a, self.Q_g = {}, {}
			
 
				-        self.d_a, self.d_g = {}, {}
			
 
				-
			
 
				-        self.momentum = momentum
			
 
				-        self.stat_decay = stat_decay
			
 
				-
			
 
				-        self.lr = lr
			
 
				-        self.kl_clip = kl_clip
			
 
				-        self.damping = damping
			
 
				-        self.weight_decay = weight_decay
			
 
				-
			
 
				-        self.fast_cnn = fast_cnn
			
 
				-
			
 
				-        self.Ts = Ts
			
 
				-        self.Tf = Tf
			
 
				-
			
 
				-        self.optim = optim.SGD(
			
 
				-            model.parameters(),
			
 
				-            lr=self.lr * (1 - self.momentum),
			
 
				-            momentum=self.momentum)
			
 
				-
			
 
				-    def _save_input(self, module, input):
			
 
				-        if input[0].volatile == False and self.steps % self.Ts == 0:
			
 
				-            classname = module.__class__.__name__
			
 
				-            layer_info = None
			
 
				-            if classname == 'Conv2d':
			
 
				-                layer_info = (module.kernel_size, module.stride,
			
 
				-                              module.padding)
			
 
				-
			
 
				-            aa = compute_cov_a(input[0].data, classname, layer_info,
			
 
				-                               self.fast_cnn)
			
 
				-
			
 
				-            # Initialize buffers
			
 
				-            if self.steps == 0:
			
 
				-                self.m_aa[module] = aa.clone()
			
 
				-
			
 
				-            update_running_stat(aa, self.m_aa[module], self.stat_decay)
			
 
				-
			
 
				-    def _save_grad_output(self, module, grad_input, grad_output):
			
 
				-        if self.acc_stats:
			
 
				-            classname = module.__class__.__name__
			
 
				-            layer_info = None
			
 
				-            if classname == 'Conv2d':
			
 
				-                layer_info = (module.kernel_size, module.stride,
			
 
				-                              module.padding)
			
 
				-
			
 
				-            gg = compute_cov_g(grad_output[0].data, classname,
			
 
				-                               layer_info, self.fast_cnn)
			
 
				-
			
 
				-            # Initialize buffers
			
 
				-            if self.steps == 0:
			
 
				-                self.m_gg[module] = gg.clone()
			
 
				-
			
 
				-            update_running_stat(gg, self.m_gg[module], self.stat_decay)
			
 
				-
			
 
				-    def _prepare_model(self):
			
 
				-        for module in self.model.modules():
			
 
				-            classname = module.__class__.__name__
			
 
				-            if classname in self.known_modules:
			
 
				-                assert not ((classname in ['Linear', 'Conv2d']) and module.bias is not None), \
			
 
				-                                    "You must have a bias as a separate layer"
			
 
				-
			
 
				-                self.modules.append(module)
			
 
				-                module.register_forward_pre_hook(self._save_input)
			
 
				-                module.register_backward_hook(self._save_grad_output)
			
 
				-
			
 
				-    def step(self):
			
 
				-        # Add weight decay
			
 
				-        if self.weight_decay > 0:
			
 
				-            for p in self.model.parameters():
			
 
				-                p.grad.data.add_(self.weight_decay, p.data)
			
 
				-
			
 
				-        updates = {}
			
 
				-        for i, m in enumerate(self.modules):
			
 
				-            assert len(list(m.parameters())
			
 
				-                       ) == 1, "Can handle only one parameter at the moment"
			
 
				-            classname = m.__class__.__name__
			
 
				-            p = next(m.parameters())
			
 
				-
			
 
				-            la = self.damping + self.weight_decay
			
 
				-
			
 
				-            if self.steps % self.Tf == 0:
			
 
				-                # My asynchronous implementation exists, I will add it later.
			
 
				-                # Experimenting with different ways to this in PyTorch.
			
 
				-                self.d_a[m], self.Q_a[m] = torch.symeig(
			
 
				-                    self.m_aa[m], eigenvectors=True)
			
 
				-                self.d_g[m], self.Q_g[m] = torch.symeig(
			
 
				-                    self.m_gg[m], eigenvectors=True)
			
 
				-
			
 
				-                self.d_a[m].mul_((self.d_a[m] > 1e-6).float())
			
 
				-                self.d_g[m].mul_((self.d_g[m] > 1e-6).float())
			
 
				-
			
 
				-            if classname == 'Conv2d':
			
 
				-                p_grad_mat = p.grad.data.view(p.grad.data.size(0), -1)
			
 
				-            else:
			
 
				-                p_grad_mat = p.grad.data
			
 
				-
			
 
				-            v1 = self.Q_g[m].t() @ p_grad_mat @ self.Q_a[m]
			
 
				-            v2 = v1 / (
			
 
				-                self.d_g[m].unsqueeze(1) * self.d_a[m].unsqueeze(0) + la)
			
 
				-            v = self.Q_g[m] @ v2 @ self.Q_a[m].t()
			
 
				-
			
 
				-            v = v.view(p.grad.data.size())
			
 
				-            updates[p] = v
			
 
				-
			
 
				-        vg_sum = 0
			
 
				-        for p in self.model.parameters():
			
 
				-            v = updates[p]
			
 
				-            vg_sum += (v * p.grad.data * self.lr * self.lr).sum()
			
 
				-
			
 
				-        nu = min(1, math.sqrt(self.kl_clip / vg_sum))
			
 
				-
			
 
				-        for p in self.model.parameters():
			
 
				-            v = updates[p]
			
 
				-            p.grad.data.copy_(v)
			
 
				-            p.grad.data.mul_(nu)
			
 
				-
			
 
				-        self.optim.step()
			
 
				-        self.steps += 1
			
--- a/pytorch_rl/main.py
+++ b/pytorch_rl/main.py
@@ -1,267 +0,0 @@
 
				-import copy
			
 
				-import glob
			
 
				-import os
			
 
				-import time
			
 
				-import operator
			
 
				-from functools import reduce
			
 
				-
			
 
				-import gym
			
 
				-import numpy as np
			
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-import torch.nn.functional as F
			
 
				-import torch.optim as optim
			
 
				-from torch.autograd import Variable
			
 
				-
			
 
				-from arguments import get_args
			
 
				-from vec_env.dummy_vec_env import DummyVecEnv
			
 
				-from vec_env.subproc_vec_env import SubprocVecEnv
			
 
				-from envs import make_env
			
 
				-from kfac import KFACOptimizer
			
 
				-from model import Policy
			
 
				-from storage import RolloutStorage
			
 
				-from visualize import visdom_plot
			
 
				-
			
 
				-args = get_args()
			
 
				-
			
 
				-assert args.algo in ['a2c', 'ppo', 'acktr']
			
 
				-if args.recurrent_policy:
			
 
				-    assert args.algo in ['a2c', 'ppo'], 'Recurrent policy is not implemented for ACKTR'
			
 
				-
			
 
				-num_updates = int(args.num_frames) // args.num_steps // args.num_processes
			
 
				-
			
 
				-torch.manual_seed(args.seed)
			
 
				-if args.cuda:
			
 
				-    torch.cuda.manual_seed(args.seed)
			
 
				-
			
 
				-try:
			
 
				-    os.makedirs(args.log_dir)
			
 
				-except OSError:
			
 
				-    files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv'))
			
 
				-    for f in files:
			
 
				-        os.remove(f)
			
 
				-
			
 
				-def main():
			
 
				-    os.environ['OMP_NUM_THREADS'] = '1'
			
 
				-
			
 
				-    envs = [make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes)]
			
 
				-
			
 
				-    if args.num_processes > 1:
			
 
				-        envs = SubprocVecEnv(envs)
			
 
				-    else:
			
 
				-        envs = DummyVecEnv(envs)
			
 
				-
			
 
				-    obs_shape = envs.observation_space.shape
			
 
				-    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])
			
 
				-    obs_numel = reduce(operator.mul, obs_shape, 1)
			
 
				-
			
 
				-    actor_critic = Policy(obs_numel, envs.action_space)
			
 
				-
			
 
				-    # Maxime: log some info about the model and its size
			
 
				-    modelSize = 0
			
 
				-    for p in actor_critic.parameters():
			
 
				-        pSize = reduce(operator.mul, p.size(), 1)
			
 
				-        modelSize += pSize
			
 
				-    print(str(actor_critic))
			
 
				-    print('Total model size: %d' % modelSize)
			
 
				-
			
 
				-    if envs.action_space.__class__.__name__ == "Discrete":
			
 
				-        action_shape = 1
			
 
				-    else:
			
 
				-        action_shape = envs.action_space.shape[0]
			
 
				-
			
 
				-    if args.cuda:
			
 
				-        actor_critic.cuda()
			
 
				-
			
 
				-    if args.algo == 'a2c':
			
 
				-        optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha)
			
 
				-    elif args.algo == 'ppo':
			
 
				-        optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps)
			
 
				-    elif args.algo == 'acktr':
			
 
				-        optimizer = KFACOptimizer(actor_critic)
			
 
				-
			
 
				-    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size)
			
 
				-    current_obs = torch.zeros(args.num_processes, *obs_shape)
			
 
				-
			
 
				-    def update_current_obs(obs):
			
 
				-        shape_dim0 = envs.observation_space.shape[0]
			
 
				-        obs = torch.from_numpy(obs).float()
			
 
				-        if args.num_stack > 1:
			
 
				-            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
			
 
				-        current_obs[:, -shape_dim0:] = obs
			
 
				-
			
 
				-    obs = envs.reset()
			
 
				-    update_current_obs(obs)
			
 
				-    rollouts.observations[0].copy_(current_obs)
			
 
				-
			
 
				-    # These variables are used to compute average rewards for all processes.
			
 
				-    episode_rewards = torch.zeros([args.num_processes, 1])
			
 
				-    final_rewards = torch.zeros([args.num_processes, 1])
			
 
				-
			
 
				-    if args.cuda:
			
 
				-        current_obs = current_obs.cuda()
			
 
				-        rollouts.cuda()
			
 
				-
			
 
				-    start = time.time()
			
 
				-    for j in range(num_updates):
			
 
				-        for step in range(args.num_steps):
			
 
				-            # Sample actions
			
 
				-            value, action, action_log_prob, states = actor_critic.act(
			
 
				-                Variable(rollouts.observations[step], volatile=True),
			
 
				-                Variable(rollouts.states[step], volatile=True),
			
 
				-                Variable(rollouts.masks[step], volatile=True)
			
 
				-            )
			
 
				-            cpu_actions = action.data.squeeze(1).cpu().numpy()
			
 
				-
			
 
				-            # Obser reward and next obs
			
 
				-            obs, reward, done, info = envs.step(cpu_actions)
			
 
				-            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
			
 
				-            episode_rewards += reward
			
 
				-
			
 
				-            # If done then clean the history of observations.
			
 
				-            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
			
 
				-            final_rewards *= masks
			
 
				-            final_rewards += (1 - masks) * episode_rewards
			
 
				-            episode_rewards *= masks
			
 
				-
			
 
				-            if args.cuda:
			
 
				-                masks = masks.cuda()
			
 
				-
			
 
				-            if current_obs.dim() == 4:
			
 
				-                current_obs *= masks.unsqueeze(2).unsqueeze(2)
			
 
				-            elif current_obs.dim() == 3:
			
 
				-                current_obs *= masks.unsqueeze(2)
			
 
				-            else:
			
 
				-                current_obs *= masks
			
 
				-
			
 
				-            update_current_obs(obs)
			
 
				-            rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks)
			
 
				-
			
 
				-        next_value = actor_critic(
			
 
				-            Variable(rollouts.observations[-1], volatile=True),
			
 
				-            Variable(rollouts.states[-1], volatile=True),
			
 
				-            Variable(rollouts.masks[-1], volatile=True)
			
 
				-        )[0].data
			
 
				-
			
 
				-        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)
			
 
				-
			
 
				-        if args.algo in ['a2c', 'acktr']:
			
 
				-            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
			
 
				-                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
			
 
				-                Variable(rollouts.states[:-1].view(-1, actor_critic.state_size)),
			
 
				-                Variable(rollouts.masks[:-1].view(-1, 1)),
			
 
				-                Variable(rollouts.actions.view(-1, action_shape))
			
 
				-            )
			
 
				-
			
 
				-            values = values.view(args.num_steps, args.num_processes, 1)
			
 
				-            action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)
			
 
				-
			
 
				-            advantages = Variable(rollouts.returns[:-1]) - values
			
 
				-            value_loss = advantages.pow(2).mean()
			
 
				-
			
 
				-            action_loss = -(Variable(advantages.data) * action_log_probs).mean()
			
 
				-
			
 
				-            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
			
 
				-                # Sampled fisher, see Martens 2014
			
 
				-                actor_critic.zero_grad()
			
 
				-                pg_fisher_loss = -action_log_probs.mean()
			
 
				-
			
 
				-                value_noise = Variable(torch.randn(values.size()))
			
 
				-                if args.cuda:
			
 
				-                    value_noise = value_noise.cuda()
			
 
				-
			
 
				-                sample_values = values + value_noise
			
 
				-                vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean()
			
 
				-
			
 
				-                fisher_loss = pg_fisher_loss + vf_fisher_loss
			
 
				-                optimizer.acc_stats = True
			
 
				-                fisher_loss.backward(retain_graph=True)
			
 
				-                optimizer.acc_stats = False
			
 
				-
			
 
				-            optimizer.zero_grad()
			
 
				-            (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward()
			
 
				-
			
 
				-            if args.algo == 'a2c':
			
 
				-                nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)
			
 
				-
			
 
				-            optimizer.step()
			
 
				-        elif args.algo == 'ppo':
			
 
				-            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
			
 
				-            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)
			
 
				-
			
 
				-            for e in range(args.ppo_epoch):
			
 
				-                if args.recurrent_policy:
			
 
				-                    data_generator = rollouts.recurrent_generator(advantages, args.num_mini_batch)
			
 
				-                else:
			
 
				-                    data_generator = rollouts.feed_forward_generator(advantages, args.num_mini_batch)
			
 
				-
			
 
				-                for sample in data_generator:
			
 
				-                    observations_batch, states_batch, actions_batch, \
			
 
				-                       return_batch, masks_batch, old_action_log_probs_batch, \
			
 
				-                            adv_targ = sample
			
 
				-
			
 
				-                    # Reshape to do in a single forward pass for all steps
			
 
				-                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
			
 
				-                        Variable(observations_batch),
			
 
				-                        Variable(states_batch),
			
 
				-                        Variable(masks_batch),
			
 
				-                        Variable(actions_batch)
			
 
				-                    )
			
 
				-
			
 
				-                    adv_targ = Variable(adv_targ)
			
 
				-                    ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch))
			
 
				-                    surr1 = ratio * adv_targ
			
 
				-                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ
			
 
				-                    action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP)
			
 
				-
			
 
				-                    value_loss = (Variable(return_batch) - values).pow(2).mean()
			
 
				-
			
 
				-                    optimizer.zero_grad()
			
 
				-                    (value_loss + action_loss - dist_entropy * args.entropy_coef).backward()
			
 
				-                    nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)
			
 
				-                    optimizer.step()
			
 
				-
			
 
				-        rollouts.after_update()
			
 
				-
			
 
				-        if j % args.save_interval == 0 and args.save_dir != "":
			
 
				-            save_path = os.path.join(args.save_dir, args.algo)
			
 
				-            try:
			
 
				-                os.makedirs(save_path)
			
 
				-            except OSError:
			
 
				-                pass
			
 
				-
			
 
				-            # A really ugly way to save a model to CPU
			
 
				-            save_model = actor_critic
			
 
				-            if args.cuda:
			
 
				-                save_model = copy.deepcopy(actor_critic).cpu()
			
 
				-
			
 
				-            save_model = [save_model,
			
 
				-                            hasattr(envs, 'ob_rms') and envs.ob_rms or None]
			
 
				-
			
 
				-            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))
			
 
				-
			
 
				-        if j % args.log_interval == 0:
			
 
				-            end = time.time()
			
 
				-            total_num_steps = (j + 1) * args.num_processes * args.num_steps
			
 
				-            print(
			
 
				-                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
			
 
				-                format(
			
 
				-                    j,
			
 
				-                    total_num_steps,
			
 
				-                    int(total_num_steps / (end - start)),
			
 
				-                    final_rewards.mean(),
			
 
				-                    final_rewards.median(),
			
 
				-                    final_rewards.min(),
			
 
				-                    final_rewards.max(), dist_entropy.data[0],
			
 
				-                    value_loss.data[0], action_loss.data[0]
			
 
				-                )
			
 
				-            )
			
 
				-
			
 
				-        if args.vis and j % args.vis_interval == 0:
			
 
				-            win = visdom_plot(
			
 
				-                total_num_steps,
			
 
				-                final_rewards.mean()
			
 
				-            )
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    main()
			
--- a/pytorch_rl/model.py
+++ b/pytorch_rl/model.py
@@ -1,101 +0,0 @@
 
				-import operator
			
 
				-from functools import reduce
			
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-import torch.nn.functional as F
			
 
				-from distributions import Categorical, DiagGaussian
			
 
				-from utils import orthogonal
			
 
				-
			
 
				-class FFPolicy(nn.Module):
			
 
				-    def __init__(self):
			
 
				-        super().__init__()
			
 
				-
			
 
				-    def forward(self, inputs, states, masks):
			
 
				-        raise NotImplementedError
			
 
				-
			
 
				-    def act(self, inputs, states, masks, deterministic=False):
			
 
				-        value, x, states = self(inputs, states, masks)
			
 
				-        action = self.dist.sample(x, deterministic=deterministic)
			
 
				-        action_log_probs, dist_entropy = self.dist.logprobs_and_entropy(x, action)
			
 
				-        return value, action, action_log_probs, states
			
 
				-
			
 
				-    def evaluate_actions(self, inputs, states, masks, actions):
			
 
				-        value, x, states = self(inputs, states, masks)
			
 
				-        action_log_probs, dist_entropy = self.dist.logprobs_and_entropy(x, actions)
			
 
				-        return value, action_log_probs, dist_entropy, states
			
 
				-
			
 
				-def weights_init_mlp(m):
			
 
				-    classname = m.__class__.__name__
			
 
				-    if classname.find('Linear') != -1:
			
 
				-        nn.init.xavier_normal(m.weight)
			
 
				-        if m.bias is not None:
			
 
				-            m.bias.data.fill_(0)
			
 
				-
			
 
				-class Policy(FFPolicy):
			
 
				-    def __init__(self, num_inputs, action_space):
			
 
				-        super().__init__()
			
 
				-
			
 
				-        self.action_space = action_space
			
 
				-        assert action_space.__class__.__name__ == "Discrete"
			
 
				-        num_outputs = action_space.n
			
 
				-
			
 
				-        self.fc1 = nn.Linear(num_inputs, 128)
			
 
				-        self.fc2 = nn.Linear(128, 128)
			
 
				-
			
 
				-        # Input size, hidden state size
			
 
				-        self.gru = nn.GRUCell(128, 128)
			
 
				-
			
 
				-        self.a_fc1 = nn.Linear(128, 128)
			
 
				-        self.a_fc2 = nn.Linear(128, 128)
			
 
				-        self.dist = Categorical(128, num_outputs)
			
 
				-
			
 
				-        self.v_fc1 = nn.Linear(128, 128)
			
 
				-        self.v_fc2 = nn.Linear(128, 128)
			
 
				-        self.v_fc3 = nn.Linear(128, 1)
			
 
				-
			
 
				-        self.train()
			
 
				-        self.reset_parameters()
			
 
				-
			
 
				-    @property
			
 
				-    def state_size(self):
			
 
				-        """
			
 
				-        Size of the recurrent state of the model (propagated between steps)
			
 
				-        """
			
 
				-        return 128
			
 
				-
			
 
				-    def reset_parameters(self):
			
 
				-        self.apply(weights_init_mlp)
			
 
				-
			
 
				-        orthogonal(self.gru.weight_ih.data)
			
 
				-        orthogonal(self.gru.weight_hh.data)
			
 
				-        self.gru.bias_ih.data.fill_(0)
			
 
				-        self.gru.bias_hh.data.fill_(0)
			
 
				-
			
 
				-        if self.dist.__class__.__name__ == "DiagGaussian":
			
 
				-            self.dist.fc_mean.weight.data.mul_(0.01)
			
 
				-
			
 
				-    def forward(self, inputs, states, masks):
			
 
				-        batch_numel = reduce(operator.mul, inputs.size()[1:], 1)
			
 
				-        inputs = inputs.view(-1, batch_numel)
			
 
				-
			
 
				-        x = self.fc1(inputs)
			
 
				-        x = F.tanh(x)
			
 
				-        x = self.fc2(x)
			
 
				-        x = F.tanh(x)
			
 
				-
			
 
				-        assert inputs.size(0) == states.size(0)
			
 
				-        states = self.gru(x, states * masks)
			
 
				-
			
 
				-        x = self.a_fc1(states)
			
 
				-        x = F.tanh(x)
			
 
				-        x = self.a_fc2(x)
			
 
				-        actions = x
			
 
				-
			
 
				-        x = self.v_fc1(states)
			
 
				-        x = F.tanh(x)
			
 
				-        x = self.v_fc2(x)
			
 
				-        x = F.tanh(x)
			
 
				-        x = self.v_fc3(x)
			
 
				-        value = x
			
 
				-
			
 
				-        return value, actions, states
			
--- a/pytorch_rl/storage.py
+++ b/pytorch_rl/storage.py
@@ -1,116 +0,0 @@
 
				-import torch
			
 
				-from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
			
 
				-
			
 
				-
			
 
				-class RolloutStorage(object):
			
 
				-    def __init__(self, num_steps, num_processes, obs_shape, action_space, state_size):
			
 
				-        self.observations = torch.zeros(num_steps + 1, num_processes, *obs_shape)
			
 
				-        self.states = torch.zeros(num_steps + 1, num_processes, state_size)
			
 
				-        self.rewards = torch.zeros(num_steps, num_processes, 1)
			
 
				-        self.value_preds = torch.zeros(num_steps + 1, num_processes, 1)
			
 
				-        self.returns = torch.zeros(num_steps + 1, num_processes, 1)
			
 
				-        self.action_log_probs = torch.zeros(num_steps, num_processes, 1)
			
 
				-        if action_space.__class__.__name__ == 'Discrete':
			
 
				-            action_shape = 1
			
 
				-        else:
			
 
				-            action_shape = action_space.shape[0]
			
 
				-        self.actions = torch.zeros(num_steps, num_processes, action_shape)
			
 
				-        if action_space.__class__.__name__ == 'Discrete':
			
 
				-            self.actions = self.actions.long()
			
 
				-        self.masks = torch.ones(num_steps + 1, num_processes, 1)
			
 
				-
			
 
				-    def cuda(self):
			
 
				-        self.observations = self.observations.cuda()
			
 
				-        self.states = self.states.cuda()
			
 
				-        self.rewards = self.rewards.cuda()
			
 
				-        self.value_preds = self.value_preds.cuda()
			
 
				-        self.returns = self.returns.cuda()
			
 
				-        self.action_log_probs = self.action_log_probs.cuda()
			
 
				-        self.actions = self.actions.cuda()
			
 
				-        self.masks = self.masks.cuda()
			
 
				-
			
 
				-    def insert(self, step, current_obs, state, action, action_log_prob, value_pred, reward, mask):
			
 
				-        self.observations[step + 1].copy_(current_obs)
			
 
				-        self.states[step + 1].copy_(state)
			
 
				-        self.actions[step].copy_(action)
			
 
				-        self.action_log_probs[step].copy_(action_log_prob)
			
 
				-        self.value_preds[step].copy_(value_pred)
			
 
				-        self.rewards[step].copy_(reward)
			
 
				-        self.masks[step + 1].copy_(mask)
			
 
				-
			
 
				-    def after_update(self):
			
 
				-        self.observations[0].copy_(self.observations[-1])
			
 
				-        self.states[0].copy_(self.states[-1])
			
 
				-        self.masks[0].copy_(self.masks[-1])
			
 
				-
			
 
				-    def compute_returns(self, next_value, use_gae, gamma, tau):
			
 
				-        if use_gae:
			
 
				-            self.value_preds[-1] = next_value
			
 
				-            gae = 0
			
 
				-            for step in reversed(range(self.rewards.size(0))):
			
 
				-                delta = self.rewards[step] + gamma * self.value_preds[step + 1] * self.masks[step + 1] - self.value_preds[step]
			
 
				-                gae = delta + gamma * tau * self.masks[step + 1] * gae
			
 
				-                self.returns[step] = gae + self.value_preds[step]
			
 
				-        else:
			
 
				-            self.returns[-1] = next_value
			
 
				-            for step in reversed(range(self.rewards.size(0))):
			
 
				-                self.returns[step] = self.returns[step + 1] * \
			
 
				-                    gamma * self.masks[step + 1] + self.rewards[step]
			
 
				-
			
 
				-
			
 
				-    def feed_forward_generator(self, advantages, num_mini_batch):
			
 
				-        num_steps, num_processes = self.rewards.size()[0:2]
			
 
				-        batch_size = num_processes * num_steps
			
 
				-        mini_batch_size = batch_size // num_mini_batch
			
 
				-        sampler = BatchSampler(SubsetRandomSampler(range(batch_size)), mini_batch_size, drop_last=False)
			
 
				-        for indices in sampler:
			
 
				-            indices = torch.LongTensor(indices)
			
 
				-
			
 
				-            if advantages.is_cuda:
			
 
				-                indices = indices.cuda()
			
 
				-
			
 
				-            observations_batch = self.observations[:-1].view(-1,
			
 
				-                                        *self.observations.size()[2:])[indices]
			
 
				-            states_batch = self.states[:-1].view(-1, self.states.size(-1))[indices]
			
 
				-            actions_batch = self.actions.view(-1, self.actions.size(-1))[indices]
			
 
				-            return_batch = self.returns[:-1].view(-1, 1)[indices]
			
 
				-            masks_batch = self.masks[:-1].view(-1, 1)[indices]
			
 
				-            old_action_log_probs_batch = self.action_log_probs.view(-1, 1)[indices]
			
 
				-            adv_targ = advantages.view(-1, 1)[indices]
			
 
				-
			
 
				-            yield observations_batch, states_batch, actions_batch, \
			
 
				-                return_batch, masks_batch, old_action_log_probs_batch, adv_targ
			
 
				-
			
 
				-    def recurrent_generator(self, advantages, num_mini_batch):
			
 
				-        num_processes = self.rewards.size(1)
			
 
				-        num_envs_per_batch = num_processes // num_mini_batch
			
 
				-        perm = torch.randperm(num_processes)
			
 
				-        for start_ind in range(0, num_processes, num_envs_per_batch):
			
 
				-            observations_batch = []
			
 
				-            states_batch = []
			
 
				-            actions_batch = []
			
 
				-            return_batch = []
			
 
				-            masks_batch = []
			
 
				-            old_action_log_probs_batch = []
			
 
				-            adv_targ = []
			
 
				-
			
 
				-            for offset in range(num_envs_per_batch):
			
 
				-                ind = perm[start_ind + offset]
			
 
				-                observations_batch.append(self.observations[:-1, ind])
			
 
				-                states_batch.append(self.states[0:1, ind])
			
 
				-                actions_batch.append(self.actions[:, ind])
			
 
				-                return_batch.append(self.returns[:-1, ind])
			
 
				-                masks_batch.append(self.masks[:-1, ind])
			
 
				-                old_action_log_probs_batch.append(self.action_log_probs[:, ind])
			
 
				-                adv_targ.append(advantages[:, ind])
			
 
				-
			
 
				-            observations_batch = torch.cat(observations_batch, 0)
			
 
				-            states_batch = torch.cat(states_batch, 0)
			
 
				-            actions_batch = torch.cat(actions_batch, 0)
			
 
				-            return_batch = torch.cat(return_batch, 0)
			
 
				-            masks_batch = torch.cat(masks_batch, 0)
			
 
				-            old_action_log_probs_batch = torch.cat(old_action_log_probs_batch, 0)
			
 
				-            adv_targ = torch.cat(adv_targ, 0)
			
 
				-
			
 
				-            yield observations_batch, states_batch, actions_batch, \
			
 
				-                return_batch, masks_batch, old_action_log_probs_batch, adv_targ
			
--- a/pytorch_rl/utils.py
+++ b/pytorch_rl/utils.py
@@ -1,45 +0,0 @@
 
				-import torch
			
 
				-import torch.nn as nn
			
 
				-
			
 
				-
			
 
				-# Necessary for my KFAC implementation.
			
 
				-class AddBias(nn.Module):
			
 
				-    def __init__(self, bias):
			
 
				-        super(AddBias, self).__init__()
			
 
				-        self._bias = nn.Parameter(bias.unsqueeze(1))
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        if x.dim() == 2:
			
 
				-            bias = self._bias.t().view(1, -1)
			
 
				-        else:
			
 
				-            bias = self._bias.t().view(1, -1, 1, 1)
			
 
				-
			
 
				-        return x + bias
			
 
				-
			
 
				-# A temporary solution from the master branch.
			
 
				-# https://github.com/pytorch/pytorch/blob/7752fe5d4e50052b3b0bbc9109e599f8157febc0/torch/nn/init.py#L312
			
 
				-# Remove after the next version of PyTorch gets release.
			
 
				-def orthogonal(tensor, gain=1):
			
 
				-    if tensor.ndimension() < 2:
			
 
				-        raise ValueError("Only tensors with 2 or more dimensions are supported")
			
 
				-
			
 
				-    rows = tensor.size(0)
			
 
				-    cols = tensor[0].numel()
			
 
				-    flattened = torch.Tensor(rows, cols).normal_(0, 1)
			
 
				-
			
 
				-    if rows < cols:
			
 
				-        flattened.t_()
			
 
				-
			
 
				-    # Compute the qr factorization
			
 
				-    q, r = torch.qr(flattened)
			
 
				-    # Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf
			
 
				-    d = torch.diag(r, 0)
			
 
				-    ph = d.sign()
			
 
				-    q *= ph.expand_as(q)
			
 
				-
			
 
				-    if rows < cols:
			
 
				-        q.t_()
			
 
				-
			
 
				-    tensor.view_as(q).copy_(q)
			
 
				-    tensor.mul_(gain)
			
 
				-    return tensor
			
--- a/pytorch_rl/vec_env/LICENSE
+++ b/pytorch_rl/vec_env/LICENSE
@@ -1,21 +0,0 @@
 
				-The MIT License
			
 
				-
			
 
				-Copyright (c) 2017 OpenAI (http://openai.com)
			
 
				-
			
 
				-Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				-of this software and associated documentation files (the "Software"), to deal
			
 
				-in the Software without restriction, including without limitation the rights
			
 
				-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				-copies of the Software, and to permit persons to whom the Software is
			
 
				-furnished to do so, subject to the following conditions:
			
 
				-
			
 
				-The above copyright notice and this permission notice shall be included in
			
 
				-all copies or substantial portions of the Software.
			
 
				-
			
 
				-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				-THE SOFTWARE.
			
--- a/pytorch_rl/vec_env/__init__.py
+++ b/pytorch_rl/vec_env/__init__.py
@@ -1,100 +0,0 @@
 
				-from abc import ABC, abstractmethod
			
 
				-
			
 
				-class VecEnv(ABC):
			
 
				-
			
 
				-    def __init__(self, num_envs, observation_space, action_space):
			
 
				-        self.num_envs = num_envs
			
 
				-        self.observation_space = observation_space
			
 
				-        self.action_space = action_space
			
 
				-
			
 
				-    """
			
 
				-    An abstract asynchronous, vectorized environment.
			
 
				-    """
			
 
				-    @abstractmethod
			
 
				-    def reset(self):
			
 
				-        """
			
 
				-        Reset all the environments and return an array of
			
 
				-        observations.
			
 
				-
			
 
				-        If step_async is still doing work, that work will
			
 
				-        be cancelled and step_wait() should not be called
			
 
				-        until step_async() is invoked again.
			
 
				-        """
			
 
				-        pass
			
 
				-
			
 
				-    @abstractmethod
			
 
				-    def step_async(self, actions):
			
 
				-        """
			
 
				-        Tell all the environments to start taking a step
			
 
				-        with the given actions.
			
 
				-        Call step_wait() to get the results of the step.
			
 
				-
			
 
				-        You should not call this if a step_async run is
			
 
				-        already pending.
			
 
				-        """
			
 
				-        pass
			
 
				-
			
 
				-    @abstractmethod
			
 
				-    def step_wait(self):
			
 
				-        """
			
 
				-        Wait for the step taken with step_async().
			
 
				-
			
 
				-        Returns (obs, rews, dones, infos):
			
 
				-         - obs: an array of observations
			
 
				-         - rews: an array of rewards
			
 
				-         - dones: an array of "episode done" booleans
			
 
				-         - infos: an array of info objects
			
 
				-        """
			
 
				-        pass
			
 
				-
			
 
				-    @abstractmethod
			
 
				-    def close(self):
			
 
				-        """
			
 
				-        Clean up the environments' resources.
			
 
				-        """
			
 
				-        pass
			
 
				-
			
 
				-    def step(self, actions):
			
 
				-        self.step_async(actions)
			
 
				-        return self.step_wait()
			
 
				-
			
 
				-    def render(self):
			
 
				-        logger.warn('Render not defined for %s'%self)
			
 
				-
			
 
				-class VecEnvWrapper(VecEnv):
			
 
				-    def __init__(self, venv, observation_space=None, action_space=None):
			
 
				-        self.venv = venv
			
 
				-        VecEnv.__init__(self,
			
 
				-            num_envs=venv.num_envs,
			
 
				-            observation_space=observation_space or venv.observation_space,
			
 
				-            action_space=action_space or venv.action_space)
			
 
				-
			
 
				-    def step_async(self, actions):
			
 
				-        self.venv.step_async(actions)
			
 
				-
			
 
				-    @abstractmethod
			
 
				-    def reset(self):
			
 
				-        pass
			
 
				-
			
 
				-    @abstractmethod
			
 
				-    def step_wait(self):
			
 
				-        pass
			
 
				-
			
 
				-    def close(self):
			
 
				-        return self.venv.close()
			
 
				-
			
 
				-    def render(self):
			
 
				-        self.venv.render()
			
 
				-
			
 
				-class CloudpickleWrapper(object):
			
 
				-    """
			
 
				-    Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
			
 
				-    """
			
 
				-    def __init__(self, x):
			
 
				-        self.x = x
			
 
				-    def __getstate__(self):
			
 
				-        import cloudpickle
			
 
				-        return cloudpickle.dumps(self.x)
			
 
				-    def __setstate__(self, ob):
			
 
				-        import pickle
			
 
				-        self.x = pickle.loads(ob)
			
--- a/pytorch_rl/vec_env/dummy_vec_env.py
+++ b/pytorch_rl/vec_env/dummy_vec_env.py
@@ -1,31 +0,0 @@
 
				-import numpy as np
			
 
				-from . import VecEnv
			
 
				-
			
 
				-class DummyVecEnv(VecEnv):
			
 
				-    def __init__(self, env_fns):
			
 
				-        self.envs = [fn() for fn in env_fns]
			
 
				-        env = self.envs[0]        
			
 
				-        VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
			
 
				-        self.ts = np.zeros(len(self.envs), dtype='int')        
			
 
				-        self.actions = None
			
 
				-
			
 
				-    def step_async(self, actions):
			
 
				-        self.actions = actions
			
 
				-
			
 
				-    def step_wait(self):
			
 
				-        results = [env.step(a) for (a,env) in zip(self.actions, self.envs)]
			
 
				-        obs, rews, dones, infos = map(np.array, zip(*results))
			
 
				-        self.ts += 1
			
 
				-        for (i, done) in enumerate(dones):
			
 
				-            if done: 
			
 
				-                obs[i] = self.envs[i].reset()
			
 
				-                self.ts[i] = 0
			
 
				-        self.actions = None
			
 
				-        return np.array(obs), np.array(rews), np.array(dones), infos
			
 
				-
			
 
				-    def reset(self):        
			
 
				-        results = [env.reset() for env in self.envs]
			
 
				-        return np.array(results)
			
 
				-
			
 
				-    def close(self):
			
 
				-        return
			
--- a/pytorch_rl/vec_env/subproc_vec_env.py
+++ b/pytorch_rl/vec_env/subproc_vec_env.py
@@ -1,82 +0,0 @@
 
				-import numpy as np
			
 
				-from multiprocessing import Process, Pipe
			
 
				-from vec_env import VecEnv, CloudpickleWrapper
			
 
				-
			
 
				-def worker(remote, parent_remote, env_fn_wrapper):
			
 
				-    parent_remote.close()
			
 
				-    env = env_fn_wrapper.x()
			
 
				-    while True:
			
 
				-        cmd, data = remote.recv()
			
 
				-        if cmd == 'step':
			
 
				-            ob, reward, done, info = env.step(data)
			
 
				-            if done:
			
 
				-                ob = env.reset()
			
 
				-            remote.send((ob, reward, done, info))
			
 
				-        elif cmd == 'reset':
			
 
				-            ob = env.reset()
			
 
				-            remote.send(ob)
			
 
				-        elif cmd == 'reset_task':
			
 
				-            ob = env.reset_task()
			
 
				-            remote.send(ob)
			
 
				-        elif cmd == 'close':
			
 
				-            remote.close()
			
 
				-            break
			
 
				-        elif cmd == 'get_spaces':
			
 
				-            remote.send((env.observation_space, env.action_space))
			
 
				-        else:
			
 
				-            raise NotImplementedError
			
 
				-
			
 
				-
			
 
				-class SubprocVecEnv(VecEnv):
			
 
				-    def __init__(self, env_fns, spaces=None):
			
 
				-        """
			
 
				-        envs: list of gym environments to run in subprocesses
			
 
				-        """
			
 
				-        self.waiting = False
			
 
				-        self.closed = False
			
 
				-        nenvs = len(env_fns)
			
 
				-        self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
			
 
				-        self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
			
 
				-            for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
			
 
				-        for p in self.ps:
			
 
				-            p.daemon = True # if the main process crashes, we should not cause things to hang
			
 
				-            p.start()
			
 
				-        for remote in self.work_remotes:
			
 
				-            remote.close()
			
 
				-
			
 
				-        self.remotes[0].send(('get_spaces', None))
			
 
				-        observation_space, action_space = self.remotes[0].recv()
			
 
				-        VecEnv.__init__(self, len(env_fns), observation_space, action_space)
			
 
				-
			
 
				-    def step_async(self, actions):
			
 
				-        for remote, action in zip(self.remotes, actions):
			
 
				-            remote.send(('step', action))
			
 
				-        self.waiting = True
			
 
				-
			
 
				-    def step_wait(self):
			
 
				-        results = [remote.recv() for remote in self.remotes]
			
 
				-        self.waiting = False
			
 
				-        obs, rews, dones, infos = zip(*results)
			
 
				-        return np.stack(obs), np.stack(rews), np.stack(dones), infos
			
 
				-
			
 
				-    def reset(self):
			
 
				-        for remote in self.remotes:
			
 
				-            remote.send(('reset', None))
			
 
				-        return np.stack([remote.recv() for remote in self.remotes])
			
 
				-
			
 
				-    def reset_task(self):
			
 
				-        for remote in self.remotes:
			
 
				-            remote.send(('reset_task', None))
			
 
				-        return np.stack([remote.recv() for remote in self.remotes])
			
 
				-
			
 
				-    def close(self):
			
 
				-        if self.closed:
			
 
				-            return
			
 
				-        if self.waiting:
			
 
				-            for remote in self.remotes:
			
 
				-                remote.recv()
			
 
				-        for remote in self.remotes:
			
 
				-            remote.send(('close', None))
			
 
				-        for p in self.ps:
			
 
				-            p.join()
			
 
				-        self.closed = True
			
--- a/pytorch_rl/vec_env/vec_frame_stack.py
+++ b/pytorch_rl/vec_env/vec_frame_stack.py
@@ -1,38 +0,0 @@
 
				-from vec_env import VecEnvWrapper
			
 
				-import numpy as np
			
 
				-from gym import spaces
			
 
				-
			
 
				-class VecFrameStack(VecEnvWrapper):
			
 
				-    """
			
 
				-    Vectorized environment base class
			
 
				-    """
			
 
				-    def __init__(self, venv, nstack):
			
 
				-        self.venv = venv
			
 
				-        self.nstack = nstack
			
 
				-        wos = venv.observation_space # wrapped ob space
			
 
				-        low = np.repeat(wos.low, self.nstack, axis=-1)
			
 
				-        high = np.repeat(wos.high, self.nstack, axis=-1)
			
 
				-        self.stackedobs = np.zeros((venv.num_envs,)+low.shape, low.dtype)
			
 
				-        observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype)
			
 
				-        VecEnvWrapper.__init__(self, venv, observation_space=observation_space)
			
 
				-
			
 
				-    def step_wait(self):
			
 
				-        obs, rews, news, infos = self.venv.step_wait()
			
 
				-        self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1)
			
 
				-        for (i, new) in enumerate(news):
			
 
				-            if new:
			
 
				-                self.stackedobs[i] = 0
			
 
				-        self.stackedobs[..., -obs.shape[-1]:] = obs
			
 
				-        return self.stackedobs, rews, news, infos
			
 
				-
			
 
				-    def reset(self):
			
 
				-        """
			
 
				-        Reset all environments
			
 
				-        """
			
 
				-        obs = self.venv.reset()
			
 
				-        self.stackedobs[...] = 0
			
 
				-        self.stackedobs[..., -obs.shape[-1]:] = obs
			
 
				-        return self.stackedobs
			
 
				-
			
 
				-    def close(self):
			
 
				-        self.venv.close()
			
--- a/pytorch_rl/visualize.py
+++ b/pytorch_rl/visualize.py
@@ -1,54 +0,0 @@
 
				-import numpy as np
			
 
				-
			
 
				-vis = None
			
 
				-
			
 
				-win = None
			
 
				-
			
 
				-avg_reward = 0
			
 
				-
			
 
				-X = []
			
 
				-Y = []
			
 
				-
			
 
				-def visdom_plot(
			
 
				-    total_num_steps,
			
 
				-    mean_reward
			
 
				-):
			
 
				-    # Lazily import visdom so that people don't need to install visdom
			
 
				-    # if they're not actually using it
			
 
				-    from visdom import Visdom
			
 
				-
			
 
				-    global vis
			
 
				-    global win
			
 
				-    global avg_reward
			
 
				-
			
 
				-    if vis is None:
			
 
				-        vis = Visdom()
			
 
				-        assert vis.check_connection()
			
 
				-
			
 
				-        # Close all existing plots
			
 
				-        vis.close()
			
 
				-
			
 
				-    # Running average for curve smoothing
			
 
				-    avg_reward = avg_reward * 0.9 + 0.1 * mean_reward
			
 
				-
			
 
				-    X.append(total_num_steps)
			
 
				-    Y.append(avg_reward)
			
 
				-
			
 
				-    # The plot with the handle 'win' is updated each time this is called
			
 
				-    win = vis.line(
			
 
				-        X = np.array(X),
			
 
				-        Y = np.array(Y),
			
 
				-        opts = dict(
			
 
				-            #title = 'All Environments',
			
 
				-            xlabel='Total time steps',
			
 
				-            ylabel='Reward per episode',
			
 
				-            ytickmin=0,
			
 
				-            #ytickmax=1,
			
 
				-            #ytickstep=0.1,
			
 
				-            #legend=legend,
			
 
				-            #showlegend=True,
			
 
				-            width=900,
			
 
				-            height=500
			
 
				-        ),
			
 
				-        win = win
			
 
				-    )