| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182 | import mathimport torchimport torch.nn as nnimport torch.nn.functional as Ffrom torch.autograd import Variablefrom utils import AddBiasclass Categorical(nn.Module):    def __init__(self, num_inputs, num_outputs):        super(Categorical, self).__init__()        self.linear = nn.Linear(num_inputs, num_outputs)    def forward(self, x):        x = self.linear(x)        return x    def sample(self, x, deterministic):        x = self(x)        probs = F.softmax(x, dim=1)        if deterministic is False:            action = probs.multinomial()        else:            action = probs.max(1, keepdim=True)[1]        return action    def logprobs_and_entropy(self, x, actions):        x = self(x)        log_probs = F.log_softmax(x, dim=1)        probs = F.softmax(x, dim=1)        action_log_probs = log_probs.gather(1, actions)        dist_entropy = -(log_probs * probs).sum(-1).mean()        return action_log_probs, dist_entropyclass DiagGaussian(nn.Module):    def __init__(self, num_inputs, num_outputs):        super(DiagGaussian, self).__init__()        self.fc_mean = nn.Linear(num_inputs, num_outputs)        self.logstd = AddBias(torch.zeros(num_outputs))    def forward(self, x):        action_mean = self.fc_mean(x)        #  An ugly hack for my KFAC implementation.        zeros = Variable(torch.zeros(action_mean.size()), volatile=x.volatile)        if x.is_cuda:            zeros = zeros.cuda()        action_logstd = self.logstd(zeros)        return action_mean, action_logstd    def sample(self, x, deterministic):        action_mean, action_logstd = self(x)        action_std = action_logstd.exp()        if deterministic is False:            noise = Variable(torch.randn(action_std.size()))            if action_std.is_cuda:                noise = noise.cuda()            action = action_mean + action_std * noise        else:            action = action_mean        return action    def logprobs_and_entropy(self, x, actions):        action_mean, action_logstd = self(x)        action_std = action_logstd.exp()        action_log_probs = -0.5 * ((actions - action_mean) / action_std).pow(2) - 0.5 * math.log(2 * math.pi) - action_logstd        action_log_probs = action_log_probs.sum(-1, keepdim=True)        dist_entropy = 0.5 + 0.5 * math.log(2 * math.pi) + action_logstd        dist_entropy = dist_entropy.sum(-1).mean()        return action_log_probs, dist_entropy
 |