# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Gamma mixture model."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
import numpy as np
import tensorflow as tf
from ops import rmsprop
from ops import util

st = tf.contrib.bayesflow.stochastic_tensor
distributions = tf.contrib.distributions


def train(optimizer):
  """Trains a gamma-normal mixture model.

  From http://ajbc.io/resources/bbvi_for_gammas.pdf.

  Args:
    optimizer: string specifying whether to use manual rmsprop or rmsprop
        that works on IndexedSlices

  Returns:
    Means learned using variational inference with the given optimizer
  """
  tf.reset_default_graph()
  np_dtype = np.float64
  np.random.seed(11)
  alpha_0 = np.array(0.1, dtype=np_dtype)
  mu_0 = np.array(5., dtype=np_dtype)
  # number of components
  n_components = 12
  # number of datapoints
  n_data = 100
  mu = np.random.gamma(alpha_0, mu_0 / alpha_0, n_components)
  x = np.random.normal(mu, 1., (n_data, n_components))
  ## set up for inference
  # the number of samples to draw for each parameter
  n_samples = 40
  batch_size = 1
  np.random.seed(123232)
  tf.set_random_seed(25343)
  tf_dtype = tf.float64
  inv_softplus = util.inv_softplus

  lambda_alpha_var = tf.get_variable(
      'lambda_alpha', shape=[1, n_components], dtype=tf_dtype,
      initializer=tf.constant_initializer(value=0.1))
  lambda_mu_var = tf.get_variable(
      'lambda_mu', shape=[1, n_components], dtype=tf_dtype,
      initializer=tf.constant_initializer(value=0.1))

  x_indices = tf.placeholder(shape=[batch_size], dtype=tf.int64)

  if optimizer == 'rmsprop_indexed_slices':
    lambda_alpha = tf.nn.embedding_lookup(lambda_alpha_var, x_indices)
    lambda_mu = tf.nn.embedding_lookup(lambda_mu_var, x_indices)
  elif optimizer == 'rmsprop_manual':
    lambda_alpha = lambda_alpha_var
    lambda_mu = lambda_mu_var

  variational = st.StochasticTensor(distributions.Gamma,
                                    alpha=tf.nn.softplus(lambda_alpha),
                                    beta=(tf.nn.softplus(lambda_alpha)
                                          / tf.nn.softplus(lambda_mu)),
                                    dist_value_type=st.SampleValue(n=n_samples),
                                    validate_args=False)

  # truncate samples (don't sample zero )
  sample_mus = tf.maximum(variational.value(), 1e-300)

  # probability of samples given prior
  prior = distributions.Gamma(alpha=alpha_0,
                              beta=alpha_0/mu_0,
                              validate_args=False)

  p = prior.log_pdf(sample_mus)

  # probability of samples given variational parameters
  q = variational.distribution.log_pdf(sample_mus)

  likelihood = distributions.Normal(mu=tf.expand_dims(sample_mus, 1),
                                    sigma=np.array(1., dtype=np_dtype),
                                    validate_args=False)

  # probability of observations given samples
  x_ph = tf.expand_dims(tf.constant(x, dtype=tf_dtype), 0)
  p += tf.reduce_sum(likelihood.log_pdf(x_ph), 2)

  elbo = p - q

  # run BBVI for a fixed number of iterations
  iteration = tf.Variable(0, trainable=False)
  increment_iteration = tf.assign(iteration, iteration + 1)

  # Robbins-Monro sequence for step size
  rho = tf.pow(tf.cast(iteration, tf_dtype) + 1024., -0.7)

  if optimizer == 'rmsprop_manual':
    # control variates to decrease variance of gradient ;
    # one for each variational parameter
    g_alpha = tf.pack([tf.gradients(q_sample, lambda_alpha)[0]
                       for q_sample in tf.unpack(q)])
    g_mu = tf.pack([tf.gradients(q_sample, lambda_mu)[0]
                    for q_sample in tf.unpack(q)])

    def cov(a, b):
      v = (a - tf.reduce_mean(a, 0)) * (b - tf.reduce_mean(b, 0))
      return tf.reduce_mean(v, 0)

    _, var_g_alpha = tf.nn.moments(g_alpha, [0])
    _, var_g_mu = tf.nn.moments(g_mu, [0])

    cov_alpha = cov(g_alpha * (p - q), g_alpha)
    cov_mu = cov(g_mu * (p - q), g_mu)

    cv_alpha = cov_alpha / var_g_alpha
    cv_mu = cov_mu / var_g_mu

    ms_mu = tf.Variable(tf.ones_like(g_mu), trainable=False)
    ms_alpha = tf.Variable(tf.ones_like(g_alpha), trainable=False)
    def update_ms(ms, var):
      return tf.assign(ms, 0.9 * ms + 0.1 * tf.reduce_sum(tf.square(var), 0))

    update_ms_ops = [update_ms(ms_mu, g_mu), update_ms(ms_alpha, g_alpha)]

    # update each variational parameter with smaple average
    alpha_step = rho * tf.reduce_mean(g_alpha / tf.sqrt(ms_alpha) *
                                      (p - q - cv_alpha), 0)
    update_alpha = tf.assign(lambda_alpha, lambda_alpha + alpha_step)
    mu_step = rho * tf.reduce_mean(g_mu / tf.sqrt(ms_mu) * (p - q - cv_mu), 0)
    update_mu = tf.assign(lambda_mu, lambda_mu + mu_step)
    train_ops = tf.group(update_mu, update_alpha)
  elif optimizer == 'rmsprop_indexed_slices':
    variable_list = [lambda_mu_var, lambda_alpha_var]
    train_ops = rmsprop.maximize_with_control_variate(
        rho, elbo, q, variable_list, iteration)

  # truncate variational parameters
  get_min = lambda var, val: tf.assign(var, tf.maximum(var, inv_softplus(val)))
  get_max = lambda var, val: tf.assign(var, tf.minimum(var, inv_softplus(val)))

  get_min_ops = [get_min(lambda_alpha_var, 0.005), get_min(lambda_mu_var, 1e-5)]
  get_max_ops = [get_max(var, sys.float_info.max)
                 for var in [lambda_mu_var, lambda_alpha_var]]

  truncate_ops = get_min_ops + get_max_ops

  with tf.control_dependencies([train_ops]):
    train_ops = tf.group(*truncate_ops)

  with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())
    fd = {x_indices: [0]}
    print('running variational inference using: %s' % optimizer)
    for i in range(100):
      if i % 10 == 0:
        print('iteration %d\telbo %.3e'
              % (i, np.mean(np.sum(elbo.eval(fd), axis=1))))
      if optimizer == 'rmsprop_manual':
        sess.run(update_ms_ops)
      sess.run(train_ops, fd)
      sess.run(increment_iteration)
    # return the learned variational means
    np_mu = sess.run(tf.nn.softplus(lambda_mu), fd)
  return np_mu