9 年之前 · 0dea688a68
--- a/privacy/README.md
+++ b/privacy/README.md
@@ -8,8 +8,7 @@ Knowledge acquired by teachers is transferred to the student in a differentially
 
				 private manner by noisily aggregating the teacher decisions before feeding them
			
 
				 to the student during training.
			
 
				 
			
 
				-A paper describing the approach is in preparation. A link will be added to this 
			
 
				-README when available.
			
 
				+The paper describing the approach is [arXiv:1610.05755](https://arxiv.org/abs/1610.05755)
			
 
				 
			
 
				 ## Dependencies
			
 
				 
			
@@ -20,9 +19,9 @@ respective documentations.
 
				 
			
 
				 ## How to run
			
 
				 
			
 
				-This repository supports the MNIST, CIFAR10, and SVHN datasets. The following
			
 
				+This repository supports the MNIST and SVHN datasets. The following
			
 
				 instructions are given for MNIST but can easily be adapted by replacing the 
			
 
				-flag `--dataset=mnist` by `--dataset=cifar10` or `--dataset=svhn`.
			
 
				+flag `--dataset=mnist` by `--dataset=svhn`.
			
 
				 There are 2 steps: teacher training and student training. Data will be 
			
 
				 automatically downloaded when you start the teacher training. 
			
 
				 
			
@@ -72,6 +71,20 @@ functions `inference` and `inference_deeper`. Use the flag `--deeper=true`
 
				 to switch to that model when launching `train_teachers.py` and 
			
 
				 `train_student.py`. 
			
 
				 
			
 
				+## Privacy analysis
			
 
				+
			
 
				+In the paper, we detail how data-dependent differential privacy bounds can be
			
 
				+computed to estimate the cost of training the student. In order to reproduce 
			
 
				+the bounds given in the paper, we include the label predicted by our two
			
 
				+teacher ensembles: MNIST and SVHN. You can run the privacy analysis for each
			
 
				+dataset with the following commands:
			
 
				+
			
 
				+```
			
 
				+python analysis.py --counts_file=mnist_250_teachers_labels.npy --indices_file=mnist_250_teachers_100_indices_used_by_student.npy
			
 
				+
			
 
				+python analysis.py --counts_file=svhn_250_teachers_labels.npy --max_examples=1000 --delta=1e-6
			
 
				+```
			
 
				+
			
 
				 ## Contact
			
 
				 
			
 
				 To ask questions, please email `nicolas@papernot.fr` or open an issue on 
			
--- a/privacy/analysis.py
+++ b/privacy/analysis.py
@@ -0,0 +1,288 @@
 
				+"""
			
 
				+This script computes bounds on the privacy cost of training the
			
 
				+student model from noisy aggregation of labels predicted by teachers.
			
 
				+It should be used only after training the student (and therefore the
			
 
				+teachers as well). We however include the label files required to
			
 
				+reproduce key results from our paper (https://arxiv.org/abs/1610.05755): 
			
 
				+the epsilon bounds for MNIST and SVHN students.
			
 
				+
			
 
				+The command that computes the epsilon bound associated
			
 
				+with the training of the MNIST student model (100 label queries
			
 
				+with a (1/20)*2=0.1 epsilon bound each) is:
			
 
				+
			
 
				+python analysis.py 
			
 
				+  --counts_file=mnist_250_teachers_labels.npy 
			
 
				+  --indices_file=mnist_250_teachers_100_indices_used_by_student.npy
			
 
				+
			
 
				+The command that computes the epsilon bound associated
			
 
				+with the training of the SVHN student model (1000 label queries
			
 
				+with a (1/20)*2=0.1 epsilon bound each) is:
			
 
				+
			
 
				+python analysis.py 
			
 
				+  --counts_file=svhn_250_teachers_labels.npy 
			
 
				+  --max_examples=1000
			
 
				+  --delta=1e-6
			
 
				+"""
			
 
				+import os
			
 
				+import math
			
 
				+import numpy as np
			
 
				+import tensorflow as tf
			
 
				+
			
 
				+from input import maybe_download
			
 
				+
			
 
				+# These parameters can be changed to compute bounds for different failure rates
			
 
				+# or different model predictions.
			
 
				+
			
 
				+tf.flags.DEFINE_integer("moments",8, "Number of moments")
			
 
				+tf.flags.DEFINE_float("noise_eps", 0.1, "Eps value for each call to noisymax.")
			
 
				+tf.flags.DEFINE_float("delta", 1e-5, "Target value of delta.")
			
 
				+tf.flags.DEFINE_float("beta", 0.09, "Value of beta for smooth sensitivity")
			
 
				+tf.flags.DEFINE_string("counts_file","","Numpy matrix with raw counts")
			
 
				+tf.flags.DEFINE_string("indices_file","",
			
 
				+    "File containting a numpy matrix with indices used."
			
 
				+    "Optional. Use the first max_examples indices if this is not provided.")
			
 
				+tf.flags.DEFINE_integer("max_examples",1000,
			
 
				+    "Number of examples to use. We will use the first"
			
 
				+    " max_examples many examples from the counts_file"
			
 
				+    " or indices_file to do the privacy cost estimate")
			
 
				+tf.flags.DEFINE_float("too_small", 1e-10, "Small threshold to avoid log of 0")
			
 
				+tf.flags.DEFINE_bool("input_is_counts", False, "False if labels, True if counts")
			
 
				+
			
 
				+FLAGS = tf.flags.FLAGS
			
 
				+
			
 
				+
			
 
				+def compute_q_noisy_max(counts, noise_eps):
			
 
				+  """returns ~ Pr[outcome != winner].
			
 
				+
			
 
				+  Args:
			
 
				+    counts: a list of scores
			
 
				+    noise_eps: privacy parameter for noisy_max
			
 
				+  Returns:
			
 
				+    q: the probability that outcome is different from true winner.
			
 
				+  """
			
 
				+  # For noisy max, we only get an upper bound.
			
 
				+  # Pr[ j beats i*] \leq (2+gap(j,i*))/ 4 exp(gap(j,i*)
			
 
				+  # proof at http://mathoverflow.net/questions/66763/
			
 
				+  # tight-bounds-on-probability-of-sum-of-laplace-random-variables
			
 
				+
			
 
				+  winner = np.argmax(counts)
			
 
				+  counts_normalized = noise_eps * (counts - counts[winner])
			
 
				+  counts_rest = np.array(
			
 
				+      [counts_normalized[i] for i in xrange(len(counts)) if i != winner])
			
 
				+  q = 0.0
			
 
				+  for c in counts_rest:
			
 
				+    gap = -c
			
 
				+    q += (gap + 2.0) / (4.0 * math.exp(gap))
			
 
				+  return min(q, 1.0 - (1.0/len(counts)))
			
 
				+
			
 
				+
			
 
				+def compute_q_noisy_max_approx(counts, noise_eps):
			
 
				+  """returns ~ Pr[outcome != winner].
			
 
				+
			
 
				+  Args:
			
 
				+    counts: a list of scores
			
 
				+    noise_eps: privacy parameter for noisy_max
			
 
				+  Returns:
			
 
				+    q: the probability that outcome is different from true winner.
			
 
				+  """
			
 
				+  # For noisy max, we only get an upper bound.
			
 
				+  # Pr[ j beats i*] \leq (2+gap(j,i*))/ 4 exp(gap(j,i*)
			
 
				+  # proof at http://mathoverflow.net/questions/66763/
			
 
				+  # tight-bounds-on-probability-of-sum-of-laplace-random-variables
			
 
				+  # This code uses an approximation that is faster and easier
			
 
				+  # to get local sensitivity bound on.
			
 
				+
			
 
				+  winner = np.argmax(counts)
			
 
				+  counts_normalized = noise_eps * (counts - counts[winner])
			
 
				+  counts_rest = np.array(
			
 
				+      [counts_normalized[i] for i in xrange(len(counts)) if i != winner])
			
 
				+  gap = -max(counts_rest)
			
 
				+  q = (len(counts) - 1) * (gap + 2.0) / (4.0 * math.exp(gap))
			
 
				+  return min(q, 1.0 - (1.0/len(counts)))
			
 
				+
			
 
				+
			
 
				+def logmgf_exact(q, priv_eps, l):
			
 
				+  """Computes the logmgf value given q and privacy eps.
			
 
				+
			
 
				+  The bound used is the min of three terms. The first term is from
			
 
				+  https://arxiv.org/pdf/1605.02065.pdf.
			
 
				+  The second term is based on the fact that when event has probability (1-q) for
			
 
				+  q close to zero, q can only change by exp(eps), which corresponds to a
			
 
				+  much smaller multiplicative change in (1-q)
			
 
				+  The third term comes directly from the privacy guarantee.
			
 
				+  Args:
			
 
				+    q: pr of non-optimal outcome
			
 
				+    priv_eps: eps parameter for DP
			
 
				+    l: moment to compute.
			
 
				+  Returns:
			
 
				+    Upper bound on logmgf
			
 
				+  """
			
 
				+  if q < 0.5:
			
 
				+    t_one = (1-q) * math.pow((1-q) / (1 - math.exp(priv_eps) * q), l)
			
 
				+    t_two = q * math.exp(priv_eps * l)
			
 
				+    t = t_one + t_two
			
 
				+    try:
			
 
				+      log_t = math.log(t)
			
 
				+    except ValueError:
			
 
				+      print "Got ValueError in math.log for values :" + str((q, priv_eps, l, t))
			
 
				+      log_t = priv_eps * l
			
 
				+  else:
			
 
				+    log_t = priv_eps * l
			
 
				+
			
 
				+  return min(0.5 * priv_eps * priv_eps * l * (l + 1), log_t, priv_eps * l)
			
 
				+
			
 
				+
			
 
				+def logmgf_from_counts(counts, noise_eps, l):
			
 
				+  """
			
 
				+  ReportNoisyMax mechanism with noise_eps with 2*noise_eps-DP
			
 
				+  in our setting where one count can go up by one and another
			
 
				+  can go down by 1.
			
 
				+  """
			
 
				+
			
 
				+  q = compute_q_noisy_max(counts, noise_eps)
			
 
				+  return logmgf_exact(q, 2.0 * noise_eps, l)
			
 
				+
			
 
				+
			
 
				+def sens_at_k(counts, noise_eps, l, k):
			
 
				+  """Return sensitivity at distane k.
			
 
				+
			
 
				+  Args:
			
 
				+    counts: an array of scores
			
 
				+    noise_eps: noise parameter used
			
 
				+    l: moment whose sensitivity is being computed
			
 
				+    k: distance
			
 
				+  Returns:
			
 
				+    sensitivity: at distance k
			
 
				+  """
			
 
				+  counts_sorted = sorted(counts, reverse=True)
			
 
				+  if 0.5 * noise_eps * l > 1:
			
 
				+    print "l too large to compute sensitivity"
			
 
				+    return 0
			
 
				+  # Now we can assume that at k, gap remains positive
			
 
				+  # or we have reached the point where logmgf_exact is
			
 
				+  # determined by the first term and ind of q.
			
 
				+  if counts[0] < counts[1] + k:
			
 
				+    return 0
			
 
				+  counts_sorted[0] -= k
			
 
				+  counts_sorted[1] += k
			
 
				+  val = logmgf_from_counts(counts_sorted, noise_eps, l)
			
 
				+  counts_sorted[0] -= 1
			
 
				+  counts_sorted[1] += 1
			
 
				+  val_changed = logmgf_from_counts(counts_sorted, noise_eps, l)
			
 
				+  return val_changed - val
			
 
				+
			
 
				+
			
 
				+def smoothed_sens(counts, noise_eps, l, beta):
			
 
				+  """Compute beta-smooth sensitivity.
			
 
				+
			
 
				+  Args:
			
 
				+    counts: array of scors
			
 
				+    noise_eps: noise parameter
			
 
				+    l: moment of interest
			
 
				+    beta: smoothness parameter
			
 
				+  Returns:
			
 
				+    smooth_sensitivity: a beta smooth upper bound
			
 
				+  """
			
 
				+  k = 0
			
 
				+  smoothed_sensitivity = sens_at_k(counts, noise_eps, l, k)
			
 
				+  while k < max(counts):
			
 
				+    k += 1
			
 
				+    sensitivity_at_k = sens_at_k(counts, noise_eps, l, k)
			
 
				+    smoothed_sensitivity = max(
			
 
				+        smoothed_sensitivity,
			
 
				+        math.exp(-beta * k) * sensitivity_at_k)
			
 
				+    if sensitivity_at_k == 0.0:
			
 
				+      break
			
 
				+  return smoothed_sensitivity
			
 
				+
			
 
				+
			
 
				+def main(unused_argv):
			
 
				+  ##################################################################
			
 
				+  # If we are reproducing results from paper https://arxiv.org/abs/1610.05755,
			
 
				+  # download the required binaries with label information.
			
 
				+  ##################################################################
			
 
				+  
			
 
				+  # Binaries for MNIST results
			
 
				+  paper_binaries_mnist = \
			
 
				+    ["https://github.com/npapernot/multiple-teachers-for-privacy/blob/master/mnist_250_teachers_labels.npy?raw=true", 
			
 
				+    "https://github.com/npapernot/multiple-teachers-for-privacy/blob/master/mnist_250_teachers_100_indices_used_by_student.npy?raw=true"]
			
 
				+  if FLAGS.counts_file == "mnist_250_teachers_labels.npy" \
			
 
				+    or FLAGS.indices_file == "mnist_250_teachers_100_indices_used_by_student.npy":
			
 
				+    maybe_download(paper_binaries_mnist, os.getcwd())
			
 
				+
			
 
				+  # Binaries for SVHN results
			
 
				+  paper_binaries_svhn = ["https://github.com/npapernot/multiple-teachers-for-privacy/blob/master/svhn_250_teachers_labels.npy?raw=true"]
			
 
				+  if FLAGS.counts_file == "svhn_250_teachers_labels.npy":
			
 
				+    maybe_download(paper_binaries_svhn, os.getcwd())
			
 
				+
			
 
				+  input_mat = np.load(FLAGS.counts_file)
			
 
				+  if FLAGS.input_is_counts:
			
 
				+    counts_mat = input_mat
			
 
				+  else:
			
 
				+    # In this case, the input is the raw predictions. Transform
			
 
				+    num_teachers, n = input_mat.shape
			
 
				+    counts_mat = np.zeros((n, 10)).astype(np.int32)
			
 
				+    for i in range(n):
			
 
				+      for j in range(num_teachers):
			
 
				+        counts_mat[i, input_mat[j, i]] += 1
			
 
				+  n = counts_mat.shape[0]
			
 
				+  num_examples = min(n, FLAGS.max_examples)
			
 
				+
			
 
				+  if not FLAGS.indices_file:
			
 
				+    indices = np.array(range(num_examples))
			
 
				+  else:
			
 
				+    index_list = np.load(FLAGS.indices_file)
			
 
				+    indices = index_list[:num_examples]
			
 
				+
			
 
				+  l_list = 1.0 + np.array(xrange(FLAGS.moments))
			
 
				+  beta = FLAGS.beta
			
 
				+  total_log_mgf_nm = np.array([0.0 for _ in l_list])
			
 
				+  total_ss_nm = np.array([0.0 for _ in l_list])
			
 
				+  noise_eps = FLAGS.noise_eps
			
 
				+  
			
 
				+  for i in indices:
			
 
				+    total_log_mgf_nm += np.array(
			
 
				+        [logmgf_from_counts(counts_mat[i], noise_eps, l)
			
 
				+         for l in l_list])
			
 
				+    total_ss_nm += np.array(
			
 
				+        [smoothed_sens(counts_mat[i], noise_eps, l, beta)
			
 
				+         for l in l_list])
			
 
				+  delta = FLAGS.delta
			
 
				+
			
 
				+  # We want delta = exp(alpha - eps l).
			
 
				+  # Solving gives eps = (alpha - ln (delta))/l
			
 
				+  eps_list_nm = (total_log_mgf_nm - math.log(delta)) / l_list
			
 
				+
			
 
				+  print "Epsilons (Noisy Max): " + str(eps_list_nm)
			
 
				+  print "Smoothed sensitivities (Noisy Max): " + str(total_ss_nm / l_list)
			
 
				+
			
 
				+  # If beta < eps / 2 ln (1/delta), then adding noise Lap(1) * 2 SS/eps
			
 
				+  # is eps,delta DP
			
 
				+  # Also if beta < eps / 2(gamma +1), then adding noise 2(gamma+1) SS eta / eps
			
 
				+  # where eta has density proportional to 1 / (1+|z|^gamma) is eps-DP
			
 
				+  # Both from Corolloary 2.4 in
			
 
				+  # http://www.cse.psu.edu/~ads22/pubs/NRS07/NRS07-full-draft-v1.pdf
			
 
				+  # Print the first one's scale
			
 
				+  ss_eps = 2.0 * beta * math.log(1/delta)
			
 
				+  ss_scale = 2.0 / ss_eps
			
 
				+  print "To get an " + str(ss_eps) + "-DP estimate of epsilon, "
			
 
				+  print "..add noise ~ " + str(ss_scale)
			
 
				+  print "... times " + str(total_ss_nm / l_list)
			
 
				+  print "Epsilon = " + str(min(eps_list_nm)) + "."
			
 
				+  if min(eps_list_nm) == eps_list_nm[-1]:
			
 
				+    print "Warning: May not have used enough values of l"
			
 
				+
			
 
				+  # Data indpendent bound, as mechanism is
			
 
				+  # 2*noise_eps DP.
			
 
				+  data_ind_log_mgf = np.array([0.0 for _ in l_list])
			
 
				+  data_ind_log_mgf += num_examples * np.array(
			
 
				+      [logmgf_exact(1.0, 2.0 * noise_eps, l) for l in l_list])
			
 
				+
			
 
				+  data_ind_eps_list = (data_ind_log_mgf - math.log(delta)) / l_list
			
 
				+  print "Data independent bound = " + str(min(data_ind_eps_list)) + "."
			
 
				+
			
 
				+  return
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+  tf.app.run()
			
--- a/privacy/input.py
+++ b/privacy/input.py
@@ -62,6 +62,10 @@ def maybe_download(file_urls, directory):
 
				     # Extract filename
			
 
				     filename = file_url.split('/')[-1]
			
 
				 
			
 
				+    # If downloading from GitHub, remove suffix ?raw=True from local filename
			
 
				+    if filename.endswith("?raw=true"):
			
 
				+      filename = filename[:-9]
			
 
				+
			
 
				     # Deduce local file url
			
 
				     #filepath = os.path.join(directory, filename)
			
 
				     filepath = directory + '/' + filename