analysis.py 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289
  1. """
  2. This script computes bounds on the privacy cost of training the
  3. student model from noisy aggregation of labels predicted by teachers.
  4. It should be used only after training the student (and therefore the
  5. teachers as well). We however include the label files required to
  6. reproduce key results from our paper (https://arxiv.org/abs/1610.05755):
  7. the epsilon bounds for MNIST and SVHN students.
  8. The command that computes the epsilon bound associated
  9. with the training of the MNIST student model (100 label queries
  10. with a (1/20)*2=0.1 epsilon bound each) is:
  11. python analysis.py
  12. --counts_file=mnist_250_teachers_labels.npy
  13. --indices_file=mnist_250_teachers_100_indices_used_by_student.npy
  14. The command that computes the epsilon bound associated
  15. with the training of the SVHN student model (1000 label queries
  16. with a (1/20)*2=0.1 epsilon bound each) is:
  17. python analysis.py
  18. --counts_file=svhn_250_teachers_labels.npy
  19. --max_examples=1000
  20. --delta=1e-6
  21. """
  22. import os
  23. import math
  24. import numpy as np
  25. import tensorflow as tf
  26. from input import maybe_download
  27. # These parameters can be changed to compute bounds for different failure rates
  28. # or different model predictions.
  29. tf.flags.DEFINE_integer("moments",8, "Number of moments")
  30. tf.flags.DEFINE_float("noise_eps", 0.1, "Eps value for each call to noisymax.")
  31. tf.flags.DEFINE_float("delta", 1e-5, "Target value of delta.")
  32. tf.flags.DEFINE_float("beta", 0.09, "Value of beta for smooth sensitivity")
  33. tf.flags.DEFINE_string("counts_file","","Numpy matrix with raw counts")
  34. tf.flags.DEFINE_string("indices_file","",
  35. "File containting a numpy matrix with indices used."
  36. "Optional. Use the first max_examples indices if this is not provided.")
  37. tf.flags.DEFINE_integer("max_examples",1000,
  38. "Number of examples to use. We will use the first"
  39. " max_examples many examples from the counts_file"
  40. " or indices_file to do the privacy cost estimate")
  41. tf.flags.DEFINE_float("too_small", 1e-10, "Small threshold to avoid log of 0")
  42. tf.flags.DEFINE_bool("input_is_counts", False, "False if labels, True if counts")
  43. FLAGS = tf.flags.FLAGS
  44. def compute_q_noisy_max(counts, noise_eps):
  45. """returns ~ Pr[outcome != winner].
  46. Args:
  47. counts: a list of scores
  48. noise_eps: privacy parameter for noisy_max
  49. Returns:
  50. q: the probability that outcome is different from true winner.
  51. """
  52. # For noisy max, we only get an upper bound.
  53. # Pr[ j beats i*] \leq (2+gap(j,i*))/ 4 exp(gap(j,i*)
  54. # proof at http://mathoverflow.net/questions/66763/
  55. # tight-bounds-on-probability-of-sum-of-laplace-random-variables
  56. winner = np.argmax(counts)
  57. counts_normalized = noise_eps * (counts - counts[winner])
  58. counts_rest = np.array(
  59. [counts_normalized[i] for i in xrange(len(counts)) if i != winner])
  60. q = 0.0
  61. for c in counts_rest:
  62. gap = -c
  63. q += (gap + 2.0) / (4.0 * math.exp(gap))
  64. return min(q, 1.0 - (1.0/len(counts)))
  65. def compute_q_noisy_max_approx(counts, noise_eps):
  66. """returns ~ Pr[outcome != winner].
  67. Args:
  68. counts: a list of scores
  69. noise_eps: privacy parameter for noisy_max
  70. Returns:
  71. q: the probability that outcome is different from true winner.
  72. """
  73. # For noisy max, we only get an upper bound.
  74. # Pr[ j beats i*] \leq (2+gap(j,i*))/ 4 exp(gap(j,i*)
  75. # proof at http://mathoverflow.net/questions/66763/
  76. # tight-bounds-on-probability-of-sum-of-laplace-random-variables
  77. # This code uses an approximation that is faster and easier
  78. # to get local sensitivity bound on.
  79. winner = np.argmax(counts)
  80. counts_normalized = noise_eps * (counts - counts[winner])
  81. counts_rest = np.array(
  82. [counts_normalized[i] for i in xrange(len(counts)) if i != winner])
  83. gap = -max(counts_rest)
  84. q = (len(counts) - 1) * (gap + 2.0) / (4.0 * math.exp(gap))
  85. return min(q, 1.0 - (1.0/len(counts)))
  86. def logmgf_exact(q, priv_eps, l):
  87. """Computes the logmgf value given q and privacy eps.
  88. The bound used is the min of three terms. The first term is from
  89. https://arxiv.org/pdf/1605.02065.pdf.
  90. The second term is based on the fact that when event has probability (1-q) for
  91. q close to zero, q can only change by exp(eps), which corresponds to a
  92. much smaller multiplicative change in (1-q)
  93. The third term comes directly from the privacy guarantee.
  94. Args:
  95. q: pr of non-optimal outcome
  96. priv_eps: eps parameter for DP
  97. l: moment to compute.
  98. Returns:
  99. Upper bound on logmgf
  100. """
  101. if q < 0.5:
  102. t_one = (1-q) * math.pow((1-q) / (1 - math.exp(priv_eps) * q), l)
  103. t_two = q * math.exp(priv_eps * l)
  104. t = t_one + t_two
  105. try:
  106. log_t = math.log(t)
  107. except ValueError:
  108. print "Got ValueError in math.log for values :" + str((q, priv_eps, l, t))
  109. log_t = priv_eps * l
  110. else:
  111. log_t = priv_eps * l
  112. return min(0.5 * priv_eps * priv_eps * l * (l + 1), log_t, priv_eps * l)
  113. def logmgf_from_counts(counts, noise_eps, l):
  114. """
  115. ReportNoisyMax mechanism with noise_eps with 2*noise_eps-DP
  116. in our setting where one count can go up by one and another
  117. can go down by 1.
  118. """
  119. q = compute_q_noisy_max(counts, noise_eps)
  120. return logmgf_exact(q, 2.0 * noise_eps, l)
  121. def sens_at_k(counts, noise_eps, l, k):
  122. """Return sensitivity at distane k.
  123. Args:
  124. counts: an array of scores
  125. noise_eps: noise parameter used
  126. l: moment whose sensitivity is being computed
  127. k: distance
  128. Returns:
  129. sensitivity: at distance k
  130. """
  131. counts_sorted = sorted(counts, reverse=True)
  132. if 0.5 * noise_eps * l > 1:
  133. print "l too large to compute sensitivity"
  134. return 0
  135. # Now we can assume that at k, gap remains positive
  136. # or we have reached the point where logmgf_exact is
  137. # determined by the first term and ind of q.
  138. if counts[0] < counts[1] + k:
  139. return 0
  140. counts_sorted[0] -= k
  141. counts_sorted[1] += k
  142. val = logmgf_from_counts(counts_sorted, noise_eps, l)
  143. counts_sorted[0] -= 1
  144. counts_sorted[1] += 1
  145. val_changed = logmgf_from_counts(counts_sorted, noise_eps, l)
  146. return val_changed - val
  147. def smoothed_sens(counts, noise_eps, l, beta):
  148. """Compute beta-smooth sensitivity.
  149. Args:
  150. counts: array of scors
  151. noise_eps: noise parameter
  152. l: moment of interest
  153. beta: smoothness parameter
  154. Returns:
  155. smooth_sensitivity: a beta smooth upper bound
  156. """
  157. k = 0
  158. smoothed_sensitivity = sens_at_k(counts, noise_eps, l, k)
  159. while k < max(counts):
  160. k += 1
  161. sensitivity_at_k = sens_at_k(counts, noise_eps, l, k)
  162. smoothed_sensitivity = max(
  163. smoothed_sensitivity,
  164. math.exp(-beta * k) * sensitivity_at_k)
  165. if sensitivity_at_k == 0.0:
  166. break
  167. return smoothed_sensitivity
  168. def main(unused_argv):
  169. ##################################################################
  170. # If we are reproducing results from paper https://arxiv.org/abs/1610.05755,
  171. # download the required binaries with label information.
  172. ##################################################################
  173. # Binaries for MNIST results
  174. paper_binaries_mnist = \
  175. ["https://github.com/npapernot/multiple-teachers-for-privacy/blob/master/mnist_250_teachers_labels.npy?raw=true",
  176. "https://github.com/npapernot/multiple-teachers-for-privacy/blob/master/mnist_250_teachers_100_indices_used_by_student.npy?raw=true"]
  177. if FLAGS.counts_file == "mnist_250_teachers_labels.npy" \
  178. or FLAGS.indices_file == "mnist_250_teachers_100_indices_used_by_student.npy":
  179. maybe_download(paper_binaries_mnist, os.getcwd())
  180. # Binaries for SVHN results
  181. paper_binaries_svhn = ["https://github.com/npapernot/multiple-teachers-for-privacy/blob/master/svhn_250_teachers_labels.npy?raw=true"]
  182. if FLAGS.counts_file == "svhn_250_teachers_labels.npy":
  183. maybe_download(paper_binaries_svhn, os.getcwd())
  184. input_mat = np.load(FLAGS.counts_file)
  185. if FLAGS.input_is_counts:
  186. counts_mat = input_mat
  187. else:
  188. # In this case, the input is the raw predictions. Transform
  189. num_teachers, n = input_mat.shape
  190. counts_mat = np.zeros((n, 10)).astype(np.int32)
  191. for i in range(n):
  192. for j in range(num_teachers):
  193. counts_mat[i, input_mat[j, i]] += 1
  194. n = counts_mat.shape[0]
  195. num_examples = min(n, FLAGS.max_examples)
  196. if not FLAGS.indices_file:
  197. indices = np.array(range(num_examples))
  198. else:
  199. index_list = np.load(FLAGS.indices_file)
  200. indices = index_list[:num_examples]
  201. l_list = 1.0 + np.array(xrange(FLAGS.moments))
  202. beta = FLAGS.beta
  203. total_log_mgf_nm = np.array([0.0 for _ in l_list])
  204. total_ss_nm = np.array([0.0 for _ in l_list])
  205. noise_eps = FLAGS.noise_eps
  206. for i in indices:
  207. total_log_mgf_nm += np.array(
  208. [logmgf_from_counts(counts_mat[i], noise_eps, l)
  209. for l in l_list])
  210. total_ss_nm += np.array(
  211. [smoothed_sens(counts_mat[i], noise_eps, l, beta)
  212. for l in l_list])
  213. delta = FLAGS.delta
  214. # We want delta = exp(alpha - eps l).
  215. # Solving gives eps = (alpha - ln (delta))/l
  216. eps_list_nm = (total_log_mgf_nm - math.log(delta)) / l_list
  217. print "Epsilons (Noisy Max): " + str(eps_list_nm)
  218. print "Smoothed sensitivities (Noisy Max): " + str(total_ss_nm / l_list)
  219. # If beta < eps / 2 ln (1/delta), then adding noise Lap(1) * 2 SS/eps
  220. # is eps,delta DP
  221. # Also if beta < eps / 2(gamma +1), then adding noise 2(gamma+1) SS eta / eps
  222. # where eta has density proportional to 1 / (1+|z|^gamma) is eps-DP
  223. # Both from Corolloary 2.4 in
  224. # http://www.cse.psu.edu/~ads22/pubs/NRS07/NRS07-full-draft-v1.pdf
  225. # Print the first one's scale
  226. ss_eps = 2.0 * beta * math.log(1/delta)
  227. ss_scale = 2.0 / ss_eps
  228. print "To get an " + str(ss_eps) + "-DP estimate of epsilon, "
  229. print "..add noise ~ " + str(ss_scale)
  230. print "... times " + str(total_ss_nm / l_list)
  231. print "Epsilon = " + str(min(eps_list_nm)) + "."
  232. if min(eps_list_nm) == eps_list_nm[-1]:
  233. print "Warning: May not have used enough values of l"
  234. # Data indpendent bound, as mechanism is
  235. # 2*noise_eps DP.
  236. data_ind_log_mgf = np.array([0.0 for _ in l_list])
  237. data_ind_log_mgf += num_examples * np.array(
  238. [logmgf_exact(1.0, 2.0 * noise_eps, l) for l in l_list])
  239. data_ind_eps_list = (data_ind_log_mgf - math.log(delta)) / l_list
  240. print "Data independent bound = " + str(min(data_ind_eps_list)) + "."
  241. return
  242. if __name__ == "__main__":
  243. tf.app.run()