analysis.py 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. """
  2. This script computes bounds on the privacy cost of training the
  3. student model from noisy aggregation of labels predicted by teachers.
  4. It should be used only after training the student (and therefore the
  5. teachers as well). We however include the label files required to
  6. reproduce key results from our paper (https://arxiv.org/abs/1610.05755):
  7. the epsilon bounds for MNIST and SVHN students.
  8. The command that computes the epsilon bound associated
  9. with the training of the MNIST student model (100 label queries
  10. with a (1/20)*2=0.1 epsilon bound each) is:
  11. python analysis.py
  12. --counts_file=mnist_250_teachers_labels.npy
  13. --indices_file=mnist_250_teachers_100_indices_used_by_student.npy
  14. The command that computes the epsilon bound associated
  15. with the training of the SVHN student model (1000 label queries
  16. with a (1/20)*2=0.1 epsilon bound each) is:
  17. python analysis.py
  18. --counts_file=svhn_250_teachers_labels.npy
  19. --max_examples=1000
  20. --delta=1e-6
  21. """
  22. import math
  23. import numpy as np
  24. import tensorflow as tf
  25. # These parameters can be changed to compute bounds for different failure rates
  26. # or different model predictions.
  27. tf.flags.DEFINE_integer("moments",8, "Number of moments")
  28. tf.flags.DEFINE_float("noise_eps", 0.1, "Eps value for each call to noisymax.")
  29. tf.flags.DEFINE_float("delta", 1e-5, "Target value of delta.")
  30. tf.flags.DEFINE_float("beta", 0.09, "Value of beta for smooth sensitivity")
  31. tf.flags.DEFINE_string("counts_file","","Numpy matrix with raw counts")
  32. tf.flags.DEFINE_string("indices_file","",
  33. "File containting a numpy matrix with indices used."
  34. "Optional. Use the first max_examples indices if this is not provided.")
  35. tf.flags.DEFINE_integer("max_examples",1000,
  36. "Number of examples to use. We will use the first"
  37. " max_examples many examples from the counts_file"
  38. " or indices_file to do the privacy cost estimate")
  39. tf.flags.DEFINE_float("too_small", 1e-10, "Small threshold to avoid log of 0")
  40. tf.flags.DEFINE_bool("input_is_counts", False, "False if labels, True if counts")
  41. FLAGS = tf.flags.FLAGS
  42. def compute_q_noisy_max(counts, noise_eps):
  43. """returns ~ Pr[outcome != winner].
  44. Args:
  45. counts: a list of scores
  46. noise_eps: privacy parameter for noisy_max
  47. Returns:
  48. q: the probability that outcome is different from true winner.
  49. """
  50. # For noisy max, we only get an upper bound.
  51. # Pr[ j beats i*] \leq (2+gap(j,i*))/ 4 exp(gap(j,i*)
  52. # proof at http://mathoverflow.net/questions/66763/
  53. # tight-bounds-on-probability-of-sum-of-laplace-random-variables
  54. winner = np.argmax(counts)
  55. counts_normalized = noise_eps * (counts - counts[winner])
  56. counts_rest = np.array(
  57. [counts_normalized[i] for i in xrange(len(counts)) if i != winner])
  58. q = 0.0
  59. for c in counts_rest:
  60. gap = -c
  61. q += (gap + 2.0) / (4.0 * math.exp(gap))
  62. return min(q, 1.0 - (1.0/len(counts)))
  63. def compute_q_noisy_max_approx(counts, noise_eps):
  64. """returns ~ Pr[outcome != winner].
  65. Args:
  66. counts: a list of scores
  67. noise_eps: privacy parameter for noisy_max
  68. Returns:
  69. q: the probability that outcome is different from true winner.
  70. """
  71. # For noisy max, we only get an upper bound.
  72. # Pr[ j beats i*] \leq (2+gap(j,i*))/ 4 exp(gap(j,i*)
  73. # proof at http://mathoverflow.net/questions/66763/
  74. # tight-bounds-on-probability-of-sum-of-laplace-random-variables
  75. # This code uses an approximation that is faster and easier
  76. # to get local sensitivity bound on.
  77. winner = np.argmax(counts)
  78. counts_normalized = noise_eps * (counts - counts[winner])
  79. counts_rest = np.array(
  80. [counts_normalized[i] for i in xrange(len(counts)) if i != winner])
  81. gap = -max(counts_rest)
  82. q = (len(counts) - 1) * (gap + 2.0) / (4.0 * math.exp(gap))
  83. return min(q, 1.0 - (1.0/len(counts)))
  84. def logmgf_exact(q, priv_eps, l):
  85. """Computes the logmgf value given q and privacy eps.
  86. The bound used is the min of three terms. The first term is from
  87. https://arxiv.org/pdf/1605.02065.pdf.
  88. The second term is based on the fact that when event has probability (1-q) for
  89. q close to zero, q can only change by exp(eps), which corresponds to a
  90. much smaller multiplicative change in (1-q)
  91. The third term comes directly from the privacy guarantee.
  92. Args:
  93. q: pr of non-optimal outcome
  94. priv_eps: eps parameter for DP
  95. l: moment to compute.
  96. Returns:
  97. Upper bound on logmgf
  98. """
  99. if q < 0.5:
  100. t_one = (1-q) * math.pow((1-q) / (1 - math.exp(priv_eps) * q), l)
  101. t_two = q * math.exp(priv_eps * l)
  102. t = t_one + t_two
  103. try:
  104. log_t = math.log(t)
  105. except ValueError:
  106. print "Got ValueError in math.log for values :" + str((q, priv_eps, l, t))
  107. log_t = priv_eps * l
  108. else:
  109. log_t = priv_eps * l
  110. return min(0.5 * priv_eps * priv_eps * l * (l + 1), log_t, priv_eps * l)
  111. def logmgf_from_counts(counts, noise_eps, l):
  112. """
  113. ReportNoisyMax mechanism with noise_eps with 2*noise_eps-DP
  114. in our setting where one count can go up by one and another
  115. can go down by 1.
  116. """
  117. q = compute_q_noisy_max(counts, noise_eps)
  118. return logmgf_exact(q, 2.0 * noise_eps, l)
  119. def sens_at_k(counts, noise_eps, l, k):
  120. """Return sensitivity at distane k.
  121. Args:
  122. counts: an array of scores
  123. noise_eps: noise parameter used
  124. l: moment whose sensitivity is being computed
  125. k: distance
  126. Returns:
  127. sensitivity: at distance k
  128. """
  129. counts_sorted = sorted(counts, reverse=True)
  130. if 0.5 * noise_eps * l > 1:
  131. print "l too large to compute sensitivity"
  132. return 0
  133. # Now we can assume that at k, gap remains positive
  134. # or we have reached the point where logmgf_exact is
  135. # determined by the first term and ind of q.
  136. if counts[0] < counts[1] + k:
  137. return 0
  138. counts_sorted[0] -= k
  139. counts_sorted[1] += k
  140. val = logmgf_from_counts(counts_sorted, noise_eps, l)
  141. counts_sorted[0] -= 1
  142. counts_sorted[1] += 1
  143. val_changed = logmgf_from_counts(counts_sorted, noise_eps, l)
  144. return val_changed - val
  145. def smoothed_sens(counts, noise_eps, l, beta):
  146. """Compute beta-smooth sensitivity.
  147. Args:
  148. counts: array of scors
  149. noise_eps: noise parameter
  150. l: moment of interest
  151. beta: smoothness parameter
  152. Returns:
  153. smooth_sensitivity: a beta smooth upper bound
  154. """
  155. k = 0
  156. smoothed_sensitivity = sens_at_k(counts, noise_eps, l, k)
  157. while k < max(counts):
  158. k += 1
  159. sensitivity_at_k = sens_at_k(counts, noise_eps, l, k)
  160. smoothed_sensitivity = max(
  161. smoothed_sensitivity,
  162. math.exp(-beta * k) * sensitivity_at_k)
  163. if sensitivity_at_k == 0.0:
  164. break
  165. return smoothed_sensitivity
  166. def main(unused_argv):
  167. input_mat = np.load(FLAGS.counts_file)
  168. if FLAGS.input_is_counts:
  169. counts_mat = input_mat
  170. else:
  171. # In this case, the input is the raw predictions. Transform
  172. num_teachers, n = input_mat.shape
  173. counts_mat = np.zeros((n, 10)).astype(np.int32)
  174. for i in range(n):
  175. for j in range(num_teachers):
  176. counts_mat[i, input_mat[j, i]] += 1
  177. n = counts_mat.shape[0]
  178. num_examples = min(n, FLAGS.max_examples)
  179. if not FLAGS.indices_file:
  180. indices = np.array(range(num_examples))
  181. else:
  182. index_list = np.load(FLAGS.indices_file)
  183. indices = index_list[:num_examples]
  184. l_list = 1.0 + np.array(xrange(FLAGS.moments))
  185. beta = FLAGS.beta
  186. total_log_mgf_nm = np.array([0.0 for _ in l_list])
  187. total_ss_nm = np.array([0.0 for _ in l_list])
  188. noise_eps = FLAGS.noise_eps
  189. for i in indices:
  190. total_log_mgf_nm += np.array(
  191. [logmgf_from_counts(counts_mat[i], noise_eps, l)
  192. for l in l_list])
  193. total_ss_nm += np.array(
  194. [smoothed_sens(counts_mat[i], noise_eps, l, beta)
  195. for l in l_list])
  196. delta = FLAGS.delta
  197. # We want delta = exp(alpha - eps l).
  198. # Solving gives eps = (alpha - ln (delta))/l
  199. eps_list_nm = (total_log_mgf_nm - math.log(delta)) / l_list
  200. print "Epsilons (Noisy Max): " + str(eps_list_nm)
  201. print "Smoothed sensitivities (Noisy Max): " + str(total_ss_nm / l_list)
  202. # If beta < eps / 2 ln (1/delta), then adding noise Lap(1) * 2 SS/eps
  203. # is eps,delta DP
  204. # Also if beta < eps / 2(gamma +1), then adding noise 2(gamma+1) SS eta / eps
  205. # where eta has density proportional to 1 / (1+|z|^gamma) is eps-DP
  206. # Both from Corolloary 2.4 in
  207. # http://www.cse.psu.edu/~ads22/pubs/NRS07/NRS07-full-draft-v1.pdf
  208. # Print the first one's scale
  209. ss_eps = 2.0 * beta * math.log(1/delta)
  210. ss_scale = 2.0 / ss_eps
  211. print "To get an " + str(ss_eps) + "-DP estimate of epsilon, "
  212. print "..add noise ~ " + str(ss_scale)
  213. print "... times " + str(total_ss_nm / l_list)
  214. print "Epsilon = " + str(min(eps_list_nm)) + "."
  215. if min(eps_list_nm) == eps_list_nm[-1]:
  216. print "Warning: May not have used enough values of l"
  217. # Data indpendent bound, as mechanism is
  218. # 2*noise_eps DP.
  219. data_ind_log_mgf = np.array([0.0 for _ in l_list])
  220. data_ind_log_mgf += num_examples * np.array(
  221. [logmgf_exact(1.0, 2.0 * noise_eps, l) for l in l_list])
  222. data_ind_eps_list = (data_ind_log_mgf - math.log(delta)) / l_list
  223. print "Data independent bound = " + str(min(data_ind_eps_list)) + "."
  224. return
  225. if __name__ == "__main__":
  226. tf.app.run()