devs_parallel.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. from collections import defaultdict
  2. import sys
  3. import numpy
  4. from labours.modes.devs import hdbscan_cluster_routed_series, order_commits
  5. from labours.objects import ParallelDevData
  6. from labours.plotting import deploy_plot, import_pyplot
  7. def load_devs_parallel(ownership, couples, devs, max_people):
  8. from seriate import seriate
  9. try:
  10. from hdbscan import HDBSCAN
  11. except ImportError as e:
  12. print("Cannot import ortools: %s\nInstall it from "
  13. "https://developers.google.com/optimization/install/python/" % e)
  14. sys.exit(1)
  15. people, owned = ownership
  16. _, cmatrix = couples
  17. _, days = devs
  18. print("calculating - commits")
  19. commits = defaultdict(int)
  20. for day, devs in days.items():
  21. for dev, stats in devs.items():
  22. commits[people[dev]] += stats.Commits
  23. chosen = [k for v, k in sorted(((v, k) for k, v in commits.items()),
  24. reverse=True)[:max_people]]
  25. result = {k: ParallelDevData() for k in chosen}
  26. for k, v in result.items():
  27. v.commits_rank = chosen.index(k)
  28. v.commits = commits[k]
  29. print("calculating - lines")
  30. lines = defaultdict(int)
  31. for day, devs in days.items():
  32. for dev, stats in devs.items():
  33. lines[people[dev]] += stats.Added + stats.Removed + stats.Changed
  34. lines_index = {k: i for i, (_, k) in enumerate(sorted(
  35. ((v, k) for k, v in lines.items() if k in chosen), reverse=True))}
  36. for k, v in result.items():
  37. v.lines_rank = lines_index[k]
  38. v.lines = lines[k]
  39. print("calculating - ownership")
  40. owned_index = {k: i for i, (_, k) in enumerate(sorted(
  41. ((owned[k][-1].sum(), k) for k in chosen), reverse=True))}
  42. for k, v in result.items():
  43. v.ownership_rank = owned_index[k]
  44. v.ownership = owned[k][-1].sum()
  45. print("calculating - couples")
  46. embeddings = numpy.genfromtxt(fname="couples_people_data.tsv", delimiter="\t")[
  47. [people.index(k) for k in chosen]]
  48. embeddings /= numpy.linalg.norm(embeddings, axis=1)[:, None]
  49. cos = embeddings.dot(embeddings.T)
  50. cos[cos > 1] = 1 # tiny precision faults
  51. dists = numpy.arccos(cos)
  52. clusters = HDBSCAN(min_cluster_size=2, metric="precomputed").fit_predict(dists)
  53. for k, v in result.items():
  54. v.couples_cluster = clusters[chosen.index(k)]
  55. couples_order = seriate(dists)
  56. roll_options = []
  57. for i in range(len(couples_order)):
  58. loss = 0
  59. for k, v in result.items():
  60. loss += abs(
  61. v.ownership_rank - (couples_order.index(chosen.index(k)) + i) % len(chosen))
  62. roll_options.append(loss)
  63. best_roll = numpy.argmin(roll_options)
  64. couples_order = list(numpy.roll(couples_order, best_roll))
  65. for k, v in result.items():
  66. v.couples_index = couples_order.index(chosen.index(k))
  67. print("calculating - commit series")
  68. dists, devseries, _, orig_route = order_commits(chosen, days, people)
  69. keys = list(devseries.keys())
  70. route = [keys[node] for node in orig_route]
  71. for roll in range(len(route)):
  72. loss = 0
  73. for k, v in result.items():
  74. i = route.index(people.index(k))
  75. loss += abs(v.couples_index - ((i + roll) % len(route)))
  76. roll_options[roll] = loss
  77. best_roll = numpy.argmin(roll_options)
  78. route = list(numpy.roll(route, best_roll))
  79. orig_route = list(numpy.roll(orig_route, best_roll))
  80. clusters = hdbscan_cluster_routed_series(dists, orig_route)
  81. for k, v in result.items():
  82. v.commit_coocc_index = route.index(people.index(k))
  83. v.commit_coocc_cluster = clusters[v.commit_coocc_index]
  84. return result
  85. def show_devs_parallel(args, name, start_date, end_date, devs):
  86. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  87. from matplotlib.collections import LineCollection
  88. def solve_equations(x1, y1, x2, y2):
  89. xcube = (x1 - x2) ** 3
  90. a = 2 * (y2 - y1) / xcube
  91. b = 3 * (y1 - y2) * (x1 + x2) / xcube
  92. c = 6 * (y2 - y1) * x1 * x2 / xcube
  93. d = y1 - a * x1 ** 3 - b * x1 ** 2 - c * x1
  94. return a, b, c, d
  95. # biggest = {k: max(getattr(d, k) for d in devs.values())
  96. # for k in ("commits", "lines", "ownership")}
  97. for k, dev in devs.items():
  98. points = numpy.array([
  99. (1, dev.commits_rank),
  100. (2, dev.lines_rank),
  101. (3, dev.ownership_rank),
  102. (4, dev.couples_index),
  103. (5, dev.commit_coocc_index)],
  104. dtype=float)
  105. points[:, 1] = points[:, 1] / len(devs)
  106. splines = []
  107. for i in range(len(points) - 1):
  108. a, b, c, d = solve_equations(*points[i], *points[i + 1])
  109. x = numpy.linspace(i + 1, i + 2, 100)
  110. smooth_points = numpy.array(
  111. [x, a * x ** 3 + b * x ** 2 + c * x + d]).T.reshape(-1, 1, 2)
  112. splines.append(smooth_points)
  113. points = numpy.concatenate(splines)
  114. segments = numpy.concatenate([points[:-1], points[1:]], axis=1)
  115. lc = LineCollection(segments)
  116. lc.set_array(numpy.linspace(0, 0.1, segments.shape[0]))
  117. pyplot.gca().add_collection(lc)
  118. pyplot.xlim(0, 6)
  119. pyplot.ylim(-0.1, 1.1)
  120. deploy_plot("Developers", args.output, args.background)