devs_parallel.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. from collections import defaultdict
  2. import sys
  3. from typing import Any, Dict, List, Tuple
  4. import numpy
  5. from scipy.sparse.csr import csr_matrix
  6. from labours.modes.devs import hdbscan_cluster_routed_series, order_commits
  7. from labours.objects import DevDay, ParallelDevData
  8. from labours.plotting import deploy_plot, import_pyplot
  9. def load_devs_parallel(
  10. ownership: Tuple[List[Any], Dict[Any, Any]],
  11. couples: Tuple[List[str], csr_matrix],
  12. devs: Tuple[List[str], Dict[int, Dict[int, DevDay]]],
  13. max_people: int,
  14. ):
  15. from seriate import seriate
  16. try:
  17. from hdbscan import HDBSCAN
  18. except ImportError as e:
  19. print(
  20. "Cannot import ortools: %s\nInstall it from "
  21. "https://developers.google.com/optimization/install/python/" % e
  22. )
  23. sys.exit(1)
  24. people, owned = ownership
  25. _, cmatrix = couples
  26. _, days = devs
  27. print("calculating - commits")
  28. commits = defaultdict(int)
  29. for day, devs in days.items():
  30. for dev, stats in devs.items():
  31. commits[people[dev]] += stats.Commits
  32. chosen = [
  33. k
  34. for v, k in sorted(((v, k) for k, v in commits.items()), reverse=True)[
  35. :max_people
  36. ]
  37. ]
  38. result = {k: ParallelDevData() for k in chosen}
  39. for k, v in result.items():
  40. v.commits_rank = chosen.index(k)
  41. v.commits = commits[k]
  42. print("calculating - lines")
  43. lines = defaultdict(int)
  44. for day, devs in days.items():
  45. for dev, stats in devs.items():
  46. lines[people[dev]] += stats.Added + stats.Removed + stats.Changed
  47. lines_index = {
  48. k: i
  49. for i, (_, k) in enumerate(
  50. sorted(((v, k) for k, v in lines.items() if k in chosen), reverse=True)
  51. )
  52. }
  53. for k, v in result.items():
  54. v.lines_rank = lines_index[k]
  55. v.lines = lines[k]
  56. print("calculating - ownership")
  57. owned_index = {
  58. k: i
  59. for i, (_, k) in enumerate(
  60. sorted(((owned[k][-1].sum(), k) for k in chosen), reverse=True)
  61. )
  62. }
  63. for k, v in result.items():
  64. v.ownership_rank = owned_index[k]
  65. v.ownership = owned[k][-1].sum()
  66. print("calculating - couples")
  67. embeddings = numpy.genfromtxt(fname="couples_people_data.tsv", delimiter="\t")[
  68. [people.index(k) for k in chosen]
  69. ]
  70. embeddings /= numpy.linalg.norm(embeddings, axis=1)[:, None]
  71. cos = embeddings.dot(embeddings.T)
  72. cos[cos > 1] = 1 # tiny precision faults
  73. dists = numpy.arccos(cos)
  74. clusters = HDBSCAN(min_cluster_size=2, metric="precomputed").fit_predict(dists)
  75. for k, v in result.items():
  76. v.couples_cluster = clusters[chosen.index(k)]
  77. couples_order = seriate(dists)
  78. roll_options = []
  79. for i in range(len(couples_order)):
  80. loss = 0
  81. for k, v in result.items():
  82. loss += abs(
  83. v.ownership_rank
  84. - (couples_order.index(chosen.index(k)) + i) % len(chosen)
  85. )
  86. roll_options.append(loss)
  87. best_roll = numpy.argmin(roll_options)
  88. couples_order = list(numpy.roll(couples_order, best_roll))
  89. for k, v in result.items():
  90. v.couples_index = couples_order.index(chosen.index(k))
  91. print("calculating - commit series")
  92. dists, devseries, _, orig_route = order_commits(chosen, days, people)
  93. keys = list(devseries.keys())
  94. route = [keys[node] for node in orig_route]
  95. for roll in range(len(route)):
  96. loss = 0
  97. for k, v in result.items():
  98. i = route.index(people.index(k))
  99. loss += abs(v.couples_index - ((i + roll) % len(route)))
  100. roll_options[roll] = loss
  101. best_roll = numpy.argmin(roll_options)
  102. route = list(numpy.roll(route, best_roll))
  103. orig_route = list(numpy.roll(orig_route, best_roll))
  104. clusters = hdbscan_cluster_routed_series(dists, orig_route)
  105. for k, v in result.items():
  106. v.commit_coocc_index = route.index(people.index(k))
  107. v.commit_coocc_cluster = clusters[v.commit_coocc_index]
  108. return result
  109. def show_devs_parallel(args, name, start_date, end_date, devs):
  110. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  111. from matplotlib.collections import LineCollection
  112. def solve_equations(x1, y1, x2, y2):
  113. xcube = (x1 - x2) ** 3
  114. a = 2 * (y2 - y1) / xcube
  115. b = 3 * (y1 - y2) * (x1 + x2) / xcube
  116. c = 6 * (y2 - y1) * x1 * x2 / xcube
  117. d = y1 - a * x1 ** 3 - b * x1 ** 2 - c * x1
  118. return a, b, c, d
  119. # biggest = {k: max(getattr(d, k) for d in devs.values())
  120. # for k in ("commits", "lines", "ownership")}
  121. for k, dev in devs.items():
  122. points = numpy.array(
  123. [
  124. (1, dev.commits_rank),
  125. (2, dev.lines_rank),
  126. (3, dev.ownership_rank),
  127. (4, dev.couples_index),
  128. (5, dev.commit_coocc_index),
  129. ],
  130. dtype=float,
  131. )
  132. points[:, 1] = points[:, 1] / len(devs)
  133. splines = []
  134. for i in range(len(points) - 1):
  135. a, b, c, d = solve_equations(*points[i], *points[i + 1])
  136. x = numpy.linspace(i + 1, i + 2, 100)
  137. smooth_points = numpy.array(
  138. [x, a * x ** 3 + b * x ** 2 + c * x + d]
  139. ).T.reshape(-1, 1, 2)
  140. splines.append(smooth_points)
  141. points = numpy.concatenate(splines)
  142. segments = numpy.concatenate([points[:-1], points[1:]], axis=1)
  143. lc = LineCollection(segments)
  144. lc.set_array(numpy.linspace(0, 0.1, segments.shape[0]))
  145. pyplot.gca().add_collection(lc)
  146. pyplot.xlim(0, 6)
  147. pyplot.ylim(-0.1, 1.1)
  148. deploy_plot("Developers", args.output, args.background)