devs_parallel.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. from collections import defaultdict
  2. import sys
  3. from typing import Any, Dict, List, Tuple
  4. import numpy
  5. from scipy.sparse.csr import csr_matrix
  6. from labours.modes.devs import hdbscan_cluster_routed_series, order_commits
  7. from labours.plotting import deploy_plot, import_pyplot
  8. from labours.objects import DevDay, ParallelDevData
  9. def load_devs_parallel(
  10. ownership: Tuple[List[Any], Dict[Any, Any]],
  11. couples: Tuple[List[str], csr_matrix],
  12. devs: Tuple[List[str], Dict[int, Dict[int, DevDay]]],
  13. max_people: int
  14. ):
  15. from seriate import seriate
  16. try:
  17. from hdbscan import HDBSCAN
  18. except ImportError as e:
  19. print("Cannot import ortools: %s\nInstall it from "
  20. "https://developers.google.com/optimization/install/python/" % e)
  21. sys.exit(1)
  22. people, owned = ownership
  23. _, cmatrix = couples
  24. _, days = devs
  25. print("calculating - commits")
  26. commits = defaultdict(int)
  27. for day, devs in days.items():
  28. for dev, stats in devs.items():
  29. commits[people[dev]] += stats.Commits
  30. chosen = [k for v, k in sorted(((v, k) for k, v in commits.items()),
  31. reverse=True)[:max_people]]
  32. result = {k: ParallelDevData() for k in chosen}
  33. for k, v in result.items():
  34. v.commits_rank = chosen.index(k)
  35. v.commits = commits[k]
  36. print("calculating - lines")
  37. lines = defaultdict(int)
  38. for day, devs in days.items():
  39. for dev, stats in devs.items():
  40. lines[people[dev]] += stats.Added + stats.Removed + stats.Changed
  41. lines_index = {k: i for i, (_, k) in enumerate(sorted(
  42. ((v, k) for k, v in lines.items() if k in chosen), reverse=True))}
  43. for k, v in result.items():
  44. v.lines_rank = lines_index[k]
  45. v.lines = lines[k]
  46. print("calculating - ownership")
  47. owned_index = {k: i for i, (_, k) in enumerate(sorted(
  48. ((owned[k][-1].sum(), k) for k in chosen), reverse=True))}
  49. for k, v in result.items():
  50. v.ownership_rank = owned_index[k]
  51. v.ownership = owned[k][-1].sum()
  52. print("calculating - couples")
  53. embeddings = numpy.genfromtxt(fname="couples_people_data.tsv", delimiter="\t")[
  54. [people.index(k) for k in chosen]]
  55. embeddings /= numpy.linalg.norm(embeddings, axis=1)[:, None]
  56. cos = embeddings.dot(embeddings.T)
  57. cos[cos > 1] = 1 # tiny precision faults
  58. dists = numpy.arccos(cos)
  59. clusters = HDBSCAN(min_cluster_size=2, metric="precomputed").fit_predict(dists)
  60. for k, v in result.items():
  61. v.couples_cluster = clusters[chosen.index(k)]
  62. couples_order = seriate(dists)
  63. roll_options = []
  64. for i in range(len(couples_order)):
  65. loss = 0
  66. for k, v in result.items():
  67. loss += abs(
  68. v.ownership_rank - (couples_order.index(chosen.index(k)) + i) % len(chosen))
  69. roll_options.append(loss)
  70. best_roll = numpy.argmin(roll_options)
  71. couples_order = list(numpy.roll(couples_order, best_roll))
  72. for k, v in result.items():
  73. v.couples_index = couples_order.index(chosen.index(k))
  74. print("calculating - commit series")
  75. dists, devseries, _, orig_route = order_commits(chosen, days, people)
  76. keys = list(devseries.keys())
  77. route = [keys[node] for node in orig_route]
  78. for roll in range(len(route)):
  79. loss = 0
  80. for k, v in result.items():
  81. i = route.index(people.index(k))
  82. loss += abs(v.couples_index - ((i + roll) % len(route)))
  83. roll_options[roll] = loss
  84. best_roll = numpy.argmin(roll_options)
  85. route = list(numpy.roll(route, best_roll))
  86. orig_route = list(numpy.roll(orig_route, best_roll))
  87. clusters = hdbscan_cluster_routed_series(dists, orig_route)
  88. for k, v in result.items():
  89. v.commit_coocc_index = route.index(people.index(k))
  90. v.commit_coocc_cluster = clusters[v.commit_coocc_index]
  91. return result
  92. def show_devs_parallel(args, name, start_date, end_date, devs):
  93. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  94. from matplotlib.collections import LineCollection
  95. def solve_equations(x1, y1, x2, y2):
  96. xcube = (x1 - x2) ** 3
  97. a = 2 * (y2 - y1) / xcube
  98. b = 3 * (y1 - y2) * (x1 + x2) / xcube
  99. c = 6 * (y2 - y1) * x1 * x2 / xcube
  100. d = y1 - a * x1 ** 3 - b * x1 ** 2 - c * x1
  101. return a, b, c, d
  102. # biggest = {k: max(getattr(d, k) for d in devs.values())
  103. # for k in ("commits", "lines", "ownership")}
  104. for k, dev in devs.items():
  105. points = numpy.array([
  106. (1, dev.commits_rank),
  107. (2, dev.lines_rank),
  108. (3, dev.ownership_rank),
  109. (4, dev.couples_index),
  110. (5, dev.commit_coocc_index)],
  111. dtype=float)
  112. points[:, 1] = points[:, 1] / len(devs)
  113. splines = []
  114. for i in range(len(points) - 1):
  115. a, b, c, d = solve_equations(*points[i], *points[i + 1])
  116. x = numpy.linspace(i + 1, i + 2, 100)
  117. smooth_points = numpy.array(
  118. [x, a * x ** 3 + b * x ** 2 + c * x + d]).T.reshape(-1, 1, 2)
  119. splines.append(smooth_points)
  120. points = numpy.concatenate(splines)
  121. segments = numpy.concatenate([points[:-1], points[1:]], axis=1)
  122. lc = LineCollection(segments)
  123. lc.set_array(numpy.linspace(0, 0.1, segments.shape[0]))
  124. pyplot.gca().add_collection(lc)
  125. pyplot.xlim(0, 6)
  126. pyplot.ylim(-0.1, 1.1)
  127. deploy_plot("Developers", args.output, args.background)