devs.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. from collections import defaultdict
  2. from datetime import datetime, timedelta
  3. import sys
  4. import numpy
  5. import tqdm
  6. from labours.objects import DevDay
  7. from labours.plotting import apply_plot_style, deploy_plot, get_plot_path, import_pyplot
  8. from labours.utils import _format_number
  9. def show_devs(args, name, start_date, end_date, people, days, max_people=50):
  10. from scipy.signal import convolve, slepian
  11. if len(people) > max_people:
  12. print("Picking top %s developers by commit count" % max_people)
  13. # pick top N developers by commit count
  14. commits = defaultdict(int)
  15. for devs in days.values():
  16. for dev, stats in devs.items():
  17. commits[dev] += stats.Commits
  18. commits = sorted(((v, k) for k, v in commits.items()), reverse=True)
  19. chosen_people = {people[k] for _, k in commits[:max_people]}
  20. else:
  21. chosen_people = set(people)
  22. dists, devseries, devstats, route = order_commits(chosen_people, days, people)
  23. route_map = {v: i for i, v in enumerate(route)}
  24. # determine clusters
  25. clusters = hdbscan_cluster_routed_series(dists, route)
  26. keys = list(devseries.keys())
  27. route = [keys[node] for node in route]
  28. print("Plotting")
  29. # smooth time series
  30. start_date = datetime.fromtimestamp(start_date)
  31. start_date = datetime(start_date.year, start_date.month, start_date.day)
  32. end_date = datetime.fromtimestamp(end_date)
  33. end_date = datetime(end_date.year, end_date.month, end_date.day)
  34. size = (end_date - start_date).days + 1
  35. plot_x = [start_date + timedelta(days=i) for i in range(size)]
  36. resolution = 64
  37. window = slepian(size // resolution, 0.5)
  38. final = numpy.zeros((len(devseries), size), dtype=numpy.float32)
  39. for i, s in enumerate(devseries.values()):
  40. arr = numpy.array(s).transpose()
  41. full_history = numpy.zeros(size, dtype=numpy.float32)
  42. mask = arr[0] < size
  43. full_history[arr[0][mask]] = arr[1][mask]
  44. final[route_map[i]] = convolve(full_history, window, "same")
  45. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  46. pyplot.rcParams["figure.figsize"] = (32, 16)
  47. pyplot.rcParams["font.size"] = args.font_size
  48. prop_cycle = pyplot.rcParams["axes.prop_cycle"]
  49. colors = prop_cycle.by_key()["color"]
  50. fig, axes = pyplot.subplots(final.shape[0], 1)
  51. backgrounds = ("#C4FFDB", "#FFD0CD") if args.background == "white" else ("#05401C", "#40110E")
  52. max_cluster = numpy.max(clusters)
  53. for ax, series, cluster, dev_i in zip(axes, final, clusters, route):
  54. if cluster >= 0:
  55. color = colors[cluster % len(colors)]
  56. i = 1
  57. while color == "#777777":
  58. color = colors[(max_cluster + i) % len(colors)]
  59. i += 1
  60. else:
  61. # outlier
  62. color = "#777777"
  63. ax.fill_between(plot_x, series, color=color)
  64. ax.set_axis_off()
  65. author = people[dev_i]
  66. ax.text(0.03, 0.5, author[:36] + (author[36:] and "..."),
  67. horizontalalignment="right", verticalalignment="center",
  68. transform=ax.transAxes, fontsize=args.font_size,
  69. color="black" if args.background == "white" else "white")
  70. ds = devstats[dev_i]
  71. stats = "%5d %8s %8s" % (ds[0], _format_number(ds[1] - ds[2]), _format_number(ds[3]))
  72. ax.text(0.97, 0.5, stats,
  73. horizontalalignment="left", verticalalignment="center",
  74. transform=ax.transAxes, fontsize=args.font_size, family="monospace",
  75. backgroundcolor=backgrounds[ds[1] <= ds[2]],
  76. color="black" if args.background == "white" else "white")
  77. axes[0].text(0.97, 1.75, " cmts delta changed",
  78. horizontalalignment="left", verticalalignment="center",
  79. transform=axes[0].transAxes, fontsize=args.font_size, family="monospace",
  80. color="black" if args.background == "white" else "white")
  81. axes[-1].set_axis_on()
  82. target_num_labels = 12
  83. num_months = (end_date.year - start_date.year) * 12 + end_date.month - start_date.month
  84. interval = int(numpy.ceil(num_months / target_num_labels))
  85. if interval >= 8:
  86. interval = int(numpy.ceil(num_months / (12 * target_num_labels)))
  87. axes[-1].xaxis.set_major_locator(matplotlib.dates.YearLocator(base=max(1, interval // 12)))
  88. axes[-1].xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y"))
  89. else:
  90. axes[-1].xaxis.set_major_locator(matplotlib.dates.MonthLocator(interval=interval))
  91. axes[-1].xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%m"))
  92. for tick in axes[-1].xaxis.get_major_ticks():
  93. tick.label.set_fontsize(args.font_size)
  94. axes[-1].spines["left"].set_visible(False)
  95. axes[-1].spines["right"].set_visible(False)
  96. axes[-1].spines["top"].set_visible(False)
  97. axes[-1].get_yaxis().set_visible(False)
  98. axes[-1].set_facecolor((1.0,) * 3 + (0.0,))
  99. title = ("%s commits" % name) if not args.output else ""
  100. if args.mode == "all" and args.output:
  101. output = get_plot_path(args.output, "time_series")
  102. else:
  103. output = args.output
  104. deploy_plot(title, output, args.background)
  105. def order_commits(chosen_people, days, people):
  106. from seriate import seriate
  107. try:
  108. from fastdtw import fastdtw
  109. except ImportError as e:
  110. print("Cannot import fastdtw: %s\nInstall it from https://github.com/slaypni/fastdtw" % e)
  111. sys.exit(1)
  112. # FIXME(vmarkovtsev): remove once https://github.com/slaypni/fastdtw/pull/28 is merged&released
  113. try:
  114. sys.modules["fastdtw.fastdtw"].__norm = lambda p: lambda a, b: numpy.linalg.norm(
  115. numpy.atleast_1d(a) - numpy.atleast_1d(b), p)
  116. except KeyError:
  117. # the native extension does not have this bug
  118. pass
  119. devseries = defaultdict(list)
  120. devstats = defaultdict(lambda: DevDay(0, 0, 0, 0, {}))
  121. for day, devs in sorted(days.items()):
  122. for dev, stats in devs.items():
  123. if people[dev] in chosen_people:
  124. devseries[dev].append((day, stats.Commits))
  125. devstats[dev] = devstats[dev].add(stats)
  126. print("Calculating the distance matrix")
  127. # max-normalize the time series using a sliding window
  128. series = list(devseries.values())
  129. for i, s in enumerate(series):
  130. arr = numpy.array(s).transpose().astype(numpy.float32)
  131. arr[1] /= arr[1].sum()
  132. series[i] = arr.transpose()
  133. # calculate the distance matrix using dynamic time warping
  134. dists = numpy.full((len(series),) * 2, -100500, dtype=numpy.float32)
  135. # TODO: what's the total for this progress bar?
  136. with tqdm.tqdm() as pb:
  137. for x, serx in enumerate(series):
  138. dists[x, x] = 0
  139. for y, sery in enumerate(series[x + 1:], start=x + 1):
  140. min_day = int(min(serx[0][0], sery[0][0]))
  141. max_day = int(max(serx[-1][0], sery[-1][0]))
  142. arrx = numpy.zeros(max_day - min_day + 1, dtype=numpy.float32)
  143. arry = numpy.zeros_like(arrx)
  144. arrx[serx[:, 0].astype(int) - min_day] = serx[:, 1]
  145. arry[sery[:, 0].astype(int) - min_day] = sery[:, 1]
  146. # L1 norm
  147. dist, _ = fastdtw(arrx, arry, radius=5, dist=1)
  148. dists[x, y] = dists[y, x] = dist
  149. pb.update()
  150. print("Ordering the series")
  151. route = seriate(dists)
  152. return dists, devseries, devstats, route
  153. def hdbscan_cluster_routed_series(dists, route):
  154. try:
  155. from hdbscan import HDBSCAN
  156. except ImportError as e:
  157. print("Cannot import hdbscan: %s" % e)
  158. sys.exit(1)
  159. opt_dist_chain = numpy.cumsum(numpy.array(
  160. [0] + [dists[route[i], route[i + 1]] for i in range(len(route) - 1)]))
  161. clusters = HDBSCAN(min_cluster_size=2).fit_predict(opt_dist_chain[:, numpy.newaxis])
  162. return clusters
  163. def show_devs_efforts(args, name, start_date, end_date, people, days, max_people):
  164. from scipy.signal import convolve, slepian
  165. start_date = datetime.fromtimestamp(start_date)
  166. start_date = datetime(start_date.year, start_date.month, start_date.day)
  167. end_date = datetime.fromtimestamp(end_date)
  168. end_date = datetime(end_date.year, end_date.month, end_date.day)
  169. efforts_by_dev = defaultdict(int)
  170. for day, devs in days.items():
  171. for dev, stats in devs.items():
  172. efforts_by_dev[dev] += stats.Added + stats.Removed + stats.Changed
  173. if len(efforts_by_dev) > max_people:
  174. chosen = {v for k, v in sorted(
  175. ((v, k) for k, v in efforts_by_dev.items()), reverse=True)[:max_people]}
  176. print("Warning: truncated people to the most active %d" % max_people)
  177. else:
  178. chosen = set(efforts_by_dev)
  179. chosen_efforts = sorted(((efforts_by_dev[k], k) for k in chosen), reverse=True)
  180. chosen_order = {k: i for i, (_, k) in enumerate(chosen_efforts)}
  181. efforts = numpy.zeros((len(chosen) + 1, (end_date - start_date).days + 1), dtype=numpy.float32)
  182. for day, devs in days.items():
  183. if day < efforts.shape[1]:
  184. for dev, stats in devs.items():
  185. dev = chosen_order.get(dev, len(chosen_order))
  186. efforts[dev][day] += stats.Added + stats.Removed + stats.Changed
  187. efforts_cum = numpy.cumsum(efforts, axis=1)
  188. window = slepian(10, 0.5)
  189. window /= window.sum()
  190. for e in (efforts, efforts_cum):
  191. for i in range(e.shape[0]):
  192. ending = e[i][-len(window) * 2:].copy()
  193. e[i] = convolve(e[i], window, "same")
  194. e[i][-len(ending):] = ending
  195. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  196. plot_x = [start_date + timedelta(days=i) for i in range(efforts.shape[1])]
  197. people = [people[k] for _, k in chosen_efforts] + ["others"]
  198. for i, name in enumerate(people):
  199. if len(name) > 40:
  200. people[i] = name[:37] + "..."
  201. polys = pyplot.stackplot(plot_x, efforts_cum, labels=people)
  202. if len(polys) == max_people + 1:
  203. polys[-1].set_hatch("/")
  204. polys = pyplot.stackplot(plot_x, -efforts * efforts_cum.max() / efforts.max())
  205. if len(polys) == max_people + 1:
  206. polys[-1].set_hatch("/")
  207. yticks = []
  208. for tick in pyplot.gca().yaxis.iter_ticks():
  209. if tick[1] >= 0:
  210. yticks.append(tick[1])
  211. pyplot.gca().yaxis.set_ticks(yticks)
  212. legend = pyplot.legend(loc=2, ncol=2, fontsize=args.font_size)
  213. apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.background,
  214. args.font_size, args.size or "16,10")
  215. if args.mode == "all" and args.output:
  216. output = get_plot_path(args.output, "efforts")
  217. else:
  218. output = args.output
  219. deploy_plot("Efforts through time (changed lines of code)", output, args.background)