devs.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323
  1. from argparse import Namespace
  2. from collections import defaultdict
  3. from datetime import datetime, timedelta
  4. import sys
  5. from typing import Dict, List, Set, Tuple
  6. import numpy
  7. import tqdm
  8. from labours.objects import DevDay
  9. from labours.plotting import apply_plot_style, deploy_plot, get_plot_path, import_pyplot
  10. from labours.utils import _format_number
  11. def show_devs(
  12. args: Namespace,
  13. name: str,
  14. start_date: int,
  15. end_date: int,
  16. people: List[str],
  17. days: Dict[int, Dict[int, DevDay]],
  18. max_people: int = 50,
  19. ) -> None:
  20. from scipy.signal import convolve, slepian
  21. if len(people) > max_people:
  22. print("Picking top %s developers by commit count" % max_people)
  23. # pick top N developers by commit count
  24. commits = defaultdict(int)
  25. for devs in days.values():
  26. for dev, stats in devs.items():
  27. commits[dev] += stats.Commits
  28. commits = sorted(((v, k) for k, v in commits.items()), reverse=True)
  29. chosen_people = {people[k] for _, k in commits[:max_people]}
  30. else:
  31. chosen_people = set(people)
  32. dists, devseries, devstats, route = order_commits(chosen_people, days, people)
  33. route_map = {v: i for i, v in enumerate(route)}
  34. # determine clusters
  35. clusters = hdbscan_cluster_routed_series(dists, route)
  36. keys = list(devseries.keys())
  37. route = [keys[node] for node in route]
  38. print("Plotting")
  39. # smooth time series
  40. start_date = datetime.fromtimestamp(start_date)
  41. start_date = datetime(start_date.year, start_date.month, start_date.day)
  42. end_date = datetime.fromtimestamp(end_date)
  43. end_date = datetime(end_date.year, end_date.month, end_date.day)
  44. size = (end_date - start_date).days + 1
  45. plot_x = [start_date + timedelta(days=i) for i in range(size)]
  46. resolution = 64
  47. window = slepian(size // resolution, 0.5)
  48. final = numpy.zeros((len(devseries), size), dtype=numpy.float32)
  49. for i, s in enumerate(devseries.values()):
  50. arr = numpy.array(s).transpose()
  51. full_history = numpy.zeros(size, dtype=numpy.float32)
  52. mask = arr[0] < size
  53. full_history[arr[0][mask]] = arr[1][mask]
  54. final[route_map[i]] = convolve(full_history, window, "same")
  55. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  56. pyplot.rcParams["figure.figsize"] = (32, 16)
  57. pyplot.rcParams["font.size"] = args.font_size
  58. prop_cycle = pyplot.rcParams["axes.prop_cycle"]
  59. colors = prop_cycle.by_key()["color"]
  60. fig, axes = pyplot.subplots(final.shape[0], 1)
  61. try:
  62. axes = tuple(axes)
  63. except TypeError:
  64. axes = axes,
  65. backgrounds = (
  66. ("#C4FFDB", "#FFD0CD") if args.background == "white" else ("#05401C", "#40110E")
  67. )
  68. max_cluster = numpy.max(clusters)
  69. for ax, series, cluster, dev_i in zip(axes, final, clusters, route):
  70. if cluster >= 0:
  71. color = colors[cluster % len(colors)]
  72. i = 1
  73. while color == "#777777":
  74. color = colors[(max_cluster + i) % len(colors)]
  75. i += 1
  76. else:
  77. # outlier
  78. color = "#777777"
  79. ax.fill_between(plot_x, series, color=color)
  80. ax.set_axis_off()
  81. author = people[dev_i]
  82. ax.text(
  83. 0.03,
  84. 0.5,
  85. author[:36] + (author[36:] and "..."),
  86. horizontalalignment="right",
  87. verticalalignment="center",
  88. transform=ax.transAxes,
  89. fontsize=args.font_size,
  90. color="black" if args.background == "white" else "white",
  91. )
  92. ds = devstats[dev_i]
  93. stats = "%5d %8s %8s" % (
  94. ds[0],
  95. _format_number(ds[1] - ds[2]),
  96. _format_number(ds[3]),
  97. )
  98. ax.text(
  99. 0.97,
  100. 0.5,
  101. stats,
  102. horizontalalignment="left",
  103. verticalalignment="center",
  104. transform=ax.transAxes,
  105. fontsize=args.font_size,
  106. family="monospace",
  107. backgroundcolor=backgrounds[ds[1] <= ds[2]],
  108. color="black" if args.background == "white" else "white",
  109. )
  110. axes[0].text(
  111. 0.97,
  112. 1.75,
  113. " cmts delta changed",
  114. horizontalalignment="left",
  115. verticalalignment="center",
  116. transform=axes[0].transAxes,
  117. fontsize=args.font_size,
  118. family="monospace",
  119. color="black" if args.background == "white" else "white",
  120. )
  121. axes[-1].set_axis_on()
  122. target_num_labels = 12
  123. num_months = (
  124. (end_date.year - start_date.year) * 12 + end_date.month - start_date.month
  125. )
  126. interval = int(numpy.ceil(num_months / target_num_labels))
  127. if interval >= 8:
  128. interval = int(numpy.ceil(num_months / (12 * target_num_labels)))
  129. axes[-1].xaxis.set_major_locator(
  130. matplotlib.dates.YearLocator(base=max(1, interval // 12))
  131. )
  132. axes[-1].xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y"))
  133. else:
  134. axes[-1].xaxis.set_major_locator(
  135. matplotlib.dates.MonthLocator(interval=interval)
  136. )
  137. axes[-1].xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%m"))
  138. for tick in axes[-1].xaxis.get_major_ticks():
  139. tick.label.set_fontsize(args.font_size)
  140. axes[-1].spines["left"].set_visible(False)
  141. axes[-1].spines["right"].set_visible(False)
  142. axes[-1].spines["top"].set_visible(False)
  143. axes[-1].get_yaxis().set_visible(False)
  144. axes[-1].set_facecolor((1.0,) * 3 + (0.0,))
  145. title = ("%s commits" % name) if not args.output else ""
  146. if args.mode == "all" and args.output:
  147. output = get_plot_path(args.output, "time_series")
  148. else:
  149. output = args.output
  150. deploy_plot(title, output, args.background)
  151. def order_commits(
  152. chosen_people: Set[str], days: Dict[int, Dict[int, DevDay]], people: List[str]
  153. ) -> Tuple[numpy.ndarray, defaultdict, defaultdict, List[int]]:
  154. from seriate import seriate
  155. try:
  156. from fastdtw import fastdtw
  157. except ImportError as e:
  158. print(
  159. "Cannot import fastdtw: %s\nInstall it from https://github.com/slaypni/fastdtw"
  160. % e
  161. )
  162. sys.exit(1)
  163. # FIXME(vmarkovtsev): remove once https://github.com/slaypni/fastdtw/pull/28 is merged&released
  164. try:
  165. sys.modules[
  166. "fastdtw.fastdtw"
  167. ].__norm = lambda p: lambda a, b: numpy.linalg.norm(
  168. numpy.atleast_1d(a) - numpy.atleast_1d(b), p
  169. )
  170. except KeyError:
  171. # the native extension does not have this bug
  172. pass
  173. devseries = defaultdict(list)
  174. devstats = defaultdict(lambda: DevDay(0, 0, 0, 0, {}))
  175. for day, devs in sorted(days.items()):
  176. for dev, stats in devs.items():
  177. if people[dev] in chosen_people:
  178. devseries[dev].append((day, stats.Commits))
  179. devstats[dev] = devstats[dev].add(stats)
  180. print("Calculating the distance matrix")
  181. # max-normalize the time series using a sliding window
  182. series = list(devseries.values())
  183. for i, s in enumerate(series):
  184. arr = numpy.array(s).transpose().astype(numpy.float32)
  185. arr[1] /= arr[1].sum()
  186. series[i] = arr.transpose()
  187. # calculate the distance matrix using dynamic time warping
  188. dists = numpy.full((len(series),) * 2, -100500, dtype=numpy.float32)
  189. # TODO: what's the total for this progress bar?
  190. with tqdm.tqdm() as pb:
  191. for x, serx in enumerate(series):
  192. dists[x, x] = 0
  193. for y, sery in enumerate(series[x + 1 :], start=x + 1):
  194. min_day = int(min(serx[0][0], sery[0][0]))
  195. max_day = int(max(serx[-1][0], sery[-1][0]))
  196. arrx = numpy.zeros(max_day - min_day + 1, dtype=numpy.float32)
  197. arry = numpy.zeros_like(arrx)
  198. arrx[serx[:, 0].astype(int) - min_day] = serx[:, 1]
  199. arry[sery[:, 0].astype(int) - min_day] = sery[:, 1]
  200. # L1 norm
  201. dist, _ = fastdtw(arrx, arry, radius=5, dist=1)
  202. dists[x, y] = dists[y, x] = dist
  203. pb.update()
  204. print("Ordering the series")
  205. route = seriate(dists)
  206. return dists, devseries, devstats, route
  207. def hdbscan_cluster_routed_series(
  208. dists: numpy.ndarray, route: List[int]
  209. ) -> numpy.ndarray:
  210. try:
  211. from hdbscan import HDBSCAN
  212. except ImportError as e:
  213. print("Cannot import hdbscan: %s" % e)
  214. sys.exit(1)
  215. opt_dist_chain = numpy.cumsum(
  216. numpy.array(
  217. [0] + [dists[route[i], route[i + 1]] for i in range(len(route) - 1)]
  218. )
  219. )
  220. if len(route) < 2:
  221. clusters = numpy.zeros(len(route), dtype=int)
  222. else:
  223. clusters = HDBSCAN(min_cluster_size=2).fit_predict(opt_dist_chain[:, numpy.newaxis])
  224. return clusters
  225. def show_devs_efforts(
  226. args: Namespace,
  227. name: str,
  228. start_date: int,
  229. end_date: int,
  230. people: List[str],
  231. days: Dict[int, Dict[int, DevDay]],
  232. max_people: int,
  233. ) -> None:
  234. from scipy.signal import convolve, slepian
  235. start_date = datetime.fromtimestamp(start_date)
  236. start_date = datetime(start_date.year, start_date.month, start_date.day)
  237. end_date = datetime.fromtimestamp(end_date)
  238. end_date = datetime(end_date.year, end_date.month, end_date.day)
  239. efforts_by_dev = defaultdict(int)
  240. for day, devs in days.items():
  241. for dev, stats in devs.items():
  242. efforts_by_dev[dev] += stats.Added + stats.Removed + stats.Changed
  243. if len(efforts_by_dev) > max_people:
  244. chosen = {
  245. v
  246. for k, v in sorted(
  247. ((v, k) for k, v in efforts_by_dev.items()), reverse=True
  248. )[:max_people]
  249. }
  250. print("Warning: truncated people to the most active %d" % max_people)
  251. else:
  252. chosen = set(efforts_by_dev)
  253. chosen_efforts = sorted(((efforts_by_dev[k], k) for k in chosen), reverse=True)
  254. chosen_order = {k: i for i, (_, k) in enumerate(chosen_efforts)}
  255. efforts = numpy.zeros(
  256. (len(chosen) + 1, (end_date - start_date).days + 1), dtype=numpy.float32
  257. )
  258. for day, devs in days.items():
  259. if day < efforts.shape[1]:
  260. for dev, stats in devs.items():
  261. dev = chosen_order.get(dev, len(chosen_order))
  262. efforts[dev][day] += stats.Added + stats.Removed + stats.Changed
  263. efforts_cum = numpy.cumsum(efforts, axis=1)
  264. window = slepian(10, 0.5)
  265. window /= window.sum()
  266. for e in (efforts, efforts_cum):
  267. for i in range(e.shape[0]):
  268. ending = e[i][-len(window) * 2 :].copy()
  269. e[i] = convolve(e[i], window, "same")
  270. e[i][-len(ending) :] = ending
  271. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  272. plot_x = [start_date + timedelta(days=i) for i in range(efforts.shape[1])]
  273. people = [people[k] for _, k in chosen_efforts] + ["others"]
  274. for i, name in enumerate(people):
  275. if len(name) > 40:
  276. people[i] = name[:37] + "..."
  277. polys = pyplot.stackplot(plot_x, efforts_cum, labels=people)
  278. if len(polys) == max_people + 1:
  279. polys[-1].set_hatch("/")
  280. polys = pyplot.stackplot(plot_x, -efforts * efforts_cum.max() / efforts.max())
  281. if len(polys) == max_people + 1:
  282. polys[-1].set_hatch("/")
  283. yticks = []
  284. for tick in pyplot.gca().yaxis.iter_ticks():
  285. if tick[1] >= 0:
  286. yticks.append(tick[1])
  287. pyplot.gca().yaxis.set_ticks(yticks)
  288. legend = pyplot.legend(loc=2, ncol=2, fontsize=args.font_size)
  289. apply_plot_style(
  290. pyplot.gcf(),
  291. pyplot.gca(),
  292. legend,
  293. args.background,
  294. args.font_size,
  295. args.size or "16,10",
  296. )
  297. if args.mode == "all" and args.output:
  298. output = get_plot_path(args.output, "efforts")
  299. else:
  300. output = args.output
  301. deploy_plot("Efforts through time (changed lines of code)", output, args.background)