devs.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316
  1. from argparse import Namespace
  2. from collections import defaultdict
  3. from datetime import datetime, timedelta
  4. import sys
  5. from typing import Dict, List, Set, Tuple
  6. import numpy
  7. import tqdm
  8. from labours.objects import DevDay
  9. from labours.plotting import apply_plot_style, deploy_plot, get_plot_path, import_pyplot
  10. from labours.utils import _format_number
  11. def show_devs(
  12. args: Namespace,
  13. name: str,
  14. start_date: int,
  15. end_date: int,
  16. people: List[str],
  17. days: Dict[int, Dict[int, DevDay]],
  18. max_people: int = 50,
  19. ) -> None:
  20. from scipy.signal import convolve, slepian
  21. if len(people) > max_people:
  22. print("Picking top %s developers by commit count" % max_people)
  23. # pick top N developers by commit count
  24. commits = defaultdict(int)
  25. for devs in days.values():
  26. for dev, stats in devs.items():
  27. commits[dev] += stats.Commits
  28. commits = sorted(((v, k) for k, v in commits.items()), reverse=True)
  29. chosen_people = {people[k] for _, k in commits[:max_people]}
  30. else:
  31. chosen_people = set(people)
  32. dists, devseries, devstats, route = order_commits(chosen_people, days, people)
  33. route_map = {v: i for i, v in enumerate(route)}
  34. # determine clusters
  35. clusters = hdbscan_cluster_routed_series(dists, route)
  36. keys = list(devseries.keys())
  37. route = [keys[node] for node in route]
  38. print("Plotting")
  39. # smooth time series
  40. start_date = datetime.fromtimestamp(start_date)
  41. start_date = datetime(start_date.year, start_date.month, start_date.day)
  42. end_date = datetime.fromtimestamp(end_date)
  43. end_date = datetime(end_date.year, end_date.month, end_date.day)
  44. size = (end_date - start_date).days + 1
  45. plot_x = [start_date + timedelta(days=i) for i in range(size)]
  46. resolution = 64
  47. window = slepian(size // resolution, 0.5)
  48. final = numpy.zeros((len(devseries), size), dtype=numpy.float32)
  49. for i, s in enumerate(devseries.values()):
  50. arr = numpy.array(s).transpose()
  51. full_history = numpy.zeros(size, dtype=numpy.float32)
  52. mask = arr[0] < size
  53. full_history[arr[0][mask]] = arr[1][mask]
  54. final[route_map[i]] = convolve(full_history, window, "same")
  55. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  56. pyplot.rcParams["figure.figsize"] = (32, 16)
  57. pyplot.rcParams["font.size"] = args.font_size
  58. prop_cycle = pyplot.rcParams["axes.prop_cycle"]
  59. colors = prop_cycle.by_key()["color"]
  60. fig, axes = pyplot.subplots(final.shape[0], 1)
  61. backgrounds = (
  62. ("#C4FFDB", "#FFD0CD") if args.background == "white" else ("#05401C", "#40110E")
  63. )
  64. max_cluster = numpy.max(clusters)
  65. for ax, series, cluster, dev_i in zip(axes, final, clusters, route):
  66. if cluster >= 0:
  67. color = colors[cluster % len(colors)]
  68. i = 1
  69. while color == "#777777":
  70. color = colors[(max_cluster + i) % len(colors)]
  71. i += 1
  72. else:
  73. # outlier
  74. color = "#777777"
  75. ax.fill_between(plot_x, series, color=color)
  76. ax.set_axis_off()
  77. author = people[dev_i]
  78. ax.text(
  79. 0.03,
  80. 0.5,
  81. author[:36] + (author[36:] and "..."),
  82. horizontalalignment="right",
  83. verticalalignment="center",
  84. transform=ax.transAxes,
  85. fontsize=args.font_size,
  86. color="black" if args.background == "white" else "white",
  87. )
  88. ds = devstats[dev_i]
  89. stats = "%5d %8s %8s" % (
  90. ds[0],
  91. _format_number(ds[1] - ds[2]),
  92. _format_number(ds[3]),
  93. )
  94. ax.text(
  95. 0.97,
  96. 0.5,
  97. stats,
  98. horizontalalignment="left",
  99. verticalalignment="center",
  100. transform=ax.transAxes,
  101. fontsize=args.font_size,
  102. family="monospace",
  103. backgroundcolor=backgrounds[ds[1] <= ds[2]],
  104. color="black" if args.background == "white" else "white",
  105. )
  106. axes[0].text(
  107. 0.97,
  108. 1.75,
  109. " cmts delta changed",
  110. horizontalalignment="left",
  111. verticalalignment="center",
  112. transform=axes[0].transAxes,
  113. fontsize=args.font_size,
  114. family="monospace",
  115. color="black" if args.background == "white" else "white",
  116. )
  117. axes[-1].set_axis_on()
  118. target_num_labels = 12
  119. num_months = (
  120. (end_date.year - start_date.year) * 12 + end_date.month - start_date.month
  121. )
  122. interval = int(numpy.ceil(num_months / target_num_labels))
  123. if interval >= 8:
  124. interval = int(numpy.ceil(num_months / (12 * target_num_labels)))
  125. axes[-1].xaxis.set_major_locator(
  126. matplotlib.dates.YearLocator(base=max(1, interval // 12))
  127. )
  128. axes[-1].xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y"))
  129. else:
  130. axes[-1].xaxis.set_major_locator(
  131. matplotlib.dates.MonthLocator(interval=interval)
  132. )
  133. axes[-1].xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%m"))
  134. for tick in axes[-1].xaxis.get_major_ticks():
  135. tick.label.set_fontsize(args.font_size)
  136. axes[-1].spines["left"].set_visible(False)
  137. axes[-1].spines["right"].set_visible(False)
  138. axes[-1].spines["top"].set_visible(False)
  139. axes[-1].get_yaxis().set_visible(False)
  140. axes[-1].set_facecolor((1.0,) * 3 + (0.0,))
  141. title = ("%s commits" % name) if not args.output else ""
  142. if args.mode == "all" and args.output:
  143. output = get_plot_path(args.output, "time_series")
  144. else:
  145. output = args.output
  146. deploy_plot(title, output, args.background)
  147. def order_commits(
  148. chosen_people: Set[str], days: Dict[int, Dict[int, DevDay]], people: List[str]
  149. ) -> Tuple[numpy.ndarray, defaultdict, defaultdict, List[int]]:
  150. from seriate import seriate
  151. try:
  152. from fastdtw import fastdtw
  153. except ImportError as e:
  154. print(
  155. "Cannot import fastdtw: %s\nInstall it from https://github.com/slaypni/fastdtw"
  156. % e
  157. )
  158. sys.exit(1)
  159. # FIXME(vmarkovtsev): remove once https://github.com/slaypni/fastdtw/pull/28 is merged&released
  160. try:
  161. sys.modules[
  162. "fastdtw.fastdtw"
  163. ].__norm = lambda p: lambda a, b: numpy.linalg.norm(
  164. numpy.atleast_1d(a) - numpy.atleast_1d(b), p
  165. )
  166. except KeyError:
  167. # the native extension does not have this bug
  168. pass
  169. devseries = defaultdict(list)
  170. devstats = defaultdict(lambda: DevDay(0, 0, 0, 0, {}))
  171. for day, devs in sorted(days.items()):
  172. for dev, stats in devs.items():
  173. if people[dev] in chosen_people:
  174. devseries[dev].append((day, stats.Commits))
  175. devstats[dev] = devstats[dev].add(stats)
  176. print("Calculating the distance matrix")
  177. # max-normalize the time series using a sliding window
  178. series = list(devseries.values())
  179. for i, s in enumerate(series):
  180. arr = numpy.array(s).transpose().astype(numpy.float32)
  181. arr[1] /= arr[1].sum()
  182. series[i] = arr.transpose()
  183. # calculate the distance matrix using dynamic time warping
  184. dists = numpy.full((len(series),) * 2, -100500, dtype=numpy.float32)
  185. # TODO: what's the total for this progress bar?
  186. with tqdm.tqdm() as pb:
  187. for x, serx in enumerate(series):
  188. dists[x, x] = 0
  189. for y, sery in enumerate(series[x + 1 :], start=x + 1):
  190. min_day = int(min(serx[0][0], sery[0][0]))
  191. max_day = int(max(serx[-1][0], sery[-1][0]))
  192. arrx = numpy.zeros(max_day - min_day + 1, dtype=numpy.float32)
  193. arry = numpy.zeros_like(arrx)
  194. arrx[serx[:, 0].astype(int) - min_day] = serx[:, 1]
  195. arry[sery[:, 0].astype(int) - min_day] = sery[:, 1]
  196. # L1 norm
  197. dist, _ = fastdtw(arrx, arry, radius=5, dist=1)
  198. dists[x, y] = dists[y, x] = dist
  199. pb.update()
  200. print("Ordering the series")
  201. route = seriate(dists)
  202. return dists, devseries, devstats, route
  203. def hdbscan_cluster_routed_series(
  204. dists: numpy.ndarray, route: List[int]
  205. ) -> numpy.ndarray:
  206. try:
  207. from hdbscan import HDBSCAN
  208. except ImportError as e:
  209. print("Cannot import hdbscan: %s" % e)
  210. sys.exit(1)
  211. opt_dist_chain = numpy.cumsum(
  212. numpy.array(
  213. [0] + [dists[route[i], route[i + 1]] for i in range(len(route) - 1)]
  214. )
  215. )
  216. clusters = HDBSCAN(min_cluster_size=2).fit_predict(opt_dist_chain[:, numpy.newaxis])
  217. return clusters
  218. def show_devs_efforts(
  219. args: Namespace,
  220. name: str,
  221. start_date: int,
  222. end_date: int,
  223. people: List[str],
  224. days: Dict[int, Dict[int, DevDay]],
  225. max_people: int,
  226. ) -> None:
  227. from scipy.signal import convolve, slepian
  228. start_date = datetime.fromtimestamp(start_date)
  229. start_date = datetime(start_date.year, start_date.month, start_date.day)
  230. end_date = datetime.fromtimestamp(end_date)
  231. end_date = datetime(end_date.year, end_date.month, end_date.day)
  232. efforts_by_dev = defaultdict(int)
  233. for day, devs in days.items():
  234. for dev, stats in devs.items():
  235. efforts_by_dev[dev] += stats.Added + stats.Removed + stats.Changed
  236. if len(efforts_by_dev) > max_people:
  237. chosen = {
  238. v
  239. for k, v in sorted(
  240. ((v, k) for k, v in efforts_by_dev.items()), reverse=True
  241. )[:max_people]
  242. }
  243. print("Warning: truncated people to the most active %d" % max_people)
  244. else:
  245. chosen = set(efforts_by_dev)
  246. chosen_efforts = sorted(((efforts_by_dev[k], k) for k in chosen), reverse=True)
  247. chosen_order = {k: i for i, (_, k) in enumerate(chosen_efforts)}
  248. efforts = numpy.zeros(
  249. (len(chosen) + 1, (end_date - start_date).days + 1), dtype=numpy.float32
  250. )
  251. for day, devs in days.items():
  252. if day < efforts.shape[1]:
  253. for dev, stats in devs.items():
  254. dev = chosen_order.get(dev, len(chosen_order))
  255. efforts[dev][day] += stats.Added + stats.Removed + stats.Changed
  256. efforts_cum = numpy.cumsum(efforts, axis=1)
  257. window = slepian(10, 0.5)
  258. window /= window.sum()
  259. for e in (efforts, efforts_cum):
  260. for i in range(e.shape[0]):
  261. ending = e[i][-len(window) * 2 :].copy()
  262. e[i] = convolve(e[i], window, "same")
  263. e[i][-len(ending) :] = ending
  264. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  265. plot_x = [start_date + timedelta(days=i) for i in range(efforts.shape[1])]
  266. people = [people[k] for _, k in chosen_efforts] + ["others"]
  267. for i, name in enumerate(people):
  268. if len(name) > 40:
  269. people[i] = name[:37] + "..."
  270. polys = pyplot.stackplot(plot_x, efforts_cum, labels=people)
  271. if len(polys) == max_people + 1:
  272. polys[-1].set_hatch("/")
  273. polys = pyplot.stackplot(plot_x, -efforts * efforts_cum.max() / efforts.max())
  274. if len(polys) == max_people + 1:
  275. polys[-1].set_hatch("/")
  276. yticks = []
  277. for tick in pyplot.gca().yaxis.iter_ticks():
  278. if tick[1] >= 0:
  279. yticks.append(tick[1])
  280. pyplot.gca().yaxis.set_ticks(yticks)
  281. legend = pyplot.legend(loc=2, ncol=2, fontsize=args.font_size)
  282. apply_plot_style(
  283. pyplot.gcf(),
  284. pyplot.gca(),
  285. legend,
  286. args.background,
  287. args.font_size,
  288. args.size or "16,10",
  289. )
  290. if args.mode == "all" and args.output:
  291. output = get_plot_path(args.output, "efforts")
  292. else:
  293. output = args.output
  294. deploy_plot("Efforts through time (changed lines of code)", output, args.background)