devs.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262
  1. from argparse import Namespace
  2. from collections import defaultdict
  3. from datetime import datetime, timedelta
  4. import sys
  5. from typing import Dict, List, Set, Tuple
  6. import numpy
  7. import tqdm
  8. from labours.objects import DevDay
  9. from labours.plotting import apply_plot_style, deploy_plot, get_plot_path, import_pyplot
  10. from labours.utils import _format_number
  11. def show_devs(
  12. args: Namespace,
  13. name: str,
  14. start_date: int,
  15. end_date: int,
  16. people: List[str],
  17. days: Dict[int, Dict[int, DevDay]],
  18. max_people: int = 50
  19. ) -> None:
  20. from scipy.signal import convolve, slepian
  21. if len(people) > max_people:
  22. print("Picking top %s developers by commit count" % max_people)
  23. # pick top N developers by commit count
  24. commits = defaultdict(int)
  25. for devs in days.values():
  26. for dev, stats in devs.items():
  27. commits[dev] += stats.Commits
  28. commits = sorted(((v, k) for k, v in commits.items()), reverse=True)
  29. chosen_people = {people[k] for _, k in commits[:max_people]}
  30. else:
  31. chosen_people = set(people)
  32. dists, devseries, devstats, route = order_commits(chosen_people, days, people)
  33. route_map = {v: i for i, v in enumerate(route)}
  34. # determine clusters
  35. clusters = hdbscan_cluster_routed_series(dists, route)
  36. keys = list(devseries.keys())
  37. route = [keys[node] for node in route]
  38. print("Plotting")
  39. # smooth time series
  40. start_date = datetime.fromtimestamp(start_date)
  41. start_date = datetime(start_date.year, start_date.month, start_date.day)
  42. end_date = datetime.fromtimestamp(end_date)
  43. end_date = datetime(end_date.year, end_date.month, end_date.day)
  44. size = (end_date - start_date).days + 1
  45. plot_x = [start_date + timedelta(days=i) for i in range(size)]
  46. resolution = 64
  47. window = slepian(size // resolution, 0.5)
  48. final = numpy.zeros((len(devseries), size), dtype=numpy.float32)
  49. for i, s in enumerate(devseries.values()):
  50. arr = numpy.array(s).transpose()
  51. full_history = numpy.zeros(size, dtype=numpy.float32)
  52. mask = arr[0] < size
  53. full_history[arr[0][mask]] = arr[1][mask]
  54. final[route_map[i]] = convolve(full_history, window, "same")
  55. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  56. pyplot.rcParams["figure.figsize"] = (32, 16)
  57. pyplot.rcParams["font.size"] = args.font_size
  58. prop_cycle = pyplot.rcParams["axes.prop_cycle"]
  59. colors = prop_cycle.by_key()["color"]
  60. fig, axes = pyplot.subplots(final.shape[0], 1)
  61. backgrounds = ("#C4FFDB", "#FFD0CD") if args.background == "white" else ("#05401C", "#40110E")
  62. max_cluster = numpy.max(clusters)
  63. for ax, series, cluster, dev_i in zip(axes, final, clusters, route):
  64. if cluster >= 0:
  65. color = colors[cluster % len(colors)]
  66. i = 1
  67. while color == "#777777":
  68. color = colors[(max_cluster + i) % len(colors)]
  69. i += 1
  70. else:
  71. # outlier
  72. color = "#777777"
  73. ax.fill_between(plot_x, series, color=color)
  74. ax.set_axis_off()
  75. author = people[dev_i]
  76. ax.text(0.03, 0.5, author[:36] + (author[36:] and "..."),
  77. horizontalalignment="right", verticalalignment="center",
  78. transform=ax.transAxes, fontsize=args.font_size,
  79. color="black" if args.background == "white" else "white")
  80. ds = devstats[dev_i]
  81. stats = "%5d %8s %8s" % (ds[0], _format_number(ds[1] - ds[2]), _format_number(ds[3]))
  82. ax.text(0.97, 0.5, stats,
  83. horizontalalignment="left", verticalalignment="center",
  84. transform=ax.transAxes, fontsize=args.font_size, family="monospace",
  85. backgroundcolor=backgrounds[ds[1] <= ds[2]],
  86. color="black" if args.background == "white" else "white")
  87. axes[0].text(0.97, 1.75, " cmts delta changed",
  88. horizontalalignment="left", verticalalignment="center",
  89. transform=axes[0].transAxes, fontsize=args.font_size, family="monospace",
  90. color="black" if args.background == "white" else "white")
  91. axes[-1].set_axis_on()
  92. target_num_labels = 12
  93. num_months = (end_date.year - start_date.year) * 12 + end_date.month - start_date.month
  94. interval = int(numpy.ceil(num_months / target_num_labels))
  95. if interval >= 8:
  96. interval = int(numpy.ceil(num_months / (12 * target_num_labels)))
  97. axes[-1].xaxis.set_major_locator(matplotlib.dates.YearLocator(base=max(1, interval // 12)))
  98. axes[-1].xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y"))
  99. else:
  100. axes[-1].xaxis.set_major_locator(matplotlib.dates.MonthLocator(interval=interval))
  101. axes[-1].xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%m"))
  102. for tick in axes[-1].xaxis.get_major_ticks():
  103. tick.label.set_fontsize(args.font_size)
  104. axes[-1].spines["left"].set_visible(False)
  105. axes[-1].spines["right"].set_visible(False)
  106. axes[-1].spines["top"].set_visible(False)
  107. axes[-1].get_yaxis().set_visible(False)
  108. axes[-1].set_facecolor((1.0,) * 3 + (0.0,))
  109. title = ("%s commits" % name) if not args.output else ""
  110. if args.mode == "all" and args.output:
  111. output = get_plot_path(args.output, "time_series")
  112. else:
  113. output = args.output
  114. deploy_plot(title, output, args.background)
  115. def order_commits(
  116. chosen_people: Set[str],
  117. days: Dict[int, Dict[int, DevDay]],
  118. people: List[str]
  119. ) -> Tuple[numpy.ndarray, defaultdict, defaultdict, List[int]]:
  120. from seriate import seriate
  121. try:
  122. from fastdtw import fastdtw
  123. except ImportError as e:
  124. print("Cannot import fastdtw: %s\nInstall it from https://github.com/slaypni/fastdtw" % e)
  125. sys.exit(1)
  126. # FIXME(vmarkovtsev): remove once https://github.com/slaypni/fastdtw/pull/28 is merged&released
  127. try:
  128. sys.modules["fastdtw.fastdtw"].__norm = lambda p: lambda a, b: numpy.linalg.norm(
  129. numpy.atleast_1d(a) - numpy.atleast_1d(b), p)
  130. except KeyError:
  131. # the native extension does not have this bug
  132. pass
  133. devseries = defaultdict(list)
  134. devstats = defaultdict(lambda: DevDay(0, 0, 0, 0, {}))
  135. for day, devs in sorted(days.items()):
  136. for dev, stats in devs.items():
  137. if people[dev] in chosen_people:
  138. devseries[dev].append((day, stats.Commits))
  139. devstats[dev] = devstats[dev].add(stats)
  140. print("Calculating the distance matrix")
  141. # max-normalize the time series using a sliding window
  142. series = list(devseries.values())
  143. for i, s in enumerate(series):
  144. arr = numpy.array(s).transpose().astype(numpy.float32)
  145. arr[1] /= arr[1].sum()
  146. series[i] = arr.transpose()
  147. # calculate the distance matrix using dynamic time warping
  148. dists = numpy.full((len(series),) * 2, -100500, dtype=numpy.float32)
  149. # TODO: what's the total for this progress bar?
  150. with tqdm.tqdm() as pb:
  151. for x, serx in enumerate(series):
  152. dists[x, x] = 0
  153. for y, sery in enumerate(series[x + 1:], start=x + 1):
  154. min_day = int(min(serx[0][0], sery[0][0]))
  155. max_day = int(max(serx[-1][0], sery[-1][0]))
  156. arrx = numpy.zeros(max_day - min_day + 1, dtype=numpy.float32)
  157. arry = numpy.zeros_like(arrx)
  158. arrx[serx[:, 0].astype(int) - min_day] = serx[:, 1]
  159. arry[sery[:, 0].astype(int) - min_day] = sery[:, 1]
  160. # L1 norm
  161. dist, _ = fastdtw(arrx, arry, radius=5, dist=1)
  162. dists[x, y] = dists[y, x] = dist
  163. pb.update()
  164. print("Ordering the series")
  165. route = seriate(dists)
  166. return dists, devseries, devstats, route
  167. def hdbscan_cluster_routed_series(dists: numpy.ndarray, route: List[int]) -> numpy.ndarray:
  168. try:
  169. from hdbscan import HDBSCAN
  170. except ImportError as e:
  171. print("Cannot import hdbscan: %s" % e)
  172. sys.exit(1)
  173. opt_dist_chain = numpy.cumsum(numpy.array(
  174. [0] + [dists[route[i], route[i + 1]] for i in range(len(route) - 1)]))
  175. clusters = HDBSCAN(min_cluster_size=2).fit_predict(opt_dist_chain[:, numpy.newaxis])
  176. return clusters
  177. def show_devs_efforts(
  178. args: Namespace,
  179. name: str,
  180. start_date: int,
  181. end_date: int,
  182. people: List[str],
  183. days: Dict[int, Dict[int, DevDay]],
  184. max_people: int
  185. ) -> None:
  186. from scipy.signal import convolve, slepian
  187. start_date = datetime.fromtimestamp(start_date)
  188. start_date = datetime(start_date.year, start_date.month, start_date.day)
  189. end_date = datetime.fromtimestamp(end_date)
  190. end_date = datetime(end_date.year, end_date.month, end_date.day)
  191. efforts_by_dev = defaultdict(int)
  192. for day, devs in days.items():
  193. for dev, stats in devs.items():
  194. efforts_by_dev[dev] += stats.Added + stats.Removed + stats.Changed
  195. if len(efforts_by_dev) > max_people:
  196. chosen = {v for k, v in sorted(
  197. ((v, k) for k, v in efforts_by_dev.items()), reverse=True)[:max_people]}
  198. print("Warning: truncated people to the most active %d" % max_people)
  199. else:
  200. chosen = set(efforts_by_dev)
  201. chosen_efforts = sorted(((efforts_by_dev[k], k) for k in chosen), reverse=True)
  202. chosen_order = {k: i for i, (_, k) in enumerate(chosen_efforts)}
  203. efforts = numpy.zeros((len(chosen) + 1, (end_date - start_date).days + 1), dtype=numpy.float32)
  204. for day, devs in days.items():
  205. if day < efforts.shape[1]:
  206. for dev, stats in devs.items():
  207. dev = chosen_order.get(dev, len(chosen_order))
  208. efforts[dev][day] += stats.Added + stats.Removed + stats.Changed
  209. efforts_cum = numpy.cumsum(efforts, axis=1)
  210. window = slepian(10, 0.5)
  211. window /= window.sum()
  212. for e in (efforts, efforts_cum):
  213. for i in range(e.shape[0]):
  214. ending = e[i][-len(window) * 2:].copy()
  215. e[i] = convolve(e[i], window, "same")
  216. e[i][-len(ending):] = ending
  217. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  218. plot_x = [start_date + timedelta(days=i) for i in range(efforts.shape[1])]
  219. people = [people[k] for _, k in chosen_efforts] + ["others"]
  220. for i, name in enumerate(people):
  221. if len(name) > 40:
  222. people[i] = name[:37] + "..."
  223. polys = pyplot.stackplot(plot_x, efforts_cum, labels=people)
  224. if len(polys) == max_people + 1:
  225. polys[-1].set_hatch("/")
  226. polys = pyplot.stackplot(plot_x, -efforts * efforts_cum.max() / efforts.max())
  227. if len(polys) == max_people + 1:
  228. polys[-1].set_hatch("/")
  229. yticks = []
  230. for tick in pyplot.gca().yaxis.iter_ticks():
  231. if tick[1] >= 0:
  232. yticks.append(tick[1])
  233. pyplot.gca().yaxis.set_ticks(yticks)
  234. legend = pyplot.legend(loc=2, ncol=2, fontsize=args.font_size)
  235. apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.background,
  236. args.font_size, args.size or "16,10")
  237. if args.mode == "all" and args.output:
  238. output = get_plot_path(args.output, "efforts")
  239. else:
  240. output = args.output
  241. deploy_plot("Efforts through time (changed lines of code)", output, args.background)