labours.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417
  1. import argparse
  2. from datetime import datetime, timedelta
  3. import io
  4. import os
  5. import sys
  6. import warnings
  7. try:
  8. from clint.textui import progress
  9. except ImportError:
  10. print("Warning: clint is not installed, no fancy progressbars in the terminal for you.")
  11. progress = None
  12. import numpy
  13. import yaml
  14. if sys.version_info[0] < 3:
  15. # OK, ancients, I will support Python 2, but you owe me a beer
  16. input = raw_input
  17. def parse_args():
  18. parser = argparse.ArgumentParser()
  19. parser.add_argument("-o", "--output", default="",
  20. help="Path to the output file/directory (empty for display).")
  21. parser.add_argument("-i", "--input", default="-",
  22. help="Path to the input file (- for stdin).")
  23. parser.add_argument("--text-size", default=12, type=int,
  24. help="Size of the labels and legend.")
  25. parser.add_argument("--backend", help="Matplotlib backend to use.")
  26. parser.add_argument("--style", choices=["black", "white"], default="black",
  27. help="Plot's general color scheme.")
  28. parser.add_argument("--relative", action="store_true",
  29. help="Occupy 100%% height for every measurement.")
  30. parser.add_argument("-m", "--mode",
  31. choices=["project", "file", "person", "matrix", "people", "all"],
  32. default="project", help="What to plot.")
  33. parser.add_argument(
  34. "--resample", default="year",
  35. help="The way to resample the time series. Possible values are: "
  36. "\"month\", \"year\", \"no\", \"raw\" and pandas offset aliases ("
  37. "http://pandas.pydata.org/pandas-docs/stable/timeseries.html"
  38. "#offset-aliases).")
  39. args = parser.parse_args()
  40. return args
  41. def read_input(args):
  42. if args.input != "-":
  43. with open(args.input) as fin:
  44. data = yaml.load(fin)
  45. else:
  46. data = yaml.load(sys.stdin)
  47. return data["burndown"], data["project"], data.get("files"), data.get("people_sequence"), \
  48. data.get("people"), data.get("people_interaction")
  49. def calculate_average_lifetime(matrix):
  50. lifetimes = numpy.zeros(matrix.shape[1] - 1)
  51. for band in matrix:
  52. start = 0
  53. for i, line in enumerate(band):
  54. if i == 0 or band[i - 1] == 0:
  55. start += 1
  56. continue
  57. lifetimes[i - start] = band[i - 1] - line
  58. lifetimes[i - start] = band[i - 1]
  59. return (lifetimes.dot(numpy.arange(1, matrix.shape[1], 1))
  60. / (lifetimes.sum() * matrix.shape[1]))
  61. def load_main(header, name, matrix, resample):
  62. import pandas
  63. start = header["begin"]
  64. last = header["end"]
  65. granularity = header["granularity"]
  66. sampling = header["sampling"]
  67. start = datetime.fromtimestamp(int(start))
  68. last = datetime.fromtimestamp(int(last))
  69. granularity = int(granularity)
  70. sampling = int(sampling)
  71. matrix = numpy.array([numpy.fromstring(line, dtype=int, sep=" ")
  72. for line in matrix.split("\n")]).T
  73. print(name, "lifetime index:", calculate_average_lifetime(matrix))
  74. finish = start + timedelta(days=matrix.shape[1] * sampling)
  75. if resample not in ("no", "raw"):
  76. # Interpolate the day x day matrix.
  77. # Each day brings equal weight in the granularity.
  78. # Sampling's interpolation is linear.
  79. daily_matrix = numpy.zeros(
  80. (matrix.shape[0] * granularity, matrix.shape[1] * sampling),
  81. dtype=numpy.float32)
  82. epsrange = numpy.arange(0, 1, 1.0 / sampling)
  83. for y in range(matrix.shape[0]):
  84. for x in range(matrix.shape[1]):
  85. previous = matrix[y, x - 1] if x > 0 else 0
  86. value = ((previous + (matrix[y, x] - previous) * epsrange)
  87. / granularity)[numpy.newaxis, :]
  88. if (y + 1) * granularity <= x * sampling:
  89. daily_matrix[y * granularity:(y + 1) * granularity,
  90. x * sampling:(x + 1) * sampling] = value
  91. elif y * granularity <= (x + 1) * sampling:
  92. for suby in range(y * granularity, (y + 1) * granularity):
  93. for subx in range(suby, (x + 1) * sampling):
  94. daily_matrix[suby, subx] = matrix[
  95. y, x] / granularity
  96. daily_matrix[(last - start).days:] = 0
  97. # Resample the bands
  98. aliases = {
  99. "year": "A",
  100. "month": "M"
  101. }
  102. resample = aliases.get(resample, resample)
  103. periods = 0
  104. date_granularity_sampling = [start]
  105. while date_granularity_sampling[-1] < finish:
  106. periods += 1
  107. date_granularity_sampling = pandas.date_range(
  108. start, periods=periods, freq=resample)
  109. date_range_sampling = pandas.date_range(
  110. date_granularity_sampling[0],
  111. periods=(finish - date_granularity_sampling[0]).days,
  112. freq="1D")
  113. # Fill the new square matrix
  114. matrix = numpy.zeros(
  115. (len(date_granularity_sampling), len(date_range_sampling)),
  116. dtype=numpy.float32)
  117. for i, gdt in enumerate(date_granularity_sampling):
  118. istart = (date_granularity_sampling[i - 1] - start).days \
  119. if i > 0 else 0
  120. ifinish = (gdt - start).days
  121. for j, sdt in enumerate(date_range_sampling):
  122. if (sdt - start).days >= istart:
  123. break
  124. matrix[i, j:] = \
  125. daily_matrix[istart:ifinish, (sdt - start).days:].sum(axis=0)
  126. # Hardcode some cases to improve labels" readability
  127. if resample in ("year", "A"):
  128. labels = [dt.year for dt in date_granularity_sampling]
  129. elif resample in ("month", "M"):
  130. labels = [dt.strftime("%Y %B") for dt in date_granularity_sampling]
  131. else:
  132. labels = [dt.date() for dt in date_granularity_sampling]
  133. else:
  134. labels = [
  135. "%s - %s" % ((start + timedelta(days=i * granularity)).date(),
  136. (
  137. start + timedelta(days=(i + 1) * granularity)).date())
  138. for i in range(matrix.shape[0])]
  139. if len(labels) > 18:
  140. warnings.warn("Too many labels - consider resampling.")
  141. resample = "M" # fake resampling type is checked while plotting
  142. date_range_sampling = pandas.date_range(
  143. start + timedelta(days=sampling), periods=matrix.shape[1],
  144. freq="%dD" % sampling)
  145. return name, matrix, date_range_sampling, labels, granularity, sampling, resample
  146. def load_matrix(contents):
  147. matrix = numpy.array([numpy.fromstring(line, dtype=int, sep=" ")
  148. for line in contents.split("\n")])
  149. return matrix
  150. def load_people(header, sequence, contents):
  151. import pandas
  152. start = header["begin"]
  153. last = header["end"]
  154. sampling = header["sampling"]
  155. start = datetime.fromtimestamp(int(start))
  156. last = datetime.fromtimestamp(int(last))
  157. sampling = int(sampling)
  158. people = []
  159. for name in sequence:
  160. people.append(numpy.array([numpy.fromstring(line, dtype=int, sep=" ")
  161. for line in contents[name].split("\n")]).sum(axis=1))
  162. people = numpy.array(people)
  163. date_range_sampling = pandas.date_range(
  164. start + timedelta(days=sampling), periods=people[0].shape[0],
  165. freq="%dD" % sampling)
  166. return sequence, people, date_range_sampling, last
  167. def apply_plot_style(figure, axes, legend, style, text_size):
  168. figure.set_size_inches(12, 9)
  169. for side in ("bottom", "top", "left", "right"):
  170. axes.spines[side].set_color(style)
  171. for axis in (axes.xaxis, axes.yaxis):
  172. axis.label.update(dict(fontsize=text_size, color=style))
  173. for axis in ("x", "y"):
  174. axes.tick_params(axis=axis, colors=style, labelsize=text_size)
  175. if legend is not None:
  176. frame = legend.get_frame()
  177. for setter in (frame.set_facecolor, frame.set_edgecolor):
  178. setter("black" if style == "white" else "white")
  179. for text in legend.get_texts():
  180. text.set_color(style)
  181. def get_plot_path(base, name):
  182. root, ext = os.path.splitext(base)
  183. if not ext:
  184. ext = ".png"
  185. output = os.path.join(root, name + ext)
  186. os.makedirs(os.path.dirname(output), exist_ok=True)
  187. return output
  188. def deploy_plot(title, output, style):
  189. import matplotlib.pyplot as pyplot
  190. if not output:
  191. pyplot.gcf().canvas.set_window_title(title)
  192. pyplot.show()
  193. else:
  194. if title:
  195. pyplot.title(title, color=style)
  196. pyplot.tight_layout()
  197. pyplot.savefig(output, transparent=True)
  198. pyplot.clf()
  199. def plot_burndown(args, target, name, matrix, date_range_sampling, labels, granularity,
  200. sampling, resample):
  201. import matplotlib
  202. if args.backend:
  203. matplotlib.use(args.backend)
  204. import matplotlib.pyplot as pyplot
  205. pyplot.stackplot(date_range_sampling, matrix, labels=labels)
  206. if args.relative:
  207. for i in range(matrix.shape[1]):
  208. matrix[:, i] /= matrix[:, i].sum()
  209. pyplot.ylim(0, 1)
  210. legend_loc = 3
  211. else:
  212. legend_loc = 2
  213. legend = pyplot.legend(loc=legend_loc, fontsize=args.text_size)
  214. pyplot.ylabel("Lines of code")
  215. pyplot.xlabel("Time")
  216. apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.style, args.text_size)
  217. pyplot.xlim(date_range_sampling[0], date_range_sampling[-1])
  218. locator = pyplot.gca().xaxis.get_major_locator()
  219. # set the optimal xticks locator
  220. if "M" not in resample:
  221. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  222. locs = pyplot.gca().get_xticks().tolist()
  223. if len(locs) >= 16:
  224. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  225. locs = pyplot.gca().get_xticks().tolist()
  226. if len(locs) >= 16:
  227. pyplot.gca().xaxis.set_major_locator(locator)
  228. if locs[0] < pyplot.xlim()[0]:
  229. del locs[0]
  230. endindex = -1
  231. if len(locs) >= 2 and \
  232. pyplot.xlim()[1] - locs[-1] > (locs[-1] - locs[-2]) / 2:
  233. locs.append(pyplot.xlim()[1])
  234. endindex = len(locs) - 1
  235. startindex = -1
  236. if len(locs) >= 2 and \
  237. locs[0] - pyplot.xlim()[0] > (locs[1] - locs[0]) / 2:
  238. locs.append(pyplot.xlim()[0])
  239. startindex = len(locs) - 1
  240. pyplot.gca().set_xticks(locs)
  241. # hacking time!
  242. labels = pyplot.gca().get_xticklabels()
  243. if startindex >= 0:
  244. labels[startindex].set_text(date_range_sampling[0].date())
  245. labels[startindex].set_text = lambda _: None
  246. labels[startindex].set_rotation(30)
  247. labels[startindex].set_ha("right")
  248. if endindex >= 0:
  249. labels[endindex].set_text(date_range_sampling[-1].date())
  250. labels[endindex].set_text = lambda _: None
  251. labels[endindex].set_rotation(30)
  252. labels[endindex].set_ha("right")
  253. title = "%s %d x %d (granularity %d, sampling %d)" % \
  254. ((name,) + matrix.shape + (granularity, sampling))
  255. output = args.output
  256. if output:
  257. if args.mode == "project" and target == "project":
  258. output = args.output
  259. else:
  260. if target == "project":
  261. name = "project"
  262. output = get_plot_path(args.output, name)
  263. deploy_plot(title, output, args.style)
  264. def plot_many(args, target, header, parts):
  265. if not args.output:
  266. print("Warning: output not set, showing %d plots." % len(parts))
  267. itercnt = progress.bar(parts.items(), expected_size=len(parts)) \
  268. if progress is not None else parts.items()
  269. stdout = io.StringIO()
  270. for name, matrix in itercnt:
  271. backup = sys.stdout
  272. sys.stdout = stdout
  273. plot_burndown(args, target, *load_main(header, name, matrix, args.resample))
  274. sys.stdout = backup
  275. sys.stdout.write(stdout.getvalue())
  276. def plot_matrix(args, repo, people, matrix):
  277. matrix = matrix.astype(float)
  278. zeros = matrix[:, 0] == 0
  279. matrix[zeros, :] = 1
  280. matrix /= matrix[:, 0][:, None]
  281. matrix = -matrix[:, 1:]
  282. matrix[zeros, :] = 0
  283. import matplotlib
  284. if args.backend:
  285. matplotlib.use(args.backend)
  286. import matplotlib.pyplot as pyplot
  287. s = 4 + matrix.shape[1] * 0.3
  288. fig = pyplot.figure(figsize=(s, s))
  289. ax = fig.add_subplot(111)
  290. ax.xaxis.set_label_position("top")
  291. ax.matshow(matrix, cmap=pyplot.cm.OrRd)
  292. ax.set_xticks(numpy.arange(0, matrix.shape[1]))
  293. ax.set_yticks(numpy.arange(0, matrix.shape[0]))
  294. ax.set_xticklabels(["Unidentified"] + people, rotation=90, ha="center")
  295. ax.set_yticklabels(people, va="center")
  296. ax.set_xticks(numpy.arange(0.5, matrix.shape[1] + 0.5), minor=True)
  297. ax.set_yticks(numpy.arange(0.5, matrix.shape[0] + 0.5), minor=True)
  298. ax.grid(which="minor")
  299. apply_plot_style(fig, ax, None, args.style, args.text_size)
  300. if not args.output:
  301. pos1 = ax.get_position()
  302. pos2 = (pos1.x0 + 0.245, pos1.y0 - 0.1, pos1.width * 0.9, pos1.height * 0.9)
  303. ax.set_position(pos2)
  304. if args.mode == "all":
  305. output = get_plot_path(args.output, "matrix")
  306. else:
  307. output = args.output
  308. title = "%s %d developers overwrite" % (repo, matrix.shape[0])
  309. if args.output:
  310. # FIXME(vmarkovtsev): otherwise the title is screwed in savefig()
  311. title = ""
  312. deploy_plot(title, output, args.style)
  313. def plot_people(args, repo, names, people, date_range, last):
  314. import matplotlib
  315. if args.backend:
  316. matplotlib.use(args.backend)
  317. import matplotlib.pyplot as pyplot
  318. pyplot.stackplot(date_range, people, labels=names)
  319. pyplot.xlim(date_range[0], last)
  320. if args.relative:
  321. for i in range(people.shape[1]):
  322. people[:, i] /= people[:, i].sum()
  323. pyplot.ylim(0, 1)
  324. legend_loc = 3
  325. else:
  326. legend_loc = 2
  327. legend = pyplot.legend(loc=legend_loc, fontsize=args.text_size)
  328. apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.style, args.text_size)
  329. if args.mode == "all":
  330. output = get_plot_path(args.output, "people")
  331. else:
  332. output = args.output
  333. deploy_plot("%s code ratio through time" % repo, output, args.style)
  334. def main():
  335. args = parse_args()
  336. header, main_contents, files_contents, people_sequence, people_contents, people_matrix = \
  337. read_input(args)
  338. name = next(iter(main_contents))
  339. files_warning = "Files stats were not collected. Re-run hercules with -files."
  340. people_warning = "People stats were not collected. Re-run hercules with -people."
  341. if args.mode == "project":
  342. plot_burndown(args, "project",
  343. *load_main(header, name, main_contents[name], args.resample))
  344. elif args.mode == "file":
  345. if not files_contents:
  346. print(files_warning)
  347. return
  348. plot_many(args, "file", header, files_contents)
  349. elif args.mode == "person":
  350. if not people_contents:
  351. print(people_warning)
  352. return
  353. plot_many(args, "person", header, people_contents)
  354. elif args.mode == "matrix":
  355. if not people_contents:
  356. print(people_warning)
  357. return
  358. plot_matrix(args, name, people_sequence, load_matrix(people_matrix))
  359. elif args.mode == "people":
  360. if not people_contents:
  361. print(people_warning)
  362. return
  363. plot_people(args, name, *load_people(header, people_sequence, people_contents))
  364. elif args.mode == "all":
  365. plot_burndown(args, "project",
  366. *load_main(header, name, main_contents[name], args.resample))
  367. if files_contents:
  368. plot_many(args, "file", header, files_contents)
  369. if people_contents:
  370. plot_many(args, "person", header, people_contents)
  371. plot_matrix(args, name, people_sequence, load_matrix(people_matrix))
  372. plot_people(args, name, *load_people(header, people_sequence, people_contents))
  373. if __name__ == "__main__":
  374. sys.exit(main())