labours.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. import argparse
  2. from datetime import datetime, timedelta
  3. import io
  4. import os
  5. import sys
  6. import warnings
  7. try:
  8. from clint.textui import progress
  9. except ImportError:
  10. print("Warning: clint is not installed, no fancy progressbars in the terminal for you.")
  11. progress = None
  12. import numpy
  13. if sys.version_info[0] < 3:
  14. # OK, ancients, I will support Python 2, but you owe me a beer
  15. input = raw_input
  16. def parse_args():
  17. parser = argparse.ArgumentParser()
  18. parser.add_argument("-o", "--output", default="",
  19. help="Path to the output file/directory (empty for display).")
  20. parser.add_argument("-i", "--input", default="-",
  21. help="Path to the input file (- for stdin).")
  22. parser.add_argument("--text-size", default=12, type=int,
  23. help="Size of the labels and legend.")
  24. parser.add_argument("--backend", help="Matplotlib backend to use.")
  25. parser.add_argument("--style", choices=["black", "white"], default="black",
  26. help="Plot's general color scheme.")
  27. parser.add_argument("--relative", action="store_true",
  28. help="Occupy 100%% height for every measurement.")
  29. parser.add_argument("-m", "--mode", choices=["project", "file", "person", "matrix"],
  30. default="project", help="What to plot.")
  31. parser.add_argument(
  32. "--resample", default="year",
  33. help="The way to resample the time series. Possible values are: "
  34. "\"month\", \"year\", \"no\", \"raw\" and pandas offset aliases ("
  35. "http://pandas.pydata.org/pandas-docs/stable/timeseries.html"
  36. "#offset-aliases).")
  37. args = parser.parse_args()
  38. return args
  39. def read_input(args):
  40. main_contents = []
  41. files_contents = []
  42. people_contents = []
  43. if args.input != "-":
  44. with open(args.input) as fin:
  45. header = fin.readline()[:-1]
  46. contents = fin.readlines()
  47. else:
  48. header = input()
  49. contents = sys.stdin.readlines()
  50. for i, line in enumerate(contents):
  51. if line not in ("files\n", "people\n"):
  52. main_contents.append(line)
  53. else:
  54. break
  55. if i < len(contents) and contents[i] == "files\n":
  56. i += 1
  57. while i < len(contents) and contents[i] != "people\n":
  58. files_contents.append(contents[i:i + len(main_contents)])
  59. i += len(main_contents)
  60. if i < len(contents) and contents[i] == "people\n":
  61. i += 2
  62. while contents[i] != "\n":
  63. people_contents.append(contents[i:i + len(main_contents)])
  64. i += len(main_contents)
  65. people_contents.append(contents[i + 1:])
  66. return header, main_contents, files_contents, people_contents
  67. def calculate_average_lifetime(matrix):
  68. lifetimes = numpy.zeros(matrix.shape[1] - 1)
  69. for band in matrix:
  70. start = 0
  71. for i, line in enumerate(band):
  72. if i == 0 or band[i - 1] == 0:
  73. start += 1
  74. continue
  75. lifetimes[i - start] = band[i - 1] - line
  76. lifetimes[i - start] = band[i - 1]
  77. return (lifetimes.dot(numpy.arange(1, matrix.shape[1], 1))
  78. / (lifetimes.sum() * matrix.shape[1]))
  79. def load_main(header, contents, resample):
  80. import pandas
  81. start, last, granularity, sampling = header.split()
  82. start = datetime.fromtimestamp(int(start))
  83. last = datetime.fromtimestamp(int(last))
  84. granularity = int(granularity)
  85. sampling = int(sampling)
  86. name = contents[0][:-1]
  87. matrix = numpy.array([numpy.fromstring(line, dtype=int, sep=" ")
  88. for line in contents[1:]]).T
  89. print(name, "lifetime index:", calculate_average_lifetime(matrix))
  90. finish = start + timedelta(days=matrix.shape[1] * sampling)
  91. if resample not in ("no", "raw"):
  92. # Interpolate the day x day matrix.
  93. # Each day brings equal weight in the granularity.
  94. # Sampling's interpolation is linear.
  95. daily_matrix = numpy.zeros(
  96. (matrix.shape[0] * granularity, matrix.shape[1] * sampling),
  97. dtype=numpy.float32)
  98. epsrange = numpy.arange(0, 1, 1.0 / sampling)
  99. for y in range(matrix.shape[0]):
  100. for x in range(matrix.shape[1]):
  101. previous = matrix[y, x - 1] if x > 0 else 0
  102. value = ((previous + (matrix[y, x] - previous) * epsrange)
  103. / granularity)[numpy.newaxis, :]
  104. if (y + 1) * granularity <= x * sampling:
  105. daily_matrix[y * granularity:(y + 1) * granularity,
  106. x * sampling:(x + 1) * sampling] = value
  107. elif y * granularity <= (x + 1) * sampling:
  108. for suby in range(y * granularity, (y + 1) * granularity):
  109. for subx in range(suby, (x + 1) * sampling):
  110. daily_matrix[suby, subx] = matrix[
  111. y, x] / granularity
  112. daily_matrix[(last - start).days:] = 0
  113. # Resample the bands
  114. aliases = {
  115. "year": "A",
  116. "month": "M"
  117. }
  118. resample = aliases.get(resample, resample)
  119. periods = 0
  120. date_granularity_sampling = [start]
  121. while date_granularity_sampling[-1] < finish:
  122. periods += 1
  123. date_granularity_sampling = pandas.date_range(
  124. start, periods=periods, freq=resample)
  125. date_range_sampling = pandas.date_range(
  126. date_granularity_sampling[0],
  127. periods=(finish - date_granularity_sampling[0]).days,
  128. freq="1D")
  129. # Fill the new square matrix
  130. matrix = numpy.zeros(
  131. (len(date_granularity_sampling), len(date_range_sampling)),
  132. dtype=numpy.float32)
  133. for i, gdt in enumerate(date_granularity_sampling):
  134. istart = (date_granularity_sampling[i - 1] - start).days \
  135. if i > 0 else 0
  136. ifinish = (gdt - start).days
  137. for j, sdt in enumerate(date_range_sampling):
  138. if (sdt - start).days >= istart:
  139. break
  140. matrix[i, j:] = \
  141. daily_matrix[istart:ifinish, (sdt - start).days:].sum(axis=0)
  142. # Hardcode some cases to improve labels" readability
  143. if resample in ("year", "A"):
  144. labels = [dt.year for dt in date_granularity_sampling]
  145. elif resample in ("month", "M"):
  146. labels = [dt.strftime("%Y %B") for dt in date_granularity_sampling]
  147. else:
  148. labels = [dt.date() for dt in date_granularity_sampling]
  149. else:
  150. labels = [
  151. "%s - %s" % ((start + timedelta(days=i * granularity)).date(),
  152. (
  153. start + timedelta(days=(i + 1) * granularity)).date())
  154. for i in range(matrix.shape[0])]
  155. if len(labels) > 18:
  156. warnings.warn("Too many labels - consider resampling.")
  157. resample = "M" # fake resampling type is checked while plotting
  158. date_range_sampling = pandas.date_range(
  159. start + timedelta(days=sampling), periods=matrix.shape[1],
  160. freq="%dD" % sampling)
  161. return name, matrix, date_range_sampling, labels, granularity, sampling, resample
  162. def load_matrix(contents):
  163. size = len(contents) - 1
  164. people = []
  165. for i, block in enumerate(contents[:-1]):
  166. people.append(block[0].split(": ", 1)[1])
  167. matrix = numpy.array([[int(p) for p in l[:-1].split()]
  168. for l in contents[-1][-size - 1:] if l[:-1]],
  169. dtype=int)
  170. return matrix, people
  171. def plot_project(args, name, matrix, date_range_sampling, labels, granularity,
  172. sampling, resample):
  173. import matplotlib
  174. if args.backend:
  175. matplotlib.use(args.backend)
  176. import matplotlib.pyplot as pyplot
  177. if args.style == "white":
  178. pyplot.gca().spines["bottom"].set_color("white")
  179. pyplot.gca().spines["top"].set_color("white")
  180. pyplot.gca().spines["left"].set_color("white")
  181. pyplot.gca().spines["right"].set_color("white")
  182. pyplot.gca().xaxis.label.set_color("white")
  183. pyplot.gca().yaxis.label.set_color("white")
  184. pyplot.gca().tick_params(axis="x", colors="white")
  185. pyplot.gca().tick_params(axis="y", colors="white")
  186. if args.relative:
  187. for i in range(matrix.shape[1]):
  188. matrix[:, i] /= matrix[:, i].sum()
  189. pyplot.ylim(0, 1)
  190. legend_loc = 3
  191. else:
  192. legend_loc = 2
  193. pyplot.stackplot(date_range_sampling, matrix, labels=labels)
  194. legend = pyplot.legend(loc=legend_loc, fontsize=args.text_size)
  195. frame = legend.get_frame()
  196. frame.set_facecolor("black" if args.style == "white" else "white")
  197. frame.set_edgecolor("black" if args.style == "white" else "white")
  198. for text in legend.get_texts():
  199. text.set_color(args.style)
  200. pyplot.ylabel("Lines of code", fontsize=args.text_size)
  201. pyplot.xlabel("Time", fontsize=args.text_size)
  202. pyplot.tick_params(labelsize=args.text_size)
  203. pyplot.xlim(date_range_sampling[0], date_range_sampling[-1])
  204. pyplot.gcf().set_size_inches(12, 9)
  205. locator = pyplot.gca().xaxis.get_major_locator()
  206. # set the optimal xticks locator
  207. if "M" not in resample:
  208. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  209. locs = pyplot.gca().get_xticks().tolist()
  210. if len(locs) >= 16:
  211. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  212. locs = pyplot.gca().get_xticks().tolist()
  213. if len(locs) >= 16:
  214. pyplot.gca().xaxis.set_major_locator(locator)
  215. if locs[0] < pyplot.xlim()[0]:
  216. del locs[0]
  217. endindex = -1
  218. if len(locs) >= 2 and \
  219. pyplot.xlim()[1] - locs[-1] > (locs[-1] - locs[-2]) / 2:
  220. locs.append(pyplot.xlim()[1])
  221. endindex = len(locs) - 1
  222. startindex = -1
  223. if len(locs) >= 2 and \
  224. locs[0] - pyplot.xlim()[0] > (locs[1] - locs[0]) / 2:
  225. locs.append(pyplot.xlim()[0])
  226. startindex = len(locs) - 1
  227. pyplot.gca().set_xticks(locs)
  228. # hacking time!
  229. labels = pyplot.gca().get_xticklabels()
  230. if startindex >= 0:
  231. labels[startindex].set_text(date_range_sampling[0].date())
  232. labels[startindex].set_text = lambda _: None
  233. labels[startindex].set_rotation(30)
  234. labels[startindex].set_ha("right")
  235. if endindex >= 0:
  236. labels[endindex].set_text(date_range_sampling[-1].date())
  237. labels[endindex].set_text = lambda _: None
  238. labels[endindex].set_rotation(30)
  239. labels[endindex].set_ha("right")
  240. if not args.output:
  241. pyplot.gcf().canvas.set_window_title(
  242. "%s %d x %d (granularity %d, sampling %d)" %
  243. ((name,) + matrix.shape + (granularity, sampling)))
  244. pyplot.show()
  245. else:
  246. pyplot.tight_layout()
  247. if args.mode == "project":
  248. output = args.output
  249. else:
  250. root, ext = os.path.splitext(args.output)
  251. if not ext:
  252. ext = ".png"
  253. output = os.path.join(root, name + ext)
  254. os.makedirs(os.path.dirname(output), exist_ok=True)
  255. pyplot.savefig(output, transparent=True)
  256. pyplot.clf()
  257. def plot_many(args, header, parts):
  258. if not args.output:
  259. print("Warning: output not set, showing %d plots." % len(parts))
  260. itercnt = progress.bar(parts, expected_size=len(parts)) \
  261. if progress is not None else parts
  262. stdout = io.StringIO()
  263. for fc in itercnt:
  264. backup = sys.stdout
  265. sys.stdout = stdout
  266. plot_project(args, *load_main(header, fc, args.resample))
  267. sys.stdout = backup
  268. sys.stdout.write(stdout.getvalue())
  269. def plot_matrix(args, matrix, people):
  270. matrix = matrix.astype(float)
  271. zeros = matrix[:, 0] == 0
  272. matrix[zeros, :] = 1
  273. matrix /= matrix[:, 0][:, None]
  274. matrix = -matrix[:, 1:]
  275. matrix[zeros, :] = 0
  276. import matplotlib
  277. if args.backend:
  278. matplotlib.use(args.backend)
  279. import matplotlib.pyplot as pyplot
  280. s = 4 + matrix.shape[1] * 0.3
  281. fig = pyplot.figure(figsize=(s, s))
  282. ax = fig.add_subplot(111)
  283. ax.xaxis.set_label_position("top")
  284. ax.matshow(matrix, cmap=pyplot.cm.OrRd)
  285. ax.set_xticks(numpy.arange(0, matrix.shape[1]))
  286. ax.set_yticks(numpy.arange(0, matrix.shape[0]))
  287. ax.set_xticklabels(["Unidentified"] + people, rotation=90, ha="center")
  288. ax.set_yticklabels(people, va="center")
  289. ax.set_xticks(numpy.arange(0.5, matrix.shape[1] + 0.5), minor=True)
  290. ax.set_yticks(numpy.arange(0.5, matrix.shape[0] + 0.5), minor=True)
  291. ax.grid(which="minor")
  292. if not args.output:
  293. pos1 = ax.get_position()
  294. pos2 = (pos1.x0 + 0.15, pos1.y0 - 0.1, pos1.width * 0.9, pos1.height * 0.9)
  295. ax.set_position(pos2)
  296. pyplot.gcf().canvas.set_window_title(
  297. "Hercules %d developers overwrite" % matrix.shape[0])
  298. pyplot.show()
  299. else:
  300. pyplot.tight_layout()
  301. pyplot.savefig(args.output, transparent=True)
  302. def main():
  303. args = parse_args()
  304. header, main_contents, files_contents, people_contents = read_input(args)
  305. if args.mode == "project":
  306. plot_project(args, *load_main(header, main_contents, args.resample))
  307. elif args.mode == "file":
  308. plot_many(args, header, files_contents)
  309. elif args.mode == "person":
  310. plot_many(args, header, people_contents[:-1])
  311. elif args.mode == "matrix":
  312. plot_matrix(args, *load_matrix(people_contents))
  313. if __name__ == "__main__":
  314. sys.exit(main())