labours.py 39 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072
  1. #!/usr/bin/env python3
  2. import argparse
  3. from datetime import datetime, timedelta
  4. from importlib import import_module
  5. import io
  6. import json
  7. import os
  8. import re
  9. import shutil
  10. import sys
  11. import tempfile
  12. import threading
  13. import time
  14. import warnings
  15. try:
  16. from clint.textui import progress
  17. except ImportError:
  18. print("Warning: clint is not installed, no fancy progressbars in the terminal for you.")
  19. progress = None
  20. import numpy
  21. import yaml
  22. if sys.version_info[0] < 3:
  23. # OK, ancients, I will support Python 2, but you owe me a beer
  24. input = raw_input
  25. PB_MESSAGES = {
  26. "Burndown": "pb.pb_pb2.BurndownAnalysisResults",
  27. "Couples": "pb.pb_pb2.CouplesAnalysisResults",
  28. }
  29. def parse_args():
  30. parser = argparse.ArgumentParser()
  31. parser.add_argument("-o", "--output", default="",
  32. help="Path to the output file/directory (empty for display). "
  33. "If the extension is JSON, the data is saved instead of "
  34. "the real image.")
  35. parser.add_argument("-i", "--input", default="-",
  36. help="Path to the input file (- for stdin).")
  37. parser.add_argument("-f", "--input-format", default="auto", choices=["yaml", "pb", "auto"])
  38. parser.add_argument("--text-size", default=12, type=int,
  39. help="Size of the labels and legend.")
  40. parser.add_argument("--backend", help="Matplotlib backend to use.")
  41. parser.add_argument("--style", choices=["black", "white"], default="black",
  42. help="Plot's general color scheme.")
  43. parser.add_argument("--size", help="Axes' size in inches, for example \"12,9\"")
  44. parser.add_argument("--relative", action="store_true",
  45. help="Occupy 100%% height for every measurement.")
  46. parser.add_argument("--couples-tmp-dir", help="Temporary directory to work with couples.")
  47. parser.add_argument("-m", "--mode",
  48. choices=["project", "file", "person", "churn_matrix", "ownership",
  49. "couples", "all"],
  50. help="What to plot.")
  51. parser.add_argument(
  52. "--resample", default="year",
  53. help="The way to resample the time series. Possible values are: "
  54. "\"month\", \"year\", \"no\", \"raw\" and pandas offset aliases ("
  55. "http://pandas.pydata.org/pandas-docs/stable/timeseries.html"
  56. "#offset-aliases).")
  57. parser.add_argument("--disable-projector", action="store_true",
  58. help="Do not run Tensorflow Projector on couples.")
  59. parser.add_argument("--max-people", default=20, type=int,
  60. help="Maximum number of developers in churn matrix and people plots.")
  61. args = parser.parse_args()
  62. return args
  63. class Reader(object):
  64. def read(self, file):
  65. raise NotImplementedError
  66. def get_name(self):
  67. raise NotImplementedError
  68. def get_header(self):
  69. raise NotImplementedError
  70. def get_burndown_parameters(self):
  71. raise NotImplementedError
  72. def get_project_burndown(self):
  73. raise NotImplementedError
  74. def get_files_burndown(self):
  75. raise NotImplementedError
  76. def get_people_burndown(self):
  77. raise NotImplementedError
  78. def get_ownership_burndown(self):
  79. raise NotImplementedError
  80. def get_people_interaction(self):
  81. raise NotImplementedError
  82. def get_files_coocc(self):
  83. raise NotImplementedError
  84. def get_people_coocc(self):
  85. raise NotImplementedError
  86. class YamlReader(Reader):
  87. def read(self, file):
  88. yaml.reader.Reader.NON_PRINTABLE = re.compile(r"(?!x)x")
  89. try:
  90. loader = yaml.CLoader
  91. except AttributeError:
  92. print("Warning: failed to import yaml.CLoader, falling back to slow yaml.Loader")
  93. loader = yaml.Loader
  94. try:
  95. if file != "-":
  96. with open(file) as fin:
  97. data = yaml.load(fin, Loader=loader)
  98. else:
  99. data = yaml.load(sys.stdin, Loader=loader)
  100. except (UnicodeEncodeError, yaml.reader.ReaderError) as e:
  101. print("\nInvalid unicode in the input: %s\nPlease filter it through "
  102. "fix_yaml_unicode.py" % e)
  103. sys.exit(1)
  104. self.data = data
  105. def get_name(self):
  106. return next(iter(self.data["Burndown"]["project"]))
  107. def get_header(self):
  108. header = self.data["hercules"]
  109. return header["begin_unix_time"], header["end_unix_time"]
  110. def get_burndown_parameters(self):
  111. header = self.data["Burndown"]
  112. return header["sampling"], header["granularity"]
  113. def get_project_burndown(self):
  114. return self.data["hercules"]["repository"], \
  115. self._parse_burndown_matrix(self.data["Burndown"]["project"]).T
  116. def get_files_burndown(self):
  117. return [(p[0], self._parse_burndown_matrix(p[1]).T)
  118. for p in self.data["Burndown"]["files"].items()]
  119. def get_people_burndown(self):
  120. return [(p[0], self._parse_burndown_matrix(p[1]).T)
  121. for p in self.data["Burndown"]["people"].items()]
  122. def get_ownership_burndown(self):
  123. return self.data["Burndown"]["people_sequence"].copy(),\
  124. {p[0]: self._parse_burndown_matrix(p[1])
  125. for p in self.data["Burndown"]["people"].items()}
  126. def get_people_interaction(self):
  127. return self.data["Burndown"]["people_sequence"].copy(), \
  128. self._parse_burndown_matrix(self.data["Burndown"]["people_interaction"])
  129. def get_files_coocc(self):
  130. coocc = self.data["Couples"]["files_coocc"]
  131. return coocc["index"], self._parse_coocc_matrix(coocc["matrix"])
  132. def get_people_coocc(self):
  133. coocc = self.data["Couples"]["people_coocc"]
  134. return coocc["index"], self._parse_coocc_matrix(coocc["matrix"])
  135. def _parse_burndown_matrix(self, matrix):
  136. return numpy.array([numpy.fromstring(line, dtype=int, sep=" ")
  137. for line in matrix.split("\n")])
  138. def _parse_coocc_matrix(self, matrix):
  139. from scipy.sparse import csr_matrix
  140. data = []
  141. indices = []
  142. indptr = [0]
  143. for row in matrix:
  144. for k, v in sorted(row.items()):
  145. data.append(v)
  146. indices.append(k)
  147. indptr.append(indptr[-1] + len(row))
  148. return csr_matrix((data, indices, indptr), shape=(len(matrix),) * 2)
  149. class ProtobufReader(Reader):
  150. def read(self, file):
  151. from pb.pb_pb2 import AnalysisResults
  152. self.data = AnalysisResults()
  153. if file != "-":
  154. with open(file, "rb") as fin:
  155. self.data.ParseFromString(fin.read())
  156. else:
  157. self.data.ParseFromString(sys.stdin.buffer.read())
  158. self.contents = {}
  159. for key, val in self.data.contents.items():
  160. try:
  161. mod, name = PB_MESSAGES[key].rsplit(".", 1)
  162. except KeyError:
  163. sys.stderr.write("Warning: there is no registered PB decoder for %s\n" % key)
  164. continue
  165. cls = getattr(import_module(mod), name)
  166. self.contents[key] = msg = cls()
  167. msg.ParseFromString(val)
  168. def get_name(self):
  169. return self.data.header.repository
  170. def get_header(self):
  171. header = self.data.header
  172. return header.begin_unix_time, header.end_unix_time
  173. def get_burndown_parameters(self):
  174. burndown = self.contents["Burndown"]
  175. return burndown.sampling, burndown.granularity
  176. def get_project_burndown(self):
  177. return self._parse_burndown_matrix(self.contents["Burndown"].project)
  178. def get_files_burndown(self):
  179. return [self._parse_burndown_matrix(i) for i in self.contents["Burndown"].files]
  180. def get_people_burndown(self):
  181. return [self._parse_burndown_matrix(i) for i in self.contents["Burndown"].people]
  182. def get_ownership_burndown(self):
  183. people = self.get_people_burndown()
  184. return [p[0] for p in people], {p[0]: p[1].T for p in people}
  185. def get_people_interaction(self):
  186. burndown = self.contents["Burndown"]
  187. return [i.name for i in burndown.people], \
  188. self._parse_sparse_matrix(burndown.people_interaction).toarray()
  189. def get_files_coocc(self):
  190. node = self.contents["Couples"].file_couples
  191. return list(node.index), self._parse_sparse_matrix(node.matrix)
  192. def get_people_coocc(self):
  193. node = self.contents["Couples"].people_couples
  194. return list(node.index), self._parse_sparse_matrix(node.matrix)
  195. def _parse_burndown_matrix(self, matrix):
  196. dense = numpy.zeros((matrix.number_of_rows, matrix.number_of_columns), dtype=int)
  197. for y, row in enumerate(matrix.rows):
  198. for x, col in enumerate(row.columns):
  199. dense[y, x] = col
  200. return matrix.name, dense.T
  201. def _parse_sparse_matrix(self, matrix):
  202. from scipy.sparse import csr_matrix
  203. return csr_matrix((list(matrix.data), list(matrix.indices), list(matrix.indptr)),
  204. shape=(matrix.number_of_rows, matrix.number_of_columns))
  205. READERS = {"yaml": YamlReader, "yml": YamlReader, "pb": ProtobufReader}
  206. def read_input(args):
  207. sys.stdout.write("Reading the input... ")
  208. sys.stdout.flush()
  209. if args.input != "-":
  210. if args.input_format == "auto":
  211. args.input_format = args.input.rsplit(".", 1)[1]
  212. elif args.input_format == "auto":
  213. args.input_format = "yaml"
  214. reader = READERS[args.input_format]()
  215. reader.read(args.input)
  216. print("done")
  217. return reader
  218. def calculate_average_lifetime(matrix):
  219. lifetimes = numpy.zeros(matrix.shape[1] - 1)
  220. for band in matrix:
  221. start = 0
  222. for i, line in enumerate(band):
  223. if i == 0 or band[i - 1] == 0:
  224. start += 1
  225. continue
  226. lifetimes[i - start] = band[i - 1] - line
  227. lifetimes[i - start] = band[i - 1]
  228. return (lifetimes.dot(numpy.arange(1, matrix.shape[1], 1))
  229. / (lifetimes.sum() * matrix.shape[1]))
  230. def interpolate_burndown_matrix(matrix, granularity, sampling):
  231. daily = numpy.zeros(
  232. (matrix.shape[0] * granularity, matrix.shape[1] * sampling),
  233. dtype=numpy.float32)
  234. """
  235. ----------> samples, x
  236. |
  237. |
  238. |
  239. bands, y
  240. """
  241. for y in range(matrix.shape[0]):
  242. for x in range(matrix.shape[1]):
  243. if y * granularity > (x + 1) * sampling:
  244. # the future is zeros
  245. continue
  246. def decay(start_index: int, start_val: float):
  247. if start_val == 0:
  248. return
  249. k = matrix[y][x] / start_val # <= 1
  250. scale = (x + 1) * sampling - start_index
  251. for i in range(y * granularity, (y + 1) * granularity):
  252. initial = daily[i][start_index - 1]
  253. for j in range(start_index, (x + 1) * sampling):
  254. daily[i][j] = initial * (
  255. 1 + (k - 1) * (j - start_index + 1) / scale)
  256. def grow(finish_index: int, finish_val: float):
  257. initial = matrix[y][x - 1] if x > 0 else 0
  258. start_index = x * sampling
  259. if start_index < y * granularity:
  260. start_index = y * granularity
  261. if finish_index == start_index:
  262. return
  263. avg = (finish_val - initial) / (finish_index - start_index)
  264. for j in range(x * sampling, finish_index):
  265. for i in range(start_index, j + 1):
  266. daily[i][j] = avg
  267. # copy [x*g..y*s)
  268. for j in range(x * sampling, finish_index):
  269. for i in range(y * granularity, x * sampling):
  270. daily[i][j] = daily[i][j - 1]
  271. if (y + 1) * granularity >= (x + 1) * sampling:
  272. # x*granularity <= (y+1)*sampling
  273. # 1. x*granularity <= y*sampling
  274. # y*sampling..(y+1)sampling
  275. #
  276. # x+1
  277. # /
  278. # /
  279. # / y+1 -|
  280. # / |
  281. # / y -|
  282. # /
  283. # / x
  284. #
  285. # 2. x*granularity > y*sampling
  286. # x*granularity..(y+1)sampling
  287. #
  288. # x+1
  289. # /
  290. # /
  291. # / y+1 -|
  292. # / |
  293. # / x -|
  294. # /
  295. # / y
  296. if y * granularity <= x * sampling:
  297. grow((x + 1) * sampling, matrix[y][x])
  298. elif (x + 1) * sampling > y * granularity:
  299. grow((x + 1) * sampling, matrix[y][x])
  300. avg = matrix[y][x] / ((x + 1) * sampling - y * granularity)
  301. for j in range(y * granularity, (x + 1) * sampling):
  302. for i in range(y * granularity, j + 1):
  303. daily[i][j] = avg
  304. elif (y + 1) * granularity >= x * sampling:
  305. # y*sampling <= (x+1)*granularity < (y+1)sampling
  306. # y*sampling..(x+1)*granularity
  307. # (x+1)*granularity..(y+1)sampling
  308. # x+1
  309. # /\
  310. # / \
  311. # / \
  312. # / y+1
  313. # /
  314. # y
  315. v1 = matrix[y][x - 1]
  316. v2 = matrix[y][x]
  317. delta = (y + 1) * granularity - x * sampling
  318. previous = 0
  319. if x > 0 and (x - 1) * sampling >= y * granularity:
  320. # x*g <= (y-1)*s <= y*s <= (x+1)*g <= (y+1)*s
  321. # |________|.......^
  322. if x > 1:
  323. previous = matrix[y][x - 2]
  324. scale = sampling
  325. else:
  326. # (y-1)*s < x*g <= y*s <= (x+1)*g <= (y+1)*s
  327. # |______|.......^
  328. scale = sampling if x == 0 else x * sampling - y * granularity
  329. peak = v1 + (v1 - previous) / scale * delta
  330. if v2 > peak:
  331. # we need to adjust the peak, it may not be less than the decayed value
  332. if x < matrix.shape[1] - 1:
  333. # y*s <= (x+1)*g <= (y+1)*s < (y+2)*s
  334. # ^.........|_________|
  335. k = (v2 - matrix[y][x + 1]) / sampling # > 0
  336. peak = matrix[y][x] + k * ((x + 1) * sampling - (y + 1) * granularity)
  337. # peak > v2 > v1
  338. else:
  339. peak = v2
  340. # not enough data to interpolate; this is at least not restricted
  341. grow((y + 1) * granularity, peak)
  342. decay((y + 1) * granularity, peak)
  343. else:
  344. # (x+1)*granularity < y*sampling
  345. # y*sampling..(y+1)sampling
  346. decay(x * sampling, matrix[y][x - 1])
  347. return daily
  348. def load_burndown(header, name, matrix, resample):
  349. import pandas
  350. start, last, sampling, granularity = header
  351. assert sampling > 0
  352. assert granularity >= sampling
  353. start = datetime.fromtimestamp(start)
  354. last = datetime.fromtimestamp(last)
  355. print(name, "lifetime index:", calculate_average_lifetime(matrix))
  356. finish = start + timedelta(days=matrix.shape[1] * sampling)
  357. if resample not in ("no", "raw"):
  358. # Interpolate the day x day matrix.
  359. # Each day brings equal weight in the granularity.
  360. # Sampling's interpolation is linear.
  361. daily = interpolate_burndown_matrix(matrix, granularity, sampling)
  362. daily[(last - start).days:] = 0
  363. # Resample the bands
  364. aliases = {
  365. "year": "A",
  366. "month": "M"
  367. }
  368. resample = aliases.get(resample, resample)
  369. periods = 0
  370. date_granularity_sampling = [start]
  371. while date_granularity_sampling[-1] < finish:
  372. periods += 1
  373. date_granularity_sampling = pandas.date_range(
  374. start, periods=periods, freq=resample)
  375. date_range_sampling = pandas.date_range(
  376. date_granularity_sampling[0],
  377. periods=(finish - date_granularity_sampling[0]).days,
  378. freq="1D")
  379. # Fill the new square matrix
  380. matrix = numpy.zeros(
  381. (len(date_granularity_sampling), len(date_range_sampling)),
  382. dtype=numpy.float32)
  383. for i, gdt in enumerate(date_granularity_sampling):
  384. istart = (date_granularity_sampling[i - 1] - start).days \
  385. if i > 0 else 0
  386. ifinish = (gdt - start).days
  387. for j, sdt in enumerate(date_range_sampling):
  388. if (sdt - start).days >= istart:
  389. break
  390. matrix[i, j:] = \
  391. daily[istart:ifinish, (sdt - start).days:].sum(axis=0)
  392. # Hardcode some cases to improve labels' readability
  393. if resample in ("year", "A"):
  394. labels = [dt.year for dt in date_granularity_sampling]
  395. elif resample in ("month", "M"):
  396. labels = [dt.strftime("%Y %B") for dt in date_granularity_sampling]
  397. else:
  398. labels = [dt.date() for dt in date_granularity_sampling]
  399. else:
  400. labels = [
  401. "%s - %s" % ((start + timedelta(days=i * granularity)).date(),
  402. (
  403. start + timedelta(days=(i + 1) * granularity)).date())
  404. for i in range(matrix.shape[0])]
  405. if len(labels) > 18:
  406. warnings.warn("Too many labels - consider resampling.")
  407. resample = "M" # fake resampling type is checked while plotting
  408. date_range_sampling = pandas.date_range(
  409. start + timedelta(days=sampling), periods=matrix.shape[1],
  410. freq="%dD" % sampling)
  411. return name, matrix, date_range_sampling, labels, granularity, sampling, resample
  412. def load_ownership(header, sequence, contents, max_people):
  413. import pandas
  414. start, last, sampling, _ = header
  415. start = datetime.fromtimestamp(start)
  416. last = datetime.fromtimestamp(last)
  417. people = []
  418. for name in sequence:
  419. people.append(contents[name].sum(axis=1))
  420. people = numpy.array(people)
  421. date_range_sampling = pandas.date_range(
  422. start + timedelta(days=sampling), periods=people[0].shape[0],
  423. freq="%dD" % sampling)
  424. if people.shape[0] > max_people:
  425. order = numpy.argsort(-people.sum(axis=1))
  426. people = people[order[:max_people]]
  427. sequence = [sequence[i] for i in order[:max_people]]
  428. print("Warning: truncated people to most owning %d" % max_people)
  429. for i, name in enumerate(sequence):
  430. if len(name) > 40:
  431. sequence[i] = name[:37] + "..."
  432. return sequence, people, date_range_sampling, last
  433. def load_churn_matrix(people, matrix, max_people):
  434. matrix = matrix.astype(float)
  435. if matrix.shape[0] > max_people:
  436. order = numpy.argsort(-matrix[:, 0])
  437. matrix = matrix[order[:max_people]][:, [0, 1] + list(2 + order[:max_people])]
  438. people = [people[i] for i in order[:max_people]]
  439. print("Warning: truncated people to most productive %d" % max_people)
  440. zeros = matrix[:, 0] == 0
  441. matrix[zeros, :] = 1
  442. matrix /= matrix[:, 0][:, None]
  443. matrix = -matrix[:, 1:]
  444. matrix[zeros, :] = 0
  445. for i, name in enumerate(people):
  446. if len(name) > 40:
  447. people[i] = name[:37] + "..."
  448. return people, matrix
  449. def apply_plot_style(figure, axes, legend, style, text_size, axes_size):
  450. if axes_size is None:
  451. axes_size = (12, 9)
  452. else:
  453. axes_size = tuple(float(p) for p in axes_size.split(","))
  454. figure.set_size_inches(*axes_size)
  455. for side in ("bottom", "top", "left", "right"):
  456. axes.spines[side].set_color(style)
  457. for axis in (axes.xaxis, axes.yaxis):
  458. axis.label.update(dict(fontsize=text_size, color=style))
  459. for axis in ("x", "y"):
  460. axes.tick_params(axis=axis, colors=style, labelsize=text_size)
  461. if legend is not None:
  462. frame = legend.get_frame()
  463. for setter in (frame.set_facecolor, frame.set_edgecolor):
  464. setter("black" if style == "white" else "white")
  465. for text in legend.get_texts():
  466. text.set_color(style)
  467. def get_plot_path(base, name):
  468. root, ext = os.path.splitext(base)
  469. if not ext:
  470. ext = ".png"
  471. output = os.path.join(root, name + ext)
  472. os.makedirs(os.path.dirname(output), exist_ok=True)
  473. return output
  474. def deploy_plot(title, output, style):
  475. import matplotlib.pyplot as pyplot
  476. if not output:
  477. pyplot.gcf().canvas.set_window_title(title)
  478. pyplot.show()
  479. else:
  480. if title:
  481. pyplot.title(title, color=style)
  482. try:
  483. pyplot.tight_layout()
  484. except:
  485. print("Warning: failed to set the tight layout")
  486. pyplot.savefig(output, transparent=True)
  487. pyplot.clf()
  488. def default_json(x):
  489. if hasattr(x, "tolist"):
  490. return x.tolist()
  491. if hasattr(x, "isoformat"):
  492. return x.isoformat()
  493. return x
  494. def plot_burndown(args, target, name, matrix, date_range_sampling, labels, granularity,
  495. sampling, resample):
  496. if args.output and args.output.endswith(".json"):
  497. data = locals().copy()
  498. del data["args"]
  499. data["type"] = "burndown"
  500. if args.mode == "project" and target == "project":
  501. output = args.output
  502. else:
  503. if target == "project":
  504. name = "project"
  505. output = get_plot_path(args.output, name)
  506. with open(output, "w") as fout:
  507. json.dump(data, fout, sort_keys=True, default=default_json)
  508. return
  509. import matplotlib
  510. if args.backend:
  511. matplotlib.use(args.backend)
  512. import matplotlib.pyplot as pyplot
  513. pyplot.stackplot(date_range_sampling, matrix, labels=labels)
  514. if args.relative:
  515. for i in range(matrix.shape[1]):
  516. matrix[:, i] /= matrix[:, i].sum()
  517. pyplot.ylim(0, 1)
  518. legend_loc = 3
  519. else:
  520. legend_loc = 2
  521. legend = pyplot.legend(loc=legend_loc, fontsize=args.text_size)
  522. pyplot.ylabel("Lines of code")
  523. pyplot.xlabel("Time")
  524. apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.style, args.text_size, args.size)
  525. pyplot.xlim(date_range_sampling[0], date_range_sampling[-1])
  526. locator = pyplot.gca().xaxis.get_major_locator()
  527. # set the optimal xticks locator
  528. if "M" not in resample:
  529. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  530. locs = pyplot.gca().get_xticks().tolist()
  531. if len(locs) >= 16:
  532. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  533. locs = pyplot.gca().get_xticks().tolist()
  534. if len(locs) >= 16:
  535. pyplot.gca().xaxis.set_major_locator(locator)
  536. if locs[0] < pyplot.xlim()[0]:
  537. del locs[0]
  538. endindex = -1
  539. if len(locs) >= 2 and \
  540. pyplot.xlim()[1] - locs[-1] > (locs[-1] - locs[-2]) / 2:
  541. locs.append(pyplot.xlim()[1])
  542. endindex = len(locs) - 1
  543. startindex = -1
  544. if len(locs) >= 2 and \
  545. locs[0] - pyplot.xlim()[0] > (locs[1] - locs[0]) / 2:
  546. locs.append(pyplot.xlim()[0])
  547. startindex = len(locs) - 1
  548. pyplot.gca().set_xticks(locs)
  549. # hacking time!
  550. labels = pyplot.gca().get_xticklabels()
  551. if startindex >= 0:
  552. labels[startindex].set_text(date_range_sampling[0].date())
  553. labels[startindex].set_text = lambda _: None
  554. labels[startindex].set_rotation(30)
  555. labels[startindex].set_ha("right")
  556. if endindex >= 0:
  557. labels[endindex].set_text(date_range_sampling[-1].date())
  558. labels[endindex].set_text = lambda _: None
  559. labels[endindex].set_rotation(30)
  560. labels[endindex].set_ha("right")
  561. title = "%s %d x %d (granularity %d, sampling %d)" % \
  562. ((name,) + matrix.shape + (granularity, sampling))
  563. output = args.output
  564. if output:
  565. if args.mode == "project" and target == "project":
  566. output = args.output
  567. else:
  568. if target == "project":
  569. name = "project"
  570. output = get_plot_path(args.output, name)
  571. deploy_plot(title, output, args.style)
  572. def plot_many_burndown(args, target, header, parts):
  573. if not args.output:
  574. print("Warning: output not set, showing %d plots." % len(parts))
  575. itercnt = progress.bar(parts, expected_size=len(parts)) \
  576. if progress is not None else parts
  577. stdout = io.StringIO()
  578. for name, matrix in itercnt:
  579. backup = sys.stdout
  580. sys.stdout = stdout
  581. plot_burndown(args, target, *load_burndown(header, name, matrix, args.resample))
  582. sys.stdout = backup
  583. sys.stdout.write(stdout.getvalue())
  584. def plot_churn_matrix(args, repo, people, matrix):
  585. if args.output and args.output.endswith(".json"):
  586. data = locals().copy()
  587. del data["args"]
  588. data["type"] = "churn_matrix"
  589. if args.mode == "all":
  590. output = get_plot_path(args.output, "matrix")
  591. else:
  592. output = args.output
  593. with open(output, "w") as fout:
  594. json.dump(data, fout, sort_keys=True, default=default_json)
  595. return
  596. import matplotlib
  597. if args.backend:
  598. matplotlib.use(args.backend)
  599. import matplotlib.pyplot as pyplot
  600. s = 4 + matrix.shape[1] * 0.3
  601. fig = pyplot.figure(figsize=(s, s))
  602. ax = fig.add_subplot(111)
  603. ax.xaxis.set_label_position("top")
  604. ax.matshow(matrix, cmap=pyplot.cm.OrRd)
  605. ax.set_xticks(numpy.arange(0, matrix.shape[1]))
  606. ax.set_yticks(numpy.arange(0, matrix.shape[0]))
  607. ax.set_xticklabels(["Unidentified"] + people, rotation=90, ha="center")
  608. ax.set_yticklabels(people, va="center")
  609. ax.set_xticks(numpy.arange(0.5, matrix.shape[1] + 0.5), minor=True)
  610. ax.set_yticks(numpy.arange(0.5, matrix.shape[0] + 0.5), minor=True)
  611. ax.grid(which="minor")
  612. apply_plot_style(fig, ax, None, args.style, args.text_size, args.size)
  613. if not args.output:
  614. pos1 = ax.get_position()
  615. pos2 = (pos1.x0 + 0.245, pos1.y0 - 0.1, pos1.width * 0.9, pos1.height * 0.9)
  616. ax.set_position(pos2)
  617. if args.mode == "all":
  618. output = get_plot_path(args.output, "matrix")
  619. else:
  620. output = args.output
  621. title = "%s %d developers overwrite" % (repo, matrix.shape[0])
  622. if args.output:
  623. # FIXME(vmarkovtsev): otherwise the title is screwed in savefig()
  624. title = ""
  625. deploy_plot(title, output, args.style)
  626. def plot_ownership(args, repo, names, people, date_range, last):
  627. if args.output and args.output.endswith(".json"):
  628. data = locals().copy()
  629. del data["args"]
  630. data["type"] = "ownership"
  631. if args.mode == "all":
  632. output = get_plot_path(args.output, "people")
  633. else:
  634. output = args.output
  635. with open(output, "w") as fout:
  636. json.dump(data, fout, sort_keys=True, default=default_json)
  637. return
  638. import matplotlib
  639. if args.backend:
  640. matplotlib.use(args.backend)
  641. import matplotlib.pyplot as pyplot
  642. pyplot.stackplot(date_range, people, labels=names)
  643. pyplot.xlim(date_range[0], last)
  644. if args.relative:
  645. for i in range(people.shape[1]):
  646. people[:, i] /= people[:, i].sum()
  647. pyplot.ylim(0, 1)
  648. legend_loc = 3
  649. else:
  650. legend_loc = 2
  651. legend = pyplot.legend(loc=legend_loc, fontsize=args.text_size)
  652. apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.style, args.text_size, args.size)
  653. if args.mode == "all":
  654. output = get_plot_path(args.output, "people")
  655. else:
  656. output = args.output
  657. deploy_plot("%s code ownership through time" % repo, output, args.style)
  658. IDEAL_SHARD_SIZE = 4096
  659. def train_embeddings(index, matrix, tmpdir, shard_size=IDEAL_SHARD_SIZE):
  660. try:
  661. from . import swivel
  662. except (SystemError, ImportError):
  663. import swivel
  664. import tensorflow as tf
  665. assert matrix.shape[0] == matrix.shape[1]
  666. assert len(index) <= matrix.shape[0]
  667. outlier_threshold = numpy.percentile(matrix.data, 99)
  668. matrix.data[matrix.data > outlier_threshold] = outlier_threshold
  669. nshards = len(index) // shard_size
  670. if nshards * shard_size < len(index):
  671. nshards += 1
  672. shard_size = len(index) // nshards
  673. nshards = len(index) // shard_size
  674. remainder = len(index) - nshards * shard_size
  675. if remainder > 0:
  676. lengths = matrix.indptr[1:] - matrix.indptr[:-1]
  677. filtered = sorted(numpy.argsort(lengths)[remainder:])
  678. else:
  679. filtered = list(range(len(index)))
  680. if len(filtered) < matrix.shape[0]:
  681. print("Truncating the sparse matrix...")
  682. matrix = matrix[filtered, :][:, filtered]
  683. meta_index = []
  684. for i, j in enumerate(filtered):
  685. meta_index.append((index[j], matrix[i, i]))
  686. index = [mi[0] for mi in meta_index]
  687. with tempfile.TemporaryDirectory(prefix="hercules_labours_", dir=tmpdir or None) as tmproot:
  688. print("Writing Swivel metadata...")
  689. vocabulary = "\n".join(index)
  690. with open(os.path.join(tmproot, "row_vocab.txt"), "w") as out:
  691. out.write(vocabulary)
  692. with open(os.path.join(tmproot, "col_vocab.txt"), "w") as out:
  693. out.write(vocabulary)
  694. del vocabulary
  695. bool_sums = matrix.indptr[1:] - matrix.indptr[:-1]
  696. bool_sums_str = "\n".join(map(str, bool_sums.tolist()))
  697. with open(os.path.join(tmproot, "row_sums.txt"), "w") as out:
  698. out.write(bool_sums_str)
  699. with open(os.path.join(tmproot, "col_sums.txt"), "w") as out:
  700. out.write(bool_sums_str)
  701. del bool_sums_str
  702. reorder = numpy.argsort(-bool_sums)
  703. print("Writing Swivel shards...")
  704. for row in range(nshards):
  705. for col in range(nshards):
  706. def _int64s(xs):
  707. return tf.train.Feature(
  708. int64_list=tf.train.Int64List(value=list(xs)))
  709. def _floats(xs):
  710. return tf.train.Feature(
  711. float_list=tf.train.FloatList(value=list(xs)))
  712. indices_row = reorder[row::nshards]
  713. indices_col = reorder[col::nshards]
  714. shard = matrix[indices_row][:, indices_col].tocoo()
  715. example = tf.train.Example(features=tf.train.Features(feature={
  716. "global_row": _int64s(indices_row),
  717. "global_col": _int64s(indices_col),
  718. "sparse_local_row": _int64s(shard.row),
  719. "sparse_local_col": _int64s(shard.col),
  720. "sparse_value": _floats(shard.data)}))
  721. with open(os.path.join(tmproot, "shard-%03d-%03d.pb" % (row, col)), "wb") as out:
  722. out.write(example.SerializeToString())
  723. print("Training Swivel model...")
  724. swivel.FLAGS.submatrix_rows = shard_size
  725. swivel.FLAGS.submatrix_cols = shard_size
  726. if len(meta_index) <= IDEAL_SHARD_SIZE / 16:
  727. embedding_size = 50
  728. num_epochs = 20000
  729. elif len(meta_index) <= IDEAL_SHARD_SIZE:
  730. embedding_size = 50
  731. num_epochs = 10000
  732. elif len(meta_index) <= IDEAL_SHARD_SIZE * 2:
  733. embedding_size = 60
  734. num_epochs = 5000
  735. elif len(meta_index) <= IDEAL_SHARD_SIZE * 4:
  736. embedding_size = 70
  737. num_epochs = 4000
  738. elif len(meta_index) <= IDEAL_SHARD_SIZE * 10:
  739. embedding_size = 80
  740. num_epochs = 2500
  741. elif len(meta_index) <= IDEAL_SHARD_SIZE * 25:
  742. embedding_size = 100
  743. num_epochs = 500
  744. elif len(meta_index) <= IDEAL_SHARD_SIZE * 100:
  745. embedding_size = 200
  746. num_epochs = 300
  747. else:
  748. embedding_size = 300
  749. num_epochs = 200
  750. swivel.FLAGS.embedding_size = embedding_size
  751. swivel.FLAGS.input_base_path = tmproot
  752. swivel.FLAGS.output_base_path = tmproot
  753. swivel.FLAGS.loss_multiplier = 1.0 / shard_size
  754. swivel.FLAGS.num_epochs = num_epochs
  755. swivel.main(None)
  756. print("Reading Swivel embeddings...")
  757. embeddings = []
  758. with open(os.path.join(tmproot, "row_embedding.tsv")) as frow:
  759. with open(os.path.join(tmproot, "col_embedding.tsv")) as fcol:
  760. for i, (lrow, lcol) in enumerate(zip(frow, fcol)):
  761. prow, pcol = (l.split("\t", 1) for l in (lrow, lcol))
  762. assert prow[0] == pcol[0]
  763. erow, ecol = \
  764. (numpy.fromstring(p[1], dtype=numpy.float32, sep="\t")
  765. for p in (prow, pcol))
  766. embeddings.append((erow + ecol) / 2)
  767. return meta_index, embeddings
  768. class CORSWebServer(object):
  769. def __init__(self):
  770. self.thread = threading.Thread(target=self.serve)
  771. self.server = None
  772. def serve(self):
  773. outer = self
  774. try:
  775. from http.server import HTTPServer, SimpleHTTPRequestHandler, test
  776. except ImportError: # Python 2
  777. from BaseHTTPServer import HTTPServer, test
  778. from SimpleHTTPServer import SimpleHTTPRequestHandler
  779. class ClojureServer(HTTPServer):
  780. def __init__(self, *args, **kwargs):
  781. HTTPServer.__init__(self, *args, **kwargs)
  782. outer.server = self
  783. class CORSRequestHandler(SimpleHTTPRequestHandler):
  784. def end_headers (self):
  785. self.send_header("Access-Control-Allow-Origin", "*")
  786. SimpleHTTPRequestHandler.end_headers(self)
  787. test(CORSRequestHandler, ClojureServer)
  788. def start(self):
  789. self.thread.start()
  790. def stop(self):
  791. if self.running:
  792. self.server.shutdown()
  793. self.thread.join()
  794. @property
  795. def running(self):
  796. return self.server is not None
  797. web_server = CORSWebServer()
  798. def write_embeddings(name, output, run_server, index, embeddings):
  799. print("Writing Tensorflow Projector files...")
  800. if not output:
  801. output = "couples_" + name
  802. if output.endswith(".json"):
  803. output = os.path.join(output[:-5], "couples")
  804. run_server = False
  805. metaf = "%s_%s_meta.tsv" % (output, name)
  806. with open(metaf, "w") as fout:
  807. fout.write("name\tcommits\n")
  808. for pair in index:
  809. fout.write("%s\t%s\n" % pair)
  810. print("Wrote", metaf)
  811. dataf = "%s_%s_data.tsv" % (output, name)
  812. with open(dataf, "w") as fout:
  813. for vec in embeddings:
  814. fout.write("\t".join(str(v) for v in vec))
  815. fout.write("\n")
  816. print("Wrote", dataf)
  817. jsonf = "%s_%s.json" % (output, name)
  818. with open(jsonf, "w") as fout:
  819. fout.write("""{
  820. "embeddings": [
  821. {
  822. "tensorName": "%s %s coupling",
  823. "tensorShape": [%s, %s],
  824. "tensorPath": "http://0.0.0.0:8000/%s",
  825. "metadataPath": "http://0.0.0.0:8000/%s"
  826. }
  827. ]
  828. }
  829. """ % (output, name, len(embeddings), len(embeddings[0]), dataf, metaf))
  830. print("Wrote %s" % jsonf)
  831. if run_server and not web_server.running:
  832. web_server.start()
  833. url = "http://projector.tensorflow.org/?config=http://0.0.0.0:8000/" + jsonf
  834. print(url)
  835. if run_server:
  836. if shutil.which("xdg-open") is not None:
  837. os.system("xdg-open " + url)
  838. else:
  839. browser = os.getenv("BROWSER", "")
  840. if browser:
  841. os.system(browser + " " + url)
  842. else:
  843. print("\t" + url)
  844. def main():
  845. args = parse_args()
  846. reader = read_input(args)
  847. header = reader.get_header()
  848. name = reader.get_name()
  849. burndown_warning = "Burndown stats were not collected. Re-run hercules with -burndown."
  850. burndown_files_warning = \
  851. "Burndown stats for files were not collected. Re-run hercules with " \
  852. "-burndown -burndown-files."
  853. burndown_people_warning = \
  854. "Burndown stats for people were not collected. Re-run hercules with " \
  855. "-burndown -burndown-people."
  856. couples_warning = "Coupling stats were not collected. Re-run hercules with -couples."
  857. def project_burndown():
  858. try:
  859. full_header = header + reader.get_burndown_parameters()
  860. except KeyError:
  861. print("project: " + burndown_warning)
  862. return
  863. plot_burndown(args, "project",
  864. *load_burndown(full_header, *reader.get_project_burndown(),
  865. resample=args.resample))
  866. def files_burndown():
  867. try:
  868. full_header = header + reader.get_burndown_parameters()
  869. except KeyError:
  870. print(burndown_warning)
  871. return
  872. try:
  873. plot_many_burndown(args, "file", full_header, reader.get_files_burndown())
  874. except KeyError:
  875. print("files: " + burndown_files_warning)
  876. def people_burndown():
  877. try:
  878. full_header = header + reader.get_burndown_parameters()
  879. except KeyError:
  880. print(burndown_warning)
  881. return
  882. try:
  883. plot_many_burndown(args, "person", full_header, reader.get_people_burndown())
  884. except KeyError:
  885. print("people: " + burndown_people_warning)
  886. def churn_matrix():
  887. try:
  888. plot_churn_matrix(args, name, *load_churn_matrix(
  889. *reader.get_people_interaction(), max_people=args.max_people))
  890. except KeyError:
  891. print("churn_matrix: " + burndown_people_warning)
  892. def ownership_burndown():
  893. try:
  894. full_header = header + reader.get_burndown_parameters()
  895. except KeyError:
  896. print(burndown_warning)
  897. return
  898. try:
  899. plot_ownership(args, name, *load_ownership(
  900. full_header, *reader.get_ownership_burndown(), max_people=args.max_people))
  901. except KeyError:
  902. print("ownership: " + burndown_people_warning)
  903. def couples():
  904. try:
  905. write_embeddings("files", args.output, not args.disable_projector,
  906. *train_embeddings(*reader.get_files_coocc(),
  907. tmpdir=args.couples_tmp_dir))
  908. write_embeddings("people", args.output, not args.disable_projector,
  909. *train_embeddings(*reader.get_people_coocc(),
  910. tmpdir=args.couples_tmp_dir))
  911. except KeyError:
  912. print(couples_warning)
  913. if args.mode == "project":
  914. project_burndown()
  915. elif args.mode == "file":
  916. files_burndown()
  917. elif args.mode == "person":
  918. people_burndown()
  919. elif args.mode == "churn_matrix":
  920. churn_matrix()
  921. elif args.mode == "ownership":
  922. ownership_burndown()
  923. elif args.mode == "couples":
  924. couples()
  925. elif args.mode == "all":
  926. project_burndown()
  927. files_burndown()
  928. people_burndown()
  929. churn_matrix()
  930. ownership_burndown()
  931. couples()
  932. if web_server.running:
  933. secs = int(os.getenv("COUPLES_SERVER_TIME", "60"))
  934. print("Sleeping for %d seconds, safe to Ctrl-C" % secs)
  935. sys.stdout.flush()
  936. try:
  937. time.sleep(secs)
  938. except KeyboardInterrupt:
  939. pass
  940. web_server.stop()
  941. if __name__ == "__main__":
  942. sys.exit(main())