labours.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870
  1. import argparse
  2. from datetime import datetime, timedelta
  3. import io
  4. import json
  5. import os
  6. import re
  7. import sys
  8. import tempfile
  9. import threading
  10. import time
  11. import warnings
  12. try:
  13. from clint.textui import progress
  14. except ImportError:
  15. print("Warning: clint is not installed, no fancy progressbars in the terminal for you.")
  16. progress = None
  17. import numpy
  18. import yaml
  19. if sys.version_info[0] < 3:
  20. # OK, ancients, I will support Python 2, but you owe me a beer
  21. input = raw_input
  22. def parse_args():
  23. parser = argparse.ArgumentParser()
  24. parser.add_argument("-o", "--output", default="",
  25. help="Path to the output file/directory (empty for display). "
  26. "If the extension is JSON, the data is saved instead of "
  27. "the real image.")
  28. parser.add_argument("-i", "--input", default="-",
  29. help="Path to the input file (- for stdin).")
  30. parser.add_argument("-f", "--input-format", default="yaml", choices=["yaml", "pb"])
  31. parser.add_argument("--text-size", default=12, type=int,
  32. help="Size of the labels and legend.")
  33. parser.add_argument("--backend", help="Matplotlib backend to use.")
  34. parser.add_argument("--style", choices=["black", "white"], default="black",
  35. help="Plot's general color scheme.")
  36. parser.add_argument("--size", help="Axes' size in inches, for example \"12,9\"")
  37. parser.add_argument("--relative", action="store_true",
  38. help="Occupy 100%% height for every measurement.")
  39. parser.add_argument("--couples-tmp-dir", help="Temporary directory to work with couples.")
  40. parser.add_argument("-m", "--mode",
  41. choices=["project", "file", "person", "churn_matrix", "ownership", "couples",
  42. "all"],
  43. default="project", help="What to plot.")
  44. parser.add_argument(
  45. "--resample", default="year",
  46. help="The way to resample the time series. Possible values are: "
  47. "\"month\", \"year\", \"no\", \"raw\" and pandas offset aliases ("
  48. "http://pandas.pydata.org/pandas-docs/stable/timeseries.html"
  49. "#offset-aliases).")
  50. parser.add_argument("--disable-projector", action="store_true",
  51. help="Do not run Tensorflow Projector on couples.")
  52. parser.add_argument("--max-people", default=20, type=int,
  53. help="Maximum number of developers in churn matrix and people plots.")
  54. args = parser.parse_args()
  55. return args
  56. class Reader(object):
  57. def read(self, file):
  58. raise NotImplementedError
  59. def get_name(self):
  60. raise NotImplementedError
  61. def get_header(self):
  62. raise NotImplementedError
  63. def get_project_burndown(self):
  64. raise NotImplementedError
  65. def get_files_burndown(self):
  66. raise NotImplementedError
  67. def get_people_burndown(self):
  68. raise NotImplementedError
  69. def get_ownership_burndown(self):
  70. raise NotImplementedError
  71. def get_people_interaction(self):
  72. raise NotImplementedError
  73. def get_files_coocc(self):
  74. raise NotImplementedError
  75. def get_people_coocc(self):
  76. raise NotImplementedError
  77. class YamlReader(Reader):
  78. def read(self, file):
  79. yaml.reader.Reader.NON_PRINTABLE = re.compile(r"(?!x)x")
  80. try:
  81. loader = yaml.CLoader
  82. except AttributeError:
  83. print("Warning: failed to import yaml.CLoader, falling back to slow yaml.Loader")
  84. loader = yaml.Loader
  85. try:
  86. if file != "-":
  87. with open(file) as fin:
  88. data = yaml.load(fin, Loader=loader)
  89. else:
  90. data = yaml.load(sys.stdin, Loader=loader)
  91. except (UnicodeEncodeError, yaml.reader.ReaderError) as e:
  92. print("\nInvalid unicode in the input: %s\nPlease filter it through "
  93. "fix_yaml_unicode.py" % e)
  94. sys.exit(1)
  95. self.data = data
  96. def get_name(self):
  97. return next(iter(self.data["project"]))
  98. def get_header(self):
  99. header = self.data["burndown"]
  100. return header["begin"], header["end"], header["sampling"], header["granularity"]
  101. def get_project_burndown(self):
  102. name, matrix = next(iter(self.data["project"].items()))
  103. return name, self._parse_burndown_matrix(matrix).T
  104. def get_files_burndown(self):
  105. return [(p[0], self._parse_burndown_matrix(p[1]).T) for p in self.data["files"].items()]
  106. def get_people_burndown(self):
  107. return [(p[0], self._parse_burndown_matrix(p[1]).T) for p in self.data["people"].items()]
  108. def get_ownership_burndown(self):
  109. return self.data["people_sequence"], {p[0]: self._parse_burndown_matrix(p[1])
  110. for p in self.data["people"].items()}
  111. def get_people_interaction(self):
  112. return self.data["people_sequence"], self._parse_burndown_matrix(self.data["people_interaction"])
  113. def get_files_coocc(self):
  114. coocc = self.data["files_coocc"]
  115. return coocc["index"], self._parse_coocc_matrix(coocc["matrix"])
  116. def get_people_coocc(self):
  117. coocc = self.data["people_coocc"]
  118. return coocc["index"], self._parse_coocc_matrix(coocc["matrix"])
  119. def _parse_burndown_matrix(self, matrix):
  120. return numpy.array([numpy.fromstring(line, dtype=int, sep=" ")
  121. for line in matrix.split("\n")])
  122. def _parse_coocc_matrix(self, matrix):
  123. from scipy.sparse import csr_matrix
  124. data = []
  125. indices = []
  126. indptr = [0]
  127. for row in matrix:
  128. for k, v in sorted(row.items()):
  129. data.append(v)
  130. indices.append(k)
  131. indptr.append(indptr[-1] + len(row))
  132. return csr_matrix((data, indices, indptr), shape=(len(matrix),) * 2)
  133. class ProtobufReader(Reader):
  134. def read(self, file):
  135. from pb.pb_pb2 import AnalysisResults
  136. self.data = AnalysisResults()
  137. if file != "-":
  138. with open(file, "rb") as fin:
  139. self.data.ParseFromString(fin.read())
  140. else:
  141. self.data.ParseFromString(sys.stdin.buffer.read())
  142. def get_name(self):
  143. return self.data.header.repository
  144. def get_header(self):
  145. header = self.data.header
  146. return header.begin_unix_time, header.end_unix_time, \
  147. header.sampling, header.granularity
  148. def get_project_burndown(self):
  149. return self._parse_burndown_matrix(self.data.burndown_project)
  150. def get_files_burndown(self):
  151. return [self._parse_burndown_matrix(i) for i in self.data.burndown_files]
  152. def get_people_burndown(self):
  153. return [self._parse_burndown_matrix(i) for i in self.data.burndown_developers]
  154. def get_ownership_burndown(self):
  155. people = self.get_people_burndown()
  156. return [p[0] for p in people], {p[0]: p[1].T for p in people}
  157. def get_people_interaction(self):
  158. return [i.name for i in self.data.burndown_developers], \
  159. self._parse_sparse_matrix(self.data.developers_interaction).toarray()
  160. def get_files_coocc(self):
  161. node = self.data.file_couples
  162. return list(node.index), self._parse_sparse_matrix(node.matrix)
  163. def get_people_coocc(self):
  164. node = self.data.developer_couples
  165. return list(node.index), self._parse_sparse_matrix(node.matrix)
  166. def _parse_burndown_matrix(self, matrix):
  167. dense = numpy.zeros((matrix.number_of_rows, matrix.number_of_columns), dtype=int)
  168. for y, row in enumerate(matrix.rows):
  169. for x, col in enumerate(row.columns):
  170. dense[y, x] = col
  171. return matrix.name, dense.T
  172. def _parse_sparse_matrix(self, matrix):
  173. from scipy.sparse import csr_matrix
  174. return csr_matrix((list(matrix.data), list(matrix.indices), list(matrix.indptr)),
  175. shape=(matrix.number_of_rows, matrix.number_of_columns))
  176. READERS = {"yaml": YamlReader, "pb": ProtobufReader}
  177. def read_input(args):
  178. sys.stdout.write("Reading the input... ")
  179. sys.stdout.flush()
  180. reader = READERS[args.input_format]()
  181. reader.read(args.input)
  182. print("done")
  183. return reader
  184. def calculate_average_lifetime(matrix):
  185. lifetimes = numpy.zeros(matrix.shape[1] - 1)
  186. for band in matrix:
  187. start = 0
  188. for i, line in enumerate(band):
  189. if i == 0 or band[i - 1] == 0:
  190. start += 1
  191. continue
  192. lifetimes[i - start] = band[i - 1] - line
  193. lifetimes[i - start] = band[i - 1]
  194. return (lifetimes.dot(numpy.arange(1, matrix.shape[1], 1))
  195. / (lifetimes.sum() * matrix.shape[1]))
  196. def load_burndown(header, name, matrix, resample):
  197. import pandas
  198. start, last, sampling, granularity = header
  199. start = datetime.fromtimestamp(start)
  200. last = datetime.fromtimestamp(last)
  201. print(name, "lifetime index:", calculate_average_lifetime(matrix))
  202. finish = start + timedelta(days=matrix.shape[1] * sampling)
  203. if resample not in ("no", "raw"):
  204. # Interpolate the day x day matrix.
  205. # Each day brings equal weight in the granularity.
  206. # Sampling's interpolation is linear.
  207. daily_matrix = numpy.zeros(
  208. (matrix.shape[0] * granularity, matrix.shape[1] * sampling),
  209. dtype=numpy.float32)
  210. epsrange = numpy.arange(0, 1, 1.0 / sampling)
  211. for y in range(matrix.shape[0]):
  212. for x in range(matrix.shape[1]):
  213. previous = matrix[y, x - 1] if x > 0 else 0
  214. value = ((previous + (matrix[y, x] - previous) * epsrange)
  215. / granularity)[numpy.newaxis, :]
  216. if (y + 1) * granularity <= x * sampling:
  217. daily_matrix[y * granularity:(y + 1) * granularity,
  218. x * sampling:(x + 1) * sampling] = value
  219. elif y * granularity <= (x + 1) * sampling:
  220. for suby in range(y * granularity, (y + 1) * granularity):
  221. for subx in range(suby, (x + 1) * sampling):
  222. daily_matrix[suby, subx] = matrix[
  223. y, x] / granularity
  224. daily_matrix[(last - start).days:] = 0
  225. # Resample the bands
  226. aliases = {
  227. "year": "A",
  228. "month": "M"
  229. }
  230. resample = aliases.get(resample, resample)
  231. periods = 0
  232. date_granularity_sampling = [start]
  233. while date_granularity_sampling[-1] < finish:
  234. periods += 1
  235. date_granularity_sampling = pandas.date_range(
  236. start, periods=periods, freq=resample)
  237. date_range_sampling = pandas.date_range(
  238. date_granularity_sampling[0],
  239. periods=(finish - date_granularity_sampling[0]).days,
  240. freq="1D")
  241. # Fill the new square matrix
  242. matrix = numpy.zeros(
  243. (len(date_granularity_sampling), len(date_range_sampling)),
  244. dtype=numpy.float32)
  245. for i, gdt in enumerate(date_granularity_sampling):
  246. istart = (date_granularity_sampling[i - 1] - start).days \
  247. if i > 0 else 0
  248. ifinish = (gdt - start).days
  249. for j, sdt in enumerate(date_range_sampling):
  250. if (sdt - start).days >= istart:
  251. break
  252. matrix[i, j:] = \
  253. daily_matrix[istart:ifinish, (sdt - start).days:].sum(axis=0)
  254. # Hardcode some cases to improve labels" readability
  255. if resample in ("year", "A"):
  256. labels = [dt.year for dt in date_granularity_sampling]
  257. elif resample in ("month", "M"):
  258. labels = [dt.strftime("%Y %B") for dt in date_granularity_sampling]
  259. else:
  260. labels = [dt.date() for dt in date_granularity_sampling]
  261. else:
  262. labels = [
  263. "%s - %s" % ((start + timedelta(days=i * granularity)).date(),
  264. (
  265. start + timedelta(days=(i + 1) * granularity)).date())
  266. for i in range(matrix.shape[0])]
  267. if len(labels) > 18:
  268. warnings.warn("Too many labels - consider resampling.")
  269. resample = "M" # fake resampling type is checked while plotting
  270. date_range_sampling = pandas.date_range(
  271. start + timedelta(days=sampling), periods=matrix.shape[1],
  272. freq="%dD" % sampling)
  273. return name, matrix, date_range_sampling, labels, granularity, sampling, resample
  274. def load_ownership(header, sequence, contents, max_people):
  275. import pandas
  276. start, last, sampling, _ = header
  277. start = datetime.fromtimestamp(start)
  278. last = datetime.fromtimestamp(last)
  279. people = []
  280. for name in sequence:
  281. people.append(contents[name].sum(axis=1))
  282. people = numpy.array(people)
  283. date_range_sampling = pandas.date_range(
  284. start + timedelta(days=sampling), periods=people[0].shape[0],
  285. freq="%dD" % sampling)
  286. if people.shape[0] > max_people:
  287. order = numpy.argsort(-people.sum(axis=1))
  288. people = people[order[:max_people]]
  289. sequence = [sequence[i] for i in order[:max_people]]
  290. print("Warning: truncated people to most owning %d" % max_people)
  291. for i, name in enumerate(sequence):
  292. if len(name) > 40:
  293. sequence[i] = name[:37] + "..."
  294. return sequence, people, date_range_sampling, last
  295. def load_churn_matrix(people, matrix, max_people):
  296. matrix = matrix.astype(float)
  297. if matrix.shape[0] > max_people:
  298. order = numpy.argsort(-matrix[:, 0])
  299. matrix = matrix[order[:max_people]][:, [0, 1] + list(2 + order[:max_people])]
  300. people = [people[i] for i in order[:max_people]]
  301. print("Warning: truncated people to most productive %d" % max_people)
  302. zeros = matrix[:, 0] == 0
  303. matrix[zeros, :] = 1
  304. matrix /= matrix[:, 0][:, None]
  305. matrix = -matrix[:, 1:]
  306. matrix[zeros, :] = 0
  307. for i, name in enumerate(people):
  308. if len(name) > 40:
  309. people[i] = name[:37] + "..."
  310. return people, matrix
  311. def apply_plot_style(figure, axes, legend, style, text_size, axes_size):
  312. if axes_size is None:
  313. axes_size = (12, 9)
  314. else:
  315. axes_size = tuple(float(p) for p in axes_size.split(","))
  316. figure.set_size_inches(*axes_size)
  317. for side in ("bottom", "top", "left", "right"):
  318. axes.spines[side].set_color(style)
  319. for axis in (axes.xaxis, axes.yaxis):
  320. axis.label.update(dict(fontsize=text_size, color=style))
  321. for axis in ("x", "y"):
  322. axes.tick_params(axis=axis, colors=style, labelsize=text_size)
  323. if legend is not None:
  324. frame = legend.get_frame()
  325. for setter in (frame.set_facecolor, frame.set_edgecolor):
  326. setter("black" if style == "white" else "white")
  327. for text in legend.get_texts():
  328. text.set_color(style)
  329. def get_plot_path(base, name):
  330. root, ext = os.path.splitext(base)
  331. if not ext:
  332. ext = ".png"
  333. output = os.path.join(root, name + ext)
  334. os.makedirs(os.path.dirname(output), exist_ok=True)
  335. return output
  336. def deploy_plot(title, output, style):
  337. import matplotlib.pyplot as pyplot
  338. if not output:
  339. pyplot.gcf().canvas.set_window_title(title)
  340. pyplot.show()
  341. else:
  342. if title:
  343. pyplot.title(title, color=style)
  344. try:
  345. pyplot.tight_layout()
  346. except:
  347. print("Warning: failed to set the tight layout")
  348. pyplot.savefig(output, transparent=True)
  349. pyplot.clf()
  350. def default_json(x):
  351. if hasattr(x, "tolist"):
  352. return x.tolist()
  353. if hasattr(x, "isoformat"):
  354. return x.isoformat()
  355. return x
  356. def plot_burndown(args, target, name, matrix, date_range_sampling, labels, granularity,
  357. sampling, resample):
  358. if args.output and args.output.endswith(".json"):
  359. data = locals().copy()
  360. del data["args"]
  361. data["type"] = "burndown"
  362. if args.mode == "project" and target == "project":
  363. output = args.output
  364. else:
  365. if target == "project":
  366. name = "project"
  367. output = get_plot_path(args.output, name)
  368. with open(output, "w") as fout:
  369. json.dump(data, fout, sort_keys=True, default=default_json)
  370. return
  371. import matplotlib
  372. if args.backend:
  373. matplotlib.use(args.backend)
  374. import matplotlib.pyplot as pyplot
  375. pyplot.stackplot(date_range_sampling, matrix, labels=labels)
  376. if args.relative:
  377. for i in range(matrix.shape[1]):
  378. matrix[:, i] /= matrix[:, i].sum()
  379. pyplot.ylim(0, 1)
  380. legend_loc = 3
  381. else:
  382. legend_loc = 2
  383. legend = pyplot.legend(loc=legend_loc, fontsize=args.text_size)
  384. pyplot.ylabel("Lines of code")
  385. pyplot.xlabel("Time")
  386. apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.style, args.text_size, args.size)
  387. pyplot.xlim(date_range_sampling[0], date_range_sampling[-1])
  388. locator = pyplot.gca().xaxis.get_major_locator()
  389. # set the optimal xticks locator
  390. if "M" not in resample:
  391. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  392. locs = pyplot.gca().get_xticks().tolist()
  393. if len(locs) >= 16:
  394. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  395. locs = pyplot.gca().get_xticks().tolist()
  396. if len(locs) >= 16:
  397. pyplot.gca().xaxis.set_major_locator(locator)
  398. if locs[0] < pyplot.xlim()[0]:
  399. del locs[0]
  400. endindex = -1
  401. if len(locs) >= 2 and \
  402. pyplot.xlim()[1] - locs[-1] > (locs[-1] - locs[-2]) / 2:
  403. locs.append(pyplot.xlim()[1])
  404. endindex = len(locs) - 1
  405. startindex = -1
  406. if len(locs) >= 2 and \
  407. locs[0] - pyplot.xlim()[0] > (locs[1] - locs[0]) / 2:
  408. locs.append(pyplot.xlim()[0])
  409. startindex = len(locs) - 1
  410. pyplot.gca().set_xticks(locs)
  411. # hacking time!
  412. labels = pyplot.gca().get_xticklabels()
  413. if startindex >= 0:
  414. labels[startindex].set_text(date_range_sampling[0].date())
  415. labels[startindex].set_text = lambda _: None
  416. labels[startindex].set_rotation(30)
  417. labels[startindex].set_ha("right")
  418. if endindex >= 0:
  419. labels[endindex].set_text(date_range_sampling[-1].date())
  420. labels[endindex].set_text = lambda _: None
  421. labels[endindex].set_rotation(30)
  422. labels[endindex].set_ha("right")
  423. title = "%s %d x %d (granularity %d, sampling %d)" % \
  424. ((name,) + matrix.shape + (granularity, sampling))
  425. output = args.output
  426. if output:
  427. if args.mode == "project" and target == "project":
  428. output = args.output
  429. else:
  430. if target == "project":
  431. name = "project"
  432. output = get_plot_path(args.output, name)
  433. deploy_plot(title, output, args.style)
  434. def plot_many_burndown(args, target, header, parts):
  435. if not args.output:
  436. print("Warning: output not set, showing %d plots." % len(parts))
  437. itercnt = progress.bar(parts, expected_size=len(parts)) \
  438. if progress is not None else parts
  439. stdout = io.StringIO()
  440. for name, matrix in itercnt:
  441. backup = sys.stdout
  442. sys.stdout = stdout
  443. plot_burndown(args, target, *load_burndown(header, name, matrix, args.resample))
  444. sys.stdout = backup
  445. sys.stdout.write(stdout.getvalue())
  446. def plot_churn_matrix(args, repo, people, matrix):
  447. if args.output and args.output.endswith(".json"):
  448. data = locals().copy()
  449. del data["args"]
  450. data["type"] = "churn_matrix"
  451. if args.mode == "all":
  452. output = get_plot_path(args.output, "matrix")
  453. else:
  454. output = args.output
  455. with open(output, "w") as fout:
  456. json.dump(data, fout, sort_keys=True, default=default_json)
  457. return
  458. import matplotlib
  459. if args.backend:
  460. matplotlib.use(args.backend)
  461. import matplotlib.pyplot as pyplot
  462. s = 4 + matrix.shape[1] * 0.3
  463. fig = pyplot.figure(figsize=(s, s))
  464. ax = fig.add_subplot(111)
  465. ax.xaxis.set_label_position("top")
  466. ax.matshow(matrix, cmap=pyplot.cm.OrRd)
  467. ax.set_xticks(numpy.arange(0, matrix.shape[1]))
  468. ax.set_yticks(numpy.arange(0, matrix.shape[0]))
  469. ax.set_xticklabels(["Unidentified"] + people, rotation=90, ha="center")
  470. ax.set_yticklabels(people, va="center")
  471. ax.set_xticks(numpy.arange(0.5, matrix.shape[1] + 0.5), minor=True)
  472. ax.set_yticks(numpy.arange(0.5, matrix.shape[0] + 0.5), minor=True)
  473. ax.grid(which="minor")
  474. apply_plot_style(fig, ax, None, args.style, args.text_size, args.size)
  475. if not args.output:
  476. pos1 = ax.get_position()
  477. pos2 = (pos1.x0 + 0.245, pos1.y0 - 0.1, pos1.width * 0.9, pos1.height * 0.9)
  478. ax.set_position(pos2)
  479. if args.mode == "all":
  480. output = get_plot_path(args.output, "matrix")
  481. else:
  482. output = args.output
  483. title = "%s %d developers overwrite" % (repo, matrix.shape[0])
  484. if args.output:
  485. # FIXME(vmarkovtsev): otherwise the title is screwed in savefig()
  486. title = ""
  487. deploy_plot(title, output, args.style)
  488. def plot_ownership(args, repo, names, people, date_range, last):
  489. if args.output and args.output.endswith(".json"):
  490. data = locals().copy()
  491. del data["args"]
  492. data["type"] = "ownership"
  493. if args.mode == "all":
  494. output = get_plot_path(args.output, "people")
  495. else:
  496. output = args.output
  497. with open(output, "w") as fout:
  498. json.dump(data, fout, sort_keys=True, default=default_json)
  499. return
  500. import matplotlib
  501. if args.backend:
  502. matplotlib.use(args.backend)
  503. import matplotlib.pyplot as pyplot
  504. pyplot.stackplot(date_range, people, labels=names)
  505. pyplot.xlim(date_range[0], last)
  506. if args.relative:
  507. for i in range(people.shape[1]):
  508. people[:, i] /= people[:, i].sum()
  509. pyplot.ylim(0, 1)
  510. legend_loc = 3
  511. else:
  512. legend_loc = 2
  513. legend = pyplot.legend(loc=legend_loc, fontsize=args.text_size)
  514. apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.style, args.text_size, args.size)
  515. if args.mode == "all":
  516. output = get_plot_path(args.output, "people")
  517. else:
  518. output = args.output
  519. deploy_plot("%s code ownership through time" % repo, output, args.style)
  520. def train_embeddings(index, matrix, tmpdir, shard_size=4096):
  521. try:
  522. from . import swivel
  523. except (SystemError, ImportError):
  524. import swivel
  525. import tensorflow as tf
  526. assert matrix.shape[0] == matrix.shape[1]
  527. assert len(index) <= matrix.shape[0]
  528. nshards = len(index) // shard_size
  529. if nshards * shard_size < len(index):
  530. nshards += 1
  531. shard_size = len(index) // nshards
  532. nshards = len(index) // shard_size
  533. remainder = len(index) - nshards * shard_size
  534. if remainder > 0:
  535. lengths = matrix.indptr[1:] - matrix.indptr[:-1]
  536. filtered = sorted(numpy.argsort(lengths)[remainder:])
  537. else:
  538. filtered = list(range(len(index)))
  539. if len(filtered) < matrix.shape[0]:
  540. print("Truncating the sparse matrix...")
  541. matrix = matrix[filtered, :][:, filtered]
  542. meta_index = []
  543. for i, j in enumerate(filtered):
  544. meta_index.append((index[j], matrix[i, i]))
  545. index = [mi[0] for mi in meta_index]
  546. with tempfile.TemporaryDirectory(prefix="hercules_labours_", dir=tmpdir or None) as tmproot:
  547. print("Writing Swivel metadata...")
  548. vocabulary = "\n".join(index)
  549. with open(os.path.join(tmproot, "row_vocab.txt"), "w") as out:
  550. out.write(vocabulary)
  551. with open(os.path.join(tmproot, "col_vocab.txt"), "w") as out:
  552. out.write(vocabulary)
  553. del vocabulary
  554. bool_sums = matrix.indptr[1:] - matrix.indptr[:-1]
  555. bool_sums_str = "\n".join(map(str, bool_sums.tolist()))
  556. with open(os.path.join(tmproot, "row_sums.txt"), "w") as out:
  557. out.write(bool_sums_str)
  558. with open(os.path.join(tmproot, "col_sums.txt"), "w") as out:
  559. out.write(bool_sums_str)
  560. del bool_sums_str
  561. reorder = numpy.argsort(-bool_sums)
  562. print("Writing Swivel shards...")
  563. for row in range(nshards):
  564. for col in range(nshards):
  565. def _int64s(xs):
  566. return tf.train.Feature(
  567. int64_list=tf.train.Int64List(value=list(xs)))
  568. def _floats(xs):
  569. return tf.train.Feature(
  570. float_list=tf.train.FloatList(value=list(xs)))
  571. indices_row = reorder[row::nshards]
  572. indices_col = reorder[col::nshards]
  573. shard = matrix[indices_row][:, indices_col].tocoo()
  574. example = tf.train.Example(features=tf.train.Features(feature={
  575. "global_row": _int64s(indices_row),
  576. "global_col": _int64s(indices_col),
  577. "sparse_local_row": _int64s(shard.row),
  578. "sparse_local_col": _int64s(shard.col),
  579. "sparse_value": _floats(shard.data)}))
  580. with open(os.path.join(tmproot, "shard-%03d-%03d.pb" % (row, col)), "wb") as out:
  581. out.write(example.SerializeToString())
  582. print("Training Swivel model...")
  583. swivel.FLAGS.submatrix_rows = shard_size
  584. swivel.FLAGS.submatrix_cols = shard_size
  585. if len(meta_index) < 10000:
  586. embedding_size = 50
  587. num_epochs = 200
  588. elif len(meta_index) < 100000:
  589. embedding_size = 100
  590. num_epochs = 250
  591. elif len(meta_index) < 500000:
  592. embedding_size = 200
  593. num_epochs = 300
  594. else:
  595. embedding_size = 300
  596. num_epochs = 200
  597. swivel.FLAGS.embedding_size = embedding_size
  598. swivel.FLAGS.input_base_path = tmproot
  599. swivel.FLAGS.output_base_path = tmproot
  600. swivel.FLAGS.loss_multiplier = 1.0 / shard_size
  601. swivel.FLAGS.num_epochs = num_epochs
  602. swivel.main(None)
  603. print("Reading Swivel embeddings...")
  604. embeddings = []
  605. with open(os.path.join(tmproot, "row_embedding.tsv")) as frow:
  606. with open(os.path.join(tmproot, "col_embedding.tsv")) as fcol:
  607. for i, (lrow, lcol) in enumerate(zip(frow, fcol)):
  608. prow, pcol = (l.split("\t", 1) for l in (lrow, lcol))
  609. assert prow[0] == pcol[0]
  610. erow, ecol = \
  611. (numpy.fromstring(p[1], dtype=numpy.float32, sep="\t")
  612. for p in (prow, pcol))
  613. embeddings.append((erow + ecol) / 2)
  614. return meta_index, embeddings
  615. class CORSWebServer(object):
  616. def __init__(self):
  617. self.thread = threading.Thread(target=self.serve)
  618. self.server = None
  619. def serve(self):
  620. outer = self
  621. try:
  622. from http.server import HTTPServer, SimpleHTTPRequestHandler, test
  623. except ImportError: # Python 2
  624. from BaseHTTPServer import HTTPServer, test
  625. from SimpleHTTPServer import SimpleHTTPRequestHandler
  626. class ClojureServer(HTTPServer):
  627. def __init__(self, *args, **kwargs):
  628. HTTPServer.__init__(self, *args, **kwargs)
  629. outer.server = self
  630. class CORSRequestHandler(SimpleHTTPRequestHandler):
  631. def end_headers (self):
  632. self.send_header("Access-Control-Allow-Origin", "*")
  633. SimpleHTTPRequestHandler.end_headers(self)
  634. test(CORSRequestHandler, ClojureServer)
  635. def start(self):
  636. self.thread.start()
  637. def stop(self):
  638. if self.running:
  639. self.server.shutdown()
  640. self.thread.join()
  641. @property
  642. def running(self):
  643. return self.server is not None
  644. web_server = CORSWebServer()
  645. def write_embeddings(name, output, run_server, index, embeddings):
  646. print("Writing Tensorflow Projector files...")
  647. if not output:
  648. output = "couples_" + name
  649. if output.endswith(".json"):
  650. output = os.path.join(output[:-5], "couples")
  651. run_server = False
  652. metaf = "%s_%s_meta.tsv" % (output, name)
  653. with open(metaf, "w") as fout:
  654. fout.write("name\tcommits\n")
  655. for pair in index:
  656. fout.write("%s\t%s\n" % pair)
  657. print("Wrote", metaf)
  658. dataf = "%s_%s_data.tsv" % (output, name)
  659. with open(dataf, "w") as fout:
  660. for vec in embeddings:
  661. fout.write("\t".join(str(v) for v in vec))
  662. fout.write("\n")
  663. print("Wrote", dataf)
  664. jsonf = "%s_%s.json" % (output, name)
  665. with open(jsonf, "w") as fout:
  666. fout.write("""{
  667. "embeddings": [
  668. {
  669. "tensorName": "%s %s coupling",
  670. "tensorShape": [%s, %s],
  671. "tensorPath": "http://0.0.0.0:8000/%s",
  672. "metadataPath": "http://0.0.0.0:8000/%s"
  673. }
  674. ]
  675. }
  676. """ % (output, name, len(embeddings), len(embeddings[0]), dataf, metaf))
  677. print("Wrote %s" % jsonf)
  678. if run_server and not web_server.running:
  679. web_server.start()
  680. url = "http://projector.tensorflow.org/?config=http://0.0.0.0:8000/" + jsonf
  681. print(url)
  682. if run_server:
  683. os.system("xdg-open " + url)
  684. def main():
  685. args = parse_args()
  686. reader = read_input(args)
  687. header = reader.get_header()
  688. name = reader.get_name()
  689. files_warning = "Files stats were not collected. Re-run hercules with -files."
  690. people_warning = "People stats were not collected. Re-run hercules with -people."
  691. couples_warning = "Coupling stats were not collected. Re-run hercules with -couples."
  692. def project_burndown():
  693. plot_burndown(args, "project",
  694. *load_burndown(header, *reader.get_project_burndown(), args.resample))
  695. def files_burndown():
  696. try:
  697. plot_many_burndown(args, "file", header, reader.get_files_burndown())
  698. except KeyError:
  699. print(files_warning)
  700. def people_burndown():
  701. try:
  702. plot_many_burndown(args, "person", header, reader.get_people_burndown())
  703. except KeyError:
  704. print(people_warning)
  705. def churn_matrix():
  706. try:
  707. plot_churn_matrix(args, name, *load_churn_matrix(
  708. *reader.get_people_interaction(), args.max_people))
  709. except KeyError:
  710. print(people_warning)
  711. def ownership_burndown():
  712. try:
  713. plot_ownership(args, name, *load_ownership(
  714. header, *reader.get_ownership_burndown(), args.max_people))
  715. except KeyError:
  716. print(people_warning)
  717. def couples():
  718. try:
  719. write_embeddings("files", args.output, not args.disable_projector,
  720. *train_embeddings(*reader.get_files_coocc(), args.couples_tmp_dir))
  721. write_embeddings("people", args.output, not args.disable_projector,
  722. *train_embeddings(*reader.get_people_coocc(), args.couples_tmp_dir))
  723. except KeyError:
  724. print(couples_warning)
  725. if args.mode == "project":
  726. project_burndown()
  727. elif args.mode == "file":
  728. files_burndown()
  729. elif args.mode == "person":
  730. people_burndown()
  731. elif args.mode == "churn_matrix":
  732. churn_matrix()
  733. elif args.mode == "ownership":
  734. ownership_burndown()
  735. elif args.mode == "couples":
  736. couples()
  737. elif args.mode == "all":
  738. project_burndown()
  739. files_burndown()
  740. people_burndown()
  741. churn_matrix()
  742. ownership_burndown()
  743. couples()
  744. if web_server.running:
  745. print("Sleeping for 60 seconds, safe to Ctrl-C")
  746. try:
  747. time.sleep(60)
  748. except KeyboardInterrupt:
  749. pass
  750. web_server.stop()
  751. if __name__ == "__main__":
  752. sys.exit(main())