labours.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761
  1. import argparse
  2. from datetime import datetime, timedelta
  3. import io
  4. import os
  5. import re
  6. import sys
  7. import tempfile
  8. import threading
  9. import time
  10. import warnings
  11. try:
  12. from clint.textui import progress
  13. except ImportError:
  14. print("Warning: clint is not installed, no fancy progressbars in the terminal for you.")
  15. progress = None
  16. import numpy
  17. import yaml
  18. if sys.version_info[0] < 3:
  19. # OK, ancients, I will support Python 2, but you owe me a beer
  20. input = raw_input
  21. def parse_args():
  22. parser = argparse.ArgumentParser()
  23. parser.add_argument("-o", "--output", default="",
  24. help="Path to the output file/directory (empty for display).")
  25. parser.add_argument("-i", "--input", default="-",
  26. help="Path to the input file (- for stdin).")
  27. parser.add_argument("-f", "--input-format", default="yaml", choices=["yaml", "pb"])
  28. parser.add_argument("--text-size", default=12, type=int,
  29. help="Size of the labels and legend.")
  30. parser.add_argument("--backend", help="Matplotlib backend to use.")
  31. parser.add_argument("--style", choices=["black", "white"], default="black",
  32. help="Plot's general color scheme.")
  33. parser.add_argument("--size", help="Axes' size in inches, for example \"12,9\"")
  34. parser.add_argument("--relative", action="store_true",
  35. help="Occupy 100%% height for every measurement.")
  36. parser.add_argument("--couples-tmp-dir", help="Temporary directory to work with couples.")
  37. parser.add_argument("-m", "--mode",
  38. choices=["project", "file", "person", "churn_matrix", "people", "couples",
  39. "all"],
  40. default="project", help="What to plot.")
  41. parser.add_argument(
  42. "--resample", default="year",
  43. help="The way to resample the time series. Possible values are: "
  44. "\"month\", \"year\", \"no\", \"raw\" and pandas offset aliases ("
  45. "http://pandas.pydata.org/pandas-docs/stable/timeseries.html"
  46. "#offset-aliases).")
  47. parser.add_argument("--disable-projector", action="store_true",
  48. help="Do not run Tensorflow Projector on couples.")
  49. parser.add_argument("--max-people", default=20, type=int,
  50. help="Maximum number of developers in churn matrix and people plots.")
  51. args = parser.parse_args()
  52. return args
  53. class Reader(object):
  54. def read(self, file):
  55. raise NotImplementedError
  56. def get_name(self):
  57. raise NotImplementedError
  58. def get_header(self):
  59. raise NotImplementedError
  60. def get_project_burndown(self):
  61. raise NotImplementedError
  62. def get_files_burndown(self):
  63. raise NotImplementedError
  64. def get_people_burndown(self):
  65. raise NotImplementedError
  66. def get_ownership_burndown(self):
  67. raise NotImplementedError
  68. def get_people_interaction(self):
  69. raise NotImplementedError
  70. def get_files_coocc(self):
  71. raise NotImplementedError
  72. def get_people_coocc(self):
  73. raise NotImplementedError
  74. class YamlReader(Reader):
  75. def read(self, file):
  76. yaml.reader.Reader.NON_PRINTABLE = re.compile(r"(?!x)x")
  77. try:
  78. loader = yaml.CLoader
  79. except AttributeError:
  80. print("Warning: failed to import yaml.CLoader, falling back to slow yaml.Loader")
  81. loader = yaml.Loader
  82. try:
  83. if file != "-":
  84. with open(file) as fin:
  85. data = yaml.load(fin, Loader=loader)
  86. else:
  87. data = yaml.load(sys.stdin, Loader=loader)
  88. except (UnicodeEncodeError, yaml.reader.ReaderError) as e:
  89. print("\nInvalid unicode in the input: %s\nPlease filter it through "
  90. "fix_yaml_unicode.py" % e)
  91. sys.exit(1)
  92. print("done")
  93. self.data = data
  94. def get_name(self):
  95. return next(iter(self.data["project"]))
  96. def get_header(self):
  97. header = self.data["burndown"]
  98. return header["begin"], header["end"], header["sampling"], header["granularity"]
  99. def get_project_burndown(self):
  100. name, matrix = next(iter(self.data["project"].items()))
  101. return name, self._parse_burndown_matrix(matrix).T
  102. def get_files_burndown(self):
  103. return [(p[0], self._parse_burndown_matrix(p[1]).T) for p in self.data["files"].items()]
  104. def get_people_burndown(self):
  105. return [(p[0], self._parse_burndown_matrix(p[1]).T) for p in self.data["people"].items()]
  106. def get_ownership_burndown(self):
  107. return self.data["people_sequence"], {p[0]: self._parse_burndown_matrix(p[1])
  108. for p in self.data["people"].items()}
  109. def get_people_interaction(self):
  110. return self.data["people_sequence"], self._parse_burndown_matrix(self.data["people_interaction"])
  111. def get_files_coocc(self):
  112. coocc = self.data["files_coocc"]
  113. return coocc["index"], self._parse_coocc_matrix(coocc["matrix"])
  114. def get_people_coocc(self):
  115. coocc = self.data["people_coocc"]
  116. return coocc["index"], self._parse_coocc_matrix(coocc["matrix"])
  117. def _parse_burndown_matrix(self, matrix):
  118. return numpy.array([numpy.fromstring(line, dtype=int, sep=" ")
  119. for line in matrix.split("\n")])
  120. def _parse_coocc_matrix(self, matrix):
  121. from scipy.sparse import csr_matrix
  122. data = []
  123. indices = []
  124. indptr = [0]
  125. for row in matrix:
  126. for k, v in sorted(row.items()):
  127. data.append(v)
  128. indices.append(k)
  129. indptr.append(indptr[-1] + len(row))
  130. return csr_matrix((data, indices, indptr), shape=(len(matrix),) * 2)
  131. class ProtobufReader(Reader):
  132. def read(self, file):
  133. pass
  134. READERS = {"yaml": YamlReader, "pb": ProtobufReader}
  135. def read_input(args):
  136. sys.stdout.write("Reading the input... ")
  137. sys.stdout.flush()
  138. reader = READERS[args.input_format]()
  139. reader.read(args.input)
  140. return reader
  141. def calculate_average_lifetime(matrix):
  142. lifetimes = numpy.zeros(matrix.shape[1] - 1)
  143. for band in matrix:
  144. start = 0
  145. for i, line in enumerate(band):
  146. if i == 0 or band[i - 1] == 0:
  147. start += 1
  148. continue
  149. lifetimes[i - start] = band[i - 1] - line
  150. lifetimes[i - start] = band[i - 1]
  151. return (lifetimes.dot(numpy.arange(1, matrix.shape[1], 1))
  152. / (lifetimes.sum() * matrix.shape[1]))
  153. def load_burndown(header, name, matrix, resample):
  154. import pandas
  155. start, last, sampling, granularity = header
  156. start = datetime.fromtimestamp(start)
  157. last = datetime.fromtimestamp(last)
  158. print(name, "lifetime index:", calculate_average_lifetime(matrix))
  159. finish = start + timedelta(days=matrix.shape[1] * sampling)
  160. if resample not in ("no", "raw"):
  161. # Interpolate the day x day matrix.
  162. # Each day brings equal weight in the granularity.
  163. # Sampling's interpolation is linear.
  164. daily_matrix = numpy.zeros(
  165. (matrix.shape[0] * granularity, matrix.shape[1] * sampling),
  166. dtype=numpy.float32)
  167. epsrange = numpy.arange(0, 1, 1.0 / sampling)
  168. for y in range(matrix.shape[0]):
  169. for x in range(matrix.shape[1]):
  170. previous = matrix[y, x - 1] if x > 0 else 0
  171. value = ((previous + (matrix[y, x] - previous) * epsrange)
  172. / granularity)[numpy.newaxis, :]
  173. if (y + 1) * granularity <= x * sampling:
  174. daily_matrix[y * granularity:(y + 1) * granularity,
  175. x * sampling:(x + 1) * sampling] = value
  176. elif y * granularity <= (x + 1) * sampling:
  177. for suby in range(y * granularity, (y + 1) * granularity):
  178. for subx in range(suby, (x + 1) * sampling):
  179. daily_matrix[suby, subx] = matrix[
  180. y, x] / granularity
  181. daily_matrix[(last - start).days:] = 0
  182. # Resample the bands
  183. aliases = {
  184. "year": "A",
  185. "month": "M"
  186. }
  187. resample = aliases.get(resample, resample)
  188. periods = 0
  189. date_granularity_sampling = [start]
  190. while date_granularity_sampling[-1] < finish:
  191. periods += 1
  192. date_granularity_sampling = pandas.date_range(
  193. start, periods=periods, freq=resample)
  194. date_range_sampling = pandas.date_range(
  195. date_granularity_sampling[0],
  196. periods=(finish - date_granularity_sampling[0]).days,
  197. freq="1D")
  198. # Fill the new square matrix
  199. matrix = numpy.zeros(
  200. (len(date_granularity_sampling), len(date_range_sampling)),
  201. dtype=numpy.float32)
  202. for i, gdt in enumerate(date_granularity_sampling):
  203. istart = (date_granularity_sampling[i - 1] - start).days \
  204. if i > 0 else 0
  205. ifinish = (gdt - start).days
  206. for j, sdt in enumerate(date_range_sampling):
  207. if (sdt - start).days >= istart:
  208. break
  209. matrix[i, j:] = \
  210. daily_matrix[istart:ifinish, (sdt - start).days:].sum(axis=0)
  211. # Hardcode some cases to improve labels" readability
  212. if resample in ("year", "A"):
  213. labels = [dt.year for dt in date_granularity_sampling]
  214. elif resample in ("month", "M"):
  215. labels = [dt.strftime("%Y %B") for dt in date_granularity_sampling]
  216. else:
  217. labels = [dt.date() for dt in date_granularity_sampling]
  218. else:
  219. labels = [
  220. "%s - %s" % ((start + timedelta(days=i * granularity)).date(),
  221. (
  222. start + timedelta(days=(i + 1) * granularity)).date())
  223. for i in range(matrix.shape[0])]
  224. if len(labels) > 18:
  225. warnings.warn("Too many labels - consider resampling.")
  226. resample = "M" # fake resampling type is checked while plotting
  227. date_range_sampling = pandas.date_range(
  228. start + timedelta(days=sampling), periods=matrix.shape[1],
  229. freq="%dD" % sampling)
  230. return name, matrix, date_range_sampling, labels, granularity, sampling, resample
  231. def load_people(header, sequence, contents):
  232. import pandas
  233. start, last, sampling, _ = header
  234. start = datetime.fromtimestamp(start)
  235. last = datetime.fromtimestamp(last)
  236. people = []
  237. for name in sequence:
  238. people.append(contents[name].sum(axis=1))
  239. people = numpy.array(people)
  240. date_range_sampling = pandas.date_range(
  241. start + timedelta(days=sampling), periods=people[0].shape[0],
  242. freq="%dD" % sampling)
  243. return sequence, people, date_range_sampling, last
  244. def apply_plot_style(figure, axes, legend, style, text_size, axes_size):
  245. if axes_size is None:
  246. axes_size = (12, 9)
  247. else:
  248. axes_size = tuple(float(p) for p in axes_size.split(","))
  249. figure.set_size_inches(*axes_size)
  250. for side in ("bottom", "top", "left", "right"):
  251. axes.spines[side].set_color(style)
  252. for axis in (axes.xaxis, axes.yaxis):
  253. axis.label.update(dict(fontsize=text_size, color=style))
  254. for axis in ("x", "y"):
  255. axes.tick_params(axis=axis, colors=style, labelsize=text_size)
  256. if legend is not None:
  257. frame = legend.get_frame()
  258. for setter in (frame.set_facecolor, frame.set_edgecolor):
  259. setter("black" if style == "white" else "white")
  260. for text in legend.get_texts():
  261. text.set_color(style)
  262. def get_plot_path(base, name):
  263. root, ext = os.path.splitext(base)
  264. if not ext:
  265. ext = ".png"
  266. output = os.path.join(root, name + ext)
  267. os.makedirs(os.path.dirname(output), exist_ok=True)
  268. return output
  269. def deploy_plot(title, output, style):
  270. import matplotlib.pyplot as pyplot
  271. if not output:
  272. pyplot.gcf().canvas.set_window_title(title)
  273. pyplot.show()
  274. else:
  275. if title:
  276. pyplot.title(title, color=style)
  277. try:
  278. pyplot.tight_layout()
  279. except:
  280. print("Warning: failed to set the tight layout")
  281. pyplot.savefig(output, transparent=True)
  282. pyplot.clf()
  283. def plot_burndown(args, target, name, matrix, date_range_sampling, labels, granularity,
  284. sampling, resample):
  285. import matplotlib
  286. if args.backend:
  287. matplotlib.use(args.backend)
  288. import matplotlib.pyplot as pyplot
  289. pyplot.stackplot(date_range_sampling, matrix, labels=labels)
  290. if args.relative:
  291. for i in range(matrix.shape[1]):
  292. matrix[:, i] /= matrix[:, i].sum()
  293. pyplot.ylim(0, 1)
  294. legend_loc = 3
  295. else:
  296. legend_loc = 2
  297. legend = pyplot.legend(loc=legend_loc, fontsize=args.text_size)
  298. pyplot.ylabel("Lines of code")
  299. pyplot.xlabel("Time")
  300. apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.style, args.text_size, args.size)
  301. pyplot.xlim(date_range_sampling[0], date_range_sampling[-1])
  302. locator = pyplot.gca().xaxis.get_major_locator()
  303. # set the optimal xticks locator
  304. if "M" not in resample:
  305. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  306. locs = pyplot.gca().get_xticks().tolist()
  307. if len(locs) >= 16:
  308. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  309. locs = pyplot.gca().get_xticks().tolist()
  310. if len(locs) >= 16:
  311. pyplot.gca().xaxis.set_major_locator(locator)
  312. if locs[0] < pyplot.xlim()[0]:
  313. del locs[0]
  314. endindex = -1
  315. if len(locs) >= 2 and \
  316. pyplot.xlim()[1] - locs[-1] > (locs[-1] - locs[-2]) / 2:
  317. locs.append(pyplot.xlim()[1])
  318. endindex = len(locs) - 1
  319. startindex = -1
  320. if len(locs) >= 2 and \
  321. locs[0] - pyplot.xlim()[0] > (locs[1] - locs[0]) / 2:
  322. locs.append(pyplot.xlim()[0])
  323. startindex = len(locs) - 1
  324. pyplot.gca().set_xticks(locs)
  325. # hacking time!
  326. labels = pyplot.gca().get_xticklabels()
  327. if startindex >= 0:
  328. labels[startindex].set_text(date_range_sampling[0].date())
  329. labels[startindex].set_text = lambda _: None
  330. labels[startindex].set_rotation(30)
  331. labels[startindex].set_ha("right")
  332. if endindex >= 0:
  333. labels[endindex].set_text(date_range_sampling[-1].date())
  334. labels[endindex].set_text = lambda _: None
  335. labels[endindex].set_rotation(30)
  336. labels[endindex].set_ha("right")
  337. title = "%s %d x %d (granularity %d, sampling %d)" % \
  338. ((name,) + matrix.shape + (granularity, sampling))
  339. output = args.output
  340. if output:
  341. if args.mode == "project" and target == "project":
  342. output = args.output
  343. else:
  344. if target == "project":
  345. name = "project"
  346. output = get_plot_path(args.output, name)
  347. deploy_plot(title, output, args.style)
  348. def plot_many(args, target, header, parts):
  349. if not args.output:
  350. print("Warning: output not set, showing %d plots." % len(parts))
  351. itercnt = progress.bar(parts, expected_size=len(parts)) \
  352. if progress is not None else parts
  353. stdout = io.StringIO()
  354. for name, matrix in itercnt:
  355. backup = sys.stdout
  356. sys.stdout = stdout
  357. plot_burndown(args, target, *load_burndown(header, name, matrix, args.resample))
  358. sys.stdout = backup
  359. sys.stdout.write(stdout.getvalue())
  360. def plot_churn_matrix(args, repo, people, matrix):
  361. matrix = matrix.astype(float)
  362. if matrix.shape[0] > args.max_people:
  363. order = numpy.argsort(-matrix[:, 0])
  364. matrix = matrix[order[:args.max_people]][:, [0, 1] + list(2 + order[:args.max_people])]
  365. people = [people[i] for i in order[:args.max_people]]
  366. print("Warning: truncated people to most productive %d" % args.max_people)
  367. zeros = matrix[:, 0] == 0
  368. matrix[zeros, :] = 1
  369. matrix /= matrix[:, 0][:, None]
  370. matrix = -matrix[:, 1:]
  371. matrix[zeros, :] = 0
  372. for i, name in enumerate(people):
  373. if len(name) > 40:
  374. people[i] = name[:37] + "..."
  375. import matplotlib
  376. if args.backend:
  377. matplotlib.use(args.backend)
  378. import matplotlib.pyplot as pyplot
  379. s = 4 + matrix.shape[1] * 0.3
  380. fig = pyplot.figure(figsize=(s, s))
  381. ax = fig.add_subplot(111)
  382. ax.xaxis.set_label_position("top")
  383. ax.matshow(matrix, cmap=pyplot.cm.OrRd)
  384. ax.set_xticks(numpy.arange(0, matrix.shape[1]))
  385. ax.set_yticks(numpy.arange(0, matrix.shape[0]))
  386. ax.set_xticklabels(["Unidentified"] + people, rotation=90, ha="center")
  387. ax.set_yticklabels(people, va="center")
  388. ax.set_xticks(numpy.arange(0.5, matrix.shape[1] + 0.5), minor=True)
  389. ax.set_yticks(numpy.arange(0.5, matrix.shape[0] + 0.5), minor=True)
  390. ax.grid(which="minor")
  391. apply_plot_style(fig, ax, None, args.style, args.text_size, args.size)
  392. if not args.output:
  393. pos1 = ax.get_position()
  394. pos2 = (pos1.x0 + 0.245, pos1.y0 - 0.1, pos1.width * 0.9, pos1.height * 0.9)
  395. ax.set_position(pos2)
  396. if args.mode == "all":
  397. output = get_plot_path(args.output, "matrix")
  398. else:
  399. output = args.output
  400. title = "%s %d developers overwrite" % (repo, matrix.shape[0])
  401. if args.output:
  402. # FIXME(vmarkovtsev): otherwise the title is screwed in savefig()
  403. title = ""
  404. deploy_plot(title, output, args.style)
  405. def plot_people(args, repo, names, people, date_range, last):
  406. import matplotlib
  407. if args.backend:
  408. matplotlib.use(args.backend)
  409. import matplotlib.pyplot as pyplot
  410. if people.shape[0] > args.max_people:
  411. order = numpy.argsort(-people.sum(axis=1))
  412. people = people[order[:args.max_people]]
  413. names = [names[i] for i in order[:args.max_people]]
  414. print("Warning: truncated people to most owning %d" % args.max_people)
  415. for i, name in enumerate(names):
  416. if len(name) > 40:
  417. names[i] = name[:37] + "..."
  418. pyplot.stackplot(date_range, people, labels=names)
  419. pyplot.xlim(date_range[0], last)
  420. if args.relative:
  421. for i in range(people.shape[1]):
  422. people[:, i] /= people[:, i].sum()
  423. pyplot.ylim(0, 1)
  424. legend_loc = 3
  425. else:
  426. legend_loc = 2
  427. legend = pyplot.legend(loc=legend_loc, fontsize=args.text_size)
  428. apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.style, args.text_size, args.size)
  429. if args.mode == "all":
  430. output = get_plot_path(args.output, "people")
  431. else:
  432. output = args.output
  433. deploy_plot("%s code ownership through time" % repo, output, args.style)
  434. def train_embeddings(index, matrix, tmpdir, shard_size=4096):
  435. try:
  436. from . import swivel
  437. except (SystemError, ImportError):
  438. import swivel
  439. import tensorflow as tf
  440. assert matrix.shape[0] == matrix.shape[1]
  441. assert len(index) <= matrix.shape[0]
  442. nshards = len(index) // shard_size
  443. if nshards * shard_size < len(index):
  444. nshards += 1
  445. shard_size = len(index) // nshards
  446. nshards = len(index) // shard_size
  447. remainder = len(index) - nshards * shard_size
  448. if remainder > 0:
  449. lengths = matrix.indptr[1:] - matrix.indptr[:-1]
  450. filtered = sorted(numpy.argsort(lengths)[remainder:])
  451. else:
  452. filtered = list(range(len(index)))
  453. if len(filtered) < matrix.shape[0]:
  454. print("Truncating the sparse matrix...")
  455. matrix = matrix[filtered, :][:, filtered]
  456. meta_index = []
  457. for i, j in enumerate(filtered):
  458. meta_index.append((index[j], matrix[i, i]))
  459. index = [mi[0] for mi in meta_index]
  460. with tempfile.TemporaryDirectory(prefix="hercules_labours_", dir=tmpdir or None) as tmproot:
  461. print("Writing Swivel metadata...")
  462. vocabulary = "\n".join(index)
  463. with open(os.path.join(tmproot, "row_vocab.txt"), "w") as out:
  464. out.write(vocabulary)
  465. with open(os.path.join(tmproot, "col_vocab.txt"), "w") as out:
  466. out.write(vocabulary)
  467. del vocabulary
  468. bool_sums = matrix.indptr[1:] - matrix.indptr[:-1]
  469. bool_sums_str = "\n".join(map(str, bool_sums.tolist()))
  470. with open(os.path.join(tmproot, "row_sums.txt"), "w") as out:
  471. out.write(bool_sums_str)
  472. with open(os.path.join(tmproot, "col_sums.txt"), "w") as out:
  473. out.write(bool_sums_str)
  474. del bool_sums_str
  475. reorder = numpy.argsort(-bool_sums)
  476. print("Writing Swivel shards...")
  477. for row in range(nshards):
  478. for col in range(nshards):
  479. def _int64s(xs):
  480. return tf.train.Feature(
  481. int64_list=tf.train.Int64List(value=list(xs)))
  482. def _floats(xs):
  483. return tf.train.Feature(
  484. float_list=tf.train.FloatList(value=list(xs)))
  485. indices_row = reorder[row::nshards]
  486. indices_col = reorder[col::nshards]
  487. shard = matrix[indices_row][:, indices_col].tocoo()
  488. example = tf.train.Example(features=tf.train.Features(feature={
  489. "global_row": _int64s(indices_row),
  490. "global_col": _int64s(indices_col),
  491. "sparse_local_row": _int64s(shard.row),
  492. "sparse_local_col": _int64s(shard.col),
  493. "sparse_value": _floats(shard.data)}))
  494. with open(os.path.join(tmproot, "shard-%03d-%03d.pb" % (row, col)), "wb") as out:
  495. out.write(example.SerializeToString())
  496. print("Training Swivel model...")
  497. swivel.FLAGS.submatrix_rows = shard_size
  498. swivel.FLAGS.submatrix_cols = shard_size
  499. if len(meta_index) < 10000:
  500. embedding_size = 50
  501. num_epochs = 200
  502. elif len(meta_index) < 100000:
  503. embedding_size = 100
  504. num_epochs = 250
  505. elif len(meta_index) < 500000:
  506. embedding_size = 200
  507. num_epochs = 300
  508. else:
  509. embedding_size = 300
  510. num_epochs = 200
  511. swivel.FLAGS.embedding_size = embedding_size
  512. swivel.FLAGS.input_base_path = tmproot
  513. swivel.FLAGS.output_base_path = tmproot
  514. swivel.FLAGS.loss_multiplier = 1.0 / shard_size
  515. swivel.FLAGS.num_epochs = num_epochs
  516. swivel.main(None)
  517. print("Reading Swivel embeddings...")
  518. embeddings = []
  519. with open(os.path.join(tmproot, "row_embedding.tsv")) as frow:
  520. with open(os.path.join(tmproot, "col_embedding.tsv")) as fcol:
  521. for i, (lrow, lcol) in enumerate(zip(frow, fcol)):
  522. prow, pcol = (l.split("\t", 1) for l in (lrow, lcol))
  523. assert prow[0] == pcol[0]
  524. erow, ecol = \
  525. (numpy.fromstring(p[1], dtype=numpy.float32, sep="\t")
  526. for p in (prow, pcol))
  527. embeddings.append((erow + ecol) / 2)
  528. return meta_index, embeddings
  529. class CORSWebServer(object):
  530. def __init__(self):
  531. self.thread = threading.Thread(target=self.serve)
  532. self.server = None
  533. def serve(self):
  534. outer = self
  535. try:
  536. from http.server import HTTPServer, SimpleHTTPRequestHandler, test
  537. except ImportError: # Python 2
  538. from BaseHTTPServer import HTTPServer, test
  539. from SimpleHTTPServer import SimpleHTTPRequestHandler
  540. class ClojureServer(HTTPServer):
  541. def __init__(self, *args, **kwargs):
  542. HTTPServer.__init__(self, *args, **kwargs)
  543. outer.server = self
  544. class CORSRequestHandler(SimpleHTTPRequestHandler):
  545. def end_headers (self):
  546. self.send_header("Access-Control-Allow-Origin", "*")
  547. SimpleHTTPRequestHandler.end_headers(self)
  548. test(CORSRequestHandler, ClojureServer)
  549. def start(self):
  550. self.thread.start()
  551. def stop(self):
  552. if self.running:
  553. self.server.shutdown()
  554. self.thread.join()
  555. @property
  556. def running(self):
  557. return self.server is not None
  558. web_server = CORSWebServer()
  559. def write_embeddings(name, output, run_server, index, embeddings):
  560. print("Writing Tensorflow Projector files...")
  561. if not output:
  562. output = "couples_" + name
  563. metaf = "%s_%s_meta.tsv" % (output, name)
  564. with open(metaf, "w") as fout:
  565. fout.write("name\tcommits\n")
  566. for pair in index:
  567. fout.write("%s\t%s\n" % pair)
  568. print("Wrote", metaf)
  569. dataf = "%s_%s_data.tsv" % (output, name)
  570. with open(dataf, "w") as fout:
  571. for vec in embeddings:
  572. fout.write("\t".join(str(v) for v in vec))
  573. fout.write("\n")
  574. print("Wrote", dataf)
  575. jsonf = "%s_%s.json" % (output, name)
  576. with open(jsonf, "w") as fout:
  577. fout.write("""{
  578. "embeddings": [
  579. {
  580. "tensorName": "%s %s coupling",
  581. "tensorShape": [%s, %s],
  582. "tensorPath": "http://0.0.0.0:8000/%s",
  583. "metadataPath": "http://0.0.0.0:8000/%s"
  584. }
  585. ]
  586. }
  587. """ % (output, name, len(embeddings), len(embeddings[0]), dataf, metaf))
  588. print("Wrote %s", jsonf)
  589. if run_server and not web_server.running:
  590. web_server.start()
  591. url = "http://projector.tensorflow.org/?config=http://0.0.0.0:8000/" + jsonf
  592. print(url)
  593. if run_server:
  594. os.system("xdg-open " + url)
  595. def main():
  596. args = parse_args()
  597. reader = read_input(args)
  598. header = reader.get_header()
  599. name = reader.get_name()
  600. files_warning = "Files stats were not collected. Re-run hercules with -files."
  601. people_warning = "People stats were not collected. Re-run hercules with -people."
  602. couples_warning = "Coupling stats were not collected. Re-run hercules with -couples."
  603. def project_burndown():
  604. plot_burndown(args, "project",
  605. *load_burndown(header, *reader.get_project_burndown(), args.resample))
  606. def files_burndown():
  607. try:
  608. plot_many(args, "file", header, reader.get_files_burndown())
  609. except KeyError:
  610. print(files_warning)
  611. def people_burndown():
  612. try:
  613. plot_many(args, "person", header, reader.get_people_burndown())
  614. except KeyError:
  615. print(people_warning)
  616. def churn_matrix():
  617. try:
  618. plot_churn_matrix(args, name, *reader.get_people_interaction())
  619. except KeyError:
  620. print(people_warning)
  621. def ownership_burndown():
  622. try:
  623. plot_people(args, name, *load_people(header, *reader.get_ownership_burndown()))
  624. except KeyError:
  625. print(people_warning)
  626. def couples():
  627. try:
  628. write_embeddings("files", args.output, not args.disable_projector,
  629. *train_embeddings(*reader.get_files_coocc(), args.couples_tmp_dir))
  630. write_embeddings("people", args.output, not args.disable_projector,
  631. *train_embeddings(*reader.get_people_coocc(), args.couples_tmp_dir))
  632. except KeyError:
  633. print(couples_warning)
  634. if args.mode == "project":
  635. project_burndown()
  636. elif args.mode == "file":
  637. files_burndown()
  638. elif args.mode == "person":
  639. people_burndown()
  640. elif args.mode == "churn_matrix":
  641. churn_matrix()
  642. elif args.mode == "people":
  643. ownership_burndown()
  644. elif args.mode == "couples":
  645. couples()
  646. elif args.mode == "all":
  647. project_burndown()
  648. files_burndown()
  649. people_burndown()
  650. churn_matrix()
  651. ownership_burndown()
  652. couples()
  653. if web_server.running:
  654. print("Sleeping for 60 seconds, safe to Ctrl-C")
  655. try:
  656. time.sleep(60)
  657. except KeyboardInterrupt:
  658. pass
  659. web_server.stop()
  660. if __name__ == "__main__":
  661. sys.exit(main())