labours.py 56 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498
  1. #!/usr/bin/env python3
  2. import argparse
  3. from collections import defaultdict, namedtuple
  4. from datetime import datetime, timedelta
  5. from importlib import import_module
  6. import io
  7. import json
  8. import os
  9. import re
  10. import shutil
  11. import subprocess
  12. import sys
  13. import tempfile
  14. import threading
  15. import time
  16. import warnings
  17. try:
  18. from clint.textui import progress
  19. except ImportError:
  20. print("Warning: clint is not installed, no fancy progressbars in the terminal for you.")
  21. progress = None
  22. import numpy
  23. import yaml
  24. if sys.version_info[0] < 3:
  25. # OK, ancients, I will support Python 2, but you owe me a beer
  26. input = raw_input # noqa: F821
  27. def list_matplotlib_styles():
  28. script = "import sys; from matplotlib import pyplot; " \
  29. "sys.stdout.write(repr(pyplot.style.available))"
  30. styles = eval(subprocess.check_output([sys.executable, "-c", script]))
  31. styles.remove("classic")
  32. return ["default", "classic"] + styles
  33. def parse_args():
  34. parser = argparse.ArgumentParser()
  35. parser.add_argument("-o", "--output", default="",
  36. help="Path to the output file/directory (empty for display). "
  37. "If the extension is JSON, the data is saved instead of "
  38. "the real image.")
  39. parser.add_argument("-i", "--input", default="-",
  40. help="Path to the input file (- for stdin).")
  41. parser.add_argument("-f", "--input-format", default="auto", choices=["yaml", "pb", "auto"])
  42. parser.add_argument("--font-size", default=12, type=int,
  43. help="Size of the labels and legend.")
  44. parser.add_argument("--style", default="ggplot", choices=list_matplotlib_styles(),
  45. help="Plot style to use.")
  46. parser.add_argument("--backend", help="Matplotlib backend to use.")
  47. parser.add_argument("--background", choices=["black", "white"], default="white",
  48. help="Plot's general color scheme.")
  49. parser.add_argument("--size", help="Axes' size in inches, for example \"12,9\"")
  50. parser.add_argument("--relative", action="store_true",
  51. help="Occupy 100%% height for every measurement.")
  52. parser.add_argument("--couples-tmp-dir", help="Temporary directory to work with couples.")
  53. parser.add_argument("-m", "--mode",
  54. choices=["burndown-project", "burndown-file", "burndown-person",
  55. "churn-matrix", "ownership", "couples", "shotness", "sentiment",
  56. "devs", "all", "run-times"],
  57. help="What to plot.")
  58. parser.add_argument(
  59. "--resample", default="year",
  60. help="The way to resample the time series. Possible values are: "
  61. "\"month\", \"year\", \"no\", \"raw\" and pandas offset aliases ("
  62. "http://pandas.pydata.org/pandas-docs/stable/timeseries.html"
  63. "#offset-aliases).")
  64. parser.add_argument("--disable-projector", action="store_true",
  65. help="Do not run Tensorflow Projector on couples.")
  66. parser.add_argument("--max-people", default=20, type=int,
  67. help="Maximum number of developers in churn matrix and people plots.")
  68. args = parser.parse_args()
  69. return args
  70. class Reader(object):
  71. def read(self, file):
  72. raise NotImplementedError
  73. def get_name(self):
  74. raise NotImplementedError
  75. def get_header(self):
  76. raise NotImplementedError
  77. def get_burndown_parameters(self):
  78. raise NotImplementedError
  79. def get_project_burndown(self):
  80. raise NotImplementedError
  81. def get_files_burndown(self):
  82. raise NotImplementedError
  83. def get_people_burndown(self):
  84. raise NotImplementedError
  85. def get_ownership_burndown(self):
  86. raise NotImplementedError
  87. def get_people_interaction(self):
  88. raise NotImplementedError
  89. def get_files_coocc(self):
  90. raise NotImplementedError
  91. def get_people_coocc(self):
  92. raise NotImplementedError
  93. def get_shotness_coocc(self):
  94. raise NotImplementedError
  95. def get_shotness(self):
  96. raise NotImplementedError
  97. def get_sentiment(self):
  98. raise NotImplementedError
  99. def get_devs(self):
  100. raise NotImplementedError
  101. class YamlReader(Reader):
  102. def read(self, file):
  103. yaml.reader.Reader.NON_PRINTABLE = re.compile(r"(?!x)x")
  104. try:
  105. loader = yaml.CLoader
  106. except AttributeError:
  107. print("Warning: failed to import yaml.CLoader, falling back to slow yaml.Loader")
  108. loader = yaml.Loader
  109. try:
  110. if file != "-":
  111. with open(file) as fin:
  112. data = yaml.load(fin, Loader=loader)
  113. else:
  114. data = yaml.load(sys.stdin, Loader=loader)
  115. except (UnicodeEncodeError, yaml.reader.ReaderError) as e:
  116. print("\nInvalid unicode in the input: %s\nPlease filter it through "
  117. "fix_yaml_unicode.py" % e)
  118. sys.exit(1)
  119. if data is None:
  120. print("\nNo data has been read - has Hercules crashed?")
  121. sys.exit(1)
  122. self.data = data
  123. def get_run_times(self):
  124. return {}
  125. def get_name(self):
  126. return self.data["hercules"]["repository"]
  127. def get_header(self):
  128. header = self.data["hercules"]
  129. return header["begin_unix_time"], header["end_unix_time"]
  130. def get_burndown_parameters(self):
  131. header = self.data["Burndown"]
  132. return header["sampling"], header["granularity"]
  133. def get_project_burndown(self):
  134. return self.data["hercules"]["repository"], \
  135. self._parse_burndown_matrix(self.data["Burndown"]["project"]).T
  136. def get_files_burndown(self):
  137. return [(p[0], self._parse_burndown_matrix(p[1]).T)
  138. for p in self.data["Burndown"]["files"].items()]
  139. def get_people_burndown(self):
  140. return [(p[0], self._parse_burndown_matrix(p[1]).T)
  141. for p in self.data["Burndown"]["people"].items()]
  142. def get_ownership_burndown(self):
  143. return self.data["Burndown"]["people_sequence"].copy(),\
  144. {p[0]: self._parse_burndown_matrix(p[1])
  145. for p in self.data["Burndown"]["people"].items()}
  146. def get_people_interaction(self):
  147. return self.data["Burndown"]["people_sequence"].copy(), \
  148. self._parse_burndown_matrix(self.data["Burndown"]["people_interaction"])
  149. def get_files_coocc(self):
  150. coocc = self.data["Couples"]["files_coocc"]
  151. return coocc["index"], self._parse_coocc_matrix(coocc["matrix"])
  152. def get_people_coocc(self):
  153. coocc = self.data["Couples"]["people_coocc"]
  154. return coocc["index"], self._parse_coocc_matrix(coocc["matrix"])
  155. def get_shotness_coocc(self):
  156. shotness = self.data["Shotness"]
  157. index = ["%s:%s" % (i["file"], i["name"]) for i in shotness]
  158. indptr = numpy.zeros(len(shotness) + 1, dtype=numpy.int64)
  159. indices = []
  160. data = []
  161. for i, record in enumerate(shotness):
  162. pairs = [(int(k), v) for k, v in record["counters"].items()]
  163. pairs.sort()
  164. indptr[i + 1] = indptr[i] + len(pairs)
  165. for k, v in pairs:
  166. indices.append(k)
  167. data.append(v)
  168. indices = numpy.array(indices, dtype=numpy.int32)
  169. data = numpy.array(data, dtype=numpy.int32)
  170. from scipy.sparse import csr_matrix
  171. return index, csr_matrix((data, indices, indptr), shape=(len(shotness),) * 2)
  172. def get_shotness(self):
  173. from munch import munchify
  174. obj = munchify(self.data["Shotness"])
  175. # turn strings into ints
  176. for item in obj:
  177. item.counters = {int(k): v for k, v in item.counters.items()}
  178. if len(obj) == 0:
  179. raise KeyError
  180. return obj
  181. def get_sentiment(self):
  182. from munch import munchify
  183. return munchify({int(key): {
  184. "Comments": vals[2].split("|"),
  185. "Commits": vals[1],
  186. "Value": float(vals[0])
  187. } for key, vals in self.data["Sentiment"].items()})
  188. def get_devs(self):
  189. people = self.data["Devs"]["people"]
  190. days = {int(d): {int(dev): DevDay(*(int(x) for x in day)) for dev, day in devs.items()}
  191. for d, devs in self.data["Devs"]["days"].items()}
  192. return days, people
  193. def _parse_burndown_matrix(self, matrix):
  194. return numpy.array([numpy.fromstring(line, dtype=int, sep=" ")
  195. for line in matrix.split("\n")])
  196. def _parse_coocc_matrix(self, matrix):
  197. from scipy.sparse import csr_matrix
  198. data = []
  199. indices = []
  200. indptr = [0]
  201. for row in matrix:
  202. for k, v in sorted(row.items()):
  203. data.append(v)
  204. indices.append(k)
  205. indptr.append(indptr[-1] + len(row))
  206. return csr_matrix((data, indices, indptr), shape=(len(matrix),) * 2)
  207. class ProtobufReader(Reader):
  208. def read(self, file):
  209. try:
  210. from internal.pb.pb_pb2 import AnalysisResults
  211. except ImportError as e:
  212. print("\n\n>>> You need to generate internal/pb/pb_pb2.py - run \"make\"\n",
  213. file=sys.stderr)
  214. raise e from None
  215. self.data = AnalysisResults()
  216. if file != "-":
  217. with open(file, "rb") as fin:
  218. self.data.ParseFromString(fin.read())
  219. else:
  220. self.data.ParseFromString(sys.stdin.buffer.read())
  221. self.contents = {}
  222. for key, val in self.data.contents.items():
  223. try:
  224. mod, name = PB_MESSAGES[key].rsplit(".", 1)
  225. except KeyError:
  226. sys.stderr.write("Warning: there is no registered PB decoder for %s\n" % key)
  227. continue
  228. cls = getattr(import_module(mod), name)
  229. self.contents[key] = msg = cls()
  230. msg.ParseFromString(val)
  231. def get_run_times(self):
  232. return {key: val for key, val in self.data.header.run_time_per_item.items()}
  233. def get_name(self):
  234. return self.data.header.repository
  235. def get_header(self):
  236. header = self.data.header
  237. return header.begin_unix_time, header.end_unix_time
  238. def get_burndown_parameters(self):
  239. burndown = self.contents["Burndown"]
  240. return burndown.sampling, burndown.granularity
  241. def get_project_burndown(self):
  242. return self._parse_burndown_matrix(self.contents["Burndown"].project)
  243. def get_files_burndown(self):
  244. return [self._parse_burndown_matrix(i) for i in self.contents["Burndown"].files]
  245. def get_people_burndown(self):
  246. return [self._parse_burndown_matrix(i) for i in self.contents["Burndown"].people]
  247. def get_ownership_burndown(self):
  248. people = self.get_people_burndown()
  249. return [p[0] for p in people], {p[0]: p[1].T for p in people}
  250. def get_people_interaction(self):
  251. burndown = self.contents["Burndown"]
  252. return [i.name for i in burndown.people], \
  253. self._parse_sparse_matrix(burndown.people_interaction).toarray()
  254. def get_files_coocc(self):
  255. node = self.contents["Couples"].file_couples
  256. return list(node.index), self._parse_sparse_matrix(node.matrix)
  257. def get_people_coocc(self):
  258. node = self.contents["Couples"].people_couples
  259. return list(node.index), self._parse_sparse_matrix(node.matrix)
  260. def get_shotness_coocc(self):
  261. shotness = self.get_shotness()
  262. index = ["%s:%s" % (i.file, i.name) for i in shotness]
  263. indptr = numpy.zeros(len(shotness) + 1, dtype=numpy.int32)
  264. indices = []
  265. data = []
  266. for i, record in enumerate(shotness):
  267. pairs = list(record.counters.items())
  268. pairs.sort()
  269. indptr[i + 1] = indptr[i] + len(pairs)
  270. for k, v in pairs:
  271. indices.append(k)
  272. data.append(v)
  273. indices = numpy.array(indices, dtype=numpy.int32)
  274. data = numpy.array(data, dtype=numpy.int32)
  275. from scipy.sparse import csr_matrix
  276. return index, csr_matrix((data, indices, indptr), shape=(len(shotness),) * 2)
  277. def get_shotness(self):
  278. records = self.contents["Shotness"].records
  279. if len(records) == 0:
  280. raise KeyError
  281. return records
  282. def get_sentiment(self):
  283. byday = self.contents["Sentiment"].SentimentByDay
  284. if len(byday) == 0:
  285. raise KeyError
  286. return byday
  287. def get_devs(self):
  288. people = list(self.contents["Devs"].dev_index)
  289. days = {d: {dev: DevDay(stats.commits, stats.added, stats.removed, stats.changed)
  290. for dev, stats in day.devs.items()}
  291. for d, day in self.contents["Devs"].days.items()}
  292. return days, people
  293. def _parse_burndown_matrix(self, matrix):
  294. dense = numpy.zeros((matrix.number_of_rows, matrix.number_of_columns), dtype=int)
  295. for y, row in enumerate(matrix.rows):
  296. for x, col in enumerate(row.columns):
  297. dense[y, x] = col
  298. return matrix.name, dense.T
  299. def _parse_sparse_matrix(self, matrix):
  300. from scipy.sparse import csr_matrix
  301. return csr_matrix((list(matrix.data), list(matrix.indices), list(matrix.indptr)),
  302. shape=(matrix.number_of_rows, matrix.number_of_columns))
  303. READERS = {"yaml": YamlReader, "yml": YamlReader, "pb": ProtobufReader}
  304. PB_MESSAGES = {
  305. "Burndown": "internal.pb.pb_pb2.BurndownAnalysisResults",
  306. "Couples": "internal.pb.pb_pb2.CouplesAnalysisResults",
  307. "Shotness": "internal.pb.pb_pb2.ShotnessAnalysisResults",
  308. "Devs": "internal.pb.pb_pb2.DevsAnalysisResults",
  309. }
  310. def read_input(args):
  311. sys.stdout.write("Reading the input... ")
  312. sys.stdout.flush()
  313. if args.input != "-":
  314. if args.input_format == "auto":
  315. args.input_format = args.input.rsplit(".", 1)[1]
  316. elif args.input_format == "auto":
  317. args.input_format = "yaml"
  318. reader = READERS[args.input_format]()
  319. reader.read(args.input)
  320. print("done")
  321. return reader
  322. DevDay = namedtuple("DevDay", ("Commits", "Added", "Removed", "Changed"))
  323. def calculate_average_lifetime(matrix):
  324. lifetimes = numpy.zeros(matrix.shape[1] - 1)
  325. for band in matrix:
  326. start = 0
  327. for i, line in enumerate(band):
  328. if i == 0 or band[i - 1] == 0:
  329. start += 1
  330. continue
  331. lifetimes[i - start] = band[i - 1] - line
  332. lifetimes[i - start] = band[i - 1]
  333. lsum = lifetimes.sum()
  334. if lsum != 0:
  335. return (lifetimes.dot(numpy.arange(1, matrix.shape[1], 1))
  336. / (lsum * matrix.shape[1]))
  337. return numpy.nan
  338. def interpolate_burndown_matrix(matrix, granularity, sampling):
  339. daily = numpy.zeros(
  340. (matrix.shape[0] * granularity, matrix.shape[1] * sampling),
  341. dtype=numpy.float32)
  342. """
  343. ----------> samples, x
  344. |
  345. |
  346. |
  347. bands, y
  348. """
  349. for y in range(matrix.shape[0]):
  350. for x in range(matrix.shape[1]):
  351. if y * granularity > (x + 1) * sampling:
  352. # the future is zeros
  353. continue
  354. def decay(start_index: int, start_val: float):
  355. if start_val == 0:
  356. return
  357. k = matrix[y][x] / start_val # <= 1
  358. scale = (x + 1) * sampling - start_index
  359. for i in range(y * granularity, (y + 1) * granularity):
  360. initial = daily[i][start_index - 1]
  361. for j in range(start_index, (x + 1) * sampling):
  362. daily[i][j] = initial * (
  363. 1 + (k - 1) * (j - start_index + 1) / scale)
  364. def grow(finish_index: int, finish_val: float):
  365. initial = matrix[y][x - 1] if x > 0 else 0
  366. start_index = x * sampling
  367. if start_index < y * granularity:
  368. start_index = y * granularity
  369. if finish_index == start_index:
  370. return
  371. avg = (finish_val - initial) / (finish_index - start_index)
  372. for j in range(x * sampling, finish_index):
  373. for i in range(start_index, j + 1):
  374. daily[i][j] = avg
  375. # copy [x*g..y*s)
  376. for j in range(x * sampling, finish_index):
  377. for i in range(y * granularity, x * sampling):
  378. daily[i][j] = daily[i][j - 1]
  379. if (y + 1) * granularity >= (x + 1) * sampling:
  380. # x*granularity <= (y+1)*sampling
  381. # 1. x*granularity <= y*sampling
  382. # y*sampling..(y+1)sampling
  383. #
  384. # x+1
  385. # /
  386. # /
  387. # / y+1 -|
  388. # / |
  389. # / y -|
  390. # /
  391. # / x
  392. #
  393. # 2. x*granularity > y*sampling
  394. # x*granularity..(y+1)sampling
  395. #
  396. # x+1
  397. # /
  398. # /
  399. # / y+1 -|
  400. # / |
  401. # / x -|
  402. # /
  403. # / y
  404. if y * granularity <= x * sampling:
  405. grow((x + 1) * sampling, matrix[y][x])
  406. elif (x + 1) * sampling > y * granularity:
  407. grow((x + 1) * sampling, matrix[y][x])
  408. avg = matrix[y][x] / ((x + 1) * sampling - y * granularity)
  409. for j in range(y * granularity, (x + 1) * sampling):
  410. for i in range(y * granularity, j + 1):
  411. daily[i][j] = avg
  412. elif (y + 1) * granularity >= x * sampling:
  413. # y*sampling <= (x+1)*granularity < (y+1)sampling
  414. # y*sampling..(x+1)*granularity
  415. # (x+1)*granularity..(y+1)sampling
  416. # x+1
  417. # /\
  418. # / \
  419. # / \
  420. # / y+1
  421. # /
  422. # y
  423. v1 = matrix[y][x - 1]
  424. v2 = matrix[y][x]
  425. delta = (y + 1) * granularity - x * sampling
  426. previous = 0
  427. if x > 0 and (x - 1) * sampling >= y * granularity:
  428. # x*g <= (y-1)*s <= y*s <= (x+1)*g <= (y+1)*s
  429. # |________|.......^
  430. if x > 1:
  431. previous = matrix[y][x - 2]
  432. scale = sampling
  433. else:
  434. # (y-1)*s < x*g <= y*s <= (x+1)*g <= (y+1)*s
  435. # |______|.......^
  436. scale = sampling if x == 0 else x * sampling - y * granularity
  437. peak = v1 + (v1 - previous) / scale * delta
  438. if v2 > peak:
  439. # we need to adjust the peak, it may not be less than the decayed value
  440. if x < matrix.shape[1] - 1:
  441. # y*s <= (x+1)*g <= (y+1)*s < (y+2)*s
  442. # ^.........|_________|
  443. k = (v2 - matrix[y][x + 1]) / sampling # > 0
  444. peak = matrix[y][x] + k * ((x + 1) * sampling - (y + 1) * granularity)
  445. # peak > v2 > v1
  446. else:
  447. peak = v2
  448. # not enough data to interpolate; this is at least not restricted
  449. grow((y + 1) * granularity, peak)
  450. decay((y + 1) * granularity, peak)
  451. else:
  452. # (x+1)*granularity < y*sampling
  453. # y*sampling..(y+1)sampling
  454. decay(x * sampling, matrix[y][x - 1])
  455. return daily
  456. def load_burndown(header, name, matrix, resample):
  457. import pandas
  458. start, last, sampling, granularity = header
  459. assert sampling > 0
  460. assert granularity >= sampling
  461. start = datetime.fromtimestamp(start)
  462. last = datetime.fromtimestamp(last)
  463. print(name, "lifetime index:", calculate_average_lifetime(matrix))
  464. finish = start + timedelta(days=matrix.shape[1] * sampling)
  465. if resample not in ("no", "raw"):
  466. print("resampling to %s, please wait..." % resample)
  467. # Interpolate the day x day matrix.
  468. # Each day brings equal weight in the granularity.
  469. # Sampling's interpolation is linear.
  470. daily = interpolate_burndown_matrix(matrix, granularity, sampling)
  471. daily[(last - start).days:] = 0
  472. # Resample the bands
  473. aliases = {
  474. "year": "A",
  475. "month": "M"
  476. }
  477. resample = aliases.get(resample, resample)
  478. periods = 0
  479. date_granularity_sampling = [start]
  480. while date_granularity_sampling[-1] < finish:
  481. periods += 1
  482. date_granularity_sampling = pandas.date_range(
  483. start, periods=periods, freq=resample)
  484. date_range_sampling = pandas.date_range(
  485. date_granularity_sampling[0],
  486. periods=(finish - date_granularity_sampling[0]).days,
  487. freq="1D")
  488. # Fill the new square matrix
  489. matrix = numpy.zeros(
  490. (len(date_granularity_sampling), len(date_range_sampling)),
  491. dtype=numpy.float32)
  492. for i, gdt in enumerate(date_granularity_sampling):
  493. istart = (date_granularity_sampling[i - 1] - start).days \
  494. if i > 0 else 0
  495. ifinish = (gdt - start).days
  496. for j, sdt in enumerate(date_range_sampling):
  497. if (sdt - start).days >= istart:
  498. break
  499. matrix[i, j:] = \
  500. daily[istart:ifinish, (sdt - start).days:].sum(axis=0)
  501. # Hardcode some cases to improve labels' readability
  502. if resample in ("year", "A"):
  503. labels = [dt.year for dt in date_granularity_sampling]
  504. elif resample in ("month", "M"):
  505. labels = [dt.strftime("%Y %B") for dt in date_granularity_sampling]
  506. else:
  507. labels = [dt.date() for dt in date_granularity_sampling]
  508. else:
  509. labels = [
  510. "%s - %s" % ((start + timedelta(days=i * granularity)).date(),
  511. (
  512. start + timedelta(days=(i + 1) * granularity)).date())
  513. for i in range(matrix.shape[0])]
  514. if len(labels) > 18:
  515. warnings.warn("Too many labels - consider resampling.")
  516. resample = "M" # fake resampling type is checked while plotting
  517. date_range_sampling = pandas.date_range(
  518. start + timedelta(days=sampling), periods=matrix.shape[1],
  519. freq="%dD" % sampling)
  520. return name, matrix, date_range_sampling, labels, granularity, sampling, resample
  521. def load_ownership(header, sequence, contents, max_people):
  522. import pandas
  523. start, last, sampling, _ = header
  524. start = datetime.fromtimestamp(start)
  525. last = datetime.fromtimestamp(last)
  526. people = []
  527. for name in sequence:
  528. people.append(contents[name].sum(axis=1))
  529. people = numpy.array(people)
  530. date_range_sampling = pandas.date_range(
  531. start + timedelta(days=sampling), periods=people[0].shape[0],
  532. freq="%dD" % sampling)
  533. if people.shape[0] > max_people:
  534. order = numpy.argsort(-people.sum(axis=1))
  535. people = people[order[:max_people]]
  536. sequence = [sequence[i] for i in order[:max_people]]
  537. print("Warning: truncated people to most owning %d" % max_people)
  538. for i, name in enumerate(sequence):
  539. if len(name) > 40:
  540. sequence[i] = name[:37] + "..."
  541. return sequence, people, date_range_sampling, last
  542. def load_churn_matrix(people, matrix, max_people):
  543. matrix = matrix.astype(float)
  544. if matrix.shape[0] > max_people:
  545. order = numpy.argsort(-matrix[:, 0])
  546. matrix = matrix[order[:max_people]][:, [0, 1] + list(2 + order[:max_people])]
  547. people = [people[i] for i in order[:max_people]]
  548. print("Warning: truncated people to most productive %d" % max_people)
  549. zeros = matrix[:, 0] == 0
  550. matrix[zeros, :] = 1
  551. matrix /= matrix[:, 0][:, None]
  552. matrix = -matrix[:, 1:]
  553. matrix[zeros, :] = 0
  554. for i, name in enumerate(people):
  555. if len(name) > 40:
  556. people[i] = name[:37] + "..."
  557. return people, matrix
  558. def import_pyplot(backend, style):
  559. import matplotlib
  560. if backend:
  561. matplotlib.use(backend)
  562. from matplotlib import pyplot
  563. pyplot.style.use(style)
  564. return matplotlib, pyplot
  565. def apply_plot_style(figure, axes, legend, background, font_size, axes_size):
  566. foreground = "black" if background == "white" else "white"
  567. if axes_size is None:
  568. axes_size = (12, 9)
  569. else:
  570. axes_size = tuple(float(p) for p in axes_size.split(","))
  571. figure.set_size_inches(*axes_size)
  572. for side in ("bottom", "top", "left", "right"):
  573. axes.spines[side].set_color(foreground)
  574. for axis in (axes.xaxis, axes.yaxis):
  575. axis.label.update(dict(fontsize=font_size, color=foreground))
  576. for axis in ("x", "y"):
  577. getattr(axes, axis + "axis").get_offset_text().set_size(font_size)
  578. axes.tick_params(axis=axis, colors=foreground, labelsize=font_size)
  579. try:
  580. axes.ticklabel_format(axis="y", style="sci", scilimits=(0, 3))
  581. except AttributeError:
  582. pass
  583. figure.patch.set_facecolor(background)
  584. axes.set_facecolor(background)
  585. if legend is not None:
  586. frame = legend.get_frame()
  587. for setter in (frame.set_facecolor, frame.set_edgecolor):
  588. setter(background)
  589. for text in legend.get_texts():
  590. text.set_color(foreground)
  591. def get_plot_path(base, name):
  592. root, ext = os.path.splitext(base)
  593. if not ext:
  594. ext = ".png"
  595. output = os.path.join(root, name + ext)
  596. os.makedirs(os.path.dirname(output), exist_ok=True)
  597. return output
  598. def deploy_plot(title, output, background):
  599. import matplotlib.pyplot as pyplot
  600. if not output:
  601. pyplot.gcf().canvas.set_window_title(title)
  602. pyplot.show()
  603. else:
  604. if title:
  605. pyplot.title(title, color="black" if background == "white" else "white")
  606. try:
  607. pyplot.tight_layout()
  608. except: # noqa: E722
  609. print("Warning: failed to set the tight layout")
  610. pyplot.savefig(output, transparent=True)
  611. pyplot.clf()
  612. def default_json(x):
  613. if hasattr(x, "tolist"):
  614. return x.tolist()
  615. if hasattr(x, "isoformat"):
  616. return x.isoformat()
  617. return x
  618. def plot_burndown(args, target, name, matrix, date_range_sampling, labels, granularity,
  619. sampling, resample):
  620. if args.output and args.output.endswith(".json"):
  621. data = locals().copy()
  622. del data["args"]
  623. data["type"] = "burndown"
  624. if args.mode == "project" and target == "project":
  625. output = args.output
  626. else:
  627. if target == "project":
  628. name = "project"
  629. output = get_plot_path(args.output, name)
  630. with open(output, "w") as fout:
  631. json.dump(data, fout, sort_keys=True, default=default_json)
  632. return
  633. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  634. pyplot.stackplot(date_range_sampling, matrix, labels=labels)
  635. if args.relative:
  636. for i in range(matrix.shape[1]):
  637. matrix[:, i] /= matrix[:, i].sum()
  638. pyplot.ylim(0, 1)
  639. legend_loc = 3
  640. else:
  641. legend_loc = 2
  642. legend = pyplot.legend(loc=legend_loc, fontsize=args.font_size)
  643. pyplot.ylabel("Lines of code")
  644. pyplot.xlabel("Time")
  645. apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.background,
  646. args.font_size, args.size)
  647. pyplot.xlim(date_range_sampling[0], date_range_sampling[-1])
  648. locator = pyplot.gca().xaxis.get_major_locator()
  649. # set the optimal xticks locator
  650. if "M" not in resample:
  651. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  652. locs = pyplot.gca().get_xticks().tolist()
  653. if len(locs) >= 16:
  654. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  655. locs = pyplot.gca().get_xticks().tolist()
  656. if len(locs) >= 16:
  657. pyplot.gca().xaxis.set_major_locator(locator)
  658. if locs[0] < pyplot.xlim()[0]:
  659. del locs[0]
  660. endindex = -1
  661. if len(locs) >= 2 and pyplot.xlim()[1] - locs[-1] > (locs[-1] - locs[-2]) / 2:
  662. locs.append(pyplot.xlim()[1])
  663. endindex = len(locs) - 1
  664. startindex = -1
  665. if len(locs) >= 2 and locs[0] - pyplot.xlim()[0] > (locs[1] - locs[0]) / 2:
  666. locs.append(pyplot.xlim()[0])
  667. startindex = len(locs) - 1
  668. pyplot.gca().set_xticks(locs)
  669. # hacking time!
  670. labels = pyplot.gca().get_xticklabels()
  671. if startindex >= 0:
  672. labels[startindex].set_text(date_range_sampling[0].date())
  673. labels[startindex].set_text = lambda _: None
  674. labels[startindex].set_rotation(30)
  675. labels[startindex].set_ha("right")
  676. if endindex >= 0:
  677. labels[endindex].set_text(date_range_sampling[-1].date())
  678. labels[endindex].set_text = lambda _: None
  679. labels[endindex].set_rotation(30)
  680. labels[endindex].set_ha("right")
  681. title = "%s %d x %d (granularity %d, sampling %d)" % \
  682. ((name,) + matrix.shape + (granularity, sampling))
  683. output = args.output
  684. if output:
  685. if args.mode == "project" and target == "project":
  686. output = args.output
  687. else:
  688. if target == "project":
  689. name = "project"
  690. output = get_plot_path(args.output, name)
  691. deploy_plot(title, output, args.style)
  692. def plot_many_burndown(args, target, header, parts):
  693. if not args.output:
  694. print("Warning: output not set, showing %d plots." % len(parts))
  695. itercnt = progress.bar(parts, expected_size=len(parts)) \
  696. if progress is not None else parts
  697. stdout = io.StringIO()
  698. for name, matrix in itercnt:
  699. backup = sys.stdout
  700. sys.stdout = stdout
  701. plot_burndown(args, target, *load_burndown(header, name, matrix, args.resample))
  702. sys.stdout = backup
  703. sys.stdout.write(stdout.getvalue())
  704. def plot_churn_matrix(args, repo, people, matrix):
  705. if args.output and args.output.endswith(".json"):
  706. data = locals().copy()
  707. del data["args"]
  708. data["type"] = "churn_matrix"
  709. if args.mode == "all":
  710. output = get_plot_path(args.output, "matrix")
  711. else:
  712. output = args.output
  713. with open(output, "w") as fout:
  714. json.dump(data, fout, sort_keys=True, default=default_json)
  715. return
  716. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  717. s = 4 + matrix.shape[1] * 0.3
  718. fig = pyplot.figure(figsize=(s, s))
  719. ax = fig.add_subplot(111)
  720. ax.xaxis.set_label_position("top")
  721. ax.matshow(matrix, cmap=pyplot.cm.OrRd)
  722. ax.set_xticks(numpy.arange(0, matrix.shape[1]))
  723. ax.set_yticks(numpy.arange(0, matrix.shape[0]))
  724. ax.set_yticklabels(people, va="center")
  725. ax.set_xticks(numpy.arange(0.5, matrix.shape[1] + 0.5), minor=True)
  726. ax.set_xticklabels(["Unidentified"] + people, rotation=45, ha="left",
  727. va="bottom", rotation_mode="anchor")
  728. ax.set_yticks(numpy.arange(0.5, matrix.shape[0] + 0.5), minor=True)
  729. ax.grid(which="minor")
  730. apply_plot_style(fig, ax, None, args.background, args.font_size, args.size)
  731. if not args.output:
  732. pos1 = ax.get_position()
  733. pos2 = (pos1.x0 + 0.15, pos1.y0 - 0.1, pos1.width * 0.9, pos1.height * 0.9)
  734. ax.set_position(pos2)
  735. if args.mode == "all":
  736. output = get_plot_path(args.output, "matrix")
  737. else:
  738. output = args.output
  739. title = "%s %d developers overwrite" % (repo, matrix.shape[0])
  740. if args.output:
  741. # FIXME(vmarkovtsev): otherwise the title is screwed in savefig()
  742. title = ""
  743. deploy_plot(title, output, args.style)
  744. def plot_ownership(args, repo, names, people, date_range, last):
  745. if args.output and args.output.endswith(".json"):
  746. data = locals().copy()
  747. del data["args"]
  748. data["type"] = "ownership"
  749. if args.mode == "all":
  750. output = get_plot_path(args.output, "people")
  751. else:
  752. output = args.output
  753. with open(output, "w") as fout:
  754. json.dump(data, fout, sort_keys=True, default=default_json)
  755. return
  756. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  757. pyplot.stackplot(date_range, people, labels=names)
  758. pyplot.xlim(date_range[0], last)
  759. if args.relative:
  760. for i in range(people.shape[1]):
  761. people[:, i] /= people[:, i].sum()
  762. pyplot.ylim(0, 1)
  763. legend_loc = 3
  764. else:
  765. legend_loc = 2
  766. legend = pyplot.legend(loc=legend_loc, fontsize=args.font_size)
  767. apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.background,
  768. args.font_size, args.size)
  769. if args.mode == "all":
  770. output = get_plot_path(args.output, "people")
  771. else:
  772. output = args.output
  773. deploy_plot("%s code ownership through time" % repo, output, args.style)
  774. IDEAL_SHARD_SIZE = 4096
  775. def train_embeddings(index, matrix, tmpdir, shard_size=IDEAL_SHARD_SIZE):
  776. try:
  777. from . import swivel
  778. except (SystemError, ImportError):
  779. import swivel
  780. import tensorflow as tf
  781. assert matrix.shape[0] == matrix.shape[1]
  782. assert len(index) <= matrix.shape[0]
  783. outlier_threshold = numpy.percentile(matrix.data, 99)
  784. matrix.data[matrix.data > outlier_threshold] = outlier_threshold
  785. nshards = len(index) // shard_size
  786. if nshards * shard_size < len(index):
  787. nshards += 1
  788. shard_size = len(index) // nshards
  789. nshards = len(index) // shard_size
  790. remainder = len(index) - nshards * shard_size
  791. if remainder > 0:
  792. lengths = matrix.indptr[1:] - matrix.indptr[:-1]
  793. filtered = sorted(numpy.argsort(lengths)[remainder:])
  794. else:
  795. filtered = list(range(len(index)))
  796. if len(filtered) < matrix.shape[0]:
  797. print("Truncating the sparse matrix...")
  798. matrix = matrix[filtered, :][:, filtered]
  799. meta_index = []
  800. for i, j in enumerate(filtered):
  801. meta_index.append((index[j], matrix[i, i]))
  802. index = [mi[0] for mi in meta_index]
  803. with tempfile.TemporaryDirectory(prefix="hercules_labours_", dir=tmpdir or None) as tmproot:
  804. print("Writing Swivel metadata...")
  805. vocabulary = "\n".join(index)
  806. with open(os.path.join(tmproot, "row_vocab.txt"), "w") as out:
  807. out.write(vocabulary)
  808. with open(os.path.join(tmproot, "col_vocab.txt"), "w") as out:
  809. out.write(vocabulary)
  810. del vocabulary
  811. bool_sums = matrix.indptr[1:] - matrix.indptr[:-1]
  812. bool_sums_str = "\n".join(map(str, bool_sums.tolist()))
  813. with open(os.path.join(tmproot, "row_sums.txt"), "w") as out:
  814. out.write(bool_sums_str)
  815. with open(os.path.join(tmproot, "col_sums.txt"), "w") as out:
  816. out.write(bool_sums_str)
  817. del bool_sums_str
  818. reorder = numpy.argsort(-bool_sums)
  819. print("Writing Swivel shards...")
  820. for row in range(nshards):
  821. for col in range(nshards):
  822. def _int64s(xs):
  823. return tf.train.Feature(
  824. int64_list=tf.train.Int64List(value=list(xs)))
  825. def _floats(xs):
  826. return tf.train.Feature(
  827. float_list=tf.train.FloatList(value=list(xs)))
  828. indices_row = reorder[row::nshards]
  829. indices_col = reorder[col::nshards]
  830. shard = matrix[indices_row][:, indices_col].tocoo()
  831. example = tf.train.Example(features=tf.train.Features(feature={
  832. "global_row": _int64s(indices_row),
  833. "global_col": _int64s(indices_col),
  834. "sparse_local_row": _int64s(shard.row),
  835. "sparse_local_col": _int64s(shard.col),
  836. "sparse_value": _floats(shard.data)}))
  837. with open(os.path.join(tmproot, "shard-%03d-%03d.pb" % (row, col)), "wb") as out:
  838. out.write(example.SerializeToString())
  839. print("Training Swivel model...")
  840. swivel.FLAGS.submatrix_rows = shard_size
  841. swivel.FLAGS.submatrix_cols = shard_size
  842. if len(meta_index) <= IDEAL_SHARD_SIZE / 16:
  843. embedding_size = 50
  844. num_epochs = 100000
  845. elif len(meta_index) <= IDEAL_SHARD_SIZE:
  846. embedding_size = 50
  847. num_epochs = 50000
  848. elif len(meta_index) <= IDEAL_SHARD_SIZE * 2:
  849. embedding_size = 60
  850. num_epochs = 10000
  851. elif len(meta_index) <= IDEAL_SHARD_SIZE * 4:
  852. embedding_size = 70
  853. num_epochs = 8000
  854. elif len(meta_index) <= IDEAL_SHARD_SIZE * 10:
  855. embedding_size = 80
  856. num_epochs = 5000
  857. elif len(meta_index) <= IDEAL_SHARD_SIZE * 25:
  858. embedding_size = 100
  859. num_epochs = 1000
  860. elif len(meta_index) <= IDEAL_SHARD_SIZE * 100:
  861. embedding_size = 200
  862. num_epochs = 600
  863. else:
  864. embedding_size = 300
  865. num_epochs = 300
  866. if os.getenv("CI"):
  867. # Travis, AppVeyor etc. during the integration tests
  868. num_epochs /= 10
  869. swivel.FLAGS.embedding_size = embedding_size
  870. swivel.FLAGS.input_base_path = tmproot
  871. swivel.FLAGS.output_base_path = tmproot
  872. swivel.FLAGS.loss_multiplier = 1.0 / shard_size
  873. swivel.FLAGS.num_epochs = num_epochs
  874. # Tensorflow 1.5 parses sys.argv unconditionally *applause*
  875. argv_backup = sys.argv[1:]
  876. del sys.argv[1:]
  877. swivel.main(None)
  878. sys.argv.extend(argv_backup)
  879. print("Reading Swivel embeddings...")
  880. embeddings = []
  881. with open(os.path.join(tmproot, "row_embedding.tsv")) as frow:
  882. with open(os.path.join(tmproot, "col_embedding.tsv")) as fcol:
  883. for i, (lrow, lcol) in enumerate(zip(frow, fcol)):
  884. prow, pcol = (l.split("\t", 1) for l in (lrow, lcol))
  885. assert prow[0] == pcol[0]
  886. erow, ecol = \
  887. (numpy.fromstring(p[1], dtype=numpy.float32, sep="\t")
  888. for p in (prow, pcol))
  889. embeddings.append((erow + ecol) / 2)
  890. return meta_index, embeddings
  891. class CORSWebServer(object):
  892. def __init__(self):
  893. self.thread = threading.Thread(target=self.serve)
  894. self.server = None
  895. def serve(self):
  896. outer = self
  897. try:
  898. from http.server import HTTPServer, SimpleHTTPRequestHandler, test
  899. except ImportError: # Python 2
  900. from BaseHTTPServer import HTTPServer, test
  901. from SimpleHTTPServer import SimpleHTTPRequestHandler
  902. class ClojureServer(HTTPServer):
  903. def __init__(self, *args, **kwargs):
  904. HTTPServer.__init__(self, *args, **kwargs)
  905. outer.server = self
  906. class CORSRequestHandler(SimpleHTTPRequestHandler):
  907. def end_headers(self):
  908. self.send_header("Access-Control-Allow-Origin", "*")
  909. SimpleHTTPRequestHandler.end_headers(self)
  910. test(CORSRequestHandler, ClojureServer)
  911. def start(self):
  912. self.thread.start()
  913. def stop(self):
  914. if self.running:
  915. self.server.shutdown()
  916. self.thread.join()
  917. @property
  918. def running(self):
  919. return self.server is not None
  920. web_server = CORSWebServer()
  921. def write_embeddings(name, output, run_server, index, embeddings):
  922. print("Writing Tensorflow Projector files...")
  923. if not output:
  924. output = "couples_" + name
  925. if output.endswith(".json"):
  926. output = os.path.join(output[:-5], "couples")
  927. run_server = False
  928. metaf = "%s_%s_meta.tsv" % (output, name)
  929. with open(metaf, "w") as fout:
  930. fout.write("name\tcommits\n")
  931. for pair in index:
  932. fout.write("%s\t%s\n" % pair)
  933. print("Wrote", metaf)
  934. dataf = "%s_%s_data.tsv" % (output, name)
  935. with open(dataf, "w") as fout:
  936. for vec in embeddings:
  937. fout.write("\t".join(str(v) for v in vec))
  938. fout.write("\n")
  939. print("Wrote", dataf)
  940. jsonf = "%s_%s.json" % (output, name)
  941. with open(jsonf, "w") as fout:
  942. fout.write("""{
  943. "embeddings": [
  944. {
  945. "tensorName": "%s %s coupling",
  946. "tensorShape": [%s, %s],
  947. "tensorPath": "http://0.0.0.0:8000/%s",
  948. "metadataPath": "http://0.0.0.0:8000/%s"
  949. }
  950. ]
  951. }
  952. """ % (output, name, len(embeddings), len(embeddings[0]), dataf, metaf))
  953. print("Wrote %s" % jsonf)
  954. if run_server and not web_server.running:
  955. web_server.start()
  956. url = "http://projector.tensorflow.org/?config=http://0.0.0.0:8000/" + jsonf
  957. print(url)
  958. if run_server:
  959. if shutil.which("xdg-open") is not None:
  960. os.system("xdg-open " + url)
  961. else:
  962. browser = os.getenv("BROWSER", "")
  963. if browser:
  964. os.system(browser + " " + url)
  965. else:
  966. print("\t" + url)
  967. def show_shotness_stats(data):
  968. top = sorted(((r.counters[i], i) for i, r in enumerate(data)), reverse=True)
  969. for count, i in top:
  970. r = data[i]
  971. print("%8d %s:%s [%s]" % (count, r.file, r.name, r.internal_role))
  972. def show_sentiment_stats(args, name, resample, start_date, data):
  973. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  974. start_date = datetime.fromtimestamp(start_date)
  975. data = sorted(data.items())
  976. xdates = [start_date + timedelta(days=d[0]) for d in data]
  977. xpos = []
  978. ypos = []
  979. xneg = []
  980. yneg = []
  981. for x, (_, y) in zip(xdates, data):
  982. y = 0.5 - y.Value
  983. if y > 0:
  984. xpos.append(x)
  985. ypos.append(y)
  986. else:
  987. xneg.append(x)
  988. yneg.append(y)
  989. pyplot.bar(xpos, ypos, color="g", label="Positive")
  990. pyplot.bar(xneg, yneg, color="r", label="Negative")
  991. legend = pyplot.legend(loc=1, fontsize=args.font_size)
  992. pyplot.ylabel("Lines of code")
  993. pyplot.xlabel("Time")
  994. apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.background,
  995. args.font_size, args.size)
  996. pyplot.xlim(xdates[0], xdates[-1])
  997. locator = pyplot.gca().xaxis.get_major_locator()
  998. # set the optimal xticks locator
  999. if "M" not in resample:
  1000. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  1001. locs = pyplot.gca().get_xticks().tolist()
  1002. if len(locs) >= 16:
  1003. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  1004. locs = pyplot.gca().get_xticks().tolist()
  1005. if len(locs) >= 16:
  1006. pyplot.gca().xaxis.set_major_locator(locator)
  1007. if locs[0] < pyplot.xlim()[0]:
  1008. del locs[0]
  1009. endindex = -1
  1010. if len(locs) >= 2 and pyplot.xlim()[1] - locs[-1] > (locs[-1] - locs[-2]) / 2:
  1011. locs.append(pyplot.xlim()[1])
  1012. endindex = len(locs) - 1
  1013. startindex = -1
  1014. if len(locs) >= 2 and locs[0] - pyplot.xlim()[0] > (locs[1] - locs[0]) / 2:
  1015. locs.append(pyplot.xlim()[0])
  1016. startindex = len(locs) - 1
  1017. pyplot.gca().set_xticks(locs)
  1018. # hacking time!
  1019. labels = pyplot.gca().get_xticklabels()
  1020. if startindex >= 0:
  1021. labels[startindex].set_text(xdates[0].date())
  1022. labels[startindex].set_text = lambda _: None
  1023. labels[startindex].set_rotation(30)
  1024. labels[startindex].set_ha("right")
  1025. if endindex >= 0:
  1026. labels[endindex].set_text(xdates[-1].date())
  1027. labels[endindex].set_text = lambda _: None
  1028. labels[endindex].set_rotation(30)
  1029. labels[endindex].set_ha("right")
  1030. overall_pos = sum(2 * (0.5 - d[1].Value) for d in data if d[1].Value < 0.5)
  1031. overall_neg = sum(2 * (d[1].Value - 0.5) for d in data if d[1].Value > 0.5)
  1032. title = "%s sentiment +%.1f -%.1f δ=%.1f" % (
  1033. name, overall_pos, overall_neg, overall_pos - overall_neg)
  1034. deploy_plot(title, args.output, args.style)
  1035. def show_devs(args, name, start_date, end_date, data):
  1036. try:
  1037. from fastdtw import fastdtw
  1038. except ImportError as e:
  1039. print("Cannot import fastdtw: %s\nInstall it from https://github.com/slaypni/fastdtw" % e)
  1040. sys.exit(1)
  1041. try:
  1042. from ortools.constraint_solver import pywrapcp, routing_enums_pb2
  1043. except ImportError as e:
  1044. print("Cannot import ortools: %s\nInstall it from "
  1045. "https://developers.google.com/optimization/install/python/" % e)
  1046. sys.exit(1)
  1047. try:
  1048. from hdbscan import HDBSCAN
  1049. except ImportError as e:
  1050. print("Cannot import ortools: %s\nInstall it from "
  1051. "https://developers.google.com/optimization/install/python/" % e)
  1052. sys.exit(1)
  1053. from scipy.signal import convolve, slepian
  1054. days, people = data
  1055. max_people = 50
  1056. if len(people) > max_people:
  1057. print("Picking top 100 developers by commit count")
  1058. # pick top N developers by commit count
  1059. commits = defaultdict(int)
  1060. for devs in days.values():
  1061. for dev, stats in devs.items():
  1062. commits[dev] += stats.Commits
  1063. commits = sorted(((v, k) for k, v in commits.items()), reverse=True)
  1064. chosen_people = {people[k] for _, k in commits[:max_people]}
  1065. else:
  1066. chosen_people = set(people)
  1067. devseries = defaultdict(list)
  1068. for day, devs in sorted(days.items()):
  1069. for dev, stats in devs.items():
  1070. if people[dev] in chosen_people:
  1071. devseries[dev].append((day, stats.Commits))
  1072. print("Calculating the distance matrix")
  1073. # max-normalize the time series using a sliding window
  1074. keys = list(devseries.keys())
  1075. series = list(devseries.values())
  1076. for i, s in enumerate(series):
  1077. arr = numpy.array(s).transpose().astype(numpy.float32)
  1078. commits = arr[1]
  1079. if len(commits) < 7:
  1080. commits /= commits.max()
  1081. else:
  1082. # 4 is sizeof(float32)
  1083. windows = numpy.lib.stride_tricks.as_strided(commits, [len(commits) - 6, 7], [4, 4])
  1084. commits = numpy.concatenate((
  1085. [windows[0, 0] / windows[0].max(),
  1086. windows[0, 1] / windows[0].max(),
  1087. windows[0, 2] / windows[0].max()],
  1088. windows[:, 3] / windows.max(axis=1),
  1089. [windows[-1, 4] / windows[-1].max(),
  1090. windows[-1, 5] / windows[-1].max(),
  1091. windows[-1, 6] / windows[-1].max()]
  1092. ))
  1093. arr[1] = commits * 7 # 7 is a pure heuristic here and is not related to window size
  1094. series[i] = list(arr.transpose())
  1095. # calculate the distance matrix using dynamic time warping metric
  1096. dists = numpy.full((len(series)+1, len(series)+1), -100500, dtype=numpy.float32)
  1097. for x in range(len(series)):
  1098. dists[x, x] = 0
  1099. for y in range(x + 1, len(series)):
  1100. # L1 norm
  1101. dist, _ = fastdtw(series[x], series[y], radius=5, dist=1)
  1102. dists[x, y] = dists[y, x] = dist
  1103. # preparation for seriation ordering
  1104. dists[len(series), :] = 0
  1105. dists[:, len(series)] = 0
  1106. assert (dists >= 0).all()
  1107. print("Ordering the series")
  1108. # solve the TSP on the distance matrix
  1109. routing = pywrapcp.RoutingModel(dists.shape[0], 1, len(series))
  1110. def dist_callback(x, y):
  1111. # ortools wants integers, so we approximate here
  1112. return int(dists[x][y] * 1000)
  1113. routing.SetArcCostEvaluatorOfAllVehicles(dist_callback)
  1114. search_parameters = pywrapcp.RoutingModel.DefaultSearchParameters()
  1115. search_parameters.local_search_metaheuristic = (
  1116. routing_enums_pb2.LocalSearchMetaheuristic.GUIDED_LOCAL_SEARCH)
  1117. search_parameters.time_limit_ms = 2000
  1118. assignment = routing.SolveWithParameters(search_parameters)
  1119. index = routing.Start(0)
  1120. route = []
  1121. while not routing.IsEnd(index):
  1122. node = routing.IndexToNode(index)
  1123. if node < len(keys):
  1124. route.append(node)
  1125. index = assignment.Value(routing.NextVar(index))
  1126. route_map = {v: i for i, v in enumerate(route)}
  1127. # determine clusters
  1128. opt_dist_chain = numpy.cumsum(numpy.array(
  1129. [0] + [dists[route[i], route[i + 1]] for i in range(len(route)-1)]))
  1130. clusters = HDBSCAN(min_cluster_size=2).fit_predict(opt_dist_chain[:, numpy.newaxis])
  1131. route = [keys[node] for node in route]
  1132. print("Plotting")
  1133. # smooth time series
  1134. start_date = datetime.fromtimestamp(start_date)
  1135. start_date = datetime(start_date.year, start_date.month, start_date.day)
  1136. end_date = datetime.fromtimestamp(end_date)
  1137. end_date = datetime(end_date.year, end_date.month, end_date.day)
  1138. size = (end_date - start_date).days + 1
  1139. plot_x = [start_date + timedelta(days=i) for i in range(size)]
  1140. resolution = 64
  1141. window = slepian(size // resolution, 0.5)
  1142. series = list(devseries.values())
  1143. final = numpy.zeros((len(devseries), size), dtype=numpy.float32)
  1144. for i, s in enumerate(series):
  1145. arr = numpy.array(s).transpose()
  1146. full_history = numpy.zeros(size, dtype=numpy.float32)
  1147. full_history[arr[0]] = arr[1]
  1148. final[route_map[i]] = convolve(full_history, window, "same")
  1149. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  1150. prop_cycle = pyplot.rcParams["axes.prop_cycle"]
  1151. colors = prop_cycle.by_key()["color"]
  1152. fig, axes = pyplot.subplots(final.shape[0], 1)
  1153. for ax, series, cluster, dev_i in zip(axes, final, clusters, route):
  1154. if cluster >= 0:
  1155. color = colors[cluster % len(colors)]
  1156. else:
  1157. # outlier
  1158. color = "grey"
  1159. ax.plot(plot_x, series, color=color)
  1160. ax.set_axis_off()
  1161. author = people[dev_i]
  1162. ax.text(0.03, 0.5, author[:36] + (author[36:] and "..."),
  1163. horizontalalignment="right", verticalalignment="center",
  1164. transform=ax.transAxes, fontsize=14)
  1165. ax.text(0.97, 0.5, sum(p[1] for p in devseries[dev_i]),
  1166. horizontalalignment="left", verticalalignment="center",
  1167. transform=ax.transAxes, fontsize=14)
  1168. axes[-1].set_axis_on()
  1169. target_num_labels = 12
  1170. num_months = (end_date.year - start_date.year) * 12 + end_date.month - start_date.month
  1171. interval = int(numpy.ceil(num_months / target_num_labels))
  1172. if interval >= 8:
  1173. interval = int(numpy.ceil(num_months / (12 * target_num_labels)))
  1174. axes[-1].xaxis.set_major_locator(matplotlib.dates.YearLocator(interval=interval))
  1175. axes[-1].xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y"))
  1176. else:
  1177. axes[-1].xaxis.set_major_locator(matplotlib.dates.MonthLocator(interval=interval))
  1178. axes[-1].xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%m"))
  1179. for tick in axes[-1].xaxis.get_major_ticks():
  1180. tick.label.set_fontsize(16)
  1181. axes[-1].spines["left"].set_visible(False)
  1182. axes[-1].spines["right"].set_visible(False)
  1183. axes[-1].spines["top"].set_visible(False)
  1184. axes[-1].get_yaxis().set_visible(False)
  1185. axes[-1].set_facecolor((1.0,) * 3 + (0.0,))
  1186. title = "%s commits" % name
  1187. deploy_plot(title, args.output, args.style)
  1188. def main():
  1189. args = parse_args()
  1190. reader = read_input(args)
  1191. header = reader.get_header()
  1192. name = reader.get_name()
  1193. burndown_warning = "Burndown stats were not collected. Re-run hercules with --burndown."
  1194. burndown_files_warning = \
  1195. "Burndown stats for files were not collected. Re-run hercules with " \
  1196. "--burndown --burndown-files."
  1197. burndown_people_warning = \
  1198. "Burndown stats for people were not collected. Re-run hercules with " \
  1199. "--burndown --burndown-people."
  1200. couples_warning = "Coupling stats were not collected. Re-run hercules with --couples."
  1201. shotness_warning = "Structural hotness stats were not collected. Re-run hercules with " \
  1202. "--shotness. Also check --languages - the output may be empty."
  1203. sentiment_warning = "Sentiment stats were not collected. Re-run hercules with --sentiment."
  1204. devs_warning = "Devs stats were not collected. Re-run hercules with --devs."
  1205. def run_times():
  1206. rt = reader.get_run_times()
  1207. import pandas
  1208. series = pandas.to_timedelta(pandas.Series(rt).sort_values(ascending=False), unit="s")
  1209. df = pandas.concat([series, series / series.sum()], axis=1)
  1210. df.columns = ["time", "ratio"]
  1211. print(df)
  1212. def project_burndown():
  1213. try:
  1214. full_header = header + reader.get_burndown_parameters()
  1215. except KeyError:
  1216. print("project: " + burndown_warning)
  1217. return
  1218. plot_burndown(args, "project",
  1219. *load_burndown(full_header, *reader.get_project_burndown(),
  1220. resample=args.resample))
  1221. def files_burndown():
  1222. try:
  1223. full_header = header + reader.get_burndown_parameters()
  1224. except KeyError:
  1225. print(burndown_warning)
  1226. return
  1227. try:
  1228. plot_many_burndown(args, "file", full_header, reader.get_files_burndown())
  1229. except KeyError:
  1230. print("files: " + burndown_files_warning)
  1231. def people_burndown():
  1232. try:
  1233. full_header = header + reader.get_burndown_parameters()
  1234. except KeyError:
  1235. print(burndown_warning)
  1236. return
  1237. try:
  1238. plot_many_burndown(args, "person", full_header, reader.get_people_burndown())
  1239. except KeyError:
  1240. print("people: " + burndown_people_warning)
  1241. def churn_matrix():
  1242. try:
  1243. plot_churn_matrix(args, name, *load_churn_matrix(
  1244. *reader.get_people_interaction(), max_people=args.max_people))
  1245. except KeyError:
  1246. print("churn_matrix: " + burndown_people_warning)
  1247. def ownership_burndown():
  1248. try:
  1249. full_header = header + reader.get_burndown_parameters()
  1250. except KeyError:
  1251. print(burndown_warning)
  1252. return
  1253. try:
  1254. plot_ownership(args, name, *load_ownership(
  1255. full_header, *reader.get_ownership_burndown(), max_people=args.max_people))
  1256. except KeyError:
  1257. print("ownership: " + burndown_people_warning)
  1258. def couples():
  1259. try:
  1260. write_embeddings("files", args.output, not args.disable_projector,
  1261. *train_embeddings(*reader.get_files_coocc(),
  1262. tmpdir=args.couples_tmp_dir))
  1263. write_embeddings("people", args.output, not args.disable_projector,
  1264. *train_embeddings(*reader.get_people_coocc(),
  1265. tmpdir=args.couples_tmp_dir))
  1266. except KeyError:
  1267. print(couples_warning)
  1268. try:
  1269. write_embeddings("shotness", args.output, not args.disable_projector,
  1270. *train_embeddings(*reader.get_shotness_coocc(),
  1271. tmpdir=args.couples_tmp_dir))
  1272. except KeyError:
  1273. print(shotness_warning)
  1274. def shotness():
  1275. try:
  1276. data = reader.get_shotness()
  1277. except KeyError:
  1278. print(shotness_warning)
  1279. return
  1280. show_shotness_stats(data)
  1281. def sentiment():
  1282. try:
  1283. data = reader.get_sentiment()
  1284. except KeyError:
  1285. print(sentiment_warning)
  1286. return
  1287. show_sentiment_stats(args, reader.get_name(), args.resample, reader.get_header()[0], data)
  1288. def devs():
  1289. try:
  1290. data = reader.get_devs()
  1291. except KeyError:
  1292. print(devs_warning)
  1293. return
  1294. show_devs(args, reader.get_name(), *reader.get_header(), data)
  1295. modes = {
  1296. "run-times": run_times,
  1297. "burndown-project": project_burndown,
  1298. "burndown-file": files_burndown,
  1299. "burndown-person": people_burndown,
  1300. "churn-matrix": churn_matrix,
  1301. "ownership": ownership_burndown,
  1302. "couples": couples,
  1303. "shotness": shotness,
  1304. "sentiment": sentiment,
  1305. "devs": devs,
  1306. }
  1307. try:
  1308. modes[args.mode]()
  1309. except KeyError:
  1310. assert args.mode == "all"
  1311. project_burndown()
  1312. files_burndown()
  1313. people_burndown()
  1314. churn_matrix()
  1315. ownership_burndown()
  1316. couples()
  1317. shotness()
  1318. sentiment()
  1319. devs()
  1320. if web_server.running:
  1321. secs = int(os.getenv("COUPLES_SERVER_TIME", "60"))
  1322. print("Sleeping for %d seconds, safe to Ctrl-C" % secs)
  1323. sys.stdout.flush()
  1324. try:
  1325. time.sleep(secs)
  1326. except KeyboardInterrupt:
  1327. pass
  1328. web_server.stop()
  1329. if __name__ == "__main__":
  1330. sys.exit(main())