labours.py 63 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669
  1. #!/usr/bin/env python3
  2. import argparse
  3. from collections import defaultdict, namedtuple
  4. from datetime import datetime, timedelta
  5. from importlib import import_module
  6. import io
  7. from itertools import chain
  8. import json
  9. import os
  10. import re
  11. import shutil
  12. import subprocess
  13. import sys
  14. import tempfile
  15. import threading
  16. import time
  17. import warnings
  18. try:
  19. from clint.textui import progress
  20. except ImportError:
  21. print("Warning: clint is not installed, no fancy progressbars in the terminal for you.")
  22. progress = None
  23. import numpy
  24. import yaml
  25. if sys.version_info[0] < 3:
  26. # OK, ancients, I will support Python 2, but you owe me a beer
  27. input = raw_input # noqa: F821
  28. def list_matplotlib_styles():
  29. script = "import sys; from matplotlib import pyplot; " \
  30. "sys.stdout.write(repr(pyplot.style.available))"
  31. styles = eval(subprocess.check_output([sys.executable, "-c", script]))
  32. styles.remove("classic")
  33. return ["default", "classic"] + styles
  34. def parse_args():
  35. parser = argparse.ArgumentParser()
  36. parser.add_argument("-o", "--output", default="",
  37. help="Path to the output file/directory (empty for display). "
  38. "If the extension is JSON, the data is saved instead of "
  39. "the real image.")
  40. parser.add_argument("-i", "--input", default="-",
  41. help="Path to the input file (- for stdin).")
  42. parser.add_argument("-f", "--input-format", default="auto", choices=["yaml", "pb", "auto"])
  43. parser.add_argument("--font-size", default=12, type=int,
  44. help="Size of the labels and legend.")
  45. parser.add_argument("--style", default="ggplot", choices=list_matplotlib_styles(),
  46. help="Plot style to use.")
  47. parser.add_argument("--backend", help="Matplotlib backend to use.")
  48. parser.add_argument("--background", choices=["black", "white"], default="white",
  49. help="Plot's general color scheme.")
  50. parser.add_argument("--size", help="Axes' size in inches, for example \"12,9\"")
  51. parser.add_argument("--relative", action="store_true",
  52. help="Occupy 100%% height for every measurement.")
  53. parser.add_argument("--couples-tmp-dir", help="Temporary directory to work with couples.")
  54. parser.add_argument("-m", "--mode",
  55. choices=["burndown-project", "burndown-file", "burndown-person",
  56. "churn-matrix", "ownership", "couples-files", "couples-people",
  57. "couples-shotness", "shotness", "sentiment",
  58. "devs", "old-vs-new", "all", "run-times", "languages"],
  59. help="What to plot.")
  60. parser.add_argument(
  61. "--resample", default="year",
  62. help="The way to resample the time series. Possible values are: "
  63. "\"month\", \"year\", \"no\", \"raw\" and pandas offset aliases ("
  64. "http://pandas.pydata.org/pandas-docs/stable/timeseries.html"
  65. "#offset-aliases).")
  66. dateutil_url = "https://dateutil.readthedocs.io/en/stable/parser.html#dateutil.parser.parse"
  67. parser.add_argument("--start-date",
  68. help="Start date of time-based plots. Any format is accepted which is "
  69. "supported by %s" % dateutil_url)
  70. parser.add_argument("--end-date",
  71. help="End date of time-based plots. Any format is accepted which is "
  72. "supported by %s" % dateutil_url)
  73. parser.add_argument("--disable-projector", action="store_true",
  74. help="Do not run Tensorflow Projector on couples.")
  75. parser.add_argument("--max-people", default=20, type=int,
  76. help="Maximum number of developers in churn matrix and people plots.")
  77. args = parser.parse_args()
  78. return args
  79. class Reader(object):
  80. def read(self, file):
  81. raise NotImplementedError
  82. def get_name(self):
  83. raise NotImplementedError
  84. def get_header(self):
  85. raise NotImplementedError
  86. def get_burndown_parameters(self):
  87. raise NotImplementedError
  88. def get_project_burndown(self):
  89. raise NotImplementedError
  90. def get_files_burndown(self):
  91. raise NotImplementedError
  92. def get_people_burndown(self):
  93. raise NotImplementedError
  94. def get_ownership_burndown(self):
  95. raise NotImplementedError
  96. def get_people_interaction(self):
  97. raise NotImplementedError
  98. def get_files_coocc(self):
  99. raise NotImplementedError
  100. def get_people_coocc(self):
  101. raise NotImplementedError
  102. def get_shotness_coocc(self):
  103. raise NotImplementedError
  104. def get_shotness(self):
  105. raise NotImplementedError
  106. def get_sentiment(self):
  107. raise NotImplementedError
  108. def get_devs(self):
  109. raise NotImplementedError
  110. class YamlReader(Reader):
  111. def read(self, file):
  112. yaml.reader.Reader.NON_PRINTABLE = re.compile(r"(?!x)x")
  113. try:
  114. loader = yaml.CLoader
  115. except AttributeError:
  116. print("Warning: failed to import yaml.CLoader, falling back to slow yaml.Loader")
  117. loader = yaml.Loader
  118. try:
  119. if file != "-":
  120. with open(file) as fin:
  121. data = yaml.load(fin, Loader=loader)
  122. else:
  123. data = yaml.load(sys.stdin, Loader=loader)
  124. except (UnicodeEncodeError, yaml.reader.ReaderError) as e:
  125. print("\nInvalid unicode in the input: %s\nPlease filter it through "
  126. "fix_yaml_unicode.py" % e)
  127. sys.exit(1)
  128. if data is None:
  129. print("\nNo data has been read - has Hercules crashed?")
  130. sys.exit(1)
  131. self.data = data
  132. def get_run_times(self):
  133. return {}
  134. def get_name(self):
  135. return self.data["hercules"]["repository"]
  136. def get_header(self):
  137. header = self.data["hercules"]
  138. return header["begin_unix_time"], header["end_unix_time"]
  139. def get_burndown_parameters(self):
  140. header = self.data["Burndown"]
  141. return header["sampling"], header["granularity"]
  142. def get_project_burndown(self):
  143. return self.data["hercules"]["repository"], \
  144. self._parse_burndown_matrix(self.data["Burndown"]["project"]).T
  145. def get_files_burndown(self):
  146. return [(p[0], self._parse_burndown_matrix(p[1]).T)
  147. for p in self.data["Burndown"]["files"].items()]
  148. def get_people_burndown(self):
  149. return [(p[0], self._parse_burndown_matrix(p[1]).T)
  150. for p in self.data["Burndown"]["people"].items()]
  151. def get_ownership_burndown(self):
  152. return self.data["Burndown"]["people_sequence"].copy(), \
  153. {p[0]: self._parse_burndown_matrix(p[1])
  154. for p in self.data["Burndown"]["people"].items()}
  155. def get_people_interaction(self):
  156. return self.data["Burndown"]["people_sequence"].copy(), \
  157. self._parse_burndown_matrix(self.data["Burndown"]["people_interaction"])
  158. def get_files_coocc(self):
  159. coocc = self.data["Couples"]["files_coocc"]
  160. return coocc["index"], self._parse_coocc_matrix(coocc["matrix"])
  161. def get_people_coocc(self):
  162. coocc = self.data["Couples"]["people_coocc"]
  163. return coocc["index"], self._parse_coocc_matrix(coocc["matrix"])
  164. def get_shotness_coocc(self):
  165. shotness = self.data["Shotness"]
  166. index = ["%s:%s" % (i["file"], i["name"]) for i in shotness]
  167. indptr = numpy.zeros(len(shotness) + 1, dtype=numpy.int64)
  168. indices = []
  169. data = []
  170. for i, record in enumerate(shotness):
  171. pairs = [(int(k), v) for k, v in record["counters"].items()]
  172. pairs.sort()
  173. indptr[i + 1] = indptr[i] + len(pairs)
  174. for k, v in pairs:
  175. indices.append(k)
  176. data.append(v)
  177. indices = numpy.array(indices, dtype=numpy.int32)
  178. data = numpy.array(data, dtype=numpy.int32)
  179. from scipy.sparse import csr_matrix
  180. return index, csr_matrix((data, indices, indptr), shape=(len(shotness),) * 2)
  181. def get_shotness(self):
  182. from munch import munchify
  183. obj = munchify(self.data["Shotness"])
  184. # turn strings into ints
  185. for item in obj:
  186. item.counters = {int(k): v for k, v in item.counters.items()}
  187. if len(obj) == 0:
  188. raise KeyError
  189. return obj
  190. def get_sentiment(self):
  191. from munch import munchify
  192. return munchify({int(key): {
  193. "Comments": vals[2].split("|"),
  194. "Commits": vals[1],
  195. "Value": float(vals[0])
  196. } for key, vals in self.data["Sentiment"].items()})
  197. def get_devs(self):
  198. people = self.data["Devs"]["people"]
  199. days = {int(d): {int(dev): DevDay(*(int(x) for x in day[:-1]), day[-1])
  200. for dev, day in devs.items()}
  201. for d, devs in self.data["Devs"]["days"].items()}
  202. return days, people
  203. def _parse_burndown_matrix(self, matrix):
  204. return numpy.array([numpy.fromstring(line, dtype=int, sep=" ")
  205. for line in matrix.split("\n")])
  206. def _parse_coocc_matrix(self, matrix):
  207. from scipy.sparse import csr_matrix
  208. data = []
  209. indices = []
  210. indptr = [0]
  211. for row in matrix:
  212. for k, v in sorted(row.items()):
  213. data.append(v)
  214. indices.append(k)
  215. indptr.append(indptr[-1] + len(row))
  216. return csr_matrix((data, indices, indptr), shape=(len(matrix),) * 2)
  217. class ProtobufReader(Reader):
  218. def read(self, file):
  219. try:
  220. from internal.pb.pb_pb2 import AnalysisResults
  221. except ImportError as e:
  222. print("\n\n>>> You need to generate internal/pb/pb_pb2.py - run \"make\"\n",
  223. file=sys.stderr)
  224. raise e from None
  225. self.data = AnalysisResults()
  226. if file != "-":
  227. with open(file, "rb") as fin:
  228. bytes = fin.read()
  229. else:
  230. bytes = sys.stdin.buffer.read()
  231. if not bytes:
  232. raise ValueError("empty input")
  233. self.data.ParseFromString(bytes)
  234. self.contents = {}
  235. for key, val in self.data.contents.items():
  236. try:
  237. mod, name = PB_MESSAGES[key].rsplit(".", 1)
  238. except KeyError:
  239. sys.stderr.write("Warning: there is no registered PB decoder for %s\n" % key)
  240. continue
  241. cls = getattr(import_module(mod), name)
  242. self.contents[key] = msg = cls()
  243. msg.ParseFromString(val)
  244. def get_run_times(self):
  245. return {key: val for key, val in self.data.header.run_time_per_item.items()}
  246. def get_name(self):
  247. return self.data.header.repository
  248. def get_header(self):
  249. header = self.data.header
  250. return header.begin_unix_time, header.end_unix_time
  251. def get_burndown_parameters(self):
  252. burndown = self.contents["Burndown"]
  253. return burndown.sampling, burndown.granularity
  254. def get_project_burndown(self):
  255. return self._parse_burndown_matrix(self.contents["Burndown"].project)
  256. def get_files_burndown(self):
  257. return [self._parse_burndown_matrix(i) for i in self.contents["Burndown"].files]
  258. def get_people_burndown(self):
  259. return [self._parse_burndown_matrix(i) for i in self.contents["Burndown"].people]
  260. def get_ownership_burndown(self):
  261. people = self.get_people_burndown()
  262. return [p[0] for p in people], {p[0]: p[1].T for p in people}
  263. def get_people_interaction(self):
  264. burndown = self.contents["Burndown"]
  265. return [i.name for i in burndown.people], \
  266. self._parse_sparse_matrix(burndown.people_interaction).toarray()
  267. def get_files_coocc(self):
  268. node = self.contents["Couples"].file_couples
  269. return list(node.index), self._parse_sparse_matrix(node.matrix)
  270. def get_people_coocc(self):
  271. node = self.contents["Couples"].people_couples
  272. return list(node.index), self._parse_sparse_matrix(node.matrix)
  273. def get_shotness_coocc(self):
  274. shotness = self.get_shotness()
  275. index = ["%s:%s" % (i.file, i.name) for i in shotness]
  276. indptr = numpy.zeros(len(shotness) + 1, dtype=numpy.int32)
  277. indices = []
  278. data = []
  279. for i, record in enumerate(shotness):
  280. pairs = list(record.counters.items())
  281. pairs.sort()
  282. indptr[i + 1] = indptr[i] + len(pairs)
  283. for k, v in pairs:
  284. indices.append(k)
  285. data.append(v)
  286. indices = numpy.array(indices, dtype=numpy.int32)
  287. data = numpy.array(data, dtype=numpy.int32)
  288. from scipy.sparse import csr_matrix
  289. return index, csr_matrix((data, indices, indptr), shape=(len(shotness),) * 2)
  290. def get_shotness(self):
  291. records = self.contents["Shotness"].records
  292. if len(records) == 0:
  293. raise KeyError
  294. return records
  295. def get_sentiment(self):
  296. byday = self.contents["Sentiment"].SentimentByDay
  297. if len(byday) == 0:
  298. raise KeyError
  299. return byday
  300. def get_devs(self):
  301. people = list(self.contents["Devs"].dev_index)
  302. days = {d: {dev: DevDay(stats.commits, stats.stats.added, stats.stats.removed,
  303. stats.stats.changed, {k: [v.added, v.removed, v.changed]
  304. for k, v in stats.languages.items()})
  305. for dev, stats in day.devs.items()}
  306. for d, day in self.contents["Devs"].days.items()}
  307. return days, people
  308. def _parse_burndown_matrix(self, matrix):
  309. dense = numpy.zeros((matrix.number_of_rows, matrix.number_of_columns), dtype=int)
  310. for y, row in enumerate(matrix.rows):
  311. for x, col in enumerate(row.columns):
  312. dense[y, x] = col
  313. return matrix.name, dense.T
  314. def _parse_sparse_matrix(self, matrix):
  315. from scipy.sparse import csr_matrix
  316. return csr_matrix((list(matrix.data), list(matrix.indices), list(matrix.indptr)),
  317. shape=(matrix.number_of_rows, matrix.number_of_columns))
  318. READERS = {"yaml": YamlReader, "yml": YamlReader, "pb": ProtobufReader}
  319. PB_MESSAGES = {
  320. "Burndown": "internal.pb.pb_pb2.BurndownAnalysisResults",
  321. "Couples": "internal.pb.pb_pb2.CouplesAnalysisResults",
  322. "Shotness": "internal.pb.pb_pb2.ShotnessAnalysisResults",
  323. "Devs": "internal.pb.pb_pb2.DevsAnalysisResults",
  324. }
  325. def read_input(args):
  326. sys.stdout.write("Reading the input... ")
  327. sys.stdout.flush()
  328. if args.input != "-":
  329. if args.input_format == "auto":
  330. try:
  331. args.input_format = args.input.rsplit(".", 1)[1]
  332. except IndexError:
  333. try:
  334. with open(args.input) as f:
  335. f.read(1 << 16)
  336. args.input_format = "yaml"
  337. except UnicodeDecodeError:
  338. args.input_format = "pb"
  339. elif args.input_format == "auto":
  340. args.input_format = "yaml"
  341. reader = READERS[args.input_format]()
  342. reader.read(args.input)
  343. print("done")
  344. return reader
  345. class DevDay(namedtuple("DevDay", ("Commits", "Added", "Removed", "Changed", "Languages"))):
  346. def add(self, dd):
  347. langs = defaultdict(lambda: [0] * 3)
  348. for key, val in self.Languages.items():
  349. for i in range(3):
  350. langs[key][i] += val[i]
  351. for key, val in dd.Languages.items():
  352. for i in range(3):
  353. langs[key][i] += val[i]
  354. return DevDay(Commits=self.Commits + dd.Commits,
  355. Added=self.Added + dd.Added,
  356. Removed=self.Removed + dd.Removed,
  357. Changed=self.Changed + dd.Changed,
  358. Languages=dict(langs))
  359. def calculate_average_lifetime(matrix):
  360. lifetimes = numpy.zeros(matrix.shape[1] - 1)
  361. for band in matrix:
  362. start = 0
  363. for i, line in enumerate(band):
  364. if i == 0 or band[i - 1] == 0:
  365. start += 1
  366. continue
  367. lifetimes[i - start] = band[i - 1] - line
  368. lifetimes[i - start] = band[i - 1]
  369. lsum = lifetimes.sum()
  370. if lsum != 0:
  371. total = lifetimes.dot(numpy.arange(1, matrix.shape[1], 1))
  372. return total / (lsum * matrix.shape[1])
  373. return numpy.nan
  374. def interpolate_burndown_matrix(matrix, granularity, sampling):
  375. daily = numpy.zeros(
  376. (matrix.shape[0] * granularity, matrix.shape[1] * sampling),
  377. dtype=numpy.float32)
  378. """
  379. ----------> samples, x
  380. |
  381. |
  382. |
  383. bands, y
  384. """
  385. for y in range(matrix.shape[0]):
  386. for x in range(matrix.shape[1]):
  387. if y * granularity > (x + 1) * sampling:
  388. # the future is zeros
  389. continue
  390. def decay(start_index: int, start_val: float):
  391. if start_val == 0:
  392. return
  393. k = matrix[y][x] / start_val # <= 1
  394. scale = (x + 1) * sampling - start_index
  395. for i in range(y * granularity, (y + 1) * granularity):
  396. initial = daily[i][start_index - 1]
  397. for j in range(start_index, (x + 1) * sampling):
  398. daily[i][j] = initial * (
  399. 1 + (k - 1) * (j - start_index + 1) / scale)
  400. def grow(finish_index: int, finish_val: float):
  401. initial = matrix[y][x - 1] if x > 0 else 0
  402. start_index = x * sampling
  403. if start_index < y * granularity:
  404. start_index = y * granularity
  405. if finish_index == start_index:
  406. return
  407. avg = (finish_val - initial) / (finish_index - start_index)
  408. for j in range(x * sampling, finish_index):
  409. for i in range(start_index, j + 1):
  410. daily[i][j] = avg
  411. # copy [x*g..y*s)
  412. for j in range(x * sampling, finish_index):
  413. for i in range(y * granularity, x * sampling):
  414. daily[i][j] = daily[i][j - 1]
  415. if (y + 1) * granularity >= (x + 1) * sampling:
  416. # x*granularity <= (y+1)*sampling
  417. # 1. x*granularity <= y*sampling
  418. # y*sampling..(y+1)sampling
  419. #
  420. # x+1
  421. # /
  422. # /
  423. # / y+1 -|
  424. # / |
  425. # / y -|
  426. # /
  427. # / x
  428. #
  429. # 2. x*granularity > y*sampling
  430. # x*granularity..(y+1)sampling
  431. #
  432. # x+1
  433. # /
  434. # /
  435. # / y+1 -|
  436. # / |
  437. # / x -|
  438. # /
  439. # / y
  440. if y * granularity <= x * sampling:
  441. grow((x + 1) * sampling, matrix[y][x])
  442. elif (x + 1) * sampling > y * granularity:
  443. grow((x + 1) * sampling, matrix[y][x])
  444. avg = matrix[y][x] / ((x + 1) * sampling - y * granularity)
  445. for j in range(y * granularity, (x + 1) * sampling):
  446. for i in range(y * granularity, j + 1):
  447. daily[i][j] = avg
  448. elif (y + 1) * granularity >= x * sampling:
  449. # y*sampling <= (x+1)*granularity < (y+1)sampling
  450. # y*sampling..(x+1)*granularity
  451. # (x+1)*granularity..(y+1)sampling
  452. # x+1
  453. # /\
  454. # / \
  455. # / \
  456. # / y+1
  457. # /
  458. # y
  459. v1 = matrix[y][x - 1]
  460. v2 = matrix[y][x]
  461. delta = (y + 1) * granularity - x * sampling
  462. previous = 0
  463. if x > 0 and (x - 1) * sampling >= y * granularity:
  464. # x*g <= (y-1)*s <= y*s <= (x+1)*g <= (y+1)*s
  465. # |________|.......^
  466. if x > 1:
  467. previous = matrix[y][x - 2]
  468. scale = sampling
  469. else:
  470. # (y-1)*s < x*g <= y*s <= (x+1)*g <= (y+1)*s
  471. # |______|.......^
  472. scale = sampling if x == 0 else x * sampling - y * granularity
  473. peak = v1 + (v1 - previous) / scale * delta
  474. if v2 > peak:
  475. # we need to adjust the peak, it may not be less than the decayed value
  476. if x < matrix.shape[1] - 1:
  477. # y*s <= (x+1)*g <= (y+1)*s < (y+2)*s
  478. # ^.........|_________|
  479. k = (v2 - matrix[y][x + 1]) / sampling # > 0
  480. peak = matrix[y][x] + k * ((x + 1) * sampling - (y + 1) * granularity)
  481. # peak > v2 > v1
  482. else:
  483. peak = v2
  484. # not enough data to interpolate; this is at least not restricted
  485. grow((y + 1) * granularity, peak)
  486. decay((y + 1) * granularity, peak)
  487. else:
  488. # (x+1)*granularity < y*sampling
  489. # y*sampling..(y+1)sampling
  490. decay(x * sampling, matrix[y][x - 1])
  491. return daily
  492. def import_pandas():
  493. import pandas
  494. try:
  495. from pandas.plotting import register_matplotlib_converters
  496. register_matplotlib_converters()
  497. except ImportError:
  498. pass
  499. return pandas
  500. def load_burndown(header, name, matrix, resample):
  501. pandas = import_pandas()
  502. start, last, sampling, granularity = header
  503. assert sampling > 0
  504. assert granularity >= sampling
  505. start = datetime.fromtimestamp(start)
  506. last = datetime.fromtimestamp(last)
  507. print(name, "lifetime index:", calculate_average_lifetime(matrix))
  508. finish = start + timedelta(days=matrix.shape[1] * sampling)
  509. if resample not in ("no", "raw"):
  510. print("resampling to %s, please wait..." % resample)
  511. # Interpolate the day x day matrix.
  512. # Each day brings equal weight in the granularity.
  513. # Sampling's interpolation is linear.
  514. daily = interpolate_burndown_matrix(matrix, granularity, sampling)
  515. daily[(last - start).days:] = 0
  516. # Resample the bands
  517. aliases = {
  518. "year": "A",
  519. "month": "M"
  520. }
  521. resample = aliases.get(resample, resample)
  522. periods = 0
  523. date_granularity_sampling = [start]
  524. while date_granularity_sampling[-1] < finish:
  525. periods += 1
  526. date_granularity_sampling = pandas.date_range(
  527. start, periods=periods, freq=resample)
  528. date_range_sampling = pandas.date_range(
  529. date_granularity_sampling[0],
  530. periods=(finish - date_granularity_sampling[0]).days,
  531. freq="1D")
  532. # Fill the new square matrix
  533. matrix = numpy.zeros(
  534. (len(date_granularity_sampling), len(date_range_sampling)),
  535. dtype=numpy.float32)
  536. for i, gdt in enumerate(date_granularity_sampling):
  537. istart = (date_granularity_sampling[i - 1] - start).days \
  538. if i > 0 else 0
  539. ifinish = (gdt - start).days
  540. for j, sdt in enumerate(date_range_sampling):
  541. if (sdt - start).days >= istart:
  542. break
  543. matrix[i, j:] = \
  544. daily[istart:ifinish, (sdt - start).days:].sum(axis=0)
  545. # Hardcode some cases to improve labels' readability
  546. if resample in ("year", "A"):
  547. labels = [dt.year for dt in date_granularity_sampling]
  548. elif resample in ("month", "M"):
  549. labels = [dt.strftime("%Y %B") for dt in date_granularity_sampling]
  550. else:
  551. labels = [dt.date() for dt in date_granularity_sampling]
  552. else:
  553. labels = [
  554. "%s - %s" % ((start + timedelta(days=i * granularity)).date(),
  555. (
  556. start + timedelta(days=(i + 1) * granularity)).date())
  557. for i in range(matrix.shape[0])]
  558. if len(labels) > 18:
  559. warnings.warn("Too many labels - consider resampling.")
  560. resample = "M" # fake resampling type is checked while plotting
  561. date_range_sampling = pandas.date_range(
  562. start + timedelta(days=sampling), periods=matrix.shape[1],
  563. freq="%dD" % sampling)
  564. return name, matrix, date_range_sampling, labels, granularity, sampling, resample
  565. def load_ownership(header, sequence, contents, max_people):
  566. pandas = import_pandas()
  567. start, last, sampling, _ = header
  568. start = datetime.fromtimestamp(start)
  569. last = datetime.fromtimestamp(last)
  570. people = []
  571. for name in sequence:
  572. people.append(contents[name].sum(axis=1))
  573. people = numpy.array(people)
  574. date_range_sampling = pandas.date_range(
  575. start + timedelta(days=sampling), periods=people[0].shape[0],
  576. freq="%dD" % sampling)
  577. if people.shape[0] > max_people:
  578. order = numpy.argsort(-people.sum(axis=1))
  579. people = people[order[:max_people]]
  580. sequence = [sequence[i] for i in order[:max_people]]
  581. print("Warning: truncated people to most owning %d" % max_people)
  582. for i, name in enumerate(sequence):
  583. if len(name) > 40:
  584. sequence[i] = name[:37] + "..."
  585. return sequence, people, date_range_sampling, last
  586. def load_churn_matrix(people, matrix, max_people):
  587. matrix = matrix.astype(float)
  588. if matrix.shape[0] > max_people:
  589. order = numpy.argsort(-matrix[:, 0])
  590. matrix = matrix[order[:max_people]][:, [0, 1] + list(2 + order[:max_people])]
  591. people = [people[i] for i in order[:max_people]]
  592. print("Warning: truncated people to most productive %d" % max_people)
  593. zeros = matrix[:, 0] == 0
  594. matrix[zeros, :] = 1
  595. matrix /= matrix[:, 0][:, None]
  596. matrix = -matrix[:, 1:]
  597. matrix[zeros, :] = 0
  598. for i, name in enumerate(people):
  599. if len(name) > 40:
  600. people[i] = name[:37] + "..."
  601. return people, matrix
  602. def import_pyplot(backend, style):
  603. import matplotlib
  604. if backend:
  605. matplotlib.use(backend)
  606. from matplotlib import pyplot
  607. pyplot.style.use(style)
  608. return matplotlib, pyplot
  609. def apply_plot_style(figure, axes, legend, background, font_size, axes_size):
  610. foreground = "black" if background == "white" else "white"
  611. if axes_size is None:
  612. axes_size = (12, 9)
  613. else:
  614. axes_size = tuple(float(p) for p in axes_size.split(","))
  615. figure.set_size_inches(*axes_size)
  616. for side in ("bottom", "top", "left", "right"):
  617. axes.spines[side].set_color(foreground)
  618. for axis in (axes.xaxis, axes.yaxis):
  619. axis.label.update(dict(fontsize=font_size, color=foreground))
  620. for axis in ("x", "y"):
  621. getattr(axes, axis + "axis").get_offset_text().set_size(font_size)
  622. axes.tick_params(axis=axis, colors=foreground, labelsize=font_size)
  623. try:
  624. axes.ticklabel_format(axis="y", style="sci", scilimits=(0, 3))
  625. except AttributeError:
  626. pass
  627. figure.patch.set_facecolor(background)
  628. axes.set_facecolor(background)
  629. if legend is not None:
  630. frame = legend.get_frame()
  631. for setter in (frame.set_facecolor, frame.set_edgecolor):
  632. setter(background)
  633. for text in legend.get_texts():
  634. text.set_color(foreground)
  635. def get_plot_path(base, name):
  636. root, ext = os.path.splitext(base)
  637. if not ext:
  638. ext = ".png"
  639. output = os.path.join(root, name + ext)
  640. os.makedirs(os.path.dirname(output), exist_ok=True)
  641. return output
  642. def deploy_plot(title, output, background):
  643. import matplotlib.pyplot as pyplot
  644. if not output:
  645. pyplot.gcf().canvas.set_window_title(title)
  646. pyplot.show()
  647. else:
  648. if title:
  649. pyplot.title(title, color="black" if background == "white" else "white")
  650. try:
  651. pyplot.tight_layout()
  652. except: # noqa: E722
  653. print("Warning: failed to set the tight layout")
  654. pyplot.savefig(output, transparent=True)
  655. pyplot.clf()
  656. def default_json(x):
  657. if hasattr(x, "tolist"):
  658. return x.tolist()
  659. if hasattr(x, "isoformat"):
  660. return x.isoformat()
  661. return x
  662. def parse_date(text, default):
  663. if not text:
  664. return default
  665. from dateutil.parser import parse
  666. return parse(text)
  667. def plot_burndown(args, target, name, matrix, date_range_sampling, labels, granularity,
  668. sampling, resample):
  669. if args.output and args.output.endswith(".json"):
  670. data = locals().copy()
  671. del data["args"]
  672. data["type"] = "burndown"
  673. if args.mode == "project" and target == "project":
  674. output = args.output
  675. else:
  676. if target == "project":
  677. name = "project"
  678. output = get_plot_path(args.output, name)
  679. with open(output, "w") as fout:
  680. json.dump(data, fout, sort_keys=True, default=default_json)
  681. return
  682. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  683. pyplot.stackplot(date_range_sampling, matrix, labels=labels)
  684. if args.relative:
  685. for i in range(matrix.shape[1]):
  686. matrix[:, i] /= matrix[:, i].sum()
  687. pyplot.ylim(0, 1)
  688. legend_loc = 3
  689. else:
  690. legend_loc = 2
  691. legend = pyplot.legend(loc=legend_loc, fontsize=args.font_size)
  692. pyplot.ylabel("Lines of code")
  693. pyplot.xlabel("Time")
  694. apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.background,
  695. args.font_size, args.size)
  696. pyplot.xlim(parse_date(args.start_date, date_range_sampling[0]),
  697. parse_date(args.end_date, date_range_sampling[-1]))
  698. locator = pyplot.gca().xaxis.get_major_locator()
  699. # set the optimal xticks locator
  700. if "M" not in resample:
  701. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  702. locs = pyplot.gca().get_xticks().tolist()
  703. if len(locs) >= 16:
  704. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  705. locs = pyplot.gca().get_xticks().tolist()
  706. if len(locs) >= 16:
  707. pyplot.gca().xaxis.set_major_locator(locator)
  708. if locs[0] < pyplot.xlim()[0]:
  709. del locs[0]
  710. endindex = -1
  711. if len(locs) >= 2 and pyplot.xlim()[1] - locs[-1] > (locs[-1] - locs[-2]) / 2:
  712. locs.append(pyplot.xlim()[1])
  713. endindex = len(locs) - 1
  714. startindex = -1
  715. if len(locs) >= 2 and locs[0] - pyplot.xlim()[0] > (locs[1] - locs[0]) / 2:
  716. locs.append(pyplot.xlim()[0])
  717. startindex = len(locs) - 1
  718. pyplot.gca().set_xticks(locs)
  719. # hacking time!
  720. labels = pyplot.gca().get_xticklabels()
  721. if startindex >= 0:
  722. labels[startindex].set_text(date_range_sampling[0].date())
  723. labels[startindex].set_text = lambda _: None
  724. labels[startindex].set_rotation(30)
  725. labels[startindex].set_ha("right")
  726. if endindex >= 0:
  727. labels[endindex].set_text(date_range_sampling[-1].date())
  728. labels[endindex].set_text = lambda _: None
  729. labels[endindex].set_rotation(30)
  730. labels[endindex].set_ha("right")
  731. title = "%s %d x %d (granularity %d, sampling %d)" % \
  732. ((name,) + matrix.shape + (granularity, sampling))
  733. output = args.output
  734. if output:
  735. if args.mode == "project" and target == "project":
  736. output = args.output
  737. else:
  738. if target == "project":
  739. name = "project"
  740. output = get_plot_path(args.output, name)
  741. deploy_plot(title, output, args.style)
  742. def plot_many_burndown(args, target, header, parts):
  743. if not args.output:
  744. print("Warning: output not set, showing %d plots." % len(parts))
  745. itercnt = progress.bar(parts, expected_size=len(parts)) \
  746. if progress is not None else parts
  747. stdout = io.StringIO()
  748. for name, matrix in itercnt:
  749. backup = sys.stdout
  750. sys.stdout = stdout
  751. plot_burndown(args, target, *load_burndown(header, name, matrix, args.resample))
  752. sys.stdout = backup
  753. sys.stdout.write(stdout.getvalue())
  754. def plot_churn_matrix(args, repo, people, matrix):
  755. if args.output and args.output.endswith(".json"):
  756. data = locals().copy()
  757. del data["args"]
  758. data["type"] = "churn_matrix"
  759. if args.mode == "all":
  760. output = get_plot_path(args.output, "matrix")
  761. else:
  762. output = args.output
  763. with open(output, "w") as fout:
  764. json.dump(data, fout, sort_keys=True, default=default_json)
  765. return
  766. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  767. s = 4 + matrix.shape[1] * 0.3
  768. fig = pyplot.figure(figsize=(s, s))
  769. ax = fig.add_subplot(111)
  770. ax.xaxis.set_label_position("top")
  771. ax.matshow(matrix, cmap=pyplot.cm.OrRd)
  772. ax.set_xticks(numpy.arange(0, matrix.shape[1]))
  773. ax.set_yticks(numpy.arange(0, matrix.shape[0]))
  774. ax.set_yticklabels(people, va="center")
  775. ax.set_xticks(numpy.arange(0.5, matrix.shape[1] + 0.5), minor=True)
  776. ax.set_xticklabels(["Unidentified"] + people, rotation=45, ha="left",
  777. va="bottom", rotation_mode="anchor")
  778. ax.set_yticks(numpy.arange(0.5, matrix.shape[0] + 0.5), minor=True)
  779. ax.grid(False)
  780. ax.grid(which="minor")
  781. apply_plot_style(fig, ax, None, args.background, args.font_size, args.size)
  782. if not args.output:
  783. pos1 = ax.get_position()
  784. pos2 = (pos1.x0 + 0.15, pos1.y0 - 0.1, pos1.width * 0.9, pos1.height * 0.9)
  785. ax.set_position(pos2)
  786. if args.mode == "all" and args.output:
  787. output = get_plot_path(args.output, "matrix")
  788. else:
  789. output = args.output
  790. title = "%s %d developers overwrite" % (repo, matrix.shape[0])
  791. if args.output:
  792. # FIXME(vmarkovtsev): otherwise the title is screwed in savefig()
  793. title = ""
  794. deploy_plot(title, output, args.style)
  795. def plot_ownership(args, repo, names, people, date_range, last):
  796. if args.output and args.output.endswith(".json"):
  797. data = locals().copy()
  798. del data["args"]
  799. data["type"] = "ownership"
  800. if args.mode == "all" and args.output:
  801. output = get_plot_path(args.output, "people")
  802. else:
  803. output = args.output
  804. with open(output, "w") as fout:
  805. json.dump(data, fout, sort_keys=True, default=default_json)
  806. return
  807. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  808. pyplot.stackplot(date_range, people, labels=names)
  809. pyplot.xlim(parse_date(args.start_date, date_range[0]), parse_date(args.end_date, last))
  810. if args.relative:
  811. for i in range(people.shape[1]):
  812. people[:, i] /= people[:, i].sum()
  813. pyplot.ylim(0, 1)
  814. legend_loc = 3
  815. else:
  816. legend_loc = 2
  817. legend = pyplot.legend(loc=legend_loc, fontsize=args.font_size)
  818. apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.background,
  819. args.font_size, args.size)
  820. if args.mode == "all" and args.output:
  821. output = get_plot_path(args.output, "people")
  822. else:
  823. output = args.output
  824. deploy_plot("%s code ownership through time" % repo, output, args.style)
  825. IDEAL_SHARD_SIZE = 4096
  826. def train_embeddings(index, matrix, tmpdir, shard_size=IDEAL_SHARD_SIZE):
  827. try:
  828. from . import swivel
  829. except (SystemError, ImportError):
  830. import swivel
  831. import tensorflow as tf
  832. assert matrix.shape[0] == matrix.shape[1]
  833. assert len(index) <= matrix.shape[0]
  834. outlier_threshold = numpy.percentile(matrix.data, 99)
  835. matrix.data[matrix.data > outlier_threshold] = outlier_threshold
  836. nshards = len(index) // shard_size
  837. if nshards * shard_size < len(index):
  838. nshards += 1
  839. shard_size = len(index) // nshards
  840. nshards = len(index) // shard_size
  841. remainder = len(index) - nshards * shard_size
  842. if remainder > 0:
  843. lengths = matrix.indptr[1:] - matrix.indptr[:-1]
  844. filtered = sorted(numpy.argsort(lengths)[remainder:])
  845. else:
  846. filtered = list(range(len(index)))
  847. if len(filtered) < matrix.shape[0]:
  848. print("Truncating the sparse matrix...")
  849. matrix = matrix[filtered, :][:, filtered]
  850. meta_index = []
  851. for i, j in enumerate(filtered):
  852. meta_index.append((index[j], matrix[i, i]))
  853. index = [mi[0] for mi in meta_index]
  854. with tempfile.TemporaryDirectory(prefix="hercules_labours_", dir=tmpdir or None) as tmproot:
  855. print("Writing Swivel metadata...")
  856. vocabulary = "\n".join(index)
  857. with open(os.path.join(tmproot, "row_vocab.txt"), "w") as out:
  858. out.write(vocabulary)
  859. with open(os.path.join(tmproot, "col_vocab.txt"), "w") as out:
  860. out.write(vocabulary)
  861. del vocabulary
  862. bool_sums = matrix.indptr[1:] - matrix.indptr[:-1]
  863. bool_sums_str = "\n".join(map(str, bool_sums.tolist()))
  864. with open(os.path.join(tmproot, "row_sums.txt"), "w") as out:
  865. out.write(bool_sums_str)
  866. with open(os.path.join(tmproot, "col_sums.txt"), "w") as out:
  867. out.write(bool_sums_str)
  868. del bool_sums_str
  869. reorder = numpy.argsort(-bool_sums)
  870. print("Writing Swivel shards...")
  871. for row in range(nshards):
  872. for col in range(nshards):
  873. def _int64s(xs):
  874. return tf.train.Feature(
  875. int64_list=tf.train.Int64List(value=list(xs)))
  876. def _floats(xs):
  877. return tf.train.Feature(
  878. float_list=tf.train.FloatList(value=list(xs)))
  879. indices_row = reorder[row::nshards]
  880. indices_col = reorder[col::nshards]
  881. shard = matrix[indices_row][:, indices_col].tocoo()
  882. example = tf.train.Example(features=tf.train.Features(feature={
  883. "global_row": _int64s(indices_row),
  884. "global_col": _int64s(indices_col),
  885. "sparse_local_row": _int64s(shard.row),
  886. "sparse_local_col": _int64s(shard.col),
  887. "sparse_value": _floats(shard.data)}))
  888. with open(os.path.join(tmproot, "shard-%03d-%03d.pb" % (row, col)), "wb") as out:
  889. out.write(example.SerializeToString())
  890. print("Training Swivel model...")
  891. swivel.FLAGS.submatrix_rows = shard_size
  892. swivel.FLAGS.submatrix_cols = shard_size
  893. if len(meta_index) <= IDEAL_SHARD_SIZE / 16:
  894. embedding_size = 50
  895. num_epochs = 100000
  896. elif len(meta_index) <= IDEAL_SHARD_SIZE:
  897. embedding_size = 50
  898. num_epochs = 50000
  899. elif len(meta_index) <= IDEAL_SHARD_SIZE * 2:
  900. embedding_size = 60
  901. num_epochs = 10000
  902. elif len(meta_index) <= IDEAL_SHARD_SIZE * 4:
  903. embedding_size = 70
  904. num_epochs = 8000
  905. elif len(meta_index) <= IDEAL_SHARD_SIZE * 10:
  906. embedding_size = 80
  907. num_epochs = 5000
  908. elif len(meta_index) <= IDEAL_SHARD_SIZE * 25:
  909. embedding_size = 100
  910. num_epochs = 1000
  911. elif len(meta_index) <= IDEAL_SHARD_SIZE * 100:
  912. embedding_size = 200
  913. num_epochs = 600
  914. else:
  915. embedding_size = 300
  916. num_epochs = 300
  917. if os.getenv("CI"):
  918. # Travis, AppVeyor etc. during the integration tests
  919. num_epochs /= 10
  920. swivel.FLAGS.embedding_size = embedding_size
  921. swivel.FLAGS.input_base_path = tmproot
  922. swivel.FLAGS.output_base_path = tmproot
  923. swivel.FLAGS.loss_multiplier = 1.0 / shard_size
  924. swivel.FLAGS.num_epochs = num_epochs
  925. # Tensorflow 1.5 parses sys.argv unconditionally *applause*
  926. argv_backup = sys.argv[1:]
  927. del sys.argv[1:]
  928. swivel.main(None)
  929. sys.argv.extend(argv_backup)
  930. print("Reading Swivel embeddings...")
  931. embeddings = []
  932. with open(os.path.join(tmproot, "row_embedding.tsv")) as frow:
  933. with open(os.path.join(tmproot, "col_embedding.tsv")) as fcol:
  934. for i, (lrow, lcol) in enumerate(zip(frow, fcol)):
  935. prow, pcol = (l.split("\t", 1) for l in (lrow, lcol))
  936. assert prow[0] == pcol[0]
  937. erow, ecol = \
  938. (numpy.fromstring(p[1], dtype=numpy.float32, sep="\t")
  939. for p in (prow, pcol))
  940. embeddings.append((erow + ecol) / 2)
  941. return meta_index, embeddings
  942. class CORSWebServer(object):
  943. def __init__(self):
  944. self.thread = threading.Thread(target=self.serve)
  945. self.server = None
  946. def serve(self):
  947. outer = self
  948. try:
  949. from http.server import HTTPServer, SimpleHTTPRequestHandler, test
  950. except ImportError: # Python 2
  951. from BaseHTTPServer import HTTPServer, test
  952. from SimpleHTTPServer import SimpleHTTPRequestHandler
  953. class ClojureServer(HTTPServer):
  954. def __init__(self, *args, **kwargs):
  955. HTTPServer.__init__(self, *args, **kwargs)
  956. outer.server = self
  957. class CORSRequestHandler(SimpleHTTPRequestHandler):
  958. def end_headers(self):
  959. self.send_header("Access-Control-Allow-Origin", "*")
  960. SimpleHTTPRequestHandler.end_headers(self)
  961. test(CORSRequestHandler, ClojureServer)
  962. def start(self):
  963. self.thread.start()
  964. def stop(self):
  965. if self.running:
  966. self.server.shutdown()
  967. self.thread.join()
  968. @property
  969. def running(self):
  970. return self.server is not None
  971. web_server = CORSWebServer()
  972. def write_embeddings(name, output, run_server, index, embeddings):
  973. print("Writing Tensorflow Projector files...")
  974. if not output:
  975. output = "couples_" + name
  976. if output.endswith(".json"):
  977. output = os.path.join(output[:-5], "couples")
  978. run_server = False
  979. metaf = "%s_%s_meta.tsv" % (output, name)
  980. with open(metaf, "w") as fout:
  981. fout.write("name\tcommits\n")
  982. for pair in index:
  983. fout.write("%s\t%s\n" % pair)
  984. print("Wrote", metaf)
  985. dataf = "%s_%s_data.tsv" % (output, name)
  986. with open(dataf, "w") as fout:
  987. for vec in embeddings:
  988. fout.write("\t".join(str(v) for v in vec))
  989. fout.write("\n")
  990. print("Wrote", dataf)
  991. jsonf = "%s_%s.json" % (output, name)
  992. with open(jsonf, "w") as fout:
  993. fout.write("""{
  994. "embeddings": [
  995. {
  996. "tensorName": "%s %s coupling",
  997. "tensorShape": [%s, %s],
  998. "tensorPath": "http://0.0.0.0:8000/%s",
  999. "metadataPath": "http://0.0.0.0:8000/%s"
  1000. }
  1001. ]
  1002. }
  1003. """ % (output, name, len(embeddings), len(embeddings[0]), dataf, metaf))
  1004. print("Wrote %s" % jsonf)
  1005. if run_server and not web_server.running:
  1006. web_server.start()
  1007. url = "http://projector.tensorflow.org/?config=http://0.0.0.0:8000/" + jsonf
  1008. print(url)
  1009. if run_server:
  1010. if shutil.which("xdg-open") is not None:
  1011. os.system("xdg-open " + url)
  1012. else:
  1013. browser = os.getenv("BROWSER", "")
  1014. if browser:
  1015. os.system(browser + " " + url)
  1016. else:
  1017. print("\t" + url)
  1018. def show_shotness_stats(data):
  1019. top = sorted(((r.counters[i], i) for i, r in enumerate(data)), reverse=True)
  1020. for count, i in top:
  1021. r = data[i]
  1022. print("%8d %s:%s [%s]" % (count, r.file, r.name, r.internal_role))
  1023. def show_sentiment_stats(args, name, resample, start_date, data):
  1024. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  1025. start_date = datetime.fromtimestamp(start_date)
  1026. data = sorted(data.items())
  1027. xdates = [start_date + timedelta(days=d[0]) for d in data]
  1028. xpos = []
  1029. ypos = []
  1030. xneg = []
  1031. yneg = []
  1032. for x, (_, y) in zip(xdates, data):
  1033. y = 0.5 - y.Value
  1034. if y > 0:
  1035. xpos.append(x)
  1036. ypos.append(y)
  1037. else:
  1038. xneg.append(x)
  1039. yneg.append(y)
  1040. pyplot.bar(xpos, ypos, color="g", label="Positive")
  1041. pyplot.bar(xneg, yneg, color="r", label="Negative")
  1042. legend = pyplot.legend(loc=1, fontsize=args.font_size)
  1043. pyplot.ylabel("Lines of code")
  1044. pyplot.xlabel("Time")
  1045. apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.background,
  1046. args.font_size, args.size)
  1047. pyplot.xlim(parse_date(args.start_date, xdates[0]), parse_date(args.end_date, xdates[-1]))
  1048. locator = pyplot.gca().xaxis.get_major_locator()
  1049. # set the optimal xticks locator
  1050. if "M" not in resample:
  1051. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  1052. locs = pyplot.gca().get_xticks().tolist()
  1053. if len(locs) >= 16:
  1054. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  1055. locs = pyplot.gca().get_xticks().tolist()
  1056. if len(locs) >= 16:
  1057. pyplot.gca().xaxis.set_major_locator(locator)
  1058. if locs[0] < pyplot.xlim()[0]:
  1059. del locs[0]
  1060. endindex = -1
  1061. if len(locs) >= 2 and pyplot.xlim()[1] - locs[-1] > (locs[-1] - locs[-2]) / 2:
  1062. locs.append(pyplot.xlim()[1])
  1063. endindex = len(locs) - 1
  1064. startindex = -1
  1065. if len(locs) >= 2 and locs[0] - pyplot.xlim()[0] > (locs[1] - locs[0]) / 2:
  1066. locs.append(pyplot.xlim()[0])
  1067. startindex = len(locs) - 1
  1068. pyplot.gca().set_xticks(locs)
  1069. # hacking time!
  1070. labels = pyplot.gca().get_xticklabels()
  1071. if startindex >= 0:
  1072. labels[startindex].set_text(xdates[0].date())
  1073. labels[startindex].set_text = lambda _: None
  1074. labels[startindex].set_rotation(30)
  1075. labels[startindex].set_ha("right")
  1076. if endindex >= 0:
  1077. labels[endindex].set_text(xdates[-1].date())
  1078. labels[endindex].set_text = lambda _: None
  1079. labels[endindex].set_rotation(30)
  1080. labels[endindex].set_ha("right")
  1081. overall_pos = sum(2 * (0.5 - d[1].Value) for d in data if d[1].Value < 0.5)
  1082. overall_neg = sum(2 * (d[1].Value - 0.5) for d in data if d[1].Value > 0.5)
  1083. title = "%s sentiment +%.1f -%.1f δ=%.1f" % (
  1084. name, overall_pos, overall_neg, overall_pos - overall_neg)
  1085. deploy_plot(title, args.output, args.style)
  1086. def show_devs(args, name, start_date, end_date, data):
  1087. try:
  1088. from fastdtw import fastdtw
  1089. except ImportError as e:
  1090. print("Cannot import fastdtw: %s\nInstall it from https://github.com/slaypni/fastdtw" % e)
  1091. sys.exit(1)
  1092. try:
  1093. from ortools.constraint_solver import pywrapcp, routing_enums_pb2
  1094. except ImportError as e:
  1095. print("Cannot import ortools: %s\nInstall it from "
  1096. "https://developers.google.com/optimization/install/python/" % e)
  1097. sys.exit(1)
  1098. try:
  1099. from hdbscan import HDBSCAN
  1100. except ImportError as e:
  1101. print("Cannot import ortools: %s\nInstall it from "
  1102. "https://developers.google.com/optimization/install/python/" % e)
  1103. sys.exit(1)
  1104. from scipy.signal import convolve, slepian
  1105. days, people = data
  1106. max_people = 50
  1107. if len(people) > max_people:
  1108. print("Picking top 100 developers by commit count")
  1109. # pick top N developers by commit count
  1110. commits = defaultdict(int)
  1111. for devs in days.values():
  1112. for dev, stats in devs.items():
  1113. commits[dev] += stats.Commits
  1114. commits = sorted(((v, k) for k, v in commits.items()), reverse=True)
  1115. chosen_people = {people[k] for _, k in commits[:max_people]}
  1116. else:
  1117. chosen_people = set(people)
  1118. devseries = defaultdict(list)
  1119. devstats = defaultdict(lambda: DevDay(0, 0, 0, 0, {}))
  1120. for day, devs in sorted(days.items()):
  1121. for dev, stats in devs.items():
  1122. if people[dev] in chosen_people:
  1123. devseries[dev].append((day, stats.Commits))
  1124. devstats[dev] = devstats[dev].add(stats)
  1125. print("Calculating the distance matrix")
  1126. # max-normalize the time series using a sliding window
  1127. keys = list(devseries.keys())
  1128. series = list(devseries.values())
  1129. for i, s in enumerate(series):
  1130. arr = numpy.array(s).transpose().astype(numpy.float32)
  1131. commits = arr[1]
  1132. if len(commits) < 7:
  1133. commits /= commits.max()
  1134. else:
  1135. # 4 is sizeof(float32)
  1136. windows = numpy.lib.stride_tricks.as_strided(commits, [len(commits) - 6, 7], [4, 4])
  1137. commits = numpy.concatenate((
  1138. [windows[0, 0] / windows[0].max(),
  1139. windows[0, 1] / windows[0].max(),
  1140. windows[0, 2] / windows[0].max()],
  1141. windows[:, 3] / windows.max(axis=1),
  1142. [windows[-1, 4] / windows[-1].max(),
  1143. windows[-1, 5] / windows[-1].max(),
  1144. windows[-1, 6] / windows[-1].max()]
  1145. ))
  1146. arr[1] = commits * 7 # 7 is a pure heuristic here and is not related to window size
  1147. series[i] = list(arr.transpose())
  1148. # calculate the distance matrix using dynamic time warping metric
  1149. dists = numpy.full((len(series) + 1, len(series) + 1), -100500, dtype=numpy.float32)
  1150. for x in range(len(series)):
  1151. dists[x, x] = 0
  1152. for y in range(x + 1, len(series)):
  1153. # L1 norm
  1154. dist, _ = fastdtw(series[x], series[y], radius=5, dist=1)
  1155. dists[x, y] = dists[y, x] = dist
  1156. # preparation for seriation ordering
  1157. dists[len(series), :] = 0
  1158. dists[:, len(series)] = 0
  1159. assert (dists >= 0).all()
  1160. print("Ordering the series")
  1161. # solve the TSP on the distance matrix
  1162. routing = pywrapcp.RoutingModel(dists.shape[0], 1, len(series))
  1163. def dist_callback(x, y):
  1164. # ortools wants integers, so we approximate here
  1165. return int(dists[x][y] * 1000)
  1166. routing.SetArcCostEvaluatorOfAllVehicles(dist_callback)
  1167. search_parameters = pywrapcp.RoutingModel.DefaultSearchParameters()
  1168. search_parameters.local_search_metaheuristic = (
  1169. routing_enums_pb2.LocalSearchMetaheuristic.GUIDED_LOCAL_SEARCH)
  1170. search_parameters.time_limit_ms = 2000
  1171. assignment = routing.SolveWithParameters(search_parameters)
  1172. index = routing.Start(0)
  1173. route = []
  1174. while not routing.IsEnd(index):
  1175. node = routing.IndexToNode(index)
  1176. if node < len(keys):
  1177. route.append(node)
  1178. index = assignment.Value(routing.NextVar(index))
  1179. route_map = {v: i for i, v in enumerate(route)}
  1180. # determine clusters
  1181. opt_dist_chain = numpy.cumsum(numpy.array(
  1182. [0] + [dists[route[i], route[i + 1]] for i in range(len(route) - 1)]))
  1183. clusters = HDBSCAN(min_cluster_size=2).fit_predict(opt_dist_chain[:, numpy.newaxis])
  1184. route = [keys[node] for node in route]
  1185. print("Plotting")
  1186. # smooth time series
  1187. start_date = datetime.fromtimestamp(start_date)
  1188. start_date = datetime(start_date.year, start_date.month, start_date.day)
  1189. end_date = datetime.fromtimestamp(end_date)
  1190. end_date = datetime(end_date.year, end_date.month, end_date.day)
  1191. size = (end_date - start_date).days + 1
  1192. plot_x = [start_date + timedelta(days=i) for i in range(size)]
  1193. resolution = 64
  1194. window = slepian(size // resolution, 0.5)
  1195. final = numpy.zeros((len(devseries), size), dtype=numpy.float32)
  1196. for i, s in enumerate(devseries.values()):
  1197. arr = numpy.array(s).transpose()
  1198. full_history = numpy.zeros(size, dtype=numpy.float32)
  1199. mask = arr[0] < size
  1200. full_history[arr[0][mask]] = arr[1][mask]
  1201. final[route_map[i]] = convolve(full_history, window, "same")
  1202. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  1203. pyplot.rcParams["figure.figsize"] = (32, 16)
  1204. prop_cycle = pyplot.rcParams["axes.prop_cycle"]
  1205. colors = prop_cycle.by_key()["color"]
  1206. fig, axes = pyplot.subplots(final.shape[0], 1)
  1207. backgrounds = ("#C4FFDB", "#FFD0CD") if args.background == "white" else ("#05401C", "#40110E")
  1208. for ax, series, cluster, dev_i in zip(axes, final, clusters, route):
  1209. if cluster >= 0:
  1210. color = colors[cluster % len(colors)]
  1211. else:
  1212. # outlier
  1213. color = "grey"
  1214. ax.fill_between(plot_x, series, color=color)
  1215. ax.set_axis_off()
  1216. author = people[dev_i]
  1217. ax.text(0.03, 0.5, author[:36] + (author[36:] and "..."),
  1218. horizontalalignment="right", verticalalignment="center",
  1219. transform=ax.transAxes, fontsize=14,
  1220. color="black" if args.background == "white" else "white")
  1221. ds = devstats[dev_i]
  1222. stats = "%5d %8s %8s" % (ds[0], _format_number(ds[1] - ds[2]), _format_number(ds[3]))
  1223. ax.text(0.97, 0.5, stats,
  1224. horizontalalignment="left", verticalalignment="center",
  1225. transform=ax.transAxes, fontsize=14, family="monospace",
  1226. backgroundcolor=backgrounds[ds[1] <= ds[2]],
  1227. color="black" if args.background == "white" else "white")
  1228. axes[0].text(0.97, 1.75, " cmts delta changed",
  1229. horizontalalignment="left", verticalalignment="center",
  1230. transform=axes[0].transAxes, fontsize=14, family="monospace",
  1231. color="black" if args.background == "white" else "white")
  1232. axes[-1].set_axis_on()
  1233. target_num_labels = 12
  1234. num_months = (end_date.year - start_date.year) * 12 + end_date.month - start_date.month
  1235. interval = int(numpy.ceil(num_months / target_num_labels))
  1236. if interval >= 8:
  1237. interval = int(numpy.ceil(num_months / (12 * target_num_labels)))
  1238. axes[-1].xaxis.set_major_locator(matplotlib.dates.YearLocator(base=max(1, interval // 12)))
  1239. axes[-1].xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y"))
  1240. else:
  1241. axes[-1].xaxis.set_major_locator(matplotlib.dates.MonthLocator(interval=interval))
  1242. axes[-1].xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%m"))
  1243. for tick in axes[-1].xaxis.get_major_ticks():
  1244. tick.label.set_fontsize(args.font_size)
  1245. axes[-1].spines["left"].set_visible(False)
  1246. axes[-1].spines["right"].set_visible(False)
  1247. axes[-1].spines["top"].set_visible(False)
  1248. axes[-1].get_yaxis().set_visible(False)
  1249. axes[-1].set_facecolor((1.0,) * 3 + (0.0,))
  1250. title = ("%s commits" % name) if not args.output else ""
  1251. deploy_plot(title, args.output, args.style)
  1252. def show_old_vs_new(args, name, start_date, end_date, data):
  1253. from scipy.signal import convolve, slepian
  1254. days, people = data
  1255. start_date = datetime.fromtimestamp(start_date)
  1256. start_date = datetime(start_date.year, start_date.month, start_date.day)
  1257. end_date = datetime.fromtimestamp(end_date)
  1258. end_date = datetime(end_date.year, end_date.month, end_date.day)
  1259. new_lines = numpy.zeros((end_date - start_date).days + 1)
  1260. old_lines = numpy.zeros_like(new_lines)
  1261. for day, devs in days.items():
  1262. for stats in devs.values():
  1263. new_lines[day] += stats.Added
  1264. old_lines[day] += stats.Removed + stats.Changed
  1265. resolution = 32
  1266. window = slepian(len(new_lines) // resolution, 0.5)
  1267. new_lines = convolve(new_lines, window, "same")
  1268. old_lines = convolve(old_lines, window, "same")
  1269. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  1270. plot_x = [start_date + timedelta(days=i) for i in range(len(new_lines))]
  1271. pyplot.fill_between(plot_x, new_lines, color="#8DB843", label="Changed new lines")
  1272. pyplot.fill_between(plot_x, old_lines, color="#E14C35", label="Changed existing lines")
  1273. pyplot.legend(loc=2, fontsize=args.font_size)
  1274. for tick in chain(pyplot.gca().xaxis.get_major_ticks(), pyplot.gca().yaxis.get_major_ticks()):
  1275. tick.label.set_fontsize(args.font_size)
  1276. deploy_plot("Additions vs changes", args.output, args.style)
  1277. def show_languages(args, name, start_date, end_date, data):
  1278. days, people = data
  1279. devlangs = defaultdict(lambda: defaultdict(lambda: numpy.zeros(3, dtype=int)))
  1280. for day, devs in days.items():
  1281. for dev, stats in devs.items():
  1282. for lang, vals in stats.Languages.items():
  1283. devlangs[dev][lang] += vals
  1284. devlangs = sorted(devlangs.items(), key=lambda p: -sum(x.sum() for x in p[1].values()))
  1285. for dev, ls in devlangs:
  1286. print()
  1287. print("#", people[dev])
  1288. ls = sorted(((vals.sum(), lang) for lang, vals in ls.items()), reverse=True)
  1289. for vals, lang in ls:
  1290. if lang:
  1291. print("%s: %d" % (lang, vals))
  1292. def _format_number(n):
  1293. if n == 0:
  1294. return "0"
  1295. power = int(numpy.log10(abs(n)))
  1296. if power >= 6:
  1297. n = n / 1000000
  1298. if n >= 10:
  1299. n = str(int(n))
  1300. else:
  1301. n = "%.1f" % n
  1302. if n.endswith("0"):
  1303. n = n[:-2]
  1304. suffix = "M"
  1305. elif power >= 3:
  1306. n = n / 1000
  1307. if n >= 10:
  1308. n = str(int(n))
  1309. else:
  1310. n = "%.1f" % n
  1311. if n.endswith("0"):
  1312. n = n[:-2]
  1313. suffix = "K"
  1314. else:
  1315. n = str(n)
  1316. suffix = ""
  1317. return n + suffix
  1318. def main():
  1319. args = parse_args()
  1320. reader = read_input(args)
  1321. header = reader.get_header()
  1322. name = reader.get_name()
  1323. burndown_warning = "Burndown stats were not collected. Re-run hercules with --burndown."
  1324. burndown_files_warning = \
  1325. "Burndown stats for files were not collected. Re-run hercules with " \
  1326. "--burndown --burndown-files."
  1327. burndown_people_warning = \
  1328. "Burndown stats for people were not collected. Re-run hercules with " \
  1329. "--burndown --burndown-people."
  1330. couples_warning = "Coupling stats were not collected. Re-run hercules with --couples."
  1331. shotness_warning = "Structural hotness stats were not collected. Re-run hercules with " \
  1332. "--shotness. Also check --languages - the output may be empty."
  1333. sentiment_warning = "Sentiment stats were not collected. Re-run hercules with --sentiment."
  1334. devs_warning = "Devs stats were not collected. Re-run hercules with --devs."
  1335. def run_times():
  1336. rt = reader.get_run_times()
  1337. pandas = import_pandas()
  1338. series = pandas.to_timedelta(pandas.Series(rt).sort_values(ascending=False), unit="s")
  1339. df = pandas.concat([series, series / series.sum()], axis=1)
  1340. df.columns = ["time", "ratio"]
  1341. print(df)
  1342. def project_burndown():
  1343. try:
  1344. full_header = header + reader.get_burndown_parameters()
  1345. except KeyError:
  1346. print("project: " + burndown_warning)
  1347. return
  1348. plot_burndown(args, "project",
  1349. *load_burndown(full_header, *reader.get_project_burndown(),
  1350. resample=args.resample))
  1351. def files_burndown():
  1352. try:
  1353. full_header = header + reader.get_burndown_parameters()
  1354. except KeyError:
  1355. print(burndown_warning)
  1356. return
  1357. try:
  1358. plot_many_burndown(args, "file", full_header, reader.get_files_burndown())
  1359. except KeyError:
  1360. print("files: " + burndown_files_warning)
  1361. def people_burndown():
  1362. try:
  1363. full_header = header + reader.get_burndown_parameters()
  1364. except KeyError:
  1365. print(burndown_warning)
  1366. return
  1367. try:
  1368. plot_many_burndown(args, "person", full_header, reader.get_people_burndown())
  1369. except KeyError:
  1370. print("people: " + burndown_people_warning)
  1371. def churn_matrix():
  1372. try:
  1373. plot_churn_matrix(args, name, *load_churn_matrix(
  1374. *reader.get_people_interaction(), max_people=args.max_people))
  1375. except KeyError:
  1376. print("churn_matrix: " + burndown_people_warning)
  1377. def ownership_burndown():
  1378. try:
  1379. full_header = header + reader.get_burndown_parameters()
  1380. except KeyError:
  1381. print(burndown_warning)
  1382. return
  1383. try:
  1384. plot_ownership(args, name, *load_ownership(
  1385. full_header, *reader.get_ownership_burndown(), max_people=args.max_people))
  1386. except KeyError:
  1387. print("ownership: " + burndown_people_warning)
  1388. def couples_files():
  1389. try:
  1390. write_embeddings("files", args.output, not args.disable_projector,
  1391. *train_embeddings(*reader.get_files_coocc(),
  1392. tmpdir=args.couples_tmp_dir))
  1393. except KeyError:
  1394. print(couples_warning)
  1395. def couples_people():
  1396. try:
  1397. write_embeddings("people", args.output, not args.disable_projector,
  1398. *train_embeddings(*reader.get_people_coocc(),
  1399. tmpdir=args.couples_tmp_dir))
  1400. except KeyError:
  1401. print(couples_warning)
  1402. def couples_shotness():
  1403. try:
  1404. write_embeddings("shotness", args.output, not args.disable_projector,
  1405. *train_embeddings(*reader.get_shotness_coocc(),
  1406. tmpdir=args.couples_tmp_dir))
  1407. except KeyError:
  1408. print(shotness_warning)
  1409. def shotness():
  1410. try:
  1411. data = reader.get_shotness()
  1412. except KeyError:
  1413. print(shotness_warning)
  1414. return
  1415. show_shotness_stats(data)
  1416. def sentiment():
  1417. try:
  1418. data = reader.get_sentiment()
  1419. except KeyError:
  1420. print(sentiment_warning)
  1421. return
  1422. show_sentiment_stats(args, reader.get_name(), args.resample, reader.get_header()[0], data)
  1423. def devs():
  1424. try:
  1425. data = reader.get_devs()
  1426. except KeyError:
  1427. print(devs_warning)
  1428. return
  1429. show_devs(args, reader.get_name(), *reader.get_header(), data)
  1430. def old_vs_new():
  1431. try:
  1432. data = reader.get_devs()
  1433. except KeyError:
  1434. print(devs_warning)
  1435. return
  1436. show_old_vs_new(args, reader.get_name(), *reader.get_header(), data)
  1437. def languages():
  1438. try:
  1439. data = reader.get_devs()
  1440. except KeyError:
  1441. print(devs_warning)
  1442. return
  1443. show_languages(args, reader.get_name(), *reader.get_header(), data)
  1444. modes = {
  1445. "run-times": run_times,
  1446. "burndown-project": project_burndown,
  1447. "burndown-file": files_burndown,
  1448. "burndown-person": people_burndown,
  1449. "churn-matrix": churn_matrix,
  1450. "ownership": ownership_burndown,
  1451. "couples-files": couples_files,
  1452. "couples-people": couples_people,
  1453. "couples-shotness": couples_shotness,
  1454. "shotness": shotness,
  1455. "sentiment": sentiment,
  1456. "devs": devs,
  1457. "old-vs-new": old_vs_new,
  1458. "languages": languages,
  1459. }
  1460. try:
  1461. modes[args.mode]()
  1462. except KeyError:
  1463. assert args.mode == "all"
  1464. project_burndown()
  1465. files_burndown()
  1466. people_burndown()
  1467. churn_matrix()
  1468. ownership_burndown()
  1469. couples_files()
  1470. couples_people()
  1471. couples_shotness()
  1472. shotness()
  1473. sentiment()
  1474. devs()
  1475. if web_server.running:
  1476. secs = int(os.getenv("COUPLES_SERVER_TIME", "60"))
  1477. print("Sleeping for %d seconds, safe to Ctrl-C" % secs)
  1478. sys.stdout.flush()
  1479. try:
  1480. time.sleep(secs)
  1481. except KeyboardInterrupt:
  1482. pass
  1483. web_server.stop()
  1484. if __name__ == "__main__":
  1485. sys.exit(main())