labours.py 76 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008
  1. #!/usr/bin/env python3
  2. import argparse
  3. from collections import defaultdict, namedtuple
  4. import contextlib
  5. from datetime import datetime, timedelta
  6. from importlib import import_module
  7. import io
  8. from itertools import chain
  9. import json
  10. import os
  11. import re
  12. import shutil
  13. import subprocess
  14. import sys
  15. import tempfile
  16. import threading
  17. import time
  18. import warnings
  19. import numpy
  20. import tqdm
  21. import yaml
  22. def list_matplotlib_styles():
  23. script = "import sys; from matplotlib import pyplot; " \
  24. "sys.stdout.write(repr(pyplot.style.available))"
  25. styles = eval(subprocess.check_output([sys.executable, "-c", script]))
  26. styles.remove("classic")
  27. return ["default", "classic"] + styles
  28. def parse_args():
  29. parser = argparse.ArgumentParser()
  30. parser.add_argument("-o", "--output", default="",
  31. help="Path to the output file/directory (empty for display). "
  32. "If the extension is JSON, the data is saved instead of "
  33. "the real image.")
  34. parser.add_argument("-i", "--input", default="-",
  35. help="Path to the input file (- for stdin).")
  36. parser.add_argument("-f", "--input-format", default="auto", choices=["yaml", "pb", "auto"])
  37. parser.add_argument("--font-size", default=12, type=int,
  38. help="Size of the labels and legend.")
  39. parser.add_argument("--style", default="ggplot", choices=list_matplotlib_styles(),
  40. help="Plot style to use.")
  41. parser.add_argument("--backend", help="Matplotlib backend to use.")
  42. parser.add_argument("--background", choices=["black", "white"], default="white",
  43. help="Plot's general color scheme.")
  44. parser.add_argument("--size", help="Axes' size in inches, for example \"12,9\"")
  45. parser.add_argument("--relative", action="store_true",
  46. help="Occupy 100%% height for every measurement.")
  47. parser.add_argument("--tmpdir", help="Temporary directory for intermediate files.")
  48. parser.add_argument("-m", "--mode", dest="modes", default=[], action="append",
  49. choices=["burndown-project", "burndown-file", "burndown-person",
  50. "overwrites-matrix", "ownership", "couples-files",
  51. "couples-people", "couples-shotness", "shotness", "sentiment",
  52. "devs", "devs-efforts", "old-vs-new", "run-times",
  53. "languages", "devs-parallel", "all"],
  54. help="What to plot. Can be repeated, e.g. "
  55. "-m burndown-project -m run-times")
  56. parser.add_argument(
  57. "--resample", default="year",
  58. help="The way to resample the time series. Possible values are: "
  59. "\"month\", \"year\", \"no\", \"raw\" and pandas offset aliases ("
  60. "http://pandas.pydata.org/pandas-docs/stable/timeseries.html"
  61. "#offset-aliases).")
  62. dateutil_url = "https://dateutil.readthedocs.io/en/stable/parser.html#dateutil.parser.parse"
  63. parser.add_argument("--start-date",
  64. help="Start date of time-based plots. Any format is accepted which is "
  65. "supported by %s" % dateutil_url)
  66. parser.add_argument("--end-date",
  67. help="End date of time-based plots. Any format is accepted which is "
  68. "supported by %s" % dateutil_url)
  69. parser.add_argument("--disable-projector", action="store_true",
  70. help="Do not run Tensorflow Projector on couples.")
  71. parser.add_argument("--max-people", default=20, type=int,
  72. help="Maximum number of developers in overwrites matrix and people plots.")
  73. parser.add_argument("--order-ownership-by-time", action="store_true",
  74. help="Sort developers in the ownership plot according to their first "
  75. "appearance in the history. The default is sorting by the number of "
  76. "commits.")
  77. args = parser.parse_args()
  78. return args
  79. class Reader(object):
  80. def read(self, file):
  81. raise NotImplementedError
  82. def get_name(self):
  83. raise NotImplementedError
  84. def get_header(self):
  85. raise NotImplementedError
  86. def get_burndown_parameters(self):
  87. raise NotImplementedError
  88. def get_project_burndown(self):
  89. raise NotImplementedError
  90. def get_files_burndown(self):
  91. raise NotImplementedError
  92. def get_people_burndown(self):
  93. raise NotImplementedError
  94. def get_ownership_burndown(self):
  95. raise NotImplementedError
  96. def get_people_interaction(self):
  97. raise NotImplementedError
  98. def get_files_coocc(self):
  99. raise NotImplementedError
  100. def get_people_coocc(self):
  101. raise NotImplementedError
  102. def get_shotness_coocc(self):
  103. raise NotImplementedError
  104. def get_shotness(self):
  105. raise NotImplementedError
  106. def get_sentiment(self):
  107. raise NotImplementedError
  108. def get_devs(self):
  109. raise NotImplementedError
  110. class YamlReader(Reader):
  111. def read(self, file):
  112. yaml.reader.Reader.NON_PRINTABLE = re.compile(r"(?!x)x")
  113. try:
  114. loader = yaml.CLoader
  115. except AttributeError:
  116. print("Warning: failed to import yaml.CLoader, falling back to slow yaml.Loader")
  117. loader = yaml.Loader
  118. try:
  119. if file != "-":
  120. with open(file) as fin:
  121. data = yaml.load(fin, Loader=loader)
  122. else:
  123. data = yaml.load(sys.stdin, Loader=loader)
  124. except (UnicodeEncodeError, yaml.reader.ReaderError) as e:
  125. print("\nInvalid unicode in the input: %s\nPlease filter it through "
  126. "fix_yaml_unicode.py" % e)
  127. sys.exit(1)
  128. if data is None:
  129. print("\nNo data has been read - has Hercules crashed?")
  130. sys.exit(1)
  131. self.data = data
  132. def get_run_times(self):
  133. return {}
  134. def get_name(self):
  135. return self.data["hercules"]["repository"]
  136. def get_header(self):
  137. header = self.data["hercules"]
  138. return header["begin_unix_time"], header["end_unix_time"]
  139. def get_burndown_parameters(self):
  140. header = self.data["Burndown"]
  141. return header["sampling"], header["granularity"], header["tick_size"]
  142. def get_project_burndown(self):
  143. return self.data["hercules"]["repository"], \
  144. self._parse_burndown_matrix(self.data["Burndown"]["project"]).T
  145. def get_files_burndown(self):
  146. return [(p[0], self._parse_burndown_matrix(p[1]).T)
  147. for p in self.data["Burndown"]["files"].items()]
  148. def get_people_burndown(self):
  149. return [(p[0], self._parse_burndown_matrix(p[1]).T)
  150. for p in self.data["Burndown"]["people"].items()]
  151. def get_ownership_burndown(self):
  152. return self.data["Burndown"]["people_sequence"].copy(), \
  153. {p[0]: self._parse_burndown_matrix(p[1])
  154. for p in self.data["Burndown"]["people"].items()}
  155. def get_people_interaction(self):
  156. return self.data["Burndown"]["people_sequence"].copy(), \
  157. self._parse_burndown_matrix(self.data["Burndown"]["people_interaction"])
  158. def get_files_coocc(self):
  159. coocc = self.data["Couples"]["files_coocc"]
  160. return coocc["index"], self._parse_coocc_matrix(coocc["matrix"])
  161. def get_people_coocc(self):
  162. coocc = self.data["Couples"]["people_coocc"]
  163. return coocc["index"], self._parse_coocc_matrix(coocc["matrix"])
  164. def get_shotness_coocc(self):
  165. shotness = self.data["Shotness"]
  166. index = ["%s:%s" % (i["file"], i["name"]) for i in shotness]
  167. indptr = numpy.zeros(len(shotness) + 1, dtype=numpy.int64)
  168. indices = []
  169. data = []
  170. for i, record in enumerate(shotness):
  171. pairs = [(int(k), v) for k, v in record["counters"].items()]
  172. pairs.sort()
  173. indptr[i + 1] = indptr[i] + len(pairs)
  174. for k, v in pairs:
  175. indices.append(k)
  176. data.append(v)
  177. indices = numpy.array(indices, dtype=numpy.int32)
  178. data = numpy.array(data, dtype=numpy.int32)
  179. from scipy.sparse import csr_matrix
  180. return index, csr_matrix((data, indices, indptr), shape=(len(shotness),) * 2)
  181. def get_shotness(self):
  182. from munch import munchify
  183. obj = munchify(self.data["Shotness"])
  184. # turn strings into ints
  185. for item in obj:
  186. item.counters = {int(k): v for k, v in item.counters.items()}
  187. if len(obj) == 0:
  188. raise KeyError
  189. return obj
  190. def get_sentiment(self):
  191. from munch import munchify
  192. return munchify({int(key): {
  193. "Comments": vals[2].split("|"),
  194. "Commits": vals[1],
  195. "Value": float(vals[0])
  196. } for key, vals in self.data["Sentiment"].items()})
  197. def get_devs(self):
  198. people = self.data["Devs"]["people"]
  199. days = {int(d): {int(dev): DevDay(*(int(x) for x in day[:-1]), day[-1])
  200. for dev, day in devs.items()}
  201. for d, devs in self.data["Devs"]["ticks"].items()}
  202. return people, days
  203. def _parse_burndown_matrix(self, matrix):
  204. return numpy.array([numpy.fromstring(line, dtype=int, sep=" ")
  205. for line in matrix.split("\n")])
  206. def _parse_coocc_matrix(self, matrix):
  207. from scipy.sparse import csr_matrix
  208. data = []
  209. indices = []
  210. indptr = [0]
  211. for row in matrix:
  212. for k, v in sorted(row.items()):
  213. data.append(v)
  214. indices.append(k)
  215. indptr.append(indptr[-1] + len(row))
  216. return csr_matrix((data, indices, indptr), shape=(len(matrix),) * 2)
  217. class ProtobufReader(Reader):
  218. def read(self, file):
  219. try:
  220. from labours.pb_pb2 import AnalysisResults
  221. except ImportError as e:
  222. print("\n\n>>> You need to generate python/hercules/pb/pb_pb2.py - run \"make\"\n",
  223. file=sys.stderr)
  224. raise e from None
  225. self.data = AnalysisResults()
  226. if file != "-":
  227. with open(file, "rb") as fin:
  228. bytes = fin.read()
  229. else:
  230. bytes = sys.stdin.buffer.read()
  231. if not bytes:
  232. raise ValueError("empty input")
  233. self.data.ParseFromString(bytes)
  234. self.contents = {}
  235. for key, val in self.data.contents.items():
  236. try:
  237. mod, name = PB_MESSAGES[key].rsplit(".", 1)
  238. except KeyError:
  239. sys.stderr.write("Warning: there is no registered PB decoder for %s\n" % key)
  240. continue
  241. cls = getattr(import_module(mod), name)
  242. self.contents[key] = msg = cls()
  243. msg.ParseFromString(val)
  244. def get_run_times(self):
  245. return {key: val for key, val in self.data.header.run_time_per_item.items()}
  246. def get_name(self):
  247. return self.data.header.repository
  248. def get_header(self):
  249. header = self.data.header
  250. return header.begin_unix_time, header.end_unix_time
  251. def get_burndown_parameters(self):
  252. burndown = self.contents["Burndown"]
  253. return burndown.sampling, burndown.granularity, burndown.tick_size / 1000000000
  254. def get_project_burndown(self):
  255. return self._parse_burndown_matrix(self.contents["Burndown"].project)
  256. def get_files_burndown(self):
  257. return [self._parse_burndown_matrix(i) for i in self.contents["Burndown"].files]
  258. def get_people_burndown(self):
  259. return [self._parse_burndown_matrix(i) for i in self.contents["Burndown"].people]
  260. def get_ownership_burndown(self):
  261. people = self.get_people_burndown()
  262. return [p[0] for p in people], {p[0]: p[1].T for p in people}
  263. def get_people_interaction(self):
  264. burndown = self.contents["Burndown"]
  265. return [i.name for i in burndown.people], \
  266. self._parse_sparse_matrix(burndown.people_interaction).toarray()
  267. def get_files_coocc(self):
  268. node = self.contents["Couples"].file_couples
  269. return list(node.index), self._parse_sparse_matrix(node.matrix)
  270. def get_people_coocc(self):
  271. node = self.contents["Couples"].people_couples
  272. return list(node.index), self._parse_sparse_matrix(node.matrix)
  273. def get_shotness_coocc(self):
  274. shotness = self.get_shotness()
  275. index = ["%s:%s" % (i.file, i.name) for i in shotness]
  276. indptr = numpy.zeros(len(shotness) + 1, dtype=numpy.int32)
  277. indices = []
  278. data = []
  279. for i, record in enumerate(shotness):
  280. pairs = list(record.counters.items())
  281. pairs.sort()
  282. indptr[i + 1] = indptr[i] + len(pairs)
  283. for k, v in pairs:
  284. indices.append(k)
  285. data.append(v)
  286. indices = numpy.array(indices, dtype=numpy.int32)
  287. data = numpy.array(data, dtype=numpy.int32)
  288. from scipy.sparse import csr_matrix
  289. return index, csr_matrix((data, indices, indptr), shape=(len(shotness),) * 2)
  290. def get_shotness(self):
  291. records = self.contents["Shotness"].records
  292. if len(records) == 0:
  293. raise KeyError
  294. return records
  295. def get_sentiment(self):
  296. byday = self.contents["Sentiment"].SentimentByDay
  297. if len(byday) == 0:
  298. raise KeyError
  299. return byday
  300. def get_devs(self):
  301. people = list(self.contents["Devs"].dev_index)
  302. days = {d: {dev: DevDay(stats.commits, stats.stats.added, stats.stats.removed,
  303. stats.stats.changed, {k: [v.added, v.removed, v.changed]
  304. for k, v in stats.languages.items()})
  305. for dev, stats in day.devs.items()}
  306. for d, day in self.contents["Devs"].ticks.items()}
  307. return people, days
  308. def _parse_burndown_matrix(self, matrix):
  309. dense = numpy.zeros((matrix.number_of_rows, matrix.number_of_columns), dtype=int)
  310. for y, row in enumerate(matrix.rows):
  311. for x, col in enumerate(row.columns):
  312. dense[y, x] = col
  313. return matrix.name, dense.T
  314. def _parse_sparse_matrix(self, matrix):
  315. from scipy.sparse import csr_matrix
  316. return csr_matrix((list(matrix.data), list(matrix.indices), list(matrix.indptr)),
  317. shape=(matrix.number_of_rows, matrix.number_of_columns))
  318. READERS = {"yaml": YamlReader, "yml": YamlReader, "pb": ProtobufReader}
  319. PB_MESSAGES = {
  320. "Burndown": "labours.pb_pb2.BurndownAnalysisResults",
  321. "Couples": "labours.pb_pb2.CouplesAnalysisResults",
  322. "Shotness": "labours.pb_pb2.ShotnessAnalysisResults",
  323. "Devs": "labours.pb_pb2.DevsAnalysisResults",
  324. }
  325. def read_input(args):
  326. sys.stdout.write("Reading the input... ")
  327. sys.stdout.flush()
  328. if args.input != "-":
  329. if args.input_format == "auto":
  330. try:
  331. args.input_format = args.input.rsplit(".", 1)[1]
  332. except IndexError:
  333. try:
  334. with open(args.input) as f:
  335. f.read(1 << 16)
  336. args.input_format = "yaml"
  337. except UnicodeDecodeError:
  338. args.input_format = "pb"
  339. elif args.input_format == "auto":
  340. args.input_format = "yaml"
  341. reader = READERS[args.input_format]()
  342. reader.read(args.input)
  343. print("done")
  344. return reader
  345. class DevDay(namedtuple("DevDay", ("Commits", "Added", "Removed", "Changed", "Languages"))):
  346. def add(self, dd):
  347. langs = defaultdict(lambda: [0] * 3)
  348. for key, val in self.Languages.items():
  349. for i in range(3):
  350. langs[key][i] += val[i]
  351. for key, val in dd.Languages.items():
  352. for i in range(3):
  353. langs[key][i] += val[i]
  354. return DevDay(Commits=self.Commits + dd.Commits,
  355. Added=self.Added + dd.Added,
  356. Removed=self.Removed + dd.Removed,
  357. Changed=self.Changed + dd.Changed,
  358. Languages=dict(langs))
  359. def fit_kaplan_meier(matrix):
  360. from lifelines import KaplanMeierFitter
  361. T = []
  362. W = []
  363. indexes = numpy.arange(matrix.shape[0], dtype=int)
  364. entries = numpy.zeros(matrix.shape[0], int)
  365. dead = set()
  366. for i in range(1, matrix.shape[1]):
  367. diff = matrix[:, i - 1] - matrix[:, i]
  368. entries[diff < 0] = i
  369. mask = diff > 0
  370. deaths = diff[mask]
  371. T.append(numpy.full(len(deaths), i) - entries[indexes[mask]])
  372. W.append(deaths)
  373. entered = entries > 0
  374. entered[0] = True
  375. dead = dead.union(set(numpy.where((matrix[:, i] == 0) & entered)[0]))
  376. # add the survivors as censored
  377. nnzind = entries != 0
  378. nnzind[0] = True
  379. nnzind[sorted(dead)] = False
  380. T.append(numpy.full(nnzind.sum(), matrix.shape[1]) - entries[nnzind])
  381. W.append(matrix[nnzind, -1])
  382. T = numpy.concatenate(T)
  383. E = numpy.ones(len(T), bool)
  384. E[-nnzind.sum():] = 0
  385. W = numpy.concatenate(W)
  386. if T.size == 0:
  387. return None
  388. kmf = KaplanMeierFitter().fit(T, E, weights=W)
  389. return kmf
  390. def print_survival_function(kmf, sampling):
  391. sf = kmf.survival_function_
  392. sf.index = [timedelta(days=d) for d in sf.index * sampling]
  393. sf.columns = ["Ratio of survived lines"]
  394. try:
  395. print(sf[len(sf) // 6::len(sf) // 6].append(sf.tail(1)))
  396. except ValueError:
  397. pass
  398. def interpolate_burndown_matrix(matrix, granularity, sampling, progress=False):
  399. daily = numpy.zeros(
  400. (matrix.shape[0] * granularity, matrix.shape[1] * sampling),
  401. dtype=numpy.float32)
  402. """
  403. ----------> samples, x
  404. |
  405. |
  406. |
  407. bands, y
  408. """
  409. for y in tqdm.tqdm(range(matrix.shape[0]), disable=(not progress)):
  410. for x in range(matrix.shape[1]):
  411. if y * granularity > (x + 1) * sampling:
  412. # the future is zeros
  413. continue
  414. def decay(start_index: int, start_val: float):
  415. if start_val == 0:
  416. return
  417. k = matrix[y][x] / start_val # <= 1
  418. scale = (x + 1) * sampling - start_index
  419. for i in range(y * granularity, (y + 1) * granularity):
  420. initial = daily[i][start_index - 1]
  421. for j in range(start_index, (x + 1) * sampling):
  422. daily[i][j] = initial * (
  423. 1 + (k - 1) * (j - start_index + 1) / scale)
  424. def grow(finish_index: int, finish_val: float):
  425. initial = matrix[y][x - 1] if x > 0 else 0
  426. start_index = x * sampling
  427. if start_index < y * granularity:
  428. start_index = y * granularity
  429. if finish_index == start_index:
  430. return
  431. avg = (finish_val - initial) / (finish_index - start_index)
  432. for j in range(x * sampling, finish_index):
  433. for i in range(start_index, j + 1):
  434. daily[i][j] = avg
  435. # copy [x*g..y*s)
  436. for j in range(x * sampling, finish_index):
  437. for i in range(y * granularity, x * sampling):
  438. daily[i][j] = daily[i][j - 1]
  439. if (y + 1) * granularity >= (x + 1) * sampling:
  440. # x*granularity <= (y+1)*sampling
  441. # 1. x*granularity <= y*sampling
  442. # y*sampling..(y+1)sampling
  443. #
  444. # x+1
  445. # /
  446. # /
  447. # / y+1 -|
  448. # / |
  449. # / y -|
  450. # /
  451. # / x
  452. #
  453. # 2. x*granularity > y*sampling
  454. # x*granularity..(y+1)sampling
  455. #
  456. # x+1
  457. # /
  458. # /
  459. # / y+1 -|
  460. # / |
  461. # / x -|
  462. # /
  463. # / y
  464. if y * granularity <= x * sampling:
  465. grow((x + 1) * sampling, matrix[y][x])
  466. elif (x + 1) * sampling > y * granularity:
  467. grow((x + 1) * sampling, matrix[y][x])
  468. avg = matrix[y][x] / ((x + 1) * sampling - y * granularity)
  469. for j in range(y * granularity, (x + 1) * sampling):
  470. for i in range(y * granularity, j + 1):
  471. daily[i][j] = avg
  472. elif (y + 1) * granularity >= x * sampling:
  473. # y*sampling <= (x+1)*granularity < (y+1)sampling
  474. # y*sampling..(x+1)*granularity
  475. # (x+1)*granularity..(y+1)sampling
  476. # x+1
  477. # /\
  478. # / \
  479. # / \
  480. # / y+1
  481. # /
  482. # y
  483. v1 = matrix[y][x - 1]
  484. v2 = matrix[y][x]
  485. delta = (y + 1) * granularity - x * sampling
  486. previous = 0
  487. if x > 0 and (x - 1) * sampling >= y * granularity:
  488. # x*g <= (y-1)*s <= y*s <= (x+1)*g <= (y+1)*s
  489. # |________|.......^
  490. if x > 1:
  491. previous = matrix[y][x - 2]
  492. scale = sampling
  493. else:
  494. # (y-1)*s < x*g <= y*s <= (x+1)*g <= (y+1)*s
  495. # |______|.......^
  496. scale = sampling if x == 0 else x * sampling - y * granularity
  497. peak = v1 + (v1 - previous) / scale * delta
  498. if v2 > peak:
  499. # we need to adjust the peak, it may not be less than the decayed value
  500. if x < matrix.shape[1] - 1:
  501. # y*s <= (x+1)*g <= (y+1)*s < (y+2)*s
  502. # ^.........|_________|
  503. k = (v2 - matrix[y][x + 1]) / sampling # > 0
  504. peak = matrix[y][x] + k * ((x + 1) * sampling - (y + 1) * granularity)
  505. # peak > v2 > v1
  506. else:
  507. peak = v2
  508. # not enough data to interpolate; this is at least not restricted
  509. grow((y + 1) * granularity, peak)
  510. decay((y + 1) * granularity, peak)
  511. else:
  512. # (x+1)*granularity < y*sampling
  513. # y*sampling..(y+1)sampling
  514. decay(x * sampling, matrix[y][x - 1])
  515. return daily
  516. def import_pandas():
  517. import pandas
  518. try:
  519. from pandas.plotting import register_matplotlib_converters
  520. register_matplotlib_converters()
  521. except ImportError:
  522. pass
  523. return pandas
  524. def floor_datetime(dt, duration):
  525. return datetime.fromtimestamp(dt.timestamp() - dt.timestamp() % duration)
  526. def load_burndown(
  527. header,
  528. name,
  529. matrix,
  530. resample,
  531. report_survival=True,
  532. interpolation_progress=False
  533. ):
  534. pandas = import_pandas()
  535. start, last, sampling, granularity, tick = header
  536. assert sampling > 0
  537. assert granularity > 0
  538. start = floor_datetime(datetime.fromtimestamp(start), tick)
  539. last = datetime.fromtimestamp(last)
  540. if report_survival:
  541. kmf = fit_kaplan_meier(matrix)
  542. if kmf is not None:
  543. print_survival_function(kmf, sampling)
  544. finish = start + timedelta(seconds=matrix.shape[1] * sampling * tick)
  545. if resample not in ("no", "raw"):
  546. print("resampling to %s, please wait..." % resample)
  547. # Interpolate the day x day matrix.
  548. # Each day brings equal weight in the granularity.
  549. # Sampling's interpolation is linear.
  550. daily = interpolate_burndown_matrix(
  551. matrix=matrix,
  552. granularity=granularity,
  553. sampling=sampling,
  554. progress=interpolation_progress,
  555. )
  556. daily[(last - start).days:] = 0
  557. # Resample the bands
  558. aliases = {
  559. "year": "A",
  560. "month": "M"
  561. }
  562. resample = aliases.get(resample, resample)
  563. periods = 0
  564. date_granularity_sampling = [start]
  565. while date_granularity_sampling[-1] < finish:
  566. periods += 1
  567. date_granularity_sampling = pandas.date_range(
  568. start, periods=periods, freq=resample)
  569. if date_granularity_sampling[0] > finish:
  570. if resample == "A":
  571. print("too loose resampling - by year, trying by month")
  572. return load_burndown(header, name, matrix, "month", report_survival=False)
  573. else:
  574. raise ValueError("Too loose resampling: %s. Try finer." % resample)
  575. date_range_sampling = pandas.date_range(
  576. date_granularity_sampling[0],
  577. periods=(finish - date_granularity_sampling[0]).days,
  578. freq="1D")
  579. # Fill the new square matrix
  580. matrix = numpy.zeros(
  581. (len(date_granularity_sampling), len(date_range_sampling)),
  582. dtype=numpy.float32)
  583. for i, gdt in enumerate(date_granularity_sampling):
  584. istart = (date_granularity_sampling[i - 1] - start).days \
  585. if i > 0 else 0
  586. ifinish = (gdt - start).days
  587. for j, sdt in enumerate(date_range_sampling):
  588. if (sdt - start).days >= istart:
  589. break
  590. matrix[i, j:] = \
  591. daily[istart:ifinish, (sdt - start).days:].sum(axis=0)
  592. # Hardcode some cases to improve labels' readability
  593. if resample in ("year", "A"):
  594. labels = [dt.year for dt in date_granularity_sampling]
  595. elif resample in ("month", "M"):
  596. labels = [dt.strftime("%Y %B") for dt in date_granularity_sampling]
  597. else:
  598. labels = [dt.date() for dt in date_granularity_sampling]
  599. else:
  600. labels = [
  601. "%s - %s" % ((start + timedelta(seconds=i * granularity * tick)).date(),
  602. (
  603. start + timedelta(seconds=(i + 1) * granularity * tick)).date())
  604. for i in range(matrix.shape[0])]
  605. if len(labels) > 18:
  606. warnings.warn("Too many labels - consider resampling.")
  607. resample = "M" # fake resampling type is checked while plotting
  608. date_range_sampling = pandas.date_range(
  609. start + timedelta(seconds=sampling * tick), periods=matrix.shape[1],
  610. freq="%dD" % sampling)
  611. return name, matrix, date_range_sampling, labels, granularity, sampling, resample
  612. def load_ownership(header, sequence, contents, max_people, order_by_time):
  613. pandas = import_pandas()
  614. start, last, sampling, _, tick = header
  615. start = datetime.fromtimestamp(start)
  616. start = floor_datetime(start, tick)
  617. last = datetime.fromtimestamp(last)
  618. people = []
  619. for name in sequence:
  620. people.append(contents[name].sum(axis=1))
  621. people = numpy.array(people)
  622. date_range_sampling = pandas.date_range(
  623. start + timedelta(seconds=sampling * tick), periods=people[0].shape[0],
  624. freq="%dD" % sampling)
  625. if people.shape[0] > max_people:
  626. chosen = numpy.argpartition(-numpy.sum(people, axis=1), max_people)
  627. others = people[chosen[max_people:]].sum(axis=0)
  628. people = people[chosen[:max_people + 1]]
  629. people[max_people] = others
  630. sequence = [sequence[i] for i in chosen[:max_people]] + ["others"]
  631. print("Warning: truncated people to the most owning %d" % max_people)
  632. if order_by_time:
  633. appearances = numpy.argmax(people > 0, axis=1)
  634. if people.shape[0] > max_people:
  635. appearances[-1] = people.shape[1]
  636. else:
  637. appearances = -people.sum(axis=1)
  638. if people.shape[0] > max_people:
  639. appearances[-1] = 0
  640. order = numpy.argsort(appearances)
  641. people = people[order]
  642. sequence = [sequence[i] for i in order]
  643. for i, name in enumerate(sequence):
  644. if len(name) > 40:
  645. sequence[i] = name[:37] + "..."
  646. return sequence, people, date_range_sampling, last
  647. def load_overwrites_matrix(people, matrix, max_people, normalize=True):
  648. matrix = matrix.astype(float)
  649. if matrix.shape[0] > max_people:
  650. order = numpy.argsort(-matrix[:, 0])
  651. matrix = matrix[order[:max_people]][:, [0, 1] + list(2 + order[:max_people])]
  652. people = [people[i] for i in order[:max_people]]
  653. print("Warning: truncated people to most productive %d" % max_people)
  654. if normalize:
  655. zeros = matrix[:, 0] == 0
  656. matrix[zeros, :] = 1
  657. matrix /= matrix[:, 0][:, None]
  658. matrix[zeros, :] = 0
  659. matrix = -matrix[:, 1:]
  660. for i, name in enumerate(people):
  661. if len(name) > 40:
  662. people[i] = name[:37] + "..."
  663. return people, matrix
  664. def import_pyplot(backend, style):
  665. import matplotlib
  666. if backend:
  667. matplotlib.use(backend)
  668. from matplotlib import pyplot
  669. pyplot.style.use(style)
  670. print("matplotlib: backend is", matplotlib.get_backend())
  671. return matplotlib, pyplot
  672. def apply_plot_style(figure, axes, legend, background, font_size, axes_size):
  673. foreground = "black" if background == "white" else "white"
  674. if axes_size is None:
  675. axes_size = (16, 12)
  676. else:
  677. axes_size = tuple(float(p) for p in axes_size.split(","))
  678. figure.set_size_inches(*axes_size)
  679. for side in ("bottom", "top", "left", "right"):
  680. axes.spines[side].set_color(foreground)
  681. for axis in (axes.xaxis, axes.yaxis):
  682. axis.label.update(dict(fontsize=font_size, color=foreground))
  683. for axis in ("x", "y"):
  684. getattr(axes, axis + "axis").get_offset_text().set_size(font_size)
  685. axes.tick_params(axis=axis, colors=foreground, labelsize=font_size)
  686. try:
  687. axes.ticklabel_format(axis="y", style="sci", scilimits=(0, 3))
  688. except AttributeError:
  689. pass
  690. figure.patch.set_facecolor(background)
  691. axes.set_facecolor(background)
  692. if legend is not None:
  693. frame = legend.get_frame()
  694. for setter in (frame.set_facecolor, frame.set_edgecolor):
  695. setter(background)
  696. for text in legend.get_texts():
  697. text.set_color(foreground)
  698. def get_plot_path(base, name):
  699. root, ext = os.path.splitext(base)
  700. if not ext:
  701. ext = ".png"
  702. output = os.path.join(root, name + ext)
  703. os.makedirs(os.path.dirname(output), exist_ok=True)
  704. return output
  705. def deploy_plot(title, output, background, tight=True):
  706. import matplotlib.pyplot as pyplot
  707. if not output:
  708. pyplot.gcf().canvas.set_window_title(title)
  709. pyplot.show()
  710. else:
  711. if title:
  712. pyplot.title(title, color="black" if background == "white" else "white")
  713. if tight:
  714. try:
  715. pyplot.tight_layout()
  716. except: # noqa: E722
  717. print("Warning: failed to set the tight layout")
  718. print("Writing plot to %s" % output)
  719. pyplot.savefig(output, transparent=True)
  720. pyplot.clf()
  721. def default_json(x):
  722. if hasattr(x, "tolist"):
  723. return x.tolist()
  724. if hasattr(x, "isoformat"):
  725. return x.isoformat()
  726. return x
  727. def parse_date(text, default):
  728. if not text:
  729. return default
  730. from dateutil.parser import parse
  731. return parse(text)
  732. def plot_burndown(args, target, name, matrix, date_range_sampling, labels, granularity,
  733. sampling, resample):
  734. if args.output and args.output.endswith(".json"):
  735. data = locals().copy()
  736. del data["args"]
  737. data["type"] = "burndown"
  738. if args.mode == "project" and target == "project":
  739. output = args.output
  740. else:
  741. if target == "project":
  742. name = "project"
  743. output = get_plot_path(args.output, name)
  744. with open(output, "w") as fout:
  745. json.dump(data, fout, sort_keys=True, default=default_json)
  746. return
  747. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  748. pyplot.stackplot(date_range_sampling, matrix, labels=labels)
  749. if args.relative:
  750. for i in range(matrix.shape[1]):
  751. matrix[:, i] /= matrix[:, i].sum()
  752. pyplot.ylim(0, 1)
  753. legend_loc = 3
  754. else:
  755. legend_loc = 2
  756. legend = pyplot.legend(loc=legend_loc, fontsize=args.font_size)
  757. pyplot.ylabel("Lines of code")
  758. pyplot.xlabel("Time")
  759. apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.background,
  760. args.font_size, args.size)
  761. pyplot.xlim(parse_date(args.start_date, date_range_sampling[0]),
  762. parse_date(args.end_date, date_range_sampling[-1]))
  763. locator = pyplot.gca().xaxis.get_major_locator()
  764. # set the optimal xticks locator
  765. if "M" not in resample:
  766. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  767. locs = pyplot.gca().get_xticks().tolist()
  768. if len(locs) >= 16:
  769. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  770. locs = pyplot.gca().get_xticks().tolist()
  771. if len(locs) >= 16:
  772. pyplot.gca().xaxis.set_major_locator(locator)
  773. if locs[0] < pyplot.xlim()[0]:
  774. del locs[0]
  775. endindex = -1
  776. if len(locs) >= 2 and pyplot.xlim()[1] - locs[-1] > (locs[-1] - locs[-2]) / 2:
  777. locs.append(pyplot.xlim()[1])
  778. endindex = len(locs) - 1
  779. startindex = -1
  780. if len(locs) >= 2 and locs[0] - pyplot.xlim()[0] > (locs[1] - locs[0]) / 2:
  781. locs.append(pyplot.xlim()[0])
  782. startindex = len(locs) - 1
  783. pyplot.gca().set_xticks(locs)
  784. # hacking time!
  785. labels = pyplot.gca().get_xticklabels()
  786. if startindex >= 0:
  787. labels[startindex].set_text(date_range_sampling[0].date())
  788. labels[startindex].set_text = lambda _: None
  789. labels[startindex].set_rotation(30)
  790. labels[startindex].set_ha("right")
  791. if endindex >= 0:
  792. labels[endindex].set_text(date_range_sampling[-1].date())
  793. labels[endindex].set_text = lambda _: None
  794. labels[endindex].set_rotation(30)
  795. labels[endindex].set_ha("right")
  796. title = "%s %d x %d (granularity %d, sampling %d)" % \
  797. ((name,) + matrix.shape + (granularity, sampling))
  798. output = args.output
  799. if output:
  800. if args.mode == "project" and target == "project":
  801. output = args.output
  802. else:
  803. if target == "project":
  804. name = "project"
  805. output = get_plot_path(args.output, name)
  806. deploy_plot(title, output, args.background)
  807. def plot_many_burndown(args, target, header, parts):
  808. if not args.output:
  809. print("Warning: output not set, showing %d plots." % len(parts))
  810. stdout = io.StringIO()
  811. for name, matrix in tqdm.tqdm(parts):
  812. with contextlib.redirect_stdout(stdout):
  813. plot_burndown(args, target, *load_burndown(header, name, matrix, args.resample))
  814. sys.stdout.write(stdout.getvalue())
  815. def plot_overwrites_matrix(args, repo, people, matrix):
  816. if args.output and args.output.endswith(".json"):
  817. data = locals().copy()
  818. del data["args"]
  819. data["type"] = "overwrites_matrix"
  820. if args.mode == "all":
  821. output = get_plot_path(args.output, "matrix")
  822. else:
  823. output = args.output
  824. with open(output, "w") as fout:
  825. json.dump(data, fout, sort_keys=True, default=default_json)
  826. return
  827. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  828. s = 4 + matrix.shape[1] * 0.3
  829. fig = pyplot.figure(figsize=(s, s))
  830. ax = fig.add_subplot(111)
  831. ax.xaxis.set_label_position("top")
  832. ax.matshow(matrix, cmap=pyplot.cm.OrRd)
  833. ax.set_xticks(numpy.arange(0, matrix.shape[1]))
  834. ax.set_yticks(numpy.arange(0, matrix.shape[0]))
  835. ax.set_yticklabels(people, va="center")
  836. ax.set_xticks(numpy.arange(0.5, matrix.shape[1] + 0.5), minor=True)
  837. ax.set_xticklabels(["Unidentified"] + people, rotation=45, ha="left",
  838. va="bottom", rotation_mode="anchor")
  839. ax.set_yticks(numpy.arange(0.5, matrix.shape[0] + 0.5), minor=True)
  840. ax.grid(False)
  841. ax.grid(which="minor")
  842. apply_plot_style(fig, ax, None, args.background, args.font_size, args.size)
  843. if not args.output:
  844. pos1 = ax.get_position()
  845. pos2 = (pos1.x0 + 0.15, pos1.y0 - 0.1, pos1.width * 0.9, pos1.height * 0.9)
  846. ax.set_position(pos2)
  847. if args.mode == "all" and args.output:
  848. output = get_plot_path(args.output, "matrix")
  849. else:
  850. output = args.output
  851. title = "%s %d developers overwrite" % (repo, matrix.shape[0])
  852. if args.output:
  853. # FIXME(vmarkovtsev): otherwise the title is screwed in savefig()
  854. title = ""
  855. deploy_plot(title, output, args.background)
  856. def plot_ownership(args, repo, names, people, date_range, last):
  857. if args.output and args.output.endswith(".json"):
  858. data = locals().copy()
  859. del data["args"]
  860. data["type"] = "ownership"
  861. if args.mode == "all" and args.output:
  862. output = get_plot_path(args.output, "people")
  863. else:
  864. output = args.output
  865. with open(output, "w") as fout:
  866. json.dump(data, fout, sort_keys=True, default=default_json)
  867. return
  868. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  869. polys = pyplot.stackplot(date_range, people, labels=names)
  870. if names[-1] == "others":
  871. polys[-1].set_hatch("/")
  872. pyplot.xlim(parse_date(args.start_date, date_range[0]), parse_date(args.end_date, last))
  873. if args.relative:
  874. for i in range(people.shape[1]):
  875. people[:, i] /= people[:, i].sum()
  876. pyplot.ylim(0, 1)
  877. legend_loc = 3
  878. else:
  879. legend_loc = 2
  880. ncol = 1 if len(names) < 15 else 2
  881. legend = pyplot.legend(loc=legend_loc, fontsize=args.font_size, ncol=ncol)
  882. apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.background,
  883. args.font_size, args.size)
  884. if args.mode == "all" and args.output:
  885. output = get_plot_path(args.output, "people")
  886. else:
  887. output = args.output
  888. deploy_plot("%s code ownership through time" % repo, output, args.background)
  889. IDEAL_SHARD_SIZE = 4096
  890. def train_embeddings(index, matrix, tmpdir, shard_size=IDEAL_SHARD_SIZE):
  891. import tensorflow as tf
  892. try:
  893. from . import swivel
  894. except (SystemError, ImportError):
  895. import swivel
  896. assert matrix.shape[0] == matrix.shape[1]
  897. assert len(index) <= matrix.shape[0]
  898. outlier_threshold = numpy.percentile(matrix.data, 99)
  899. matrix.data[matrix.data > outlier_threshold] = outlier_threshold
  900. nshards = len(index) // shard_size
  901. if nshards * shard_size < len(index):
  902. nshards += 1
  903. shard_size = len(index) // nshards
  904. nshards = len(index) // shard_size
  905. remainder = len(index) - nshards * shard_size
  906. if remainder > 0:
  907. lengths = matrix.indptr[1:] - matrix.indptr[:-1]
  908. filtered = sorted(numpy.argsort(lengths)[remainder:])
  909. else:
  910. filtered = list(range(len(index)))
  911. if len(filtered) < matrix.shape[0]:
  912. print("Truncating the sparse matrix...")
  913. matrix = matrix[filtered, :][:, filtered]
  914. meta_index = []
  915. for i, j in enumerate(filtered):
  916. meta_index.append((index[j], matrix[i, i]))
  917. index = [mi[0] for mi in meta_index]
  918. with tempfile.TemporaryDirectory(prefix="hercules_labours_", dir=tmpdir or None) as tmproot:
  919. print("Writing Swivel metadata...")
  920. vocabulary = "\n".join(index)
  921. with open(os.path.join(tmproot, "row_vocab.txt"), "w") as out:
  922. out.write(vocabulary)
  923. with open(os.path.join(tmproot, "col_vocab.txt"), "w") as out:
  924. out.write(vocabulary)
  925. del vocabulary
  926. bool_sums = matrix.indptr[1:] - matrix.indptr[:-1]
  927. bool_sums_str = "\n".join(map(str, bool_sums.tolist()))
  928. with open(os.path.join(tmproot, "row_sums.txt"), "w") as out:
  929. out.write(bool_sums_str)
  930. with open(os.path.join(tmproot, "col_sums.txt"), "w") as out:
  931. out.write(bool_sums_str)
  932. del bool_sums_str
  933. reorder = numpy.argsort(-bool_sums)
  934. print("Writing Swivel shards...")
  935. for row in range(nshards):
  936. for col in range(nshards):
  937. def _int64s(xs):
  938. return tf.train.Feature(
  939. int64_list=tf.train.Int64List(value=list(xs)))
  940. def _floats(xs):
  941. return tf.train.Feature(
  942. float_list=tf.train.FloatList(value=list(xs)))
  943. indices_row = reorder[row::nshards]
  944. indices_col = reorder[col::nshards]
  945. shard = matrix[indices_row][:, indices_col].tocoo()
  946. example = tf.train.Example(features=tf.train.Features(feature={
  947. "global_row": _int64s(indices_row),
  948. "global_col": _int64s(indices_col),
  949. "sparse_local_row": _int64s(shard.row),
  950. "sparse_local_col": _int64s(shard.col),
  951. "sparse_value": _floats(shard.data)}))
  952. with open(os.path.join(tmproot, "shard-%03d-%03d.pb" % (row, col)), "wb") as out:
  953. out.write(example.SerializeToString())
  954. print("Training Swivel model...")
  955. swivel.FLAGS.submatrix_rows = shard_size
  956. swivel.FLAGS.submatrix_cols = shard_size
  957. if len(meta_index) <= IDEAL_SHARD_SIZE / 16:
  958. embedding_size = 50
  959. num_epochs = 100000
  960. elif len(meta_index) <= IDEAL_SHARD_SIZE:
  961. embedding_size = 50
  962. num_epochs = 50000
  963. elif len(meta_index) <= IDEAL_SHARD_SIZE * 2:
  964. embedding_size = 60
  965. num_epochs = 10000
  966. elif len(meta_index) <= IDEAL_SHARD_SIZE * 4:
  967. embedding_size = 70
  968. num_epochs = 8000
  969. elif len(meta_index) <= IDEAL_SHARD_SIZE * 10:
  970. embedding_size = 80
  971. num_epochs = 5000
  972. elif len(meta_index) <= IDEAL_SHARD_SIZE * 25:
  973. embedding_size = 100
  974. num_epochs = 1000
  975. elif len(meta_index) <= IDEAL_SHARD_SIZE * 100:
  976. embedding_size = 200
  977. num_epochs = 600
  978. else:
  979. embedding_size = 300
  980. num_epochs = 300
  981. if os.getenv("CI"):
  982. # Travis, AppVeyor etc. during the integration tests
  983. num_epochs /= 10
  984. swivel.FLAGS.embedding_size = embedding_size
  985. swivel.FLAGS.input_base_path = tmproot
  986. swivel.FLAGS.output_base_path = tmproot
  987. swivel.FLAGS.loss_multiplier = 1.0 / shard_size
  988. swivel.FLAGS.num_epochs = num_epochs
  989. # Tensorflow 1.5 parses sys.argv unconditionally *applause*
  990. argv_backup = sys.argv[1:]
  991. del sys.argv[1:]
  992. swivel.main(None)
  993. sys.argv.extend(argv_backup)
  994. print("Reading Swivel embeddings...")
  995. embeddings = []
  996. with open(os.path.join(tmproot, "row_embedding.tsv")) as frow:
  997. with open(os.path.join(tmproot, "col_embedding.tsv")) as fcol:
  998. for i, (lrow, lcol) in enumerate(zip(frow, fcol)):
  999. prow, pcol = (l.split("\t", 1) for l in (lrow, lcol))
  1000. assert prow[0] == pcol[0]
  1001. erow, ecol = \
  1002. (numpy.fromstring(p[1], dtype=numpy.float32, sep="\t")
  1003. for p in (prow, pcol))
  1004. embeddings.append((erow + ecol) / 2)
  1005. return meta_index, embeddings
  1006. class CORSWebServer(object):
  1007. def __init__(self):
  1008. self.thread = threading.Thread(target=self.serve)
  1009. self.server = None
  1010. def serve(self):
  1011. outer = self
  1012. from http.server import HTTPServer, SimpleHTTPRequestHandler, test
  1013. class ClojureServer(HTTPServer):
  1014. def __init__(self, *args, **kwargs):
  1015. HTTPServer.__init__(self, *args, **kwargs)
  1016. outer.server = self
  1017. class CORSRequestHandler(SimpleHTTPRequestHandler):
  1018. def end_headers(self):
  1019. self.send_header("Access-Control-Allow-Origin", "*")
  1020. SimpleHTTPRequestHandler.end_headers(self)
  1021. test(CORSRequestHandler, ClojureServer)
  1022. def start(self):
  1023. self.thread.start()
  1024. def stop(self):
  1025. if self.running:
  1026. self.server.shutdown()
  1027. self.thread.join()
  1028. @property
  1029. def running(self):
  1030. return self.server is not None
  1031. web_server = CORSWebServer()
  1032. def write_embeddings(name, output, run_server, index, embeddings):
  1033. print("Writing Tensorflow Projector files...")
  1034. if not output:
  1035. output = "couples"
  1036. if output.endswith(".json"):
  1037. output = os.path.join(output[:-5], "couples")
  1038. run_server = False
  1039. metaf = "%s_%s_meta.tsv" % (output, name)
  1040. with open(metaf, "w") as fout:
  1041. fout.write("name\tcommits\n")
  1042. for pair in index:
  1043. fout.write("%s\t%s\n" % pair)
  1044. print("Wrote", metaf)
  1045. dataf = "%s_%s_data.tsv" % (output, name)
  1046. with open(dataf, "w") as fout:
  1047. for vec in embeddings:
  1048. fout.write("\t".join(str(v) for v in vec))
  1049. fout.write("\n")
  1050. print("Wrote", dataf)
  1051. jsonf = "%s_%s.json" % (output, name)
  1052. with open(jsonf, "w") as fout:
  1053. fout.write("""{
  1054. "embeddings": [
  1055. {
  1056. "tensorName": "%s %s coupling",
  1057. "tensorShape": [%s, %s],
  1058. "tensorPath": "http://0.0.0.0:8000/%s",
  1059. "metadataPath": "http://0.0.0.0:8000/%s"
  1060. }
  1061. ]
  1062. }
  1063. """ % (output, name, len(embeddings), len(embeddings[0]), dataf, metaf))
  1064. print("Wrote %s" % jsonf)
  1065. if run_server and not web_server.running:
  1066. web_server.start()
  1067. url = "http://projector.tensorflow.org/?config=http://0.0.0.0:8000/" + jsonf
  1068. print(url)
  1069. if run_server:
  1070. if shutil.which("xdg-open") is not None:
  1071. os.system("xdg-open " + url)
  1072. else:
  1073. browser = os.getenv("BROWSER", "")
  1074. if browser:
  1075. os.system(browser + " " + url)
  1076. else:
  1077. print("\t" + url)
  1078. def show_shotness_stats(data):
  1079. top = sorted(((r.counters[i], i) for i, r in enumerate(data)), reverse=True)
  1080. for count, i in top:
  1081. r = data[i]
  1082. print("%8d %s:%s [%s]" % (count, r.file, r.name, r.internal_role))
  1083. def show_sentiment_stats(args, name, resample, start_date, data):
  1084. from scipy.signal import convolve, slepian
  1085. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  1086. start_date = datetime.fromtimestamp(start_date)
  1087. data = sorted(data.items())
  1088. mood = numpy.zeros(data[-1][0] + 1, dtype=numpy.float32)
  1089. timeline = numpy.array([start_date + timedelta(days=i) for i in range(mood.shape[0])])
  1090. for d, val in data:
  1091. mood[d] = (0.5 - val.Value) * 2
  1092. resolution = 32
  1093. window = slepian(len(timeline) // resolution, 0.5)
  1094. window /= window.sum()
  1095. mood_smooth = convolve(mood, window, "same")
  1096. pos = mood_smooth.copy()
  1097. pos[pos < 0] = 0
  1098. neg = mood_smooth.copy()
  1099. neg[neg >= 0] = 0
  1100. resolution = 4
  1101. window = numpy.ones(len(timeline) // resolution)
  1102. window /= window.sum()
  1103. avg = convolve(mood, window, "same")
  1104. pyplot.fill_between(timeline, pos, color="#8DB843", label="Positive")
  1105. pyplot.fill_between(timeline, neg, color="#E14C35", label="Negative")
  1106. pyplot.plot(timeline, avg, color="grey", label="Average", linewidth=5)
  1107. legend = pyplot.legend(loc=1, fontsize=args.font_size)
  1108. pyplot.ylabel("Comment sentiment")
  1109. pyplot.xlabel("Time")
  1110. apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.background,
  1111. args.font_size, args.size)
  1112. pyplot.xlim(parse_date(args.start_date, timeline[0]), parse_date(args.end_date, timeline[-1]))
  1113. locator = pyplot.gca().xaxis.get_major_locator()
  1114. # set the optimal xticks locator
  1115. if "M" not in resample:
  1116. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  1117. locs = pyplot.gca().get_xticks().tolist()
  1118. if len(locs) >= 16:
  1119. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  1120. locs = pyplot.gca().get_xticks().tolist()
  1121. if len(locs) >= 16:
  1122. pyplot.gca().xaxis.set_major_locator(locator)
  1123. if locs[0] < pyplot.xlim()[0]:
  1124. del locs[0]
  1125. endindex = -1
  1126. if len(locs) >= 2 and pyplot.xlim()[1] - locs[-1] > (locs[-1] - locs[-2]) / 2:
  1127. locs.append(pyplot.xlim()[1])
  1128. endindex = len(locs) - 1
  1129. startindex = -1
  1130. if len(locs) >= 2 and locs[0] - pyplot.xlim()[0] > (locs[1] - locs[0]) / 2:
  1131. locs.append(pyplot.xlim()[0])
  1132. startindex = len(locs) - 1
  1133. pyplot.gca().set_xticks(locs)
  1134. # hacking time!
  1135. labels = pyplot.gca().get_xticklabels()
  1136. if startindex >= 0:
  1137. labels[startindex].set_text(timeline[0].date())
  1138. labels[startindex].set_text = lambda _: None
  1139. labels[startindex].set_rotation(30)
  1140. labels[startindex].set_ha("right")
  1141. if endindex >= 0:
  1142. labels[endindex].set_text(timeline[-1].date())
  1143. labels[endindex].set_text = lambda _: None
  1144. labels[endindex].set_rotation(30)
  1145. labels[endindex].set_ha("right")
  1146. overall_pos = sum(2 * (0.5 - d[1].Value) for d in data if d[1].Value < 0.5)
  1147. overall_neg = sum(2 * (d[1].Value - 0.5) for d in data if d[1].Value > 0.5)
  1148. title = "%s sentiment +%.1f -%.1f δ=%.1f" % (
  1149. name, overall_pos, overall_neg, overall_pos - overall_neg)
  1150. if args.mode == "all" and args.output:
  1151. output = get_plot_path(args.output, "sentiment")
  1152. else:
  1153. output = args.output
  1154. deploy_plot(title, output, args.background)
  1155. def show_devs(args, name, start_date, end_date, people, days, max_people=50):
  1156. from scipy.signal import convolve, slepian
  1157. if len(people) > max_people:
  1158. print("Picking top %s developers by commit count" % max_people)
  1159. # pick top N developers by commit count
  1160. commits = defaultdict(int)
  1161. for devs in days.values():
  1162. for dev, stats in devs.items():
  1163. commits[dev] += stats.Commits
  1164. commits = sorted(((v, k) for k, v in commits.items()), reverse=True)
  1165. chosen_people = {people[k] for _, k in commits[:max_people]}
  1166. else:
  1167. chosen_people = set(people)
  1168. dists, devseries, devstats, route = order_commits(chosen_people, days, people)
  1169. route_map = {v: i for i, v in enumerate(route)}
  1170. # determine clusters
  1171. clusters = hdbscan_cluster_routed_series(dists, route)
  1172. keys = list(devseries.keys())
  1173. route = [keys[node] for node in route]
  1174. print("Plotting")
  1175. # smooth time series
  1176. start_date = datetime.fromtimestamp(start_date)
  1177. start_date = datetime(start_date.year, start_date.month, start_date.day)
  1178. end_date = datetime.fromtimestamp(end_date)
  1179. end_date = datetime(end_date.year, end_date.month, end_date.day)
  1180. size = (end_date - start_date).days + 1
  1181. plot_x = [start_date + timedelta(days=i) for i in range(size)]
  1182. resolution = 64
  1183. window = slepian(size // resolution, 0.5)
  1184. final = numpy.zeros((len(devseries), size), dtype=numpy.float32)
  1185. for i, s in enumerate(devseries.values()):
  1186. arr = numpy.array(s).transpose()
  1187. full_history = numpy.zeros(size, dtype=numpy.float32)
  1188. mask = arr[0] < size
  1189. full_history[arr[0][mask]] = arr[1][mask]
  1190. final[route_map[i]] = convolve(full_history, window, "same")
  1191. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  1192. pyplot.rcParams["figure.figsize"] = (32, 16)
  1193. pyplot.rcParams["font.size"] = args.font_size
  1194. prop_cycle = pyplot.rcParams["axes.prop_cycle"]
  1195. colors = prop_cycle.by_key()["color"]
  1196. fig, axes = pyplot.subplots(final.shape[0], 1)
  1197. backgrounds = ("#C4FFDB", "#FFD0CD") if args.background == "white" else ("#05401C", "#40110E")
  1198. max_cluster = numpy.max(clusters)
  1199. for ax, series, cluster, dev_i in zip(axes, final, clusters, route):
  1200. if cluster >= 0:
  1201. color = colors[cluster % len(colors)]
  1202. i = 1
  1203. while color == "#777777":
  1204. color = colors[(max_cluster + i) % len(colors)]
  1205. i += 1
  1206. else:
  1207. # outlier
  1208. color = "#777777"
  1209. ax.fill_between(plot_x, series, color=color)
  1210. ax.set_axis_off()
  1211. author = people[dev_i]
  1212. ax.text(0.03, 0.5, author[:36] + (author[36:] and "..."),
  1213. horizontalalignment="right", verticalalignment="center",
  1214. transform=ax.transAxes, fontsize=args.font_size,
  1215. color="black" if args.background == "white" else "white")
  1216. ds = devstats[dev_i]
  1217. stats = "%5d %8s %8s" % (ds[0], _format_number(ds[1] - ds[2]), _format_number(ds[3]))
  1218. ax.text(0.97, 0.5, stats,
  1219. horizontalalignment="left", verticalalignment="center",
  1220. transform=ax.transAxes, fontsize=args.font_size, family="monospace",
  1221. backgroundcolor=backgrounds[ds[1] <= ds[2]],
  1222. color="black" if args.background == "white" else "white")
  1223. axes[0].text(0.97, 1.75, " cmts delta changed",
  1224. horizontalalignment="left", verticalalignment="center",
  1225. transform=axes[0].transAxes, fontsize=args.font_size, family="monospace",
  1226. color="black" if args.background == "white" else "white")
  1227. axes[-1].set_axis_on()
  1228. target_num_labels = 12
  1229. num_months = (end_date.year - start_date.year) * 12 + end_date.month - start_date.month
  1230. interval = int(numpy.ceil(num_months / target_num_labels))
  1231. if interval >= 8:
  1232. interval = int(numpy.ceil(num_months / (12 * target_num_labels)))
  1233. axes[-1].xaxis.set_major_locator(matplotlib.dates.YearLocator(base=max(1, interval // 12)))
  1234. axes[-1].xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y"))
  1235. else:
  1236. axes[-1].xaxis.set_major_locator(matplotlib.dates.MonthLocator(interval=interval))
  1237. axes[-1].xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%m"))
  1238. for tick in axes[-1].xaxis.get_major_ticks():
  1239. tick.label.set_fontsize(args.font_size)
  1240. axes[-1].spines["left"].set_visible(False)
  1241. axes[-1].spines["right"].set_visible(False)
  1242. axes[-1].spines["top"].set_visible(False)
  1243. axes[-1].get_yaxis().set_visible(False)
  1244. axes[-1].set_facecolor((1.0,) * 3 + (0.0,))
  1245. title = ("%s commits" % name) if not args.output else ""
  1246. if args.mode == "all" and args.output:
  1247. output = get_plot_path(args.output, "time_series")
  1248. else:
  1249. output = args.output
  1250. deploy_plot(title, output, args.background)
  1251. def order_commits(chosen_people, days, people):
  1252. from seriate import seriate
  1253. try:
  1254. from fastdtw import fastdtw
  1255. except ImportError as e:
  1256. print("Cannot import fastdtw: %s\nInstall it from https://github.com/slaypni/fastdtw" % e)
  1257. sys.exit(1)
  1258. # FIXME(vmarkovtsev): remove once https://github.com/slaypni/fastdtw/pull/28 is merged&released
  1259. try:
  1260. sys.modules["fastdtw.fastdtw"].__norm = lambda p: lambda a, b: numpy.linalg.norm(
  1261. numpy.atleast_1d(a) - numpy.atleast_1d(b), p)
  1262. except KeyError:
  1263. # the native extension does not have this bug
  1264. pass
  1265. devseries = defaultdict(list)
  1266. devstats = defaultdict(lambda: DevDay(0, 0, 0, 0, {}))
  1267. for day, devs in sorted(days.items()):
  1268. for dev, stats in devs.items():
  1269. if people[dev] in chosen_people:
  1270. devseries[dev].append((day, stats.Commits))
  1271. devstats[dev] = devstats[dev].add(stats)
  1272. print("Calculating the distance matrix")
  1273. # max-normalize the time series using a sliding window
  1274. series = list(devseries.values())
  1275. for i, s in enumerate(series):
  1276. arr = numpy.array(s).transpose().astype(numpy.float32)
  1277. arr[1] /= arr[1].sum()
  1278. series[i] = arr.transpose()
  1279. # calculate the distance matrix using dynamic time warping
  1280. dists = numpy.full((len(series),) * 2, -100500, dtype=numpy.float32)
  1281. # TODO: what's the total for this progress bar?
  1282. with tqdm.tqdm() as pb:
  1283. for x, serx in enumerate(series):
  1284. dists[x, x] = 0
  1285. for y, sery in enumerate(series[x + 1:], start=x + 1):
  1286. min_day = int(min(serx[0][0], sery[0][0]))
  1287. max_day = int(max(serx[-1][0], sery[-1][0]))
  1288. arrx = numpy.zeros(max_day - min_day + 1, dtype=numpy.float32)
  1289. arry = numpy.zeros_like(arrx)
  1290. arrx[serx[:, 0].astype(int) - min_day] = serx[:, 1]
  1291. arry[sery[:, 0].astype(int) - min_day] = sery[:, 1]
  1292. # L1 norm
  1293. dist, _ = fastdtw(arrx, arry, radius=5, dist=1)
  1294. dists[x, y] = dists[y, x] = dist
  1295. pb.update()
  1296. print("Ordering the series")
  1297. route = seriate(dists)
  1298. return dists, devseries, devstats, route
  1299. def hdbscan_cluster_routed_series(dists, route):
  1300. try:
  1301. from hdbscan import HDBSCAN
  1302. except ImportError as e:
  1303. print("Cannot import hdbscan: %s" % e)
  1304. sys.exit(1)
  1305. opt_dist_chain = numpy.cumsum(numpy.array(
  1306. [0] + [dists[route[i], route[i + 1]] for i in range(len(route) - 1)]))
  1307. clusters = HDBSCAN(min_cluster_size=2).fit_predict(opt_dist_chain[:, numpy.newaxis])
  1308. return clusters
  1309. def show_devs_efforts(args, name, start_date, end_date, people, days, max_people):
  1310. from scipy.signal import convolve, slepian
  1311. start_date = datetime.fromtimestamp(start_date)
  1312. start_date = datetime(start_date.year, start_date.month, start_date.day)
  1313. end_date = datetime.fromtimestamp(end_date)
  1314. end_date = datetime(end_date.year, end_date.month, end_date.day)
  1315. efforts_by_dev = defaultdict(int)
  1316. for day, devs in days.items():
  1317. for dev, stats in devs.items():
  1318. efforts_by_dev[dev] += stats.Added + stats.Removed + stats.Changed
  1319. if len(efforts_by_dev) > max_people:
  1320. chosen = {v for k, v in sorted(
  1321. ((v, k) for k, v in efforts_by_dev.items()), reverse=True)[:max_people]}
  1322. print("Warning: truncated people to the most active %d" % max_people)
  1323. else:
  1324. chosen = set(efforts_by_dev)
  1325. chosen_efforts = sorted(((efforts_by_dev[k], k) for k in chosen), reverse=True)
  1326. chosen_order = {k: i for i, (_, k) in enumerate(chosen_efforts)}
  1327. efforts = numpy.zeros((len(chosen) + 1, (end_date - start_date).days + 1), dtype=numpy.float32)
  1328. for day, devs in days.items():
  1329. if day < efforts.shape[1]:
  1330. for dev, stats in devs.items():
  1331. dev = chosen_order.get(dev, len(chosen_order))
  1332. efforts[dev][day] += stats.Added + stats.Removed + stats.Changed
  1333. efforts_cum = numpy.cumsum(efforts, axis=1)
  1334. window = slepian(10, 0.5)
  1335. window /= window.sum()
  1336. for e in (efforts, efforts_cum):
  1337. for i in range(e.shape[0]):
  1338. ending = e[i][-len(window) * 2:].copy()
  1339. e[i] = convolve(e[i], window, "same")
  1340. e[i][-len(ending):] = ending
  1341. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  1342. plot_x = [start_date + timedelta(days=i) for i in range(efforts.shape[1])]
  1343. people = [people[k] for _, k in chosen_efforts] + ["others"]
  1344. for i, name in enumerate(people):
  1345. if len(name) > 40:
  1346. people[i] = name[:37] + "..."
  1347. polys = pyplot.stackplot(plot_x, efforts_cum, labels=people)
  1348. if len(polys) == max_people + 1:
  1349. polys[-1].set_hatch("/")
  1350. polys = pyplot.stackplot(plot_x, -efforts * efforts_cum.max() / efforts.max())
  1351. if len(polys) == max_people + 1:
  1352. polys[-1].set_hatch("/")
  1353. yticks = []
  1354. for tick in pyplot.gca().yaxis.iter_ticks():
  1355. if tick[1] >= 0:
  1356. yticks.append(tick[1])
  1357. pyplot.gca().yaxis.set_ticks(yticks)
  1358. legend = pyplot.legend(loc=2, ncol=2, fontsize=args.font_size)
  1359. apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.background,
  1360. args.font_size, args.size or "16,10")
  1361. if args.mode == "all" and args.output:
  1362. output = get_plot_path(args.output, "efforts")
  1363. else:
  1364. output = args.output
  1365. deploy_plot("Efforts through time (changed lines of code)", output, args.background)
  1366. def show_old_vs_new(args, name, start_date, end_date, people, days):
  1367. from scipy.signal import convolve, slepian
  1368. start_date = datetime.fromtimestamp(start_date)
  1369. start_date = datetime(start_date.year, start_date.month, start_date.day)
  1370. end_date = datetime.fromtimestamp(end_date)
  1371. end_date = datetime(end_date.year, end_date.month, end_date.day)
  1372. new_lines = numpy.zeros((end_date - start_date).days + 2)
  1373. old_lines = numpy.zeros_like(new_lines)
  1374. for day, devs in days.items():
  1375. for stats in devs.values():
  1376. new_lines[day] += stats.Added
  1377. old_lines[day] += stats.Removed + stats.Changed
  1378. resolution = 32
  1379. window = slepian(max(len(new_lines) // resolution, 1), 0.5)
  1380. new_lines = convolve(new_lines, window, "same")
  1381. old_lines = convolve(old_lines, window, "same")
  1382. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  1383. plot_x = [start_date + timedelta(days=i) for i in range(len(new_lines))]
  1384. pyplot.fill_between(plot_x, new_lines, color="#8DB843", label="Changed new lines")
  1385. pyplot.fill_between(plot_x, old_lines, color="#E14C35", label="Changed existing lines")
  1386. pyplot.legend(loc=2, fontsize=args.font_size)
  1387. for tick in chain(pyplot.gca().xaxis.get_major_ticks(), pyplot.gca().yaxis.get_major_ticks()):
  1388. tick.label.set_fontsize(args.font_size)
  1389. if args.mode == "all" and args.output:
  1390. output = get_plot_path(args.output, "old_vs_new")
  1391. else:
  1392. output = args.output
  1393. deploy_plot("Additions vs changes", output, args.background)
  1394. def show_languages(args, name, start_date, end_date, people, days):
  1395. devlangs = defaultdict(lambda: defaultdict(lambda: numpy.zeros(3, dtype=int)))
  1396. for day, devs in days.items():
  1397. for dev, stats in devs.items():
  1398. for lang, vals in stats.Languages.items():
  1399. devlangs[dev][lang] += vals
  1400. devlangs = sorted(devlangs.items(), key=lambda p: -sum(x.sum() for x in p[1].values()))
  1401. for dev, ls in devlangs:
  1402. print()
  1403. print("#", people[dev])
  1404. ls = sorted(((vals.sum(), lang) for lang, vals in ls.items()), reverse=True)
  1405. for vals, lang in ls:
  1406. if lang:
  1407. print("%s: %d" % (lang, vals))
  1408. class ParallelDevData:
  1409. def __init__(self):
  1410. self.commits_rank = -1
  1411. self.commits = -1
  1412. self.lines_rank = -1
  1413. self.lines = -1
  1414. self.ownership_rank = -1
  1415. self.ownership = -1
  1416. self.couples_index = -1
  1417. self.couples_cluster = -1
  1418. self.commit_coocc_index = -1
  1419. self.commit_coocc_cluster = -1
  1420. def __str__(self):
  1421. return str(self.__dict__)
  1422. def __repr__(self):
  1423. return str(self)
  1424. def load_devs_parallel(ownership, couples, devs, max_people):
  1425. from seriate import seriate
  1426. try:
  1427. from hdbscan import HDBSCAN
  1428. except ImportError as e:
  1429. print("Cannot import ortools: %s\nInstall it from "
  1430. "https://developers.google.com/optimization/install/python/" % e)
  1431. sys.exit(1)
  1432. people, owned = ownership
  1433. _, cmatrix = couples
  1434. _, days = devs
  1435. print("calculating - commits")
  1436. commits = defaultdict(int)
  1437. for day, devs in days.items():
  1438. for dev, stats in devs.items():
  1439. commits[people[dev]] += stats.Commits
  1440. chosen = [k for v, k in sorted(((v, k) for k, v in commits.items()),
  1441. reverse=True)[:max_people]]
  1442. result = {k: ParallelDevData() for k in chosen}
  1443. for k, v in result.items():
  1444. v.commits_rank = chosen.index(k)
  1445. v.commits = commits[k]
  1446. print("calculating - lines")
  1447. lines = defaultdict(int)
  1448. for day, devs in days.items():
  1449. for dev, stats in devs.items():
  1450. lines[people[dev]] += stats.Added + stats.Removed + stats.Changed
  1451. lines_index = {k: i for i, (_, k) in enumerate(sorted(
  1452. ((v, k) for k, v in lines.items() if k in chosen), reverse=True))}
  1453. for k, v in result.items():
  1454. v.lines_rank = lines_index[k]
  1455. v.lines = lines[k]
  1456. print("calculating - ownership")
  1457. owned_index = {k: i for i, (_, k) in enumerate(sorted(
  1458. ((owned[k][-1].sum(), k) for k in chosen), reverse=True))}
  1459. for k, v in result.items():
  1460. v.ownership_rank = owned_index[k]
  1461. v.ownership = owned[k][-1].sum()
  1462. print("calculating - couples")
  1463. embeddings = numpy.genfromtxt(fname="couples_people_data.tsv", delimiter="\t")[
  1464. [people.index(k) for k in chosen]]
  1465. embeddings /= numpy.linalg.norm(embeddings, axis=1)[:, None]
  1466. cos = embeddings.dot(embeddings.T)
  1467. cos[cos > 1] = 1 # tiny precision faults
  1468. dists = numpy.arccos(cos)
  1469. clusters = HDBSCAN(min_cluster_size=2, metric="precomputed").fit_predict(dists)
  1470. for k, v in result.items():
  1471. v.couples_cluster = clusters[chosen.index(k)]
  1472. couples_order = seriate(dists)
  1473. roll_options = []
  1474. for i in range(len(couples_order)):
  1475. loss = 0
  1476. for k, v in result.items():
  1477. loss += abs(
  1478. v.ownership_rank - (couples_order.index(chosen.index(k)) + i) % len(chosen))
  1479. roll_options.append(loss)
  1480. best_roll = numpy.argmin(roll_options)
  1481. couples_order = list(numpy.roll(couples_order, best_roll))
  1482. for k, v in result.items():
  1483. v.couples_index = couples_order.index(chosen.index(k))
  1484. print("calculating - commit series")
  1485. dists, devseries, _, orig_route = order_commits(chosen, days, people)
  1486. keys = list(devseries.keys())
  1487. route = [keys[node] for node in orig_route]
  1488. for roll in range(len(route)):
  1489. loss = 0
  1490. for k, v in result.items():
  1491. i = route.index(people.index(k))
  1492. loss += abs(v.couples_index - ((i + roll) % len(route)))
  1493. roll_options[roll] = loss
  1494. best_roll = numpy.argmin(roll_options)
  1495. route = list(numpy.roll(route, best_roll))
  1496. orig_route = list(numpy.roll(orig_route, best_roll))
  1497. clusters = hdbscan_cluster_routed_series(dists, orig_route)
  1498. for k, v in result.items():
  1499. v.commit_coocc_index = route.index(people.index(k))
  1500. v.commit_coocc_cluster = clusters[v.commit_coocc_index]
  1501. return result
  1502. def show_devs_parallel(args, name, start_date, end_date, devs):
  1503. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  1504. from matplotlib.collections import LineCollection
  1505. def solve_equations(x1, y1, x2, y2):
  1506. xcube = (x1 - x2) ** 3
  1507. a = 2 * (y2 - y1) / xcube
  1508. b = 3 * (y1 - y2) * (x1 + x2) / xcube
  1509. c = 6 * (y2 - y1) * x1 * x2 / xcube
  1510. d = y1 - a * x1 ** 3 - b * x1 ** 2 - c * x1
  1511. return a, b, c, d
  1512. # biggest = {k: max(getattr(d, k) for d in devs.values())
  1513. # for k in ("commits", "lines", "ownership")}
  1514. for k, dev in devs.items():
  1515. points = numpy.array([
  1516. (1, dev.commits_rank),
  1517. (2, dev.lines_rank),
  1518. (3, dev.ownership_rank),
  1519. (4, dev.couples_index),
  1520. (5, dev.commit_coocc_index)],
  1521. dtype=float)
  1522. points[:, 1] = points[:, 1] / len(devs)
  1523. splines = []
  1524. for i in range(len(points) - 1):
  1525. a, b, c, d = solve_equations(*points[i], *points[i + 1])
  1526. x = numpy.linspace(i + 1, i + 2, 100)
  1527. smooth_points = numpy.array(
  1528. [x, a * x ** 3 + b * x ** 2 + c * x + d]).T.reshape(-1, 1, 2)
  1529. splines.append(smooth_points)
  1530. points = numpy.concatenate(splines)
  1531. segments = numpy.concatenate([points[:-1], points[1:]], axis=1)
  1532. lc = LineCollection(segments)
  1533. lc.set_array(numpy.linspace(0, 0.1, segments.shape[0]))
  1534. pyplot.gca().add_collection(lc)
  1535. pyplot.xlim(0, 6)
  1536. pyplot.ylim(-0.1, 1.1)
  1537. deploy_plot("Developers", args.output, args.background)
  1538. def _format_number(n):
  1539. if n == 0:
  1540. return "0"
  1541. power = int(numpy.log10(abs(n)))
  1542. if power >= 6:
  1543. n = n / 1000000
  1544. if n >= 10:
  1545. n = str(int(n))
  1546. else:
  1547. n = "%.1f" % n
  1548. if n.endswith("0"):
  1549. n = n[:-2]
  1550. suffix = "M"
  1551. elif power >= 3:
  1552. n = n / 1000
  1553. if n >= 10:
  1554. n = str(int(n))
  1555. else:
  1556. n = "%.1f" % n
  1557. if n.endswith("0"):
  1558. n = n[:-2]
  1559. suffix = "K"
  1560. else:
  1561. n = str(n)
  1562. suffix = ""
  1563. return n + suffix
  1564. def main():
  1565. args = parse_args()
  1566. reader = read_input(args)
  1567. header = reader.get_header()
  1568. name = reader.get_name()
  1569. burndown_warning = "Burndown stats were not collected. Re-run hercules with --burndown."
  1570. burndown_files_warning = \
  1571. "Burndown stats for files were not collected. Re-run hercules with " \
  1572. "--burndown --burndown-files."
  1573. burndown_people_warning = \
  1574. "Burndown stats for people were not collected. Re-run hercules with " \
  1575. "--burndown --burndown-people."
  1576. couples_warning = "Coupling stats were not collected. Re-run hercules with --couples."
  1577. shotness_warning = "Structural hotness stats were not collected. Re-run hercules with " \
  1578. "--shotness. Also check --languages - the output may be empty."
  1579. sentiment_warning = "Sentiment stats were not collected. Re-run hercules with --sentiment."
  1580. devs_warning = "Devs stats were not collected. Re-run hercules with --devs."
  1581. def run_times():
  1582. rt = reader.get_run_times()
  1583. pandas = import_pandas()
  1584. series = pandas.to_timedelta(pandas.Series(rt).sort_values(ascending=False), unit="s")
  1585. df = pandas.concat([series, series / series.sum()], axis=1)
  1586. df.columns = ["time", "ratio"]
  1587. print(df)
  1588. def project_burndown():
  1589. try:
  1590. full_header = header + reader.get_burndown_parameters()
  1591. except KeyError:
  1592. print("project: " + burndown_warning)
  1593. return
  1594. plot_burndown(args, "project",
  1595. *load_burndown(full_header, *reader.get_project_burndown(),
  1596. resample=args.resample, interpolation_progress=True))
  1597. def files_burndown():
  1598. try:
  1599. full_header = header + reader.get_burndown_parameters()
  1600. except KeyError:
  1601. print(burndown_warning)
  1602. return
  1603. try:
  1604. plot_many_burndown(args, "file", full_header, reader.get_files_burndown())
  1605. except KeyError:
  1606. print("files: " + burndown_files_warning)
  1607. def people_burndown():
  1608. try:
  1609. full_header = header + reader.get_burndown_parameters()
  1610. except KeyError:
  1611. print(burndown_warning)
  1612. return
  1613. try:
  1614. plot_many_burndown(args, "person", full_header, reader.get_people_burndown())
  1615. except KeyError:
  1616. print("people: " + burndown_people_warning)
  1617. def overwrites_matrix():
  1618. try:
  1619. plot_overwrites_matrix(args, name, *load_overwrites_matrix(
  1620. *reader.get_people_interaction(), max_people=args.max_people))
  1621. people, matrix = load_overwrites_matrix(
  1622. *reader.get_people_interaction(), max_people=1000000, normalize=False)
  1623. from scipy.sparse import csr_matrix
  1624. matrix = matrix[:, 1:]
  1625. matrix = numpy.triu(matrix) + numpy.tril(matrix).T
  1626. matrix = matrix + matrix.T
  1627. matrix = csr_matrix(matrix)
  1628. try:
  1629. write_embeddings("overwrites", args.output, not args.disable_projector,
  1630. *train_embeddings(people, matrix, tmpdir=args.tmpdir))
  1631. except AttributeError as e:
  1632. print("Training the embeddings is not possible: %s: %s", type(e).__name__, e)
  1633. except KeyError:
  1634. print("overwrites_matrix: " + burndown_people_warning)
  1635. def ownership_burndown():
  1636. try:
  1637. full_header = header + reader.get_burndown_parameters()
  1638. except KeyError:
  1639. print(burndown_warning)
  1640. return
  1641. try:
  1642. plot_ownership(args, name, *load_ownership(
  1643. full_header, *reader.get_ownership_burndown(), max_people=args.max_people,
  1644. order_by_time=args.order_ownership_by_time))
  1645. except KeyError:
  1646. print("ownership: " + burndown_people_warning)
  1647. def couples_files():
  1648. try:
  1649. write_embeddings("files", args.output, not args.disable_projector,
  1650. *train_embeddings(*reader.get_files_coocc(),
  1651. tmpdir=args.tmpdir))
  1652. except KeyError:
  1653. print(couples_warning)
  1654. def couples_people():
  1655. try:
  1656. write_embeddings("people", args.output, not args.disable_projector,
  1657. *train_embeddings(*reader.get_people_coocc(),
  1658. tmpdir=args.tmpdir))
  1659. except KeyError:
  1660. print(couples_warning)
  1661. def couples_shotness():
  1662. try:
  1663. write_embeddings("shotness", args.output, not args.disable_projector,
  1664. *train_embeddings(*reader.get_shotness_coocc(),
  1665. tmpdir=args.tmpdir))
  1666. except KeyError:
  1667. print(shotness_warning)
  1668. def shotness():
  1669. try:
  1670. data = reader.get_shotness()
  1671. except KeyError:
  1672. print(shotness_warning)
  1673. return
  1674. show_shotness_stats(data)
  1675. def sentiment():
  1676. try:
  1677. data = reader.get_sentiment()
  1678. except KeyError:
  1679. print(sentiment_warning)
  1680. return
  1681. show_sentiment_stats(args, reader.get_name(), args.resample, reader.get_header()[0], data)
  1682. def devs():
  1683. try:
  1684. data = reader.get_devs()
  1685. except KeyError:
  1686. print(devs_warning)
  1687. return
  1688. show_devs(args, reader.get_name(), *reader.get_header(), *data,
  1689. max_people=args.max_people)
  1690. def devs_efforts():
  1691. try:
  1692. data = reader.get_devs()
  1693. except KeyError:
  1694. print(devs_warning)
  1695. return
  1696. show_devs_efforts(args, reader.get_name(), *reader.get_header(), *data,
  1697. max_people=args.max_people)
  1698. def old_vs_new():
  1699. try:
  1700. data = reader.get_devs()
  1701. except KeyError:
  1702. print(devs_warning)
  1703. return
  1704. show_old_vs_new(args, reader.get_name(), *reader.get_header(), *data)
  1705. def languages():
  1706. try:
  1707. data = reader.get_devs()
  1708. except KeyError:
  1709. print(devs_warning)
  1710. return
  1711. show_languages(args, reader.get_name(), *reader.get_header(), *data)
  1712. def devs_parallel():
  1713. try:
  1714. ownership = reader.get_ownership_burndown()
  1715. except KeyError:
  1716. print(burndown_people_warning)
  1717. return
  1718. try:
  1719. couples = reader.get_people_coocc()
  1720. except KeyError:
  1721. print(couples_warning)
  1722. return
  1723. try:
  1724. devs = reader.get_devs()
  1725. except KeyError:
  1726. print(devs_warning)
  1727. return
  1728. show_devs_parallel(args, reader.get_name(), *reader.get_header(),
  1729. load_devs_parallel(ownership, couples, devs, args.max_people))
  1730. modes = {
  1731. "run-times": run_times,
  1732. "burndown-project": project_burndown,
  1733. "burndown-file": files_burndown,
  1734. "burndown-person": people_burndown,
  1735. "overwrites-matrix": overwrites_matrix,
  1736. "ownership": ownership_burndown,
  1737. "couples-files": couples_files,
  1738. "couples-people": couples_people,
  1739. "couples-shotness": couples_shotness,
  1740. "shotness": shotness,
  1741. "sentiment": sentiment,
  1742. "devs": devs,
  1743. "devs-efforts": devs_efforts,
  1744. "old-vs-new": old_vs_new,
  1745. "languages": languages,
  1746. "devs-parallel": devs_parallel,
  1747. }
  1748. if "all" in args.modes:
  1749. all_mode = True
  1750. args.modes = [
  1751. "burndown-project",
  1752. "overwrites-matrix",
  1753. "ownership",
  1754. "couples-files",
  1755. "couples-people",
  1756. "couples-shotness",
  1757. "shotness",
  1758. "devs",
  1759. "devs-efforts",
  1760. ]
  1761. else:
  1762. all_mode = False
  1763. for mode in args.modes:
  1764. if mode not in modes:
  1765. print("Unknown mode: %s" % mode)
  1766. continue
  1767. print("Running: %s" % mode)
  1768. # `args.mode` is required for path determination in the mode functions
  1769. args.mode = ("all" if all_mode else mode)
  1770. try:
  1771. modes[mode]()
  1772. except ImportError as ie:
  1773. print("A module required by the %s mode was not found: %s" % (mode, ie))
  1774. if not all_mode:
  1775. raise
  1776. if web_server.running:
  1777. secs = int(os.getenv("COUPLES_SERVER_TIME", "60"))
  1778. print("Sleeping for %d seconds, safe to Ctrl-C" % secs)
  1779. sys.stdout.flush()
  1780. try:
  1781. time.sleep(secs)
  1782. except KeyboardInterrupt:
  1783. pass
  1784. web_server.stop()
  1785. if __name__ == "__main__":
  1786. sys.exit(main())