labours.py 74 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962
  1. #!/usr/bin/env python3
  2. import argparse
  3. from collections import defaultdict, namedtuple
  4. from datetime import datetime, timedelta
  5. from importlib import import_module
  6. import io
  7. from itertools import chain
  8. import json
  9. import os
  10. import re
  11. import shutil
  12. import subprocess
  13. import sys
  14. import tempfile
  15. import threading
  16. import time
  17. import warnings
  18. try:
  19. from clint.textui import progress
  20. except ImportError:
  21. print("Warning: clint is not installed, no fancy progressbars in the terminal for you.")
  22. progress = None
  23. import numpy
  24. import yaml
  25. if sys.version_info[0] < 3:
  26. # OK, ancients, I will support Python 2, but you owe me a beer
  27. input = raw_input # noqa: F821
  28. def list_matplotlib_styles():
  29. script = "import sys; from matplotlib import pyplot; " \
  30. "sys.stdout.write(repr(pyplot.style.available))"
  31. styles = eval(subprocess.check_output([sys.executable, "-c", script]))
  32. styles.remove("classic")
  33. return ["default", "classic"] + styles
  34. def parse_args():
  35. parser = argparse.ArgumentParser()
  36. parser.add_argument("-o", "--output", default="",
  37. help="Path to the output file/directory (empty for display). "
  38. "If the extension is JSON, the data is saved instead of "
  39. "the real image.")
  40. parser.add_argument("-i", "--input", default="-",
  41. help="Path to the input file (- for stdin).")
  42. parser.add_argument("-f", "--input-format", default="auto", choices=["yaml", "pb", "auto"])
  43. parser.add_argument("--font-size", default=12, type=int,
  44. help="Size of the labels and legend.")
  45. parser.add_argument("--style", default="ggplot", choices=list_matplotlib_styles(),
  46. help="Plot style to use.")
  47. parser.add_argument("--backend", help="Matplotlib backend to use.")
  48. parser.add_argument("--background", choices=["black", "white"], default="white",
  49. help="Plot's general color scheme.")
  50. parser.add_argument("--size", help="Axes' size in inches, for example \"12,9\"")
  51. parser.add_argument("--relative", action="store_true",
  52. help="Occupy 100%% height for every measurement.")
  53. parser.add_argument("--couples-tmp-dir", help="Temporary directory to work with couples.")
  54. parser.add_argument("-m", "--mode",
  55. choices=["burndown-project", "burndown-file", "burndown-person",
  56. "churn-matrix", "ownership", "couples-files", "couples-people",
  57. "couples-shotness", "shotness", "sentiment", "devs",
  58. "devs-efforts", "old-vs-new", "all", "run-times", "languages",
  59. "devs-parallel"],
  60. help="What to plot.")
  61. parser.add_argument(
  62. "--resample", default="year",
  63. help="The way to resample the time series. Possible values are: "
  64. "\"month\", \"year\", \"no\", \"raw\" and pandas offset aliases ("
  65. "http://pandas.pydata.org/pandas-docs/stable/timeseries.html"
  66. "#offset-aliases).")
  67. dateutil_url = "https://dateutil.readthedocs.io/en/stable/parser.html#dateutil.parser.parse"
  68. parser.add_argument("--start-date",
  69. help="Start date of time-based plots. Any format is accepted which is "
  70. "supported by %s" % dateutil_url)
  71. parser.add_argument("--end-date",
  72. help="End date of time-based plots. Any format is accepted which is "
  73. "supported by %s" % dateutil_url)
  74. parser.add_argument("--disable-projector", action="store_true",
  75. help="Do not run Tensorflow Projector on couples.")
  76. parser.add_argument("--max-people", default=20, type=int,
  77. help="Maximum number of developers in churn matrix and people plots.")
  78. args = parser.parse_args()
  79. return args
  80. class Reader(object):
  81. def read(self, file):
  82. raise NotImplementedError
  83. def get_name(self):
  84. raise NotImplementedError
  85. def get_header(self):
  86. raise NotImplementedError
  87. def get_burndown_parameters(self):
  88. raise NotImplementedError
  89. def get_project_burndown(self):
  90. raise NotImplementedError
  91. def get_files_burndown(self):
  92. raise NotImplementedError
  93. def get_people_burndown(self):
  94. raise NotImplementedError
  95. def get_ownership_burndown(self):
  96. raise NotImplementedError
  97. def get_people_interaction(self):
  98. raise NotImplementedError
  99. def get_files_coocc(self):
  100. raise NotImplementedError
  101. def get_people_coocc(self):
  102. raise NotImplementedError
  103. def get_shotness_coocc(self):
  104. raise NotImplementedError
  105. def get_shotness(self):
  106. raise NotImplementedError
  107. def get_sentiment(self):
  108. raise NotImplementedError
  109. def get_devs(self):
  110. raise NotImplementedError
  111. class YamlReader(Reader):
  112. def read(self, file):
  113. yaml.reader.Reader.NON_PRINTABLE = re.compile(r"(?!x)x")
  114. try:
  115. loader = yaml.CLoader
  116. except AttributeError:
  117. print("Warning: failed to import yaml.CLoader, falling back to slow yaml.Loader")
  118. loader = yaml.Loader
  119. try:
  120. if file != "-":
  121. with open(file) as fin:
  122. data = yaml.load(fin, Loader=loader)
  123. else:
  124. data = yaml.load(sys.stdin, Loader=loader)
  125. except (UnicodeEncodeError, yaml.reader.ReaderError) as e:
  126. print("\nInvalid unicode in the input: %s\nPlease filter it through "
  127. "fix_yaml_unicode.py" % e)
  128. sys.exit(1)
  129. if data is None:
  130. print("\nNo data has been read - has Hercules crashed?")
  131. sys.exit(1)
  132. self.data = data
  133. def get_run_times(self):
  134. return {}
  135. def get_name(self):
  136. return self.data["hercules"]["repository"]
  137. def get_header(self):
  138. header = self.data["hercules"]
  139. return header["begin_unix_time"], header["end_unix_time"]
  140. def get_burndown_parameters(self):
  141. header = self.data["Burndown"]
  142. return header["sampling"], header["granularity"], header["tick_size"]
  143. def get_project_burndown(self):
  144. return self.data["hercules"]["repository"], \
  145. self._parse_burndown_matrix(self.data["Burndown"]["project"]).T
  146. def get_files_burndown(self):
  147. return [(p[0], self._parse_burndown_matrix(p[1]).T)
  148. for p in self.data["Burndown"]["files"].items()]
  149. def get_people_burndown(self):
  150. return [(p[0], self._parse_burndown_matrix(p[1]).T)
  151. for p in self.data["Burndown"]["people"].items()]
  152. def get_ownership_burndown(self):
  153. return self.data["Burndown"]["people_sequence"].copy(), \
  154. {p[0]: self._parse_burndown_matrix(p[1])
  155. for p in self.data["Burndown"]["people"].items()}
  156. def get_people_interaction(self):
  157. return self.data["Burndown"]["people_sequence"].copy(), \
  158. self._parse_burndown_matrix(self.data["Burndown"]["people_interaction"])
  159. def get_files_coocc(self):
  160. coocc = self.data["Couples"]["files_coocc"]
  161. return coocc["index"], self._parse_coocc_matrix(coocc["matrix"])
  162. def get_people_coocc(self):
  163. coocc = self.data["Couples"]["people_coocc"]
  164. return coocc["index"], self._parse_coocc_matrix(coocc["matrix"])
  165. def get_shotness_coocc(self):
  166. shotness = self.data["Shotness"]
  167. index = ["%s:%s" % (i["file"], i["name"]) for i in shotness]
  168. indptr = numpy.zeros(len(shotness) + 1, dtype=numpy.int64)
  169. indices = []
  170. data = []
  171. for i, record in enumerate(shotness):
  172. pairs = [(int(k), v) for k, v in record["counters"].items()]
  173. pairs.sort()
  174. indptr[i + 1] = indptr[i] + len(pairs)
  175. for k, v in pairs:
  176. indices.append(k)
  177. data.append(v)
  178. indices = numpy.array(indices, dtype=numpy.int32)
  179. data = numpy.array(data, dtype=numpy.int32)
  180. from scipy.sparse import csr_matrix
  181. return index, csr_matrix((data, indices, indptr), shape=(len(shotness),) * 2)
  182. def get_shotness(self):
  183. from munch import munchify
  184. obj = munchify(self.data["Shotness"])
  185. # turn strings into ints
  186. for item in obj:
  187. item.counters = {int(k): v for k, v in item.counters.items()}
  188. if len(obj) == 0:
  189. raise KeyError
  190. return obj
  191. def get_sentiment(self):
  192. from munch import munchify
  193. return munchify({int(key): {
  194. "Comments": vals[2].split("|"),
  195. "Commits": vals[1],
  196. "Value": float(vals[0])
  197. } for key, vals in self.data["Sentiment"].items()})
  198. def get_devs(self):
  199. people = self.data["Devs"]["people"]
  200. days = {int(d): {int(dev): DevDay(*(int(x) for x in day[:-1]), day[-1])
  201. for dev, day in devs.items()}
  202. for d, devs in self.data["Devs"]["ticks"].items()}
  203. return people, days
  204. def _parse_burndown_matrix(self, matrix):
  205. return numpy.array([numpy.fromstring(line, dtype=int, sep=" ")
  206. for line in matrix.split("\n")])
  207. def _parse_coocc_matrix(self, matrix):
  208. from scipy.sparse import csr_matrix
  209. data = []
  210. indices = []
  211. indptr = [0]
  212. for row in matrix:
  213. for k, v in sorted(row.items()):
  214. data.append(v)
  215. indices.append(k)
  216. indptr.append(indptr[-1] + len(row))
  217. return csr_matrix((data, indices, indptr), shape=(len(matrix),) * 2)
  218. class ProtobufReader(Reader):
  219. def read(self, file):
  220. try:
  221. from labours.pb_pb2 import AnalysisResults
  222. except ImportError as e:
  223. print("\n\n>>> You need to generate python/hercules/pb/pb_pb2.py - run \"make\"\n",
  224. file=sys.stderr)
  225. raise e from None
  226. self.data = AnalysisResults()
  227. if file != "-":
  228. with open(file, "rb") as fin:
  229. bytes = fin.read()
  230. else:
  231. bytes = sys.stdin.buffer.read()
  232. if not bytes:
  233. raise ValueError("empty input")
  234. self.data.ParseFromString(bytes)
  235. self.contents = {}
  236. for key, val in self.data.contents.items():
  237. try:
  238. mod, name = PB_MESSAGES[key].rsplit(".", 1)
  239. except KeyError:
  240. sys.stderr.write("Warning: there is no registered PB decoder for %s\n" % key)
  241. continue
  242. cls = getattr(import_module(mod), name)
  243. self.contents[key] = msg = cls()
  244. msg.ParseFromString(val)
  245. def get_run_times(self):
  246. return {key: val for key, val in self.data.header.run_time_per_item.items()}
  247. def get_name(self):
  248. return self.data.header.repository
  249. def get_header(self):
  250. header = self.data.header
  251. return header.begin_unix_time, header.end_unix_time
  252. def get_burndown_parameters(self):
  253. burndown = self.contents["Burndown"]
  254. return burndown.sampling, burndown.granularity, burndown.tick_size / 1000000000
  255. def get_project_burndown(self):
  256. return self._parse_burndown_matrix(self.contents["Burndown"].project)
  257. def get_files_burndown(self):
  258. return [self._parse_burndown_matrix(i) for i in self.contents["Burndown"].files]
  259. def get_people_burndown(self):
  260. return [self._parse_burndown_matrix(i) for i in self.contents["Burndown"].people]
  261. def get_ownership_burndown(self):
  262. people = self.get_people_burndown()
  263. return [p[0] for p in people], {p[0]: p[1].T for p in people}
  264. def get_people_interaction(self):
  265. burndown = self.contents["Burndown"]
  266. return [i.name for i in burndown.people], \
  267. self._parse_sparse_matrix(burndown.people_interaction).toarray()
  268. def get_files_coocc(self):
  269. node = self.contents["Couples"].file_couples
  270. return list(node.index), self._parse_sparse_matrix(node.matrix)
  271. def get_people_coocc(self):
  272. node = self.contents["Couples"].people_couples
  273. return list(node.index), self._parse_sparse_matrix(node.matrix)
  274. def get_shotness_coocc(self):
  275. shotness = self.get_shotness()
  276. index = ["%s:%s" % (i.file, i.name) for i in shotness]
  277. indptr = numpy.zeros(len(shotness) + 1, dtype=numpy.int32)
  278. indices = []
  279. data = []
  280. for i, record in enumerate(shotness):
  281. pairs = list(record.counters.items())
  282. pairs.sort()
  283. indptr[i + 1] = indptr[i] + len(pairs)
  284. for k, v in pairs:
  285. indices.append(k)
  286. data.append(v)
  287. indices = numpy.array(indices, dtype=numpy.int32)
  288. data = numpy.array(data, dtype=numpy.int32)
  289. from scipy.sparse import csr_matrix
  290. return index, csr_matrix((data, indices, indptr), shape=(len(shotness),) * 2)
  291. def get_shotness(self):
  292. records = self.contents["Shotness"].records
  293. if len(records) == 0:
  294. raise KeyError
  295. return records
  296. def get_sentiment(self):
  297. byday = self.contents["Sentiment"].SentimentByDay
  298. if len(byday) == 0:
  299. raise KeyError
  300. return byday
  301. def get_devs(self):
  302. people = list(self.contents["Devs"].dev_index)
  303. days = {d: {dev: DevDay(stats.commits, stats.stats.added, stats.stats.removed,
  304. stats.stats.changed, {k: [v.added, v.removed, v.changed]
  305. for k, v in stats.languages.items()})
  306. for dev, stats in day.devs.items()}
  307. for d, day in self.contents["Devs"].ticks.items()}
  308. return people, days
  309. def _parse_burndown_matrix(self, matrix):
  310. dense = numpy.zeros((matrix.number_of_rows, matrix.number_of_columns), dtype=int)
  311. for y, row in enumerate(matrix.rows):
  312. for x, col in enumerate(row.columns):
  313. dense[y, x] = col
  314. return matrix.name, dense.T
  315. def _parse_sparse_matrix(self, matrix):
  316. from scipy.sparse import csr_matrix
  317. return csr_matrix((list(matrix.data), list(matrix.indices), list(matrix.indptr)),
  318. shape=(matrix.number_of_rows, matrix.number_of_columns))
  319. READERS = {"yaml": YamlReader, "yml": YamlReader, "pb": ProtobufReader}
  320. PB_MESSAGES = {
  321. "Burndown": "labours.pb_pb2.BurndownAnalysisResults",
  322. "Couples": "labours.pb_pb2.CouplesAnalysisResults",
  323. "Shotness": "labours.pb_pb2.ShotnessAnalysisResults",
  324. "Devs": "labours.pb_pb2.DevsAnalysisResults",
  325. }
  326. def read_input(args):
  327. sys.stdout.write("Reading the input... ")
  328. sys.stdout.flush()
  329. if args.input != "-":
  330. if args.input_format == "auto":
  331. try:
  332. args.input_format = args.input.rsplit(".", 1)[1]
  333. except IndexError:
  334. try:
  335. with open(args.input) as f:
  336. f.read(1 << 16)
  337. args.input_format = "yaml"
  338. except UnicodeDecodeError:
  339. args.input_format = "pb"
  340. elif args.input_format == "auto":
  341. args.input_format = "yaml"
  342. reader = READERS[args.input_format]()
  343. reader.read(args.input)
  344. print("done")
  345. return reader
  346. class DevDay(namedtuple("DevDay", ("Commits", "Added", "Removed", "Changed", "Languages"))):
  347. def add(self, dd):
  348. langs = defaultdict(lambda: [0] * 3)
  349. for key, val in self.Languages.items():
  350. for i in range(3):
  351. langs[key][i] += val[i]
  352. for key, val in dd.Languages.items():
  353. for i in range(3):
  354. langs[key][i] += val[i]
  355. return DevDay(Commits=self.Commits + dd.Commits,
  356. Added=self.Added + dd.Added,
  357. Removed=self.Removed + dd.Removed,
  358. Changed=self.Changed + dd.Changed,
  359. Languages=dict(langs))
  360. def fit_kaplan_meier(matrix):
  361. from lifelines import KaplanMeierFitter
  362. T = []
  363. W = []
  364. indexes = numpy.arange(matrix.shape[0], dtype=int)
  365. entries = numpy.zeros(matrix.shape[0], int)
  366. dead = set()
  367. for i in range(1, matrix.shape[1]):
  368. diff = matrix[:, i - 1] - matrix[:, i]
  369. entries[diff < 0] = i
  370. mask = diff > 0
  371. deaths = diff[mask]
  372. T.append(numpy.full(len(deaths), i) - entries[indexes[mask]])
  373. W.append(deaths)
  374. entered = entries > 0
  375. entered[0] = True
  376. dead = dead.union(set(numpy.where((matrix[:, i] == 0) & entered)[0]))
  377. # add the survivors as censored
  378. nnzind = entries != 0
  379. nnzind[0] = True
  380. nnzind[sorted(dead)] = False
  381. T.append(numpy.full(nnzind.sum(), matrix.shape[1]) - entries[nnzind])
  382. W.append(matrix[nnzind, -1])
  383. T = numpy.concatenate(T)
  384. E = numpy.ones(len(T), bool)
  385. E[-nnzind.sum():] = 0
  386. W = numpy.concatenate(W)
  387. if T.size == 0:
  388. return None
  389. kmf = KaplanMeierFitter().fit(T, E, weights=W)
  390. return kmf
  391. def print_survival_function(kmf, sampling):
  392. sf = kmf.survival_function_
  393. sf.index = [timedelta(days=d) for d in sf.index * sampling]
  394. sf.columns = ["Ratio of survived lines"]
  395. try:
  396. print(sf[len(sf) // 6::len(sf) // 6].append(sf.tail(1)))
  397. except ValueError:
  398. pass
  399. def interpolate_burndown_matrix(matrix, granularity, sampling):
  400. daily = numpy.zeros(
  401. (matrix.shape[0] * granularity, matrix.shape[1] * sampling),
  402. dtype=numpy.float32)
  403. """
  404. ----------> samples, x
  405. |
  406. |
  407. |
  408. bands, y
  409. """
  410. for y in range(matrix.shape[0]):
  411. for x in range(matrix.shape[1]):
  412. if y * granularity > (x + 1) * sampling:
  413. # the future is zeros
  414. continue
  415. def decay(start_index: int, start_val: float):
  416. if start_val == 0:
  417. return
  418. k = matrix[y][x] / start_val # <= 1
  419. scale = (x + 1) * sampling - start_index
  420. for i in range(y * granularity, (y + 1) * granularity):
  421. initial = daily[i][start_index - 1]
  422. for j in range(start_index, (x + 1) * sampling):
  423. daily[i][j] = initial * (
  424. 1 + (k - 1) * (j - start_index + 1) / scale)
  425. def grow(finish_index: int, finish_val: float):
  426. initial = matrix[y][x - 1] if x > 0 else 0
  427. start_index = x * sampling
  428. if start_index < y * granularity:
  429. start_index = y * granularity
  430. if finish_index == start_index:
  431. return
  432. avg = (finish_val - initial) / (finish_index - start_index)
  433. for j in range(x * sampling, finish_index):
  434. for i in range(start_index, j + 1):
  435. daily[i][j] = avg
  436. # copy [x*g..y*s)
  437. for j in range(x * sampling, finish_index):
  438. for i in range(y * granularity, x * sampling):
  439. daily[i][j] = daily[i][j - 1]
  440. if (y + 1) * granularity >= (x + 1) * sampling:
  441. # x*granularity <= (y+1)*sampling
  442. # 1. x*granularity <= y*sampling
  443. # y*sampling..(y+1)sampling
  444. #
  445. # x+1
  446. # /
  447. # /
  448. # / y+1 -|
  449. # / |
  450. # / y -|
  451. # /
  452. # / x
  453. #
  454. # 2. x*granularity > y*sampling
  455. # x*granularity..(y+1)sampling
  456. #
  457. # x+1
  458. # /
  459. # /
  460. # / y+1 -|
  461. # / |
  462. # / x -|
  463. # /
  464. # / y
  465. if y * granularity <= x * sampling:
  466. grow((x + 1) * sampling, matrix[y][x])
  467. elif (x + 1) * sampling > y * granularity:
  468. grow((x + 1) * sampling, matrix[y][x])
  469. avg = matrix[y][x] / ((x + 1) * sampling - y * granularity)
  470. for j in range(y * granularity, (x + 1) * sampling):
  471. for i in range(y * granularity, j + 1):
  472. daily[i][j] = avg
  473. elif (y + 1) * granularity >= x * sampling:
  474. # y*sampling <= (x+1)*granularity < (y+1)sampling
  475. # y*sampling..(x+1)*granularity
  476. # (x+1)*granularity..(y+1)sampling
  477. # x+1
  478. # /\
  479. # / \
  480. # / \
  481. # / y+1
  482. # /
  483. # y
  484. v1 = matrix[y][x - 1]
  485. v2 = matrix[y][x]
  486. delta = (y + 1) * granularity - x * sampling
  487. previous = 0
  488. if x > 0 and (x - 1) * sampling >= y * granularity:
  489. # x*g <= (y-1)*s <= y*s <= (x+1)*g <= (y+1)*s
  490. # |________|.......^
  491. if x > 1:
  492. previous = matrix[y][x - 2]
  493. scale = sampling
  494. else:
  495. # (y-1)*s < x*g <= y*s <= (x+1)*g <= (y+1)*s
  496. # |______|.......^
  497. scale = sampling if x == 0 else x * sampling - y * granularity
  498. peak = v1 + (v1 - previous) / scale * delta
  499. if v2 > peak:
  500. # we need to adjust the peak, it may not be less than the decayed value
  501. if x < matrix.shape[1] - 1:
  502. # y*s <= (x+1)*g <= (y+1)*s < (y+2)*s
  503. # ^.........|_________|
  504. k = (v2 - matrix[y][x + 1]) / sampling # > 0
  505. peak = matrix[y][x] + k * ((x + 1) * sampling - (y + 1) * granularity)
  506. # peak > v2 > v1
  507. else:
  508. peak = v2
  509. # not enough data to interpolate; this is at least not restricted
  510. grow((y + 1) * granularity, peak)
  511. decay((y + 1) * granularity, peak)
  512. else:
  513. # (x+1)*granularity < y*sampling
  514. # y*sampling..(y+1)sampling
  515. decay(x * sampling, matrix[y][x - 1])
  516. return daily
  517. def import_pandas():
  518. import pandas
  519. try:
  520. from pandas.plotting import register_matplotlib_converters
  521. register_matplotlib_converters()
  522. except ImportError:
  523. pass
  524. return pandas
  525. def floor_datetime(dt, duration):
  526. return datetime.fromtimestamp(dt.timestamp() - dt.timestamp() % duration)
  527. def load_burndown(header, name, matrix, resample, report_survival=True):
  528. pandas = import_pandas()
  529. start, last, sampling, granularity, tick = header
  530. assert sampling > 0
  531. assert granularity > 0
  532. start = floor_datetime(datetime.fromtimestamp(start), tick)
  533. last = datetime.fromtimestamp(last)
  534. if report_survival:
  535. kmf = fit_kaplan_meier(matrix)
  536. if kmf is not None:
  537. print_survival_function(kmf, sampling)
  538. finish = start + timedelta(seconds=matrix.shape[1] * sampling * tick)
  539. if resample not in ("no", "raw"):
  540. print("resampling to %s, please wait..." % resample)
  541. # Interpolate the day x day matrix.
  542. # Each day brings equal weight in the granularity.
  543. # Sampling's interpolation is linear.
  544. daily = interpolate_burndown_matrix(matrix, granularity, sampling)
  545. daily[(last - start).days:] = 0
  546. # Resample the bands
  547. aliases = {
  548. "year": "A",
  549. "month": "M"
  550. }
  551. resample = aliases.get(resample, resample)
  552. periods = 0
  553. date_granularity_sampling = [start]
  554. while date_granularity_sampling[-1] < finish:
  555. periods += 1
  556. date_granularity_sampling = pandas.date_range(
  557. start, periods=periods, freq=resample)
  558. if date_granularity_sampling[0] > finish:
  559. if resample == "A":
  560. print("too loose resampling - by year, trying by month")
  561. return load_burndown(header, name, matrix, "month", report_survival=False)
  562. else:
  563. raise ValueError("Too loose resampling: %s. Try finer." % resample)
  564. date_range_sampling = pandas.date_range(
  565. date_granularity_sampling[0],
  566. periods=(finish - date_granularity_sampling[0]).days,
  567. freq="1D")
  568. # Fill the new square matrix
  569. matrix = numpy.zeros(
  570. (len(date_granularity_sampling), len(date_range_sampling)),
  571. dtype=numpy.float32)
  572. for i, gdt in enumerate(date_granularity_sampling):
  573. istart = (date_granularity_sampling[i - 1] - start).days \
  574. if i > 0 else 0
  575. ifinish = (gdt - start).days
  576. for j, sdt in enumerate(date_range_sampling):
  577. if (sdt - start).days >= istart:
  578. break
  579. matrix[i, j:] = \
  580. daily[istart:ifinish, (sdt - start).days:].sum(axis=0)
  581. # Hardcode some cases to improve labels' readability
  582. if resample in ("year", "A"):
  583. labels = [dt.year for dt in date_granularity_sampling]
  584. elif resample in ("month", "M"):
  585. labels = [dt.strftime("%Y %B") for dt in date_granularity_sampling]
  586. else:
  587. labels = [dt.date() for dt in date_granularity_sampling]
  588. else:
  589. labels = [
  590. "%s - %s" % ((start + timedelta(seconds=i * granularity * tick)).date(),
  591. (
  592. start + timedelta(seconds=(i + 1) * granularity * tick)).date())
  593. for i in range(matrix.shape[0])]
  594. if len(labels) > 18:
  595. warnings.warn("Too many labels - consider resampling.")
  596. resample = "M" # fake resampling type is checked while plotting
  597. date_range_sampling = pandas.date_range(
  598. start + timedelta(seconds=sampling * tick), periods=matrix.shape[1],
  599. freq="%dD" % sampling)
  600. return name, matrix, date_range_sampling, labels, granularity, sampling, resample
  601. def load_ownership(header, sequence, contents, max_people):
  602. pandas = import_pandas()
  603. start, last, sampling, _, tick = header
  604. start = datetime.fromtimestamp(start)
  605. start = floor_datetime(start, tick)
  606. last = datetime.fromtimestamp(last)
  607. people = []
  608. for name in sequence:
  609. people.append(contents[name].sum(axis=1))
  610. people = numpy.array(people)
  611. date_range_sampling = pandas.date_range(
  612. start + timedelta(seconds=sampling * tick), periods=people[0].shape[0],
  613. freq="%dD" % sampling)
  614. if people.shape[0] > max_people:
  615. order = numpy.argsort(-people.sum(axis=1))
  616. chosen_people = people[order[:max_people + 1]]
  617. chosen_people[max_people] = people[order[max_people:]].sum(axis=0)
  618. people = chosen_people
  619. sequence = [sequence[i] for i in order[:max_people]] + ["others"]
  620. print("Warning: truncated people to the most owning %d" % max_people)
  621. for i, name in enumerate(sequence):
  622. if len(name) > 40:
  623. sequence[i] = name[:37] + "..."
  624. return sequence, people, date_range_sampling, last
  625. def load_churn_matrix(people, matrix, max_people):
  626. matrix = matrix.astype(float)
  627. if matrix.shape[0] > max_people:
  628. order = numpy.argsort(-matrix[:, 0])
  629. matrix = matrix[order[:max_people]][:, [0, 1] + list(2 + order[:max_people])]
  630. people = [people[i] for i in order[:max_people]]
  631. print("Warning: truncated people to most productive %d" % max_people)
  632. zeros = matrix[:, 0] == 0
  633. matrix[zeros, :] = 1
  634. matrix /= matrix[:, 0][:, None]
  635. matrix = -matrix[:, 1:]
  636. matrix[zeros, :] = 0
  637. for i, name in enumerate(people):
  638. if len(name) > 40:
  639. people[i] = name[:37] + "..."
  640. return people, matrix
  641. def import_pyplot(backend, style):
  642. import matplotlib
  643. if backend:
  644. matplotlib.use(backend)
  645. from matplotlib import pyplot
  646. pyplot.style.use(style)
  647. print("matplotlib: backend is", matplotlib.get_backend())
  648. return matplotlib, pyplot
  649. def apply_plot_style(figure, axes, legend, background, font_size, axes_size):
  650. foreground = "black" if background == "white" else "white"
  651. if axes_size is None:
  652. axes_size = (16, 12)
  653. else:
  654. axes_size = tuple(float(p) for p in axes_size.split(","))
  655. figure.set_size_inches(*axes_size)
  656. for side in ("bottom", "top", "left", "right"):
  657. axes.spines[side].set_color(foreground)
  658. for axis in (axes.xaxis, axes.yaxis):
  659. axis.label.update(dict(fontsize=font_size, color=foreground))
  660. for axis in ("x", "y"):
  661. getattr(axes, axis + "axis").get_offset_text().set_size(font_size)
  662. axes.tick_params(axis=axis, colors=foreground, labelsize=font_size)
  663. try:
  664. axes.ticklabel_format(axis="y", style="sci", scilimits=(0, 3))
  665. except AttributeError:
  666. pass
  667. figure.patch.set_facecolor(background)
  668. axes.set_facecolor(background)
  669. if legend is not None:
  670. frame = legend.get_frame()
  671. for setter in (frame.set_facecolor, frame.set_edgecolor):
  672. setter(background)
  673. for text in legend.get_texts():
  674. text.set_color(foreground)
  675. def get_plot_path(base, name):
  676. root, ext = os.path.splitext(base)
  677. if not ext:
  678. ext = ".png"
  679. output = os.path.join(root, name + ext)
  680. os.makedirs(os.path.dirname(output), exist_ok=True)
  681. return output
  682. def deploy_plot(title, output, background, tight=True):
  683. import matplotlib.pyplot as pyplot
  684. if not output:
  685. pyplot.gcf().canvas.set_window_title(title)
  686. pyplot.show()
  687. else:
  688. if title:
  689. pyplot.title(title, color="black" if background == "white" else "white")
  690. if tight:
  691. try:
  692. pyplot.tight_layout()
  693. except: # noqa: E722
  694. print("Warning: failed to set the tight layout")
  695. pyplot.savefig(output, transparent=True)
  696. pyplot.clf()
  697. def default_json(x):
  698. if hasattr(x, "tolist"):
  699. return x.tolist()
  700. if hasattr(x, "isoformat"):
  701. return x.isoformat()
  702. return x
  703. def parse_date(text, default):
  704. if not text:
  705. return default
  706. from dateutil.parser import parse
  707. return parse(text)
  708. def plot_burndown(args, target, name, matrix, date_range_sampling, labels, granularity,
  709. sampling, resample):
  710. if args.output and args.output.endswith(".json"):
  711. data = locals().copy()
  712. del data["args"]
  713. data["type"] = "burndown"
  714. if args.mode == "project" and target == "project":
  715. output = args.output
  716. else:
  717. if target == "project":
  718. name = "project"
  719. output = get_plot_path(args.output, name)
  720. with open(output, "w") as fout:
  721. json.dump(data, fout, sort_keys=True, default=default_json)
  722. return
  723. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  724. pyplot.stackplot(date_range_sampling, matrix, labels=labels)
  725. if args.relative:
  726. for i in range(matrix.shape[1]):
  727. matrix[:, i] /= matrix[:, i].sum()
  728. pyplot.ylim(0, 1)
  729. legend_loc = 3
  730. else:
  731. legend_loc = 2
  732. legend = pyplot.legend(loc=legend_loc, fontsize=args.font_size)
  733. pyplot.ylabel("Lines of code")
  734. pyplot.xlabel("Time")
  735. apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.background,
  736. args.font_size, args.size)
  737. pyplot.xlim(parse_date(args.start_date, date_range_sampling[0]),
  738. parse_date(args.end_date, date_range_sampling[-1]))
  739. locator = pyplot.gca().xaxis.get_major_locator()
  740. # set the optimal xticks locator
  741. if "M" not in resample:
  742. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  743. locs = pyplot.gca().get_xticks().tolist()
  744. if len(locs) >= 16:
  745. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  746. locs = pyplot.gca().get_xticks().tolist()
  747. if len(locs) >= 16:
  748. pyplot.gca().xaxis.set_major_locator(locator)
  749. if locs[0] < pyplot.xlim()[0]:
  750. del locs[0]
  751. endindex = -1
  752. if len(locs) >= 2 and pyplot.xlim()[1] - locs[-1] > (locs[-1] - locs[-2]) / 2:
  753. locs.append(pyplot.xlim()[1])
  754. endindex = len(locs) - 1
  755. startindex = -1
  756. if len(locs) >= 2 and locs[0] - pyplot.xlim()[0] > (locs[1] - locs[0]) / 2:
  757. locs.append(pyplot.xlim()[0])
  758. startindex = len(locs) - 1
  759. pyplot.gca().set_xticks(locs)
  760. # hacking time!
  761. labels = pyplot.gca().get_xticklabels()
  762. if startindex >= 0:
  763. labels[startindex].set_text(date_range_sampling[0].date())
  764. labels[startindex].set_text = lambda _: None
  765. labels[startindex].set_rotation(30)
  766. labels[startindex].set_ha("right")
  767. if endindex >= 0:
  768. labels[endindex].set_text(date_range_sampling[-1].date())
  769. labels[endindex].set_text = lambda _: None
  770. labels[endindex].set_rotation(30)
  771. labels[endindex].set_ha("right")
  772. title = "%s %d x %d (granularity %d, sampling %d)" % \
  773. ((name,) + matrix.shape + (granularity, sampling))
  774. output = args.output
  775. if output:
  776. if args.mode == "project" and target == "project":
  777. output = args.output
  778. else:
  779. if target == "project":
  780. name = "project"
  781. output = get_plot_path(args.output, name)
  782. deploy_plot(title, output, args.background)
  783. def plot_many_burndown(args, target, header, parts):
  784. if not args.output:
  785. print("Warning: output not set, showing %d plots." % len(parts))
  786. itercnt = progress.bar(parts, expected_size=len(parts)) \
  787. if progress is not None else parts
  788. stdout = io.StringIO()
  789. for name, matrix in itercnt:
  790. backup = sys.stdout
  791. sys.stdout = stdout
  792. plot_burndown(args, target, *load_burndown(header, name, matrix, args.resample))
  793. sys.stdout = backup
  794. sys.stdout.write(stdout.getvalue())
  795. def plot_churn_matrix(args, repo, people, matrix):
  796. if args.output and args.output.endswith(".json"):
  797. data = locals().copy()
  798. del data["args"]
  799. data["type"] = "churn_matrix"
  800. if args.mode == "all":
  801. output = get_plot_path(args.output, "matrix")
  802. else:
  803. output = args.output
  804. with open(output, "w") as fout:
  805. json.dump(data, fout, sort_keys=True, default=default_json)
  806. return
  807. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  808. s = 4 + matrix.shape[1] * 0.3
  809. fig = pyplot.figure(figsize=(s, s))
  810. ax = fig.add_subplot(111)
  811. ax.xaxis.set_label_position("top")
  812. ax.matshow(matrix, cmap=pyplot.cm.OrRd)
  813. ax.set_xticks(numpy.arange(0, matrix.shape[1]))
  814. ax.set_yticks(numpy.arange(0, matrix.shape[0]))
  815. ax.set_yticklabels(people, va="center")
  816. ax.set_xticks(numpy.arange(0.5, matrix.shape[1] + 0.5), minor=True)
  817. ax.set_xticklabels(["Unidentified"] + people, rotation=45, ha="left",
  818. va="bottom", rotation_mode="anchor")
  819. ax.set_yticks(numpy.arange(0.5, matrix.shape[0] + 0.5), minor=True)
  820. ax.grid(False)
  821. ax.grid(which="minor")
  822. apply_plot_style(fig, ax, None, args.background, args.font_size, args.size)
  823. if not args.output:
  824. pos1 = ax.get_position()
  825. pos2 = (pos1.x0 + 0.15, pos1.y0 - 0.1, pos1.width * 0.9, pos1.height * 0.9)
  826. ax.set_position(pos2)
  827. if args.mode == "all" and args.output:
  828. output = get_plot_path(args.output, "matrix")
  829. else:
  830. output = args.output
  831. title = "%s %d developers overwrite" % (repo, matrix.shape[0])
  832. if args.output:
  833. # FIXME(vmarkovtsev): otherwise the title is screwed in savefig()
  834. title = ""
  835. deploy_plot(title, output, args.background)
  836. def plot_ownership(args, repo, names, people, date_range, last):
  837. if args.output and args.output.endswith(".json"):
  838. data = locals().copy()
  839. del data["args"]
  840. data["type"] = "ownership"
  841. if args.mode == "all" and args.output:
  842. output = get_plot_path(args.output, "people")
  843. else:
  844. output = args.output
  845. with open(output, "w") as fout:
  846. json.dump(data, fout, sort_keys=True, default=default_json)
  847. return
  848. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  849. polys = pyplot.stackplot(date_range, people, labels=names)
  850. if names[-1] == "others":
  851. polys[-1].set_hatch("/")
  852. pyplot.xlim(parse_date(args.start_date, date_range[0]), parse_date(args.end_date, last))
  853. if args.relative:
  854. for i in range(people.shape[1]):
  855. people[:, i] /= people[:, i].sum()
  856. pyplot.ylim(0, 1)
  857. legend_loc = 3
  858. else:
  859. legend_loc = 2
  860. ncol = 1 if len(names) < 15 else 2
  861. legend = pyplot.legend(loc=legend_loc, fontsize=args.font_size, ncol=ncol)
  862. apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.background,
  863. args.font_size, args.size)
  864. if args.mode == "all" and args.output:
  865. output = get_plot_path(args.output, "people")
  866. else:
  867. output = args.output
  868. deploy_plot("%s code ownership through time" % repo, output, args.background)
  869. IDEAL_SHARD_SIZE = 4096
  870. def train_embeddings(index, matrix, tmpdir, shard_size=IDEAL_SHARD_SIZE):
  871. try:
  872. from . import swivel
  873. except (SystemError, ImportError):
  874. import swivel
  875. import tensorflow as tf
  876. assert matrix.shape[0] == matrix.shape[1]
  877. assert len(index) <= matrix.shape[0]
  878. outlier_threshold = numpy.percentile(matrix.data, 99)
  879. matrix.data[matrix.data > outlier_threshold] = outlier_threshold
  880. nshards = len(index) // shard_size
  881. if nshards * shard_size < len(index):
  882. nshards += 1
  883. shard_size = len(index) // nshards
  884. nshards = len(index) // shard_size
  885. remainder = len(index) - nshards * shard_size
  886. if remainder > 0:
  887. lengths = matrix.indptr[1:] - matrix.indptr[:-1]
  888. filtered = sorted(numpy.argsort(lengths)[remainder:])
  889. else:
  890. filtered = list(range(len(index)))
  891. if len(filtered) < matrix.shape[0]:
  892. print("Truncating the sparse matrix...")
  893. matrix = matrix[filtered, :][:, filtered]
  894. meta_index = []
  895. for i, j in enumerate(filtered):
  896. meta_index.append((index[j], matrix[i, i]))
  897. index = [mi[0] for mi in meta_index]
  898. with tempfile.TemporaryDirectory(prefix="hercules_labours_", dir=tmpdir or None) as tmproot:
  899. print("Writing Swivel metadata...")
  900. vocabulary = "\n".join(index)
  901. with open(os.path.join(tmproot, "row_vocab.txt"), "w") as out:
  902. out.write(vocabulary)
  903. with open(os.path.join(tmproot, "col_vocab.txt"), "w") as out:
  904. out.write(vocabulary)
  905. del vocabulary
  906. bool_sums = matrix.indptr[1:] - matrix.indptr[:-1]
  907. bool_sums_str = "\n".join(map(str, bool_sums.tolist()))
  908. with open(os.path.join(tmproot, "row_sums.txt"), "w") as out:
  909. out.write(bool_sums_str)
  910. with open(os.path.join(tmproot, "col_sums.txt"), "w") as out:
  911. out.write(bool_sums_str)
  912. del bool_sums_str
  913. reorder = numpy.argsort(-bool_sums)
  914. print("Writing Swivel shards...")
  915. for row in range(nshards):
  916. for col in range(nshards):
  917. def _int64s(xs):
  918. return tf.train.Feature(
  919. int64_list=tf.train.Int64List(value=list(xs)))
  920. def _floats(xs):
  921. return tf.train.Feature(
  922. float_list=tf.train.FloatList(value=list(xs)))
  923. indices_row = reorder[row::nshards]
  924. indices_col = reorder[col::nshards]
  925. shard = matrix[indices_row][:, indices_col].tocoo()
  926. example = tf.train.Example(features=tf.train.Features(feature={
  927. "global_row": _int64s(indices_row),
  928. "global_col": _int64s(indices_col),
  929. "sparse_local_row": _int64s(shard.row),
  930. "sparse_local_col": _int64s(shard.col),
  931. "sparse_value": _floats(shard.data)}))
  932. with open(os.path.join(tmproot, "shard-%03d-%03d.pb" % (row, col)), "wb") as out:
  933. out.write(example.SerializeToString())
  934. print("Training Swivel model...")
  935. swivel.FLAGS.submatrix_rows = shard_size
  936. swivel.FLAGS.submatrix_cols = shard_size
  937. if len(meta_index) <= IDEAL_SHARD_SIZE / 16:
  938. embedding_size = 50
  939. num_epochs = 100000
  940. elif len(meta_index) <= IDEAL_SHARD_SIZE:
  941. embedding_size = 50
  942. num_epochs = 50000
  943. elif len(meta_index) <= IDEAL_SHARD_SIZE * 2:
  944. embedding_size = 60
  945. num_epochs = 10000
  946. elif len(meta_index) <= IDEAL_SHARD_SIZE * 4:
  947. embedding_size = 70
  948. num_epochs = 8000
  949. elif len(meta_index) <= IDEAL_SHARD_SIZE * 10:
  950. embedding_size = 80
  951. num_epochs = 5000
  952. elif len(meta_index) <= IDEAL_SHARD_SIZE * 25:
  953. embedding_size = 100
  954. num_epochs = 1000
  955. elif len(meta_index) <= IDEAL_SHARD_SIZE * 100:
  956. embedding_size = 200
  957. num_epochs = 600
  958. else:
  959. embedding_size = 300
  960. num_epochs = 300
  961. if os.getenv("CI"):
  962. # Travis, AppVeyor etc. during the integration tests
  963. num_epochs /= 10
  964. swivel.FLAGS.embedding_size = embedding_size
  965. swivel.FLAGS.input_base_path = tmproot
  966. swivel.FLAGS.output_base_path = tmproot
  967. swivel.FLAGS.loss_multiplier = 1.0 / shard_size
  968. swivel.FLAGS.num_epochs = num_epochs
  969. # Tensorflow 1.5 parses sys.argv unconditionally *applause*
  970. argv_backup = sys.argv[1:]
  971. del sys.argv[1:]
  972. swivel.main(None)
  973. sys.argv.extend(argv_backup)
  974. print("Reading Swivel embeddings...")
  975. embeddings = []
  976. with open(os.path.join(tmproot, "row_embedding.tsv")) as frow:
  977. with open(os.path.join(tmproot, "col_embedding.tsv")) as fcol:
  978. for i, (lrow, lcol) in enumerate(zip(frow, fcol)):
  979. prow, pcol = (l.split("\t", 1) for l in (lrow, lcol))
  980. assert prow[0] == pcol[0]
  981. erow, ecol = \
  982. (numpy.fromstring(p[1], dtype=numpy.float32, sep="\t")
  983. for p in (prow, pcol))
  984. embeddings.append((erow + ecol) / 2)
  985. return meta_index, embeddings
  986. class CORSWebServer(object):
  987. def __init__(self):
  988. self.thread = threading.Thread(target=self.serve)
  989. self.server = None
  990. def serve(self):
  991. outer = self
  992. try:
  993. from http.server import HTTPServer, SimpleHTTPRequestHandler, test
  994. except ImportError: # Python 2
  995. from BaseHTTPServer import HTTPServer, test
  996. from SimpleHTTPServer import SimpleHTTPRequestHandler
  997. class ClojureServer(HTTPServer):
  998. def __init__(self, *args, **kwargs):
  999. HTTPServer.__init__(self, *args, **kwargs)
  1000. outer.server = self
  1001. class CORSRequestHandler(SimpleHTTPRequestHandler):
  1002. def end_headers(self):
  1003. self.send_header("Access-Control-Allow-Origin", "*")
  1004. SimpleHTTPRequestHandler.end_headers(self)
  1005. test(CORSRequestHandler, ClojureServer)
  1006. def start(self):
  1007. self.thread.start()
  1008. def stop(self):
  1009. if self.running:
  1010. self.server.shutdown()
  1011. self.thread.join()
  1012. @property
  1013. def running(self):
  1014. return self.server is not None
  1015. web_server = CORSWebServer()
  1016. def write_embeddings(name, output, run_server, index, embeddings):
  1017. print("Writing Tensorflow Projector files...")
  1018. if not output:
  1019. output = "couples"
  1020. if output.endswith(".json"):
  1021. output = os.path.join(output[:-5], "couples")
  1022. run_server = False
  1023. metaf = "%s_%s_meta.tsv" % (output, name)
  1024. with open(metaf, "w") as fout:
  1025. fout.write("name\tcommits\n")
  1026. for pair in index:
  1027. fout.write("%s\t%s\n" % pair)
  1028. print("Wrote", metaf)
  1029. dataf = "%s_%s_data.tsv" % (output, name)
  1030. with open(dataf, "w") as fout:
  1031. for vec in embeddings:
  1032. fout.write("\t".join(str(v) for v in vec))
  1033. fout.write("\n")
  1034. print("Wrote", dataf)
  1035. jsonf = "%s_%s.json" % (output, name)
  1036. with open(jsonf, "w") as fout:
  1037. fout.write("""{
  1038. "embeddings": [
  1039. {
  1040. "tensorName": "%s %s coupling",
  1041. "tensorShape": [%s, %s],
  1042. "tensorPath": "http://0.0.0.0:8000/%s",
  1043. "metadataPath": "http://0.0.0.0:8000/%s"
  1044. }
  1045. ]
  1046. }
  1047. """ % (output, name, len(embeddings), len(embeddings[0]), dataf, metaf))
  1048. print("Wrote %s" % jsonf)
  1049. if run_server and not web_server.running:
  1050. web_server.start()
  1051. url = "http://projector.tensorflow.org/?config=http://0.0.0.0:8000/" + jsonf
  1052. print(url)
  1053. if run_server:
  1054. if shutil.which("xdg-open") is not None:
  1055. os.system("xdg-open " + url)
  1056. else:
  1057. browser = os.getenv("BROWSER", "")
  1058. if browser:
  1059. os.system(browser + " " + url)
  1060. else:
  1061. print("\t" + url)
  1062. def show_shotness_stats(data):
  1063. top = sorted(((r.counters[i], i) for i, r in enumerate(data)), reverse=True)
  1064. for count, i in top:
  1065. r = data[i]
  1066. print("%8d %s:%s [%s]" % (count, r.file, r.name, r.internal_role))
  1067. def show_sentiment_stats(args, name, resample, start_date, data):
  1068. from scipy.signal import convolve, slepian
  1069. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  1070. start_date = datetime.fromtimestamp(start_date)
  1071. data = sorted(data.items())
  1072. mood = numpy.zeros(data[-1][0] + 1, dtype=numpy.float32)
  1073. timeline = numpy.array([start_date + timedelta(days=i) for i in range(mood.shape[0])])
  1074. for d, val in data:
  1075. mood[d] = (0.5 - val.Value) * 2
  1076. resolution = 32
  1077. window = slepian(len(timeline) // resolution, 0.5)
  1078. window /= window.sum()
  1079. mood_smooth = convolve(mood, window, "same")
  1080. pos = mood_smooth.copy()
  1081. pos[pos < 0] = 0
  1082. neg = mood_smooth.copy()
  1083. neg[neg >= 0] = 0
  1084. resolution = 4
  1085. window = numpy.ones(len(timeline) // resolution)
  1086. window /= window.sum()
  1087. avg = convolve(mood, window, "same")
  1088. pyplot.fill_between(timeline, pos, color="#8DB843", label="Positive")
  1089. pyplot.fill_between(timeline, neg, color="#E14C35", label="Negative")
  1090. pyplot.plot(timeline, avg, color="grey", label="Average", linewidth=5)
  1091. legend = pyplot.legend(loc=1, fontsize=args.font_size)
  1092. pyplot.ylabel("Comment sentiment")
  1093. pyplot.xlabel("Time")
  1094. apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.background,
  1095. args.font_size, args.size)
  1096. pyplot.xlim(parse_date(args.start_date, timeline[0]), parse_date(args.end_date, timeline[-1]))
  1097. locator = pyplot.gca().xaxis.get_major_locator()
  1098. # set the optimal xticks locator
  1099. if "M" not in resample:
  1100. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  1101. locs = pyplot.gca().get_xticks().tolist()
  1102. if len(locs) >= 16:
  1103. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  1104. locs = pyplot.gca().get_xticks().tolist()
  1105. if len(locs) >= 16:
  1106. pyplot.gca().xaxis.set_major_locator(locator)
  1107. if locs[0] < pyplot.xlim()[0]:
  1108. del locs[0]
  1109. endindex = -1
  1110. if len(locs) >= 2 and pyplot.xlim()[1] - locs[-1] > (locs[-1] - locs[-2]) / 2:
  1111. locs.append(pyplot.xlim()[1])
  1112. endindex = len(locs) - 1
  1113. startindex = -1
  1114. if len(locs) >= 2 and locs[0] - pyplot.xlim()[0] > (locs[1] - locs[0]) / 2:
  1115. locs.append(pyplot.xlim()[0])
  1116. startindex = len(locs) - 1
  1117. pyplot.gca().set_xticks(locs)
  1118. # hacking time!
  1119. labels = pyplot.gca().get_xticklabels()
  1120. if startindex >= 0:
  1121. labels[startindex].set_text(timeline[0].date())
  1122. labels[startindex].set_text = lambda _: None
  1123. labels[startindex].set_rotation(30)
  1124. labels[startindex].set_ha("right")
  1125. if endindex >= 0:
  1126. labels[endindex].set_text(timeline[-1].date())
  1127. labels[endindex].set_text = lambda _: None
  1128. labels[endindex].set_rotation(30)
  1129. labels[endindex].set_ha("right")
  1130. overall_pos = sum(2 * (0.5 - d[1].Value) for d in data if d[1].Value < 0.5)
  1131. overall_neg = sum(2 * (d[1].Value - 0.5) for d in data if d[1].Value > 0.5)
  1132. title = "%s sentiment +%.1f -%.1f δ=%.1f" % (
  1133. name, overall_pos, overall_neg, overall_pos - overall_neg)
  1134. deploy_plot(title, args.output, args.background)
  1135. def show_devs(args, name, start_date, end_date, people, days):
  1136. from scipy.signal import convolve, slepian
  1137. max_people = 50
  1138. if len(people) > max_people:
  1139. print("Picking top 100 developers by commit count")
  1140. # pick top N developers by commit count
  1141. commits = defaultdict(int)
  1142. for devs in days.values():
  1143. for dev, stats in devs.items():
  1144. commits[dev] += stats.Commits
  1145. commits = sorted(((v, k) for k, v in commits.items()), reverse=True)
  1146. chosen_people = {people[k] for _, k in commits[:max_people]}
  1147. else:
  1148. chosen_people = set(people)
  1149. dists, devseries, devstats, route = order_commits(chosen_people, days, people)
  1150. route_map = {v: i for i, v in enumerate(route)}
  1151. # determine clusters
  1152. clusters = hdbscan_cluster_routed_series(dists, route)
  1153. keys = list(devseries.keys())
  1154. route = [keys[node] for node in route]
  1155. print("Plotting")
  1156. # smooth time series
  1157. start_date = datetime.fromtimestamp(start_date)
  1158. start_date = datetime(start_date.year, start_date.month, start_date.day)
  1159. end_date = datetime.fromtimestamp(end_date)
  1160. end_date = datetime(end_date.year, end_date.month, end_date.day)
  1161. size = (end_date - start_date).days + 1
  1162. plot_x = [start_date + timedelta(days=i) for i in range(size)]
  1163. resolution = 64
  1164. window = slepian(size // resolution, 0.5)
  1165. final = numpy.zeros((len(devseries), size), dtype=numpy.float32)
  1166. for i, s in enumerate(devseries.values()):
  1167. arr = numpy.array(s).transpose()
  1168. full_history = numpy.zeros(size, dtype=numpy.float32)
  1169. mask = arr[0] < size
  1170. full_history[arr[0][mask]] = arr[1][mask]
  1171. final[route_map[i]] = convolve(full_history, window, "same")
  1172. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  1173. pyplot.rcParams["figure.figsize"] = (32, 16)
  1174. prop_cycle = pyplot.rcParams["axes.prop_cycle"]
  1175. colors = prop_cycle.by_key()["color"]
  1176. fig, axes = pyplot.subplots(final.shape[0], 1)
  1177. backgrounds = ("#C4FFDB", "#FFD0CD") if args.background == "white" else ("#05401C", "#40110E")
  1178. max_cluster = numpy.max(clusters)
  1179. for ax, series, cluster, dev_i in zip(axes, final, clusters, route):
  1180. if cluster >= 0:
  1181. color = colors[cluster % len(colors)]
  1182. i = 1
  1183. while color == "#777777":
  1184. color = colors[(max_cluster + i) % len(colors)]
  1185. i += 1
  1186. else:
  1187. # outlier
  1188. color = "#777777"
  1189. ax.fill_between(plot_x, series, color=color)
  1190. ax.set_axis_off()
  1191. author = people[dev_i]
  1192. ax.text(0.03, 0.5, author[:36] + (author[36:] and "..."),
  1193. horizontalalignment="right", verticalalignment="center",
  1194. transform=ax.transAxes, fontsize=14,
  1195. color="black" if args.background == "white" else "white")
  1196. ds = devstats[dev_i]
  1197. stats = "%5d %8s %8s" % (ds[0], _format_number(ds[1] - ds[2]), _format_number(ds[3]))
  1198. ax.text(0.97, 0.5, stats,
  1199. horizontalalignment="left", verticalalignment="center",
  1200. transform=ax.transAxes, fontsize=14, family="monospace",
  1201. backgroundcolor=backgrounds[ds[1] <= ds[2]],
  1202. color="black" if args.background == "white" else "white")
  1203. axes[0].text(0.97, 1.75, " cmts delta changed",
  1204. horizontalalignment="left", verticalalignment="center",
  1205. transform=axes[0].transAxes, fontsize=14, family="monospace",
  1206. color="black" if args.background == "white" else "white")
  1207. axes[-1].set_axis_on()
  1208. target_num_labels = 12
  1209. num_months = (end_date.year - start_date.year) * 12 + end_date.month - start_date.month
  1210. interval = int(numpy.ceil(num_months / target_num_labels))
  1211. if interval >= 8:
  1212. interval = int(numpy.ceil(num_months / (12 * target_num_labels)))
  1213. axes[-1].xaxis.set_major_locator(matplotlib.dates.YearLocator(base=max(1, interval // 12)))
  1214. axes[-1].xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y"))
  1215. else:
  1216. axes[-1].xaxis.set_major_locator(matplotlib.dates.MonthLocator(interval=interval))
  1217. axes[-1].xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%m"))
  1218. for tick in axes[-1].xaxis.get_major_ticks():
  1219. tick.label.set_fontsize(args.font_size)
  1220. axes[-1].spines["left"].set_visible(False)
  1221. axes[-1].spines["right"].set_visible(False)
  1222. axes[-1].spines["top"].set_visible(False)
  1223. axes[-1].get_yaxis().set_visible(False)
  1224. axes[-1].set_facecolor((1.0,) * 3 + (0.0,))
  1225. title = ("%s commits" % name) if not args.output else ""
  1226. deploy_plot(title, args.output, args.background)
  1227. def order_commits(chosen_people, days, people):
  1228. from seriate import seriate
  1229. try:
  1230. from fastdtw import fastdtw
  1231. except ImportError as e:
  1232. print("Cannot import fastdtw: %s\nInstall it from https://github.com/slaypni/fastdtw" % e)
  1233. sys.exit(1)
  1234. # FIXME(vmarkovtsev): remove once https://github.com/slaypni/fastdtw/pull/28 is merged&released
  1235. try:
  1236. sys.modules["fastdtw.fastdtw"].__norm = lambda p: lambda a, b: numpy.linalg.norm(
  1237. numpy.atleast_1d(a) - numpy.atleast_1d(b), p)
  1238. except KeyError:
  1239. # the native extension does not have this bug
  1240. pass
  1241. devseries = defaultdict(list)
  1242. devstats = defaultdict(lambda: DevDay(0, 0, 0, 0, {}))
  1243. for day, devs in sorted(days.items()):
  1244. for dev, stats in devs.items():
  1245. if people[dev] in chosen_people:
  1246. devseries[dev].append((day, stats.Commits))
  1247. devstats[dev] = devstats[dev].add(stats)
  1248. print("Calculating the distance matrix")
  1249. # max-normalize the time series using a sliding window
  1250. series = list(devseries.values())
  1251. for i, s in enumerate(series):
  1252. arr = numpy.array(s).transpose().astype(numpy.float32)
  1253. commits = arr[1]
  1254. if len(commits) < 7:
  1255. commits /= commits.max()
  1256. else:
  1257. # 4 is sizeof(float32)
  1258. windows = numpy.lib.stride_tricks.as_strided(commits, [len(commits) - 6, 7], [4, 4])
  1259. commits = numpy.concatenate((
  1260. [windows[0, 0] / windows[0].max(),
  1261. windows[0, 1] / windows[0].max(),
  1262. windows[0, 2] / windows[0].max()],
  1263. windows[:, 3] / windows.max(axis=1),
  1264. [windows[-1, 4] / windows[-1].max(),
  1265. windows[-1, 5] / windows[-1].max(),
  1266. windows[-1, 6] / windows[-1].max()]
  1267. ))
  1268. arr[1] = commits * 7 # 7 is a pure heuristic here and is not related to the window size
  1269. series[i] = arr.transpose()
  1270. # calculate the distance matrix using dynamic time warping metric
  1271. dists = numpy.full((len(series),) * 2, -100500, dtype=numpy.float32)
  1272. for x, serx in enumerate(series):
  1273. dists[x, x] = 0
  1274. for y, sery in enumerate(series[x + 1:], start=x + 1):
  1275. min_day = int(min(serx[0][0], sery[0][0]))
  1276. max_day = int(max(serx[-1][0], sery[-1][0]))
  1277. arrx = numpy.zeros(max_day - min_day + 1, dtype=numpy.float32)
  1278. arry = numpy.zeros_like(arrx)
  1279. arrx[serx[:, 0].astype(int) - min_day] = serx[:, 1]
  1280. arry[sery[:, 0].astype(int) - min_day] = sery[:, 1]
  1281. # L1 norm
  1282. dist, _ = fastdtw(arrx, arry, radius=5, dist=1)
  1283. dists[x, y] = dists[y, x] = dist
  1284. print("Ordering the series")
  1285. route = seriate(dists)
  1286. return dists, devseries, devstats, route
  1287. def hdbscan_cluster_routed_series(dists, route):
  1288. try:
  1289. from hdbscan import HDBSCAN
  1290. except ImportError as e:
  1291. print("Cannot import ortools: %s\nInstall it from "
  1292. "https://developers.google.com/optimization/install/python/" % e)
  1293. sys.exit(1)
  1294. opt_dist_chain = numpy.cumsum(numpy.array(
  1295. [0] + [dists[route[i], route[i + 1]] for i in range(len(route) - 1)]))
  1296. clusters = HDBSCAN(min_cluster_size=2).fit_predict(opt_dist_chain[:, numpy.newaxis])
  1297. return clusters
  1298. def show_devs_efforts(args, name, start_date, end_date, people, days, max_people):
  1299. from scipy.signal import convolve, slepian
  1300. start_date = datetime.fromtimestamp(start_date)
  1301. start_date = datetime(start_date.year, start_date.month, start_date.day)
  1302. end_date = datetime.fromtimestamp(end_date)
  1303. end_date = datetime(end_date.year, end_date.month, end_date.day)
  1304. efforts_by_dev = defaultdict(int)
  1305. for day, devs in days.items():
  1306. for dev, stats in devs.items():
  1307. efforts_by_dev[dev] += stats.Added + stats.Removed + stats.Changed
  1308. if len(efforts_by_dev) > max_people:
  1309. chosen = {v for k, v in sorted(
  1310. ((v, k) for k, v in efforts_by_dev.items()), reverse=True)[:max_people]}
  1311. print("Warning: truncated people to the most active %d" % max_people)
  1312. else:
  1313. chosen = set(efforts_by_dev)
  1314. chosen_efforts = sorted(((efforts_by_dev[k], k) for k in chosen), reverse=True)
  1315. chosen_order = {k: i for i, (_, k) in enumerate(chosen_efforts)}
  1316. efforts = numpy.zeros((len(chosen) + 1, (end_date - start_date).days + 1), dtype=numpy.float32)
  1317. for day, devs in days.items():
  1318. if day < efforts.shape[1]:
  1319. for dev, stats in devs.items():
  1320. dev = chosen_order.get(dev, len(chosen_order))
  1321. efforts[dev][day] += stats.Added + stats.Removed + stats.Changed
  1322. efforts_cum = numpy.cumsum(efforts, axis=1)
  1323. window = slepian(10, 0.5)
  1324. window /= window.sum()
  1325. for e in (efforts, efforts_cum):
  1326. for i in range(e.shape[0]):
  1327. ending = e[i][-len(window) * 2:].copy()
  1328. e[i] = convolve(e[i], window, "same")
  1329. e[i][-len(ending):] = ending
  1330. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  1331. plot_x = [start_date + timedelta(days=i) for i in range(efforts.shape[1])]
  1332. people = [people[k] for _, k in chosen_efforts] + ["others"]
  1333. for i, name in enumerate(people):
  1334. if len(name) > 40:
  1335. people[i] = name[:37] + "..."
  1336. polys = pyplot.stackplot(plot_x, efforts_cum, labels=people)
  1337. if len(polys) == max_people + 1:
  1338. polys[-1].set_hatch("/")
  1339. polys = pyplot.stackplot(plot_x, -efforts * efforts_cum.max() / efforts.max())
  1340. if len(polys) == max_people + 1:
  1341. polys[-1].set_hatch("/")
  1342. yticks = []
  1343. for tick in pyplot.gca().yaxis.iter_ticks():
  1344. if tick[1] >= 0:
  1345. yticks.append(tick[1])
  1346. pyplot.gca().yaxis.set_ticks(yticks)
  1347. legend = pyplot.legend(loc=2, ncol=2, fontsize=args.font_size)
  1348. apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.background,
  1349. args.font_size, args.size or "16,10")
  1350. deploy_plot("Efforts through time (changed lines of code)", args.output, args.background)
  1351. def show_old_vs_new(args, name, start_date, end_date, people, days):
  1352. from scipy.signal import convolve, slepian
  1353. start_date = datetime.fromtimestamp(start_date)
  1354. start_date = datetime(start_date.year, start_date.month, start_date.day)
  1355. end_date = datetime.fromtimestamp(end_date)
  1356. end_date = datetime(end_date.year, end_date.month, end_date.day)
  1357. new_lines = numpy.zeros((end_date - start_date).days + 1)
  1358. old_lines = numpy.zeros_like(new_lines)
  1359. for day, devs in days.items():
  1360. for stats in devs.values():
  1361. new_lines[day] += stats.Added
  1362. old_lines[day] += stats.Removed + stats.Changed
  1363. resolution = 32
  1364. window = slepian(len(new_lines) // resolution, 0.5)
  1365. new_lines = convolve(new_lines, window, "same")
  1366. old_lines = convolve(old_lines, window, "same")
  1367. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  1368. plot_x = [start_date + timedelta(days=i) for i in range(len(new_lines))]
  1369. pyplot.fill_between(plot_x, new_lines, color="#8DB843", label="Changed new lines")
  1370. pyplot.fill_between(plot_x, old_lines, color="#E14C35", label="Changed existing lines")
  1371. pyplot.legend(loc=2, fontsize=args.font_size)
  1372. for tick in chain(pyplot.gca().xaxis.get_major_ticks(), pyplot.gca().yaxis.get_major_ticks()):
  1373. tick.label.set_fontsize(args.font_size)
  1374. deploy_plot("Additions vs changes", args.output, args.background)
  1375. def show_languages(args, name, start_date, end_date, people, days):
  1376. devlangs = defaultdict(lambda: defaultdict(lambda: numpy.zeros(3, dtype=int)))
  1377. for day, devs in days.items():
  1378. for dev, stats in devs.items():
  1379. for lang, vals in stats.Languages.items():
  1380. devlangs[dev][lang] += vals
  1381. devlangs = sorted(devlangs.items(), key=lambda p: -sum(x.sum() for x in p[1].values()))
  1382. for dev, ls in devlangs:
  1383. print()
  1384. print("#", people[dev])
  1385. ls = sorted(((vals.sum(), lang) for lang, vals in ls.items()), reverse=True)
  1386. for vals, lang in ls:
  1387. if lang:
  1388. print("%s: %d" % (lang, vals))
  1389. class ParallelDevData:
  1390. def __init__(self):
  1391. self.commits_rank = -1
  1392. self.commits = -1
  1393. self.lines_rank = -1
  1394. self.lines = -1
  1395. self.ownership_rank = -1
  1396. self.ownership = -1
  1397. self.couples_index = -1
  1398. self.couples_cluster = -1
  1399. self.commit_coocc_index = -1
  1400. self.commit_coocc_cluster = -1
  1401. def __str__(self):
  1402. return str(self.__dict__)
  1403. def __repr__(self):
  1404. return str(self)
  1405. def load_devs_parallel(ownership, couples, devs, max_people):
  1406. from seriate import seriate
  1407. try:
  1408. from hdbscan import HDBSCAN
  1409. except ImportError as e:
  1410. print("Cannot import ortools: %s\nInstall it from "
  1411. "https://developers.google.com/optimization/install/python/" % e)
  1412. sys.exit(1)
  1413. people, owned = ownership
  1414. _, cmatrix = couples
  1415. _, days = devs
  1416. print("calculating - commits")
  1417. commits = defaultdict(int)
  1418. for day, devs in days.items():
  1419. for dev, stats in devs.items():
  1420. commits[people[dev]] += stats.Commits
  1421. chosen = [k for v, k in sorted(((v, k) for k, v in commits.items()),
  1422. reverse=True)[:max_people]]
  1423. result = {k: ParallelDevData() for k in chosen}
  1424. for k, v in result.items():
  1425. v.commits_rank = chosen.index(k)
  1426. v.commits = commits[k]
  1427. print("calculating - lines")
  1428. lines = defaultdict(int)
  1429. for day, devs in days.items():
  1430. for dev, stats in devs.items():
  1431. lines[people[dev]] += stats.Added + stats.Removed + stats.Changed
  1432. lines_index = {k: i for i, (_, k) in enumerate(sorted(
  1433. ((v, k) for k, v in lines.items() if k in chosen), reverse=True))}
  1434. for k, v in result.items():
  1435. v.lines_rank = lines_index[k]
  1436. v.lines = lines[k]
  1437. print("calculating - ownership")
  1438. owned_index = {k: i for i, (_, k) in enumerate(sorted(
  1439. ((owned[k][-1].sum(), k) for k in chosen), reverse=True))}
  1440. for k, v in result.items():
  1441. v.ownership_rank = owned_index[k]
  1442. v.ownership = owned[k][-1].sum()
  1443. print("calculating - couples")
  1444. embeddings = numpy.genfromtxt(fname="couples_people_data.tsv", delimiter="\t")[
  1445. [people.index(k) for k in chosen]]
  1446. embeddings /= numpy.linalg.norm(embeddings, axis=1)[:, None]
  1447. cos = embeddings.dot(embeddings.T)
  1448. cos[cos > 1] = 1 # tiny precision faults
  1449. dists = numpy.arccos(cos)
  1450. clusters = HDBSCAN(min_cluster_size=2, metric="precomputed").fit_predict(dists)
  1451. for k, v in result.items():
  1452. v.couples_cluster = clusters[chosen.index(k)]
  1453. couples_order = seriate(dists)
  1454. roll_options = []
  1455. for i in range(len(couples_order)):
  1456. loss = 0
  1457. for k, v in result.items():
  1458. loss += abs(
  1459. v.ownership_rank - (couples_order.index(chosen.index(k)) + i) % len(chosen))
  1460. roll_options.append(loss)
  1461. best_roll = numpy.argmin(roll_options)
  1462. couples_order = list(numpy.roll(couples_order, best_roll))
  1463. for k, v in result.items():
  1464. v.couples_index = couples_order.index(chosen.index(k))
  1465. print("calculating - commit series")
  1466. dists, devseries, _, orig_route = order_commits(chosen, days, people)
  1467. keys = list(devseries.keys())
  1468. route = [keys[node] for node in orig_route]
  1469. for roll in range(len(route)):
  1470. loss = 0
  1471. for k, v in result.items():
  1472. i = route.index(people.index(k))
  1473. loss += abs(v.couples_index - ((i + roll) % len(route)))
  1474. roll_options[roll] = loss
  1475. best_roll = numpy.argmin(roll_options)
  1476. route = list(numpy.roll(route, best_roll))
  1477. orig_route = list(numpy.roll(orig_route, best_roll))
  1478. clusters = hdbscan_cluster_routed_series(dists, orig_route)
  1479. for k, v in result.items():
  1480. v.commit_coocc_index = route.index(people.index(k))
  1481. v.commit_coocc_cluster = clusters[v.commit_coocc_index]
  1482. return result
  1483. def show_devs_parallel(args, name, start_date, end_date, devs):
  1484. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  1485. from matplotlib.collections import LineCollection
  1486. def solve_equations(x1, y1, x2, y2):
  1487. xcube = (x1 - x2) ** 3
  1488. a = 2 * (y2 - y1) / xcube
  1489. b = 3 * (y1 - y2) * (x1 + x2) / xcube
  1490. c = 6 * (y2 - y1) * x1 * x2 / xcube
  1491. d = y1 - a * x1 ** 3 - b * x1 ** 2 - c * x1
  1492. return a, b, c, d
  1493. # biggest = {k: max(getattr(d, k) for d in devs.values())
  1494. # for k in ("commits", "lines", "ownership")}
  1495. for k, dev in devs.items():
  1496. points = numpy.array([
  1497. (1, dev.commits_rank),
  1498. (2, dev.lines_rank),
  1499. (3, dev.ownership_rank),
  1500. (4, dev.couples_index),
  1501. (5, dev.commit_coocc_index)],
  1502. dtype=float)
  1503. points[:, 1] = points[:, 1] / len(devs)
  1504. splines = []
  1505. for i in range(len(points) - 1):
  1506. a, b, c, d = solve_equations(*points[i], *points[i + 1])
  1507. x = numpy.linspace(i + 1, i + 2, 100)
  1508. smooth_points = numpy.array(
  1509. [x, a * x ** 3 + b * x ** 2 + c * x + d]).T.reshape(-1, 1, 2)
  1510. splines.append(smooth_points)
  1511. points = numpy.concatenate(splines)
  1512. segments = numpy.concatenate([points[:-1], points[1:]], axis=1)
  1513. lc = LineCollection(segments)
  1514. lc.set_array(numpy.linspace(0, 0.1, segments.shape[0]))
  1515. pyplot.gca().add_collection(lc)
  1516. pyplot.xlim(0, 6)
  1517. pyplot.ylim(-0.1, 1.1)
  1518. deploy_plot("Developers", args.output, args.background)
  1519. def _format_number(n):
  1520. if n == 0:
  1521. return "0"
  1522. power = int(numpy.log10(abs(n)))
  1523. if power >= 6:
  1524. n = n / 1000000
  1525. if n >= 10:
  1526. n = str(int(n))
  1527. else:
  1528. n = "%.1f" % n
  1529. if n.endswith("0"):
  1530. n = n[:-2]
  1531. suffix = "M"
  1532. elif power >= 3:
  1533. n = n / 1000
  1534. if n >= 10:
  1535. n = str(int(n))
  1536. else:
  1537. n = "%.1f" % n
  1538. if n.endswith("0"):
  1539. n = n[:-2]
  1540. suffix = "K"
  1541. else:
  1542. n = str(n)
  1543. suffix = ""
  1544. return n + suffix
  1545. def main():
  1546. args = parse_args()
  1547. reader = read_input(args)
  1548. header = reader.get_header()
  1549. name = reader.get_name()
  1550. burndown_warning = "Burndown stats were not collected. Re-run hercules with --burndown."
  1551. burndown_files_warning = \
  1552. "Burndown stats for files were not collected. Re-run hercules with " \
  1553. "--burndown --burndown-files."
  1554. burndown_people_warning = \
  1555. "Burndown stats for people were not collected. Re-run hercules with " \
  1556. "--burndown --burndown-people."
  1557. couples_warning = "Coupling stats were not collected. Re-run hercules with --couples."
  1558. shotness_warning = "Structural hotness stats were not collected. Re-run hercules with " \
  1559. "--shotness. Also check --languages - the output may be empty."
  1560. sentiment_warning = "Sentiment stats were not collected. Re-run hercules with --sentiment."
  1561. devs_warning = "Devs stats were not collected. Re-run hercules with --devs."
  1562. def run_times():
  1563. rt = reader.get_run_times()
  1564. pandas = import_pandas()
  1565. series = pandas.to_timedelta(pandas.Series(rt).sort_values(ascending=False), unit="s")
  1566. df = pandas.concat([series, series / series.sum()], axis=1)
  1567. df.columns = ["time", "ratio"]
  1568. print(df)
  1569. def project_burndown():
  1570. try:
  1571. full_header = header + reader.get_burndown_parameters()
  1572. except KeyError:
  1573. print("project: " + burndown_warning)
  1574. return
  1575. plot_burndown(args, "project",
  1576. *load_burndown(full_header, *reader.get_project_burndown(),
  1577. resample=args.resample))
  1578. def files_burndown():
  1579. try:
  1580. full_header = header + reader.get_burndown_parameters()
  1581. except KeyError:
  1582. print(burndown_warning)
  1583. return
  1584. try:
  1585. plot_many_burndown(args, "file", full_header, reader.get_files_burndown())
  1586. except KeyError:
  1587. print("files: " + burndown_files_warning)
  1588. def people_burndown():
  1589. try:
  1590. full_header = header + reader.get_burndown_parameters()
  1591. except KeyError:
  1592. print(burndown_warning)
  1593. return
  1594. try:
  1595. plot_many_burndown(args, "person", full_header, reader.get_people_burndown())
  1596. except KeyError:
  1597. print("people: " + burndown_people_warning)
  1598. def churn_matrix():
  1599. try:
  1600. plot_churn_matrix(args, name, *load_churn_matrix(
  1601. *reader.get_people_interaction(), max_people=args.max_people))
  1602. except KeyError:
  1603. print("churn_matrix: " + burndown_people_warning)
  1604. def ownership_burndown():
  1605. try:
  1606. full_header = header + reader.get_burndown_parameters()
  1607. except KeyError:
  1608. print(burndown_warning)
  1609. return
  1610. try:
  1611. plot_ownership(args, name, *load_ownership(
  1612. full_header, *reader.get_ownership_burndown(), max_people=args.max_people))
  1613. except KeyError:
  1614. print("ownership: " + burndown_people_warning)
  1615. def couples_files():
  1616. try:
  1617. write_embeddings("files", args.output, not args.disable_projector,
  1618. *train_embeddings(*reader.get_files_coocc(),
  1619. tmpdir=args.couples_tmp_dir))
  1620. except KeyError:
  1621. print(couples_warning)
  1622. def couples_people():
  1623. try:
  1624. write_embeddings("people", args.output, not args.disable_projector,
  1625. *train_embeddings(*reader.get_people_coocc(),
  1626. tmpdir=args.couples_tmp_dir))
  1627. except KeyError:
  1628. print(couples_warning)
  1629. def couples_shotness():
  1630. try:
  1631. write_embeddings("shotness", args.output, not args.disable_projector,
  1632. *train_embeddings(*reader.get_shotness_coocc(),
  1633. tmpdir=args.couples_tmp_dir))
  1634. except KeyError:
  1635. print(shotness_warning)
  1636. def shotness():
  1637. try:
  1638. data = reader.get_shotness()
  1639. except KeyError:
  1640. print(shotness_warning)
  1641. return
  1642. show_shotness_stats(data)
  1643. def sentiment():
  1644. try:
  1645. data = reader.get_sentiment()
  1646. except KeyError:
  1647. print(sentiment_warning)
  1648. return
  1649. show_sentiment_stats(args, reader.get_name(), args.resample, reader.get_header()[0], data)
  1650. def devs():
  1651. try:
  1652. data = reader.get_devs()
  1653. except KeyError:
  1654. print(devs_warning)
  1655. return
  1656. show_devs(args, reader.get_name(), *reader.get_header(), *data)
  1657. def devs_efforts():
  1658. try:
  1659. data = reader.get_devs()
  1660. except KeyError:
  1661. print(devs_warning)
  1662. return
  1663. show_devs_efforts(args, reader.get_name(), *reader.get_header(), *data,
  1664. max_people=args.max_people)
  1665. def old_vs_new():
  1666. try:
  1667. data = reader.get_devs()
  1668. except KeyError:
  1669. print(devs_warning)
  1670. return
  1671. show_old_vs_new(args, reader.get_name(), *reader.get_header(), *data)
  1672. def languages():
  1673. try:
  1674. data = reader.get_devs()
  1675. except KeyError:
  1676. print(devs_warning)
  1677. return
  1678. show_languages(args, reader.get_name(), *reader.get_header(), *data)
  1679. def devs_parallel():
  1680. try:
  1681. ownership = reader.get_ownership_burndown()
  1682. except KeyError:
  1683. print(burndown_people_warning)
  1684. return
  1685. try:
  1686. couples = reader.get_people_coocc()
  1687. except KeyError:
  1688. print(couples_warning)
  1689. return
  1690. try:
  1691. devs = reader.get_devs()
  1692. except KeyError:
  1693. print(devs_warning)
  1694. return
  1695. show_devs_parallel(args, reader.get_name(), *reader.get_header(),
  1696. load_devs_parallel(ownership, couples, devs, args.max_people))
  1697. modes = {
  1698. "run-times": run_times,
  1699. "burndown-project": project_burndown,
  1700. "burndown-file": files_burndown,
  1701. "burndown-person": people_burndown,
  1702. "churn-matrix": churn_matrix,
  1703. "ownership": ownership_burndown,
  1704. "couples-files": couples_files,
  1705. "couples-people": couples_people,
  1706. "couples-shotness": couples_shotness,
  1707. "shotness": shotness,
  1708. "sentiment": sentiment,
  1709. "devs": devs,
  1710. "devs-efforts": devs_efforts,
  1711. "old-vs-new": old_vs_new,
  1712. "languages": languages,
  1713. "devs-parallel": devs_parallel,
  1714. }
  1715. try:
  1716. modes[args.mode]()
  1717. except KeyError:
  1718. assert args.mode == "all"
  1719. project_burndown()
  1720. files_burndown()
  1721. people_burndown()
  1722. churn_matrix()
  1723. ownership_burndown()
  1724. couples_files()
  1725. couples_people()
  1726. couples_shotness()
  1727. shotness()
  1728. sentiment()
  1729. devs()
  1730. devs_efforts()
  1731. # devs_parallel()
  1732. if web_server.running:
  1733. secs = int(os.getenv("COUPLES_SERVER_TIME", "60"))
  1734. print("Sleeping for %d seconds, safe to Ctrl-C" % secs)
  1735. sys.stdout.flush()
  1736. try:
  1737. time.sleep(secs)
  1738. except KeyboardInterrupt:
  1739. pass
  1740. web_server.stop()
  1741. if __name__ == "__main__":
  1742. sys.exit(main())