burndown.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391
  1. from argparse import Namespace
  2. import contextlib
  3. from datetime import datetime, timedelta
  4. import io
  5. import json
  6. import sys
  7. from typing import List, Tuple, TYPE_CHECKING
  8. import warnings
  9. import numpy
  10. import tqdm
  11. from labours.plotting import apply_plot_style, deploy_plot, get_plot_path, import_pyplot
  12. from labours.utils import default_json, floor_datetime, import_pandas, parse_date
  13. if TYPE_CHECKING:
  14. from lifelines import KaplanMeierFitter
  15. from pandas.core.indexes.datetimes import DatetimeIndex
  16. def plot_burndown(
  17. args: Namespace,
  18. target: str,
  19. name: str,
  20. matrix: numpy.ndarray,
  21. date_range_sampling: 'DatetimeIndex',
  22. labels: List[int],
  23. granularity: int,
  24. sampling: int,
  25. resample: str,
  26. ) -> None:
  27. if args.output and args.output.endswith(".json"):
  28. data = locals().copy()
  29. del data["args"]
  30. data["type"] = "burndown"
  31. if args.mode == "project" and target == "project":
  32. output = args.output
  33. else:
  34. if target == "project":
  35. name = "project"
  36. output = get_plot_path(args.output, name)
  37. with open(output, "w") as fout:
  38. json.dump(data, fout, sort_keys=True, default=default_json)
  39. return
  40. matplotlib, pyplot = import_pyplot(args.backend, args.style)
  41. pyplot.stackplot(date_range_sampling, matrix, labels=labels)
  42. if args.relative:
  43. for i in range(matrix.shape[1]):
  44. matrix[:, i] /= matrix[:, i].sum()
  45. pyplot.ylim(0, 1)
  46. legend_loc = 3
  47. else:
  48. legend_loc = 2
  49. legend = pyplot.legend(loc=legend_loc, fontsize=args.font_size)
  50. pyplot.ylabel("Lines of code")
  51. pyplot.xlabel("Time")
  52. apply_plot_style(
  53. pyplot.gcf(), pyplot.gca(), legend, args.background, args.font_size, args.size
  54. )
  55. pyplot.xlim(
  56. parse_date(args.start_date, date_range_sampling[0]),
  57. parse_date(args.end_date, date_range_sampling[-1]),
  58. )
  59. locator = pyplot.gca().xaxis.get_major_locator()
  60. # set the optimal xticks locator
  61. if "M" not in resample:
  62. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  63. locs = pyplot.gca().get_xticks().tolist()
  64. if len(locs) >= 16:
  65. pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
  66. locs = pyplot.gca().get_xticks().tolist()
  67. if len(locs) >= 16:
  68. pyplot.gca().xaxis.set_major_locator(locator)
  69. if locs[0] < pyplot.xlim()[0]:
  70. del locs[0]
  71. endindex = -1
  72. if len(locs) >= 2 and pyplot.xlim()[1] - locs[-1] > (locs[-1] - locs[-2]) / 2:
  73. locs.append(pyplot.xlim()[1])
  74. endindex = len(locs) - 1
  75. startindex = -1
  76. if len(locs) >= 2 and locs[0] - pyplot.xlim()[0] > (locs[1] - locs[0]) / 2:
  77. locs.append(pyplot.xlim()[0])
  78. startindex = len(locs) - 1
  79. pyplot.gca().set_xticks(locs)
  80. # hacking time!
  81. labels = pyplot.gca().get_xticklabels()
  82. if startindex >= 0:
  83. labels[startindex].set_text(date_range_sampling[0].date())
  84. labels[startindex].set_text = lambda _: None
  85. labels[startindex].set_rotation(30)
  86. labels[startindex].set_ha("right")
  87. if endindex >= 0:
  88. labels[endindex].set_text(date_range_sampling[-1].date())
  89. labels[endindex].set_text = lambda _: None
  90. labels[endindex].set_rotation(30)
  91. labels[endindex].set_ha("right")
  92. title = "%s %d x %d (granularity %d, sampling %d)" % (
  93. (name,) + matrix.shape + (granularity, sampling)
  94. )
  95. output = args.output
  96. if output:
  97. if args.mode == "project" and target == "project":
  98. output = args.output
  99. else:
  100. if target == "project":
  101. name = "project"
  102. output = get_plot_path(args.output, name)
  103. deploy_plot(title, output, args.background)
  104. def plot_many_burndown(args: Namespace, target: str, header, parts):
  105. if not args.output:
  106. print("Warning: output not set, showing %d plots." % len(parts))
  107. stdout = io.StringIO()
  108. for name, matrix in tqdm.tqdm(parts):
  109. with contextlib.redirect_stdout(stdout):
  110. plot_burndown(
  111. args, target, *load_burndown(header, name, matrix, args.resample)
  112. )
  113. sys.stdout.write(stdout.getvalue())
  114. def fit_kaplan_meier(matrix: numpy.ndarray) -> 'KaplanMeierFitter':
  115. from lifelines import KaplanMeierFitter
  116. T = []
  117. W = []
  118. indexes = numpy.arange(matrix.shape[0], dtype=int)
  119. entries = numpy.zeros(matrix.shape[0], int)
  120. dead = set()
  121. for i in range(1, matrix.shape[1]):
  122. diff = matrix[:, i - 1] - matrix[:, i]
  123. entries[diff < 0] = i
  124. mask = diff > 0
  125. deaths = diff[mask]
  126. T.append(numpy.full(len(deaths), i) - entries[indexes[mask]])
  127. W.append(deaths)
  128. entered = entries > 0
  129. entered[0] = True
  130. dead = dead.union(set(numpy.where((matrix[:, i] == 0) & entered)[0]))
  131. # add the survivors as censored
  132. nnzind = entries != 0
  133. nnzind[0] = True
  134. nnzind[sorted(dead)] = False
  135. T.append(numpy.full(nnzind.sum(), matrix.shape[1]) - entries[nnzind])
  136. W.append(matrix[nnzind, -1])
  137. T = numpy.concatenate(T)
  138. E = numpy.ones(len(T), bool)
  139. E[-nnzind.sum() :] = 0
  140. W = numpy.concatenate(W)
  141. if T.size == 0:
  142. return None
  143. kmf = KaplanMeierFitter().fit(T, E, weights=W)
  144. return kmf
  145. def print_survival_function(kmf: 'KaplanMeierFitter', sampling: int) -> None:
  146. sf = kmf.survival_function_
  147. sf.index = [timedelta(days=d) for d in sf.index * sampling]
  148. sf.columns = ["Ratio of survived lines"]
  149. try:
  150. print(sf[len(sf) // 6 :: len(sf) // 6].append(sf.tail(1)))
  151. except ValueError:
  152. pass
  153. def interpolate_burndown_matrix(
  154. matrix: numpy.ndarray, granularity: int, sampling: int, progress: bool = False
  155. ) -> numpy.ndarray:
  156. daily = numpy.zeros(
  157. (matrix.shape[0] * granularity, matrix.shape[1] * sampling), dtype=numpy.float32
  158. )
  159. """
  160. ----------> samples, x
  161. |
  162. |
  163. |
  164. bands, y
  165. """
  166. for y in tqdm.tqdm(range(matrix.shape[0]), disable=(not progress)):
  167. for x in range(matrix.shape[1]):
  168. if y * granularity > (x + 1) * sampling:
  169. # the future is zeros
  170. continue
  171. def decay(start_index: int, start_val: float):
  172. if start_val == 0:
  173. return
  174. k = matrix[y][x] / start_val # <= 1
  175. scale = (x + 1) * sampling - start_index
  176. for i in range(y * granularity, (y + 1) * granularity):
  177. initial = daily[i][start_index - 1]
  178. for j in range(start_index, (x + 1) * sampling):
  179. daily[i][j] = initial * (
  180. 1 + (k - 1) * (j - start_index + 1) / scale
  181. )
  182. def grow(finish_index: int, finish_val: float):
  183. initial = matrix[y][x - 1] if x > 0 else 0
  184. start_index = x * sampling
  185. if start_index < y * granularity:
  186. start_index = y * granularity
  187. if finish_index == start_index:
  188. return
  189. avg = (finish_val - initial) / (finish_index - start_index)
  190. for j in range(x * sampling, finish_index):
  191. for i in range(start_index, j + 1):
  192. daily[i][j] = avg
  193. # copy [x*g..y*s)
  194. for j in range(x * sampling, finish_index):
  195. for i in range(y * granularity, x * sampling):
  196. daily[i][j] = daily[i][j - 1]
  197. if (y + 1) * granularity >= (x + 1) * sampling:
  198. # x*granularity <= (y+1)*sampling
  199. # 1. x*granularity <= y*sampling
  200. # y*sampling..(y+1)sampling
  201. #
  202. # x+1
  203. # /
  204. # /
  205. # / y+1 -|
  206. # / |
  207. # / y -|
  208. # /
  209. # / x
  210. #
  211. # 2. x*granularity > y*sampling
  212. # x*granularity..(y+1)sampling
  213. #
  214. # x+1
  215. # /
  216. # /
  217. # / y+1 -|
  218. # / |
  219. # / x -|
  220. # /
  221. # / y
  222. if y * granularity <= x * sampling:
  223. grow((x + 1) * sampling, matrix[y][x])
  224. elif (x + 1) * sampling > y * granularity:
  225. grow((x + 1) * sampling, matrix[y][x])
  226. avg = matrix[y][x] / ((x + 1) * sampling - y * granularity)
  227. for j in range(y * granularity, (x + 1) * sampling):
  228. for i in range(y * granularity, j + 1):
  229. daily[i][j] = avg
  230. elif (y + 1) * granularity >= x * sampling:
  231. # y*sampling <= (x+1)*granularity < (y+1)sampling
  232. # y*sampling..(x+1)*granularity
  233. # (x+1)*granularity..(y+1)sampling
  234. # x+1
  235. # /\
  236. # / \
  237. # / \
  238. # / y+1
  239. # /
  240. # y
  241. v1 = matrix[y][x - 1]
  242. v2 = matrix[y][x]
  243. delta = (y + 1) * granularity - x * sampling
  244. previous = 0
  245. if x > 0 and (x - 1) * sampling >= y * granularity:
  246. # x*g <= (y-1)*s <= y*s <= (x+1)*g <= (y+1)*s
  247. # |________|.......^
  248. if x > 1:
  249. previous = matrix[y][x - 2]
  250. scale = sampling
  251. else:
  252. # (y-1)*s < x*g <= y*s <= (x+1)*g <= (y+1)*s
  253. # |______|.......^
  254. scale = sampling if x == 0 else x * sampling - y * granularity
  255. peak = v1 + (v1 - previous) / scale * delta
  256. if v2 > peak:
  257. # we need to adjust the peak, it may not be less than the decayed value
  258. if x < matrix.shape[1] - 1:
  259. # y*s <= (x+1)*g <= (y+1)*s < (y+2)*s
  260. # ^.........|_________|
  261. k = (v2 - matrix[y][x + 1]) / sampling # > 0
  262. peak = matrix[y][x] + k * (
  263. (x + 1) * sampling - (y + 1) * granularity
  264. )
  265. # peak > v2 > v1
  266. else:
  267. peak = v2
  268. # not enough data to interpolate; this is at least not restricted
  269. grow((y + 1) * granularity, peak)
  270. decay((y + 1) * granularity, peak)
  271. else:
  272. # (x+1)*granularity < y*sampling
  273. # y*sampling..(y+1)sampling
  274. decay(x * sampling, matrix[y][x - 1])
  275. return daily
  276. def load_burndown(
  277. header: Tuple[int, int, int, int, float],
  278. name: str,
  279. matrix: numpy.ndarray,
  280. resample: str,
  281. report_survival: bool = True,
  282. interpolation_progress: bool = False,
  283. ) -> Tuple[str, numpy.ndarray, 'DatetimeIndex', List[int], int, int, str]:
  284. pandas = import_pandas()
  285. start, last, sampling, granularity, tick = header
  286. assert sampling > 0
  287. assert granularity > 0
  288. start = floor_datetime(datetime.fromtimestamp(start), tick)
  289. last = datetime.fromtimestamp(last)
  290. if report_survival:
  291. kmf = fit_kaplan_meier(matrix)
  292. if kmf is not None:
  293. print_survival_function(kmf, sampling)
  294. finish = start + timedelta(seconds=matrix.shape[1] * sampling * tick)
  295. if resample not in ("no", "raw"):
  296. print("resampling to %s, please wait..." % resample)
  297. # Interpolate the day x day matrix.
  298. # Each day brings equal weight in the granularity.
  299. # Sampling's interpolation is linear.
  300. daily = interpolate_burndown_matrix(
  301. matrix=matrix,
  302. granularity=granularity,
  303. sampling=sampling,
  304. progress=interpolation_progress,
  305. )
  306. daily[(last - start).days :] = 0
  307. # Resample the bands
  308. aliases = {"year": "A", "month": "M"}
  309. resample = aliases.get(resample, resample)
  310. periods = 0
  311. date_granularity_sampling = [start]
  312. while date_granularity_sampling[-1] < finish:
  313. periods += 1
  314. date_granularity_sampling = pandas.date_range(
  315. start, periods=periods, freq=resample
  316. )
  317. if date_granularity_sampling[0] > finish:
  318. if resample == "A":
  319. print("too loose resampling - by year, trying by month")
  320. return load_burndown(
  321. header, name, matrix, "month", report_survival=False
  322. )
  323. else:
  324. raise ValueError("Too loose resampling: %s. Try finer." % resample)
  325. date_range_sampling = pandas.date_range(
  326. date_granularity_sampling[0],
  327. periods=(finish - date_granularity_sampling[0]).days,
  328. freq="1D",
  329. )
  330. # Fill the new square matrix
  331. matrix = numpy.zeros(
  332. (len(date_granularity_sampling), len(date_range_sampling)),
  333. dtype=numpy.float32,
  334. )
  335. for i, gdt in enumerate(date_granularity_sampling):
  336. istart = (date_granularity_sampling[i - 1] - start).days if i > 0 else 0
  337. ifinish = (gdt - start).days
  338. for j, sdt in enumerate(date_range_sampling):
  339. if (sdt - start).days >= istart:
  340. break
  341. matrix[i, j:] = daily[istart:ifinish, (sdt - start).days :].sum(axis=0)
  342. # Hardcode some cases to improve labels' readability
  343. if resample in ("year", "A"):
  344. labels = [dt.year for dt in date_granularity_sampling]
  345. elif resample in ("month", "M"):
  346. labels = [dt.strftime("%Y %B") for dt in date_granularity_sampling]
  347. else:
  348. labels = [dt.date() for dt in date_granularity_sampling]
  349. else:
  350. labels = [
  351. "%s - %s"
  352. % (
  353. (start + timedelta(seconds=i * granularity * tick)).date(),
  354. (start + timedelta(seconds=(i + 1) * granularity * tick)).date(),
  355. )
  356. for i in range(matrix.shape[0])
  357. ]
  358. if len(labels) > 18:
  359. warnings.warn("Too many labels - consider resampling.")
  360. resample = "M" # fake resampling type is checked while plotting
  361. date_range_sampling = pandas.date_range(
  362. start + timedelta(seconds=sampling * tick),
  363. periods=matrix.shape[1],
  364. freq="%dD" % sampling,
  365. )
  366. return name, matrix, date_range_sampling, labels, granularity, sampling, resample