burndown.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293
  1. from datetime import datetime, timedelta
  2. from typing import List, Tuple, TYPE_CHECKING
  3. import warnings
  4. import numpy
  5. import tqdm
  6. from labours.utils import floor_datetime
  7. if TYPE_CHECKING:
  8. from lifelines import KaplanMeierFitter
  9. from pandas.core.indexes.datetimes import DatetimeIndex
  10. def fit_kaplan_meier(matrix: numpy.ndarray) -> 'KaplanMeierFitter':
  11. from lifelines import KaplanMeierFitter
  12. T = []
  13. W = []
  14. indexes = numpy.arange(matrix.shape[0], dtype=int)
  15. entries = numpy.zeros(matrix.shape[0], int)
  16. dead = set()
  17. for i in range(1, matrix.shape[1]):
  18. diff = matrix[:, i - 1] - matrix[:, i]
  19. entries[diff < 0] = i
  20. mask = diff > 0
  21. deaths = diff[mask]
  22. T.append(numpy.full(len(deaths), i) - entries[indexes[mask]])
  23. W.append(deaths)
  24. entered = entries > 0
  25. entered[0] = True
  26. dead = dead.union(set(numpy.where((matrix[:, i] == 0) & entered)[0]))
  27. # add the survivors as censored
  28. nnzind = entries != 0
  29. nnzind[0] = True
  30. nnzind[sorted(dead)] = False
  31. T.append(numpy.full(nnzind.sum(), matrix.shape[1]) - entries[nnzind])
  32. W.append(matrix[nnzind, -1])
  33. T = numpy.concatenate(T)
  34. E = numpy.ones(len(T), bool)
  35. E[-nnzind.sum() :] = 0
  36. W = numpy.concatenate(W)
  37. if T.size == 0:
  38. return None
  39. kmf = KaplanMeierFitter().fit(T, E, weights=W)
  40. return kmf
  41. def print_survival_function(kmf: 'KaplanMeierFitter', sampling: int) -> None:
  42. sf = kmf.survival_function_
  43. sf.index = [timedelta(days=d) for d in sf.index * sampling]
  44. sf.columns = ["Ratio of survived lines"]
  45. try:
  46. print(sf[len(sf) // 6 :: len(sf) // 6].append(sf.tail(1)))
  47. except ValueError:
  48. pass
  49. def interpolate_burndown_matrix(
  50. matrix: numpy.ndarray, granularity: int, sampling: int, progress: bool = False
  51. ) -> numpy.ndarray:
  52. daily = numpy.zeros(
  53. (matrix.shape[0] * granularity, matrix.shape[1] * sampling), dtype=numpy.float32
  54. )
  55. """
  56. ----------> samples, x
  57. |
  58. |
  59. |
  60. bands, y
  61. """
  62. for y in tqdm.tqdm(range(matrix.shape[0]), disable=(not progress)):
  63. for x in range(matrix.shape[1]):
  64. if y * granularity > (x + 1) * sampling:
  65. # the future is zeros
  66. continue
  67. def decay(start_index: int, start_val: float):
  68. if start_val == 0:
  69. return
  70. k = matrix[y][x] / start_val # <= 1
  71. scale = (x + 1) * sampling - start_index
  72. for i in range(y * granularity, (y + 1) * granularity):
  73. initial = daily[i][start_index - 1]
  74. for j in range(start_index, (x + 1) * sampling):
  75. daily[i][j] = initial * (
  76. 1 + (k - 1) * (j - start_index + 1) / scale
  77. )
  78. def grow(finish_index: int, finish_val: float):
  79. initial = matrix[y][x - 1] if x > 0 else 0
  80. start_index = x * sampling
  81. if start_index < y * granularity:
  82. start_index = y * granularity
  83. if finish_index == start_index:
  84. return
  85. avg = (finish_val - initial) / (finish_index - start_index)
  86. for j in range(x * sampling, finish_index):
  87. for i in range(start_index, j + 1):
  88. daily[i][j] = avg
  89. # copy [x*g..y*s)
  90. for j in range(x * sampling, finish_index):
  91. for i in range(y * granularity, x * sampling):
  92. daily[i][j] = daily[i][j - 1]
  93. if (y + 1) * granularity >= (x + 1) * sampling:
  94. # x*granularity <= (y+1)*sampling
  95. # 1. x*granularity <= y*sampling
  96. # y*sampling..(y+1)sampling
  97. #
  98. # x+1
  99. # /
  100. # /
  101. # / y+1 -|
  102. # / |
  103. # / y -|
  104. # /
  105. # / x
  106. #
  107. # 2. x*granularity > y*sampling
  108. # x*granularity..(y+1)sampling
  109. #
  110. # x+1
  111. # /
  112. # /
  113. # / y+1 -|
  114. # / |
  115. # / x -|
  116. # /
  117. # / y
  118. if y * granularity <= x * sampling:
  119. grow((x + 1) * sampling, matrix[y][x])
  120. elif (x + 1) * sampling > y * granularity:
  121. grow((x + 1) * sampling, matrix[y][x])
  122. avg = matrix[y][x] / ((x + 1) * sampling - y * granularity)
  123. for j in range(y * granularity, (x + 1) * sampling):
  124. for i in range(y * granularity, j + 1):
  125. daily[i][j] = avg
  126. elif (y + 1) * granularity >= x * sampling:
  127. # y*sampling <= (x+1)*granularity < (y+1)sampling
  128. # y*sampling..(x+1)*granularity
  129. # (x+1)*granularity..(y+1)sampling
  130. # x+1
  131. # /\
  132. # / \
  133. # / \
  134. # / y+1
  135. # /
  136. # y
  137. v1 = matrix[y][x - 1]
  138. v2 = matrix[y][x]
  139. delta = (y + 1) * granularity - x * sampling
  140. previous = 0
  141. if x > 0 and (x - 1) * sampling >= y * granularity:
  142. # x*g <= (y-1)*s <= y*s <= (x+1)*g <= (y+1)*s
  143. # |________|.......^
  144. if x > 1:
  145. previous = matrix[y][x - 2]
  146. scale = sampling
  147. else:
  148. # (y-1)*s < x*g <= y*s <= (x+1)*g <= (y+1)*s
  149. # |______|.......^
  150. scale = sampling if x == 0 else x * sampling - y * granularity
  151. peak = v1 + (v1 - previous) / scale * delta
  152. if v2 > peak:
  153. # we need to adjust the peak, it may not be less than the decayed value
  154. if x < matrix.shape[1] - 1:
  155. # y*s <= (x+1)*g <= (y+1)*s < (y+2)*s
  156. # ^.........|_________|
  157. k = (v2 - matrix[y][x + 1]) / sampling # > 0
  158. peak = matrix[y][x] + k * (
  159. (x + 1) * sampling - (y + 1) * granularity
  160. )
  161. # peak > v2 > v1
  162. else:
  163. peak = v2
  164. # not enough data to interpolate; this is at least not restricted
  165. grow((y + 1) * granularity, peak)
  166. decay((y + 1) * granularity, peak)
  167. else:
  168. # (x+1)*granularity < y*sampling
  169. # y*sampling..(y+1)sampling
  170. decay(x * sampling, matrix[y][x - 1])
  171. return daily
  172. def import_pandas():
  173. import pandas
  174. try:
  175. from pandas.plotting import register_matplotlib_converters
  176. register_matplotlib_converters()
  177. except ImportError:
  178. pass
  179. return pandas
  180. def load_burndown(
  181. header: Tuple[int, int, int, int, float],
  182. name: str,
  183. matrix: numpy.ndarray,
  184. resample: str,
  185. report_survival: bool = True,
  186. interpolation_progress: bool = False,
  187. ) -> Tuple[str, numpy.ndarray, 'DatetimeIndex', List[int], int, int, str]:
  188. pandas = import_pandas()
  189. start, last, sampling, granularity, tick = header
  190. assert sampling > 0
  191. assert granularity > 0
  192. start = floor_datetime(datetime.fromtimestamp(start), tick)
  193. last = datetime.fromtimestamp(last)
  194. if report_survival:
  195. kmf = fit_kaplan_meier(matrix)
  196. if kmf is not None:
  197. print_survival_function(kmf, sampling)
  198. finish = start + timedelta(seconds=matrix.shape[1] * sampling * tick)
  199. if resample not in ("no", "raw"):
  200. print("resampling to %s, please wait..." % resample)
  201. # Interpolate the day x day matrix.
  202. # Each day brings equal weight in the granularity.
  203. # Sampling's interpolation is linear.
  204. daily = interpolate_burndown_matrix(
  205. matrix=matrix,
  206. granularity=granularity,
  207. sampling=sampling,
  208. progress=interpolation_progress,
  209. )
  210. daily[(last - start).days :] = 0
  211. # Resample the bands
  212. aliases = {"year": "A", "month": "M"}
  213. resample = aliases.get(resample, resample)
  214. periods = 0
  215. date_granularity_sampling = [start]
  216. while date_granularity_sampling[-1] < finish:
  217. periods += 1
  218. date_granularity_sampling = pandas.date_range(
  219. start, periods=periods, freq=resample
  220. )
  221. if date_granularity_sampling[0] > finish:
  222. if resample == "A":
  223. print("too loose resampling - by year, trying by month")
  224. return load_burndown(
  225. header, name, matrix, "month", report_survival=False
  226. )
  227. else:
  228. raise ValueError("Too loose resampling: %s. Try finer." % resample)
  229. date_range_sampling = pandas.date_range(
  230. date_granularity_sampling[0],
  231. periods=(finish - date_granularity_sampling[0]).days,
  232. freq="1D",
  233. )
  234. # Fill the new square matrix
  235. matrix = numpy.zeros(
  236. (len(date_granularity_sampling), len(date_range_sampling)),
  237. dtype=numpy.float32,
  238. )
  239. for i, gdt in enumerate(date_granularity_sampling):
  240. istart = (date_granularity_sampling[i - 1] - start).days if i > 0 else 0
  241. ifinish = (gdt - start).days
  242. for j, sdt in enumerate(date_range_sampling):
  243. if (sdt - start).days >= istart:
  244. break
  245. matrix[i, j:] = daily[istart:ifinish, (sdt - start).days :].sum(axis=0)
  246. # Hardcode some cases to improve labels' readability
  247. if resample in ("year", "A"):
  248. labels = [dt.year for dt in date_granularity_sampling]
  249. elif resample in ("month", "M"):
  250. labels = [dt.strftime("%Y %B") for dt in date_granularity_sampling]
  251. else:
  252. labels = [dt.date() for dt in date_granularity_sampling]
  253. else:
  254. labels = [
  255. "%s - %s"
  256. % (
  257. (start + timedelta(seconds=i * granularity * tick)).date(),
  258. (start + timedelta(seconds=(i + 1) * granularity * tick)).date(),
  259. )
  260. for i in range(matrix.shape[0])
  261. ]
  262. if len(labels) > 18:
  263. warnings.warn("Too many labels - consider resampling.")
  264. resample = "M" # fake resampling type is checked while plotting
  265. date_range_sampling = pandas.date_range(
  266. start + timedelta(seconds=sampling * tick),
  267. periods=matrix.shape[1],
  268. freq="%dD" % sampling,
  269. )
  270. return name, matrix, date_range_sampling, labels, granularity, sampling, resample