burndown.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276
  1. from datetime import datetime, timedelta
  2. import warnings
  3. import numpy
  4. import tqdm
  5. from labours.utils import floor_datetime
  6. def fit_kaplan_meier(matrix):
  7. from lifelines import KaplanMeierFitter
  8. T = []
  9. W = []
  10. indexes = numpy.arange(matrix.shape[0], dtype=int)
  11. entries = numpy.zeros(matrix.shape[0], int)
  12. dead = set()
  13. for i in range(1, matrix.shape[1]):
  14. diff = matrix[:, i - 1] - matrix[:, i]
  15. entries[diff < 0] = i
  16. mask = diff > 0
  17. deaths = diff[mask]
  18. T.append(numpy.full(len(deaths), i) - entries[indexes[mask]])
  19. W.append(deaths)
  20. entered = entries > 0
  21. entered[0] = True
  22. dead = dead.union(set(numpy.where((matrix[:, i] == 0) & entered)[0]))
  23. # add the survivors as censored
  24. nnzind = entries != 0
  25. nnzind[0] = True
  26. nnzind[sorted(dead)] = False
  27. T.append(numpy.full(nnzind.sum(), matrix.shape[1]) - entries[nnzind])
  28. W.append(matrix[nnzind, -1])
  29. T = numpy.concatenate(T)
  30. E = numpy.ones(len(T), bool)
  31. E[-nnzind.sum():] = 0
  32. W = numpy.concatenate(W)
  33. if T.size == 0:
  34. return None
  35. kmf = KaplanMeierFitter().fit(T, E, weights=W)
  36. return kmf
  37. def print_survival_function(kmf, sampling):
  38. sf = kmf.survival_function_
  39. sf.index = [timedelta(days=d) for d in sf.index * sampling]
  40. sf.columns = ["Ratio of survived lines"]
  41. try:
  42. print(sf[len(sf) // 6::len(sf) // 6].append(sf.tail(1)))
  43. except ValueError:
  44. pass
  45. def interpolate_burndown_matrix(matrix, granularity, sampling, progress=False):
  46. daily = numpy.zeros(
  47. (matrix.shape[0] * granularity, matrix.shape[1] * sampling),
  48. dtype=numpy.float32)
  49. """
  50. ----------> samples, x
  51. |
  52. |
  53. |
  54. bands, y
  55. """
  56. for y in tqdm.tqdm(range(matrix.shape[0]), disable=(not progress)):
  57. for x in range(matrix.shape[1]):
  58. if y * granularity > (x + 1) * sampling:
  59. # the future is zeros
  60. continue
  61. def decay(start_index: int, start_val: float):
  62. if start_val == 0:
  63. return
  64. k = matrix[y][x] / start_val # <= 1
  65. scale = (x + 1) * sampling - start_index
  66. for i in range(y * granularity, (y + 1) * granularity):
  67. initial = daily[i][start_index - 1]
  68. for j in range(start_index, (x + 1) * sampling):
  69. daily[i][j] = initial * (
  70. 1 + (k - 1) * (j - start_index + 1) / scale)
  71. def grow(finish_index: int, finish_val: float):
  72. initial = matrix[y][x - 1] if x > 0 else 0
  73. start_index = x * sampling
  74. if start_index < y * granularity:
  75. start_index = y * granularity
  76. if finish_index == start_index:
  77. return
  78. avg = (finish_val - initial) / (finish_index - start_index)
  79. for j in range(x * sampling, finish_index):
  80. for i in range(start_index, j + 1):
  81. daily[i][j] = avg
  82. # copy [x*g..y*s)
  83. for j in range(x * sampling, finish_index):
  84. for i in range(y * granularity, x * sampling):
  85. daily[i][j] = daily[i][j - 1]
  86. if (y + 1) * granularity >= (x + 1) * sampling:
  87. # x*granularity <= (y+1)*sampling
  88. # 1. x*granularity <= y*sampling
  89. # y*sampling..(y+1)sampling
  90. #
  91. # x+1
  92. # /
  93. # /
  94. # / y+1 -|
  95. # / |
  96. # / y -|
  97. # /
  98. # / x
  99. #
  100. # 2. x*granularity > y*sampling
  101. # x*granularity..(y+1)sampling
  102. #
  103. # x+1
  104. # /
  105. # /
  106. # / y+1 -|
  107. # / |
  108. # / x -|
  109. # /
  110. # / y
  111. if y * granularity <= x * sampling:
  112. grow((x + 1) * sampling, matrix[y][x])
  113. elif (x + 1) * sampling > y * granularity:
  114. grow((x + 1) * sampling, matrix[y][x])
  115. avg = matrix[y][x] / ((x + 1) * sampling - y * granularity)
  116. for j in range(y * granularity, (x + 1) * sampling):
  117. for i in range(y * granularity, j + 1):
  118. daily[i][j] = avg
  119. elif (y + 1) * granularity >= x * sampling:
  120. # y*sampling <= (x+1)*granularity < (y+1)sampling
  121. # y*sampling..(x+1)*granularity
  122. # (x+1)*granularity..(y+1)sampling
  123. # x+1
  124. # /\
  125. # / \
  126. # / \
  127. # / y+1
  128. # /
  129. # y
  130. v1 = matrix[y][x - 1]
  131. v2 = matrix[y][x]
  132. delta = (y + 1) * granularity - x * sampling
  133. previous = 0
  134. if x > 0 and (x - 1) * sampling >= y * granularity:
  135. # x*g <= (y-1)*s <= y*s <= (x+1)*g <= (y+1)*s
  136. # |________|.......^
  137. if x > 1:
  138. previous = matrix[y][x - 2]
  139. scale = sampling
  140. else:
  141. # (y-1)*s < x*g <= y*s <= (x+1)*g <= (y+1)*s
  142. # |______|.......^
  143. scale = sampling if x == 0 else x * sampling - y * granularity
  144. peak = v1 + (v1 - previous) / scale * delta
  145. if v2 > peak:
  146. # we need to adjust the peak, it may not be less than the decayed value
  147. if x < matrix.shape[1] - 1:
  148. # y*s <= (x+1)*g <= (y+1)*s < (y+2)*s
  149. # ^.........|_________|
  150. k = (v2 - matrix[y][x + 1]) / sampling # > 0
  151. peak = matrix[y][x] + k * ((x + 1) * sampling - (y + 1) * granularity)
  152. # peak > v2 > v1
  153. else:
  154. peak = v2
  155. # not enough data to interpolate; this is at least not restricted
  156. grow((y + 1) * granularity, peak)
  157. decay((y + 1) * granularity, peak)
  158. else:
  159. # (x+1)*granularity < y*sampling
  160. # y*sampling..(y+1)sampling
  161. decay(x * sampling, matrix[y][x - 1])
  162. return daily
  163. def import_pandas():
  164. import pandas
  165. try:
  166. from pandas.plotting import register_matplotlib_converters
  167. register_matplotlib_converters()
  168. except ImportError:
  169. pass
  170. return pandas
  171. def load_burndown(
  172. header,
  173. name,
  174. matrix,
  175. resample,
  176. report_survival=True,
  177. interpolation_progress=False
  178. ):
  179. pandas = import_pandas()
  180. start, last, sampling, granularity, tick = header
  181. assert sampling > 0
  182. assert granularity > 0
  183. start = floor_datetime(datetime.fromtimestamp(start), tick)
  184. last = datetime.fromtimestamp(last)
  185. if report_survival:
  186. kmf = fit_kaplan_meier(matrix)
  187. if kmf is not None:
  188. print_survival_function(kmf, sampling)
  189. finish = start + timedelta(seconds=matrix.shape[1] * sampling * tick)
  190. if resample not in ("no", "raw"):
  191. print("resampling to %s, please wait..." % resample)
  192. # Interpolate the day x day matrix.
  193. # Each day brings equal weight in the granularity.
  194. # Sampling's interpolation is linear.
  195. daily = interpolate_burndown_matrix(
  196. matrix=matrix,
  197. granularity=granularity,
  198. sampling=sampling,
  199. progress=interpolation_progress,
  200. )
  201. daily[(last - start).days:] = 0
  202. # Resample the bands
  203. aliases = {
  204. "year": "A",
  205. "month": "M"
  206. }
  207. resample = aliases.get(resample, resample)
  208. periods = 0
  209. date_granularity_sampling = [start]
  210. while date_granularity_sampling[-1] < finish:
  211. periods += 1
  212. date_granularity_sampling = pandas.date_range(
  213. start, periods=periods, freq=resample)
  214. if date_granularity_sampling[0] > finish:
  215. if resample == "A":
  216. print("too loose resampling - by year, trying by month")
  217. return load_burndown(header, name, matrix, "month", report_survival=False)
  218. else:
  219. raise ValueError("Too loose resampling: %s. Try finer." % resample)
  220. date_range_sampling = pandas.date_range(
  221. date_granularity_sampling[0],
  222. periods=(finish - date_granularity_sampling[0]).days,
  223. freq="1D")
  224. # Fill the new square matrix
  225. matrix = numpy.zeros(
  226. (len(date_granularity_sampling), len(date_range_sampling)),
  227. dtype=numpy.float32)
  228. for i, gdt in enumerate(date_granularity_sampling):
  229. istart = (date_granularity_sampling[i - 1] - start).days \
  230. if i > 0 else 0
  231. ifinish = (gdt - start).days
  232. for j, sdt in enumerate(date_range_sampling):
  233. if (sdt - start).days >= istart:
  234. break
  235. matrix[i, j:] = \
  236. daily[istart:ifinish, (sdt - start).days:].sum(axis=0)
  237. # Hardcode some cases to improve labels' readability
  238. if resample in ("year", "A"):
  239. labels = [dt.year for dt in date_granularity_sampling]
  240. elif resample in ("month", "M"):
  241. labels = [dt.strftime("%Y %B") for dt in date_granularity_sampling]
  242. else:
  243. labels = [dt.date() for dt in date_granularity_sampling]
  244. else:
  245. labels = [
  246. "%s - %s" % ((start + timedelta(seconds=i * granularity * tick)).date(),
  247. (
  248. start + timedelta(seconds=(i + 1) * granularity * tick)).date())
  249. for i in range(matrix.shape[0])]
  250. if len(labels) > 18:
  251. warnings.warn("Too many labels - consider resampling.")
  252. resample = "M" # fake resampling type is checked while plotting
  253. date_range_sampling = pandas.date_range(
  254. start + timedelta(seconds=sampling * tick), periods=matrix.shape[1],
  255. freq="%dD" % sampling)
  256. return name, matrix, date_range_sampling, labels, granularity, sampling, resample