burndown.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544
  1. package hercules
  2. import (
  3. "bufio"
  4. "bytes"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "os"
  9. "unicode/utf8"
  10. "github.com/sergi/go-diff/diffmatchpatch"
  11. "gopkg.in/src-d/go-git.v4"
  12. "gopkg.in/src-d/go-git.v4/plumbing"
  13. "gopkg.in/src-d/go-git.v4/plumbing/object"
  14. "gopkg.in/src-d/go-git.v4/utils/merkletrie"
  15. )
  16. // BurndownAnalyser allows to gather the line burndown statistics for a Git repository.
  17. type BurndownAnalysis struct {
  18. // Granularity sets the size of each band - the number of days it spans.
  19. // Smaller values provide better resolution but require more work and eat more
  20. // memory. 30 days is usually enough.
  21. Granularity int
  22. // Sampling sets how detailed is the statistic - the size of the interval in
  23. // days between consecutive measurements. It is usually a good idea to set it
  24. // <= Granularity. Try 15 or 30.
  25. Sampling int
  26. // The number of developers for which to collect the burndown stats. 0 disables it.
  27. PeopleNumber int
  28. // Debug activates the debugging mode. Analyse() runs slower in this mode
  29. // but it accurately checks all the intermediate states for invariant
  30. // violations.
  31. Debug bool
  32. // Repository points to the analysed Git repository struct from go-git.
  33. repository *git.Repository
  34. // globalStatus is the current daily alive number of lines; key is the number
  35. // of days from the beginning of the history.
  36. globalStatus map[int]int64
  37. // globalHistory is the weekly snapshots of globalStatus.
  38. globalHistory [][]int64
  39. // fileHistories is the weekly snapshots of each file's status.
  40. fileHistories map[string][][]int64
  41. // peopleHistories is the weekly snapshots of each person's status.
  42. peopleHistories [][][]int64
  43. // fiales is the mapping <file path> -> hercules.File.
  44. files map[string]*File
  45. // matrix is the mutual deletions and self insertions.
  46. matrix []map[int]int64
  47. // people is the people's individual time stats.
  48. people []map[int]int64
  49. // day is the most recent day index processed.
  50. day int
  51. // previousDay is the day from the previous sample period -
  52. // different from DaysSinceStart.previousDay.
  53. previousDay int
  54. }
  55. type BurndownResult struct {
  56. GlobalHistory [][]int64
  57. FileHistories map[string][][]int64
  58. PeopleHistories [][][]int64
  59. PeopleMatrix [][]int64
  60. }
  61. func (analyser *BurndownAnalysis) Name() string {
  62. return "BurndownAnalysis"
  63. }
  64. func (analyser *BurndownAnalysis) Provides() []string {
  65. return []string{}
  66. }
  67. func (analyser *BurndownAnalysis) Requires() []string {
  68. arr := [...]string{"renamed_changes", "blob_cache", "day", "author"}
  69. return arr[:]
  70. }
  71. func (analyser *BurndownAnalysis) Initialize(repository *git.Repository) {
  72. analyser.repository = repository
  73. analyser.globalStatus = map[int]int64{}
  74. analyser.globalHistory = [][]int64{}
  75. analyser.fileHistories = map[string][][]int64{}
  76. analyser.peopleHistories = make([][][]int64, analyser.PeopleNumber)
  77. analyser.files = map[string]*File{}
  78. analyser.matrix = make([]map[int]int64, analyser.PeopleNumber)
  79. analyser.people = make([]map[int]int64, analyser.PeopleNumber)
  80. analyser.day = 0
  81. analyser.previousDay = 0
  82. }
  83. func (analyser *BurndownAnalysis) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  84. sampling := analyser.Sampling
  85. if sampling == 0 {
  86. sampling = 1
  87. }
  88. author := deps["author"].(int)
  89. analyser.day = deps["day"].(int)
  90. delta := (analyser.day / sampling) - (analyser.previousDay / sampling)
  91. if delta > 0 {
  92. analyser.previousDay = analyser.day
  93. gs, fss, pss := analyser.groupStatus()
  94. analyser.updateHistories(gs, fss, pss, delta)
  95. }
  96. cache := deps["blob_cache"].(map[plumbing.Hash]*object.Blob)
  97. tree_diff := deps["renamed_changes"].(object.Changes)
  98. for _, change := range tree_diff {
  99. action, err := change.Action()
  100. if err != nil {
  101. return nil, err
  102. }
  103. switch action {
  104. case merkletrie.Insert:
  105. err = analyser.handleInsertion(change, author, cache)
  106. case merkletrie.Delete:
  107. err = analyser.handleDeletion(change, author, cache)
  108. case merkletrie.Modify:
  109. err = analyser.handleModification(change, author, cache)
  110. }
  111. if err != nil {
  112. return nil, err
  113. }
  114. }
  115. return nil, nil
  116. }
  117. // Finalize() returns the list of snapshots of the cumulative line edit times
  118. // and the similar lists for every file which is alive in HEAD.
  119. // The number of snapshots (the first dimension >[]<[]int64) depends on
  120. // Analyser.Sampling (the more Sampling, the less the value); the length of
  121. // each snapshot depends on Analyser.Granularity (the more Granularity,
  122. // the less the value).
  123. func (analyser *BurndownAnalysis) Finalize() interface{} {
  124. gs, fss, pss := analyser.groupStatus()
  125. analyser.updateHistories(gs, fss, pss, 1)
  126. for key, statuses := range analyser.fileHistories {
  127. if len(statuses) == len(analyser.globalHistory) {
  128. continue
  129. }
  130. padding := make([][]int64, len(analyser.globalHistory)-len(statuses))
  131. for i := range padding {
  132. padding[i] = make([]int64, len(analyser.globalStatus))
  133. }
  134. analyser.fileHistories[key] = append(padding, statuses...)
  135. }
  136. peopleMatrix := make([][]int64, analyser.PeopleNumber)
  137. for i, row := range analyser.matrix {
  138. mrow := make([]int64, analyser.PeopleNumber+2)
  139. peopleMatrix[i] = mrow
  140. for key, val := range row {
  141. if key == MISSING_AUTHOR {
  142. key = -1
  143. } else if key == SELF_AUTHOR {
  144. key = -2
  145. }
  146. mrow[key+2] = val
  147. }
  148. }
  149. return BurndownResult{
  150. GlobalHistory: analyser.globalHistory,
  151. FileHistories: analyser.fileHistories,
  152. PeopleHistories: analyser.peopleHistories,
  153. PeopleMatrix: peopleMatrix}
  154. }
  155. func checkClose(c io.Closer) {
  156. if err := c.Close(); err != nil {
  157. panic(err)
  158. }
  159. }
  160. func countLines(file *object.Blob) (int, error) {
  161. reader, err := file.Reader()
  162. if err != nil {
  163. return 0, err
  164. }
  165. defer checkClose(reader)
  166. var scanner *bufio.Scanner
  167. buffer := make([]byte, bufio.MaxScanTokenSize)
  168. counter := 0
  169. for scanner == nil || scanner.Err() == bufio.ErrTooLong {
  170. if scanner != nil && !utf8.Valid(scanner.Bytes()) {
  171. return -1, errors.New("binary")
  172. }
  173. scanner = bufio.NewScanner(reader)
  174. scanner.Buffer(buffer, 0)
  175. for scanner.Scan() {
  176. if !utf8.Valid(scanner.Bytes()) {
  177. return -1, errors.New("binary")
  178. }
  179. counter++
  180. }
  181. }
  182. return counter, nil
  183. }
  184. func blobToString(file *object.Blob) (string, error) {
  185. reader, err := file.Reader()
  186. if err != nil {
  187. return "", err
  188. }
  189. defer checkClose(reader)
  190. buf := new(bytes.Buffer)
  191. buf.ReadFrom(reader)
  192. return buf.String(), nil
  193. }
  194. func (analyser *BurndownAnalysis) packPersonWithDay(person int, day int) int {
  195. if analyser.PeopleNumber == 0 {
  196. return day
  197. }
  198. result := day
  199. result |= person << 14
  200. // This effectively means max 16384 days (>44 years) and (131072 - 2) devs
  201. return result
  202. }
  203. func (analyser *BurndownAnalysis) unpackPersonWithDay(value int) (int, int) {
  204. if analyser.PeopleNumber == 0 {
  205. return MISSING_AUTHOR, value
  206. }
  207. return value >> 14, value & 0x3FFF
  208. }
  209. func (analyser *BurndownAnalysis) updateStatus(
  210. status interface{}, _ int, previous_time_ int, delta int) {
  211. _, previous_time := analyser.unpackPersonWithDay(previous_time_)
  212. status.(map[int]int64)[previous_time] += int64(delta)
  213. }
  214. func (analyser *BurndownAnalysis) updatePeople(people interface{}, _ int, previous_time_ int, delta int) {
  215. old_author, previous_time := analyser.unpackPersonWithDay(previous_time_)
  216. if old_author == MISSING_AUTHOR {
  217. return
  218. }
  219. casted := people.([]map[int]int64)
  220. stats := casted[old_author]
  221. if stats == nil {
  222. stats = map[int]int64{}
  223. casted[old_author] = stats
  224. }
  225. stats[previous_time] += int64(delta)
  226. }
  227. func (analyser *BurndownAnalysis) updateMatrix(
  228. matrix_ interface{}, current_time int, previous_time int, delta int) {
  229. matrix := matrix_.([]map[int]int64)
  230. new_author, _ := analyser.unpackPersonWithDay(current_time)
  231. old_author, _ := analyser.unpackPersonWithDay(previous_time)
  232. if old_author == MISSING_AUTHOR {
  233. return
  234. }
  235. if new_author == old_author && delta > 0 {
  236. new_author = SELF_AUTHOR
  237. }
  238. row := matrix[old_author]
  239. if row == nil {
  240. row = map[int]int64{}
  241. matrix[old_author] = row
  242. }
  243. cell, exists := row[new_author]
  244. if !exists {
  245. row[new_author] = 0
  246. cell = 0
  247. }
  248. row[new_author] = cell + int64(delta)
  249. }
  250. func (analyser *BurndownAnalysis) newFile(
  251. author int, day int, size int, global map[int]int64, people []map[int]int64,
  252. matrix []map[int]int64) *File {
  253. if analyser.PeopleNumber == 0 {
  254. return NewFile(day, size, NewStatus(global, analyser.updateStatus),
  255. NewStatus(make(map[int]int64), analyser.updateStatus))
  256. }
  257. return NewFile(analyser.packPersonWithDay(author, day), size,
  258. NewStatus(global, analyser.updateStatus),
  259. NewStatus(make(map[int]int64), analyser.updateStatus),
  260. NewStatus(people, analyser.updatePeople),
  261. NewStatus(matrix, analyser.updateMatrix))
  262. }
  263. func (analyser *BurndownAnalysis) handleInsertion(
  264. change *object.Change, author int, cache map[plumbing.Hash]*object.Blob) error {
  265. blob := cache[change.To.TreeEntry.Hash]
  266. lines, err := countLines(blob)
  267. if err != nil {
  268. if err.Error() == "binary" {
  269. return nil
  270. }
  271. return err
  272. }
  273. name := change.To.Name
  274. file, exists := analyser.files[name]
  275. if exists {
  276. return errors.New(fmt.Sprintf("file %s already exists", name))
  277. }
  278. file = analyser.newFile(
  279. author, analyser.day, lines, analyser.globalStatus, analyser.people, analyser.matrix)
  280. analyser.files[name] = file
  281. return nil
  282. }
  283. func (analyser *BurndownAnalysis) handleDeletion(
  284. change *object.Change, author int, cache map[plumbing.Hash]*object.Blob) error {
  285. blob := cache[change.From.TreeEntry.Hash]
  286. lines, err := countLines(blob)
  287. if err != nil {
  288. return err
  289. }
  290. name := change.From.Name
  291. file := analyser.files[name]
  292. file.Update(analyser.packPersonWithDay(author, analyser.day), 0, 0, lines)
  293. delete(analyser.files, name)
  294. return nil
  295. }
  296. func (analyser *BurndownAnalysis) handleModification(
  297. change *object.Change, author int, cache map[plumbing.Hash]*object.Blob) error {
  298. blob_from := cache[change.From.TreeEntry.Hash]
  299. blob_to := cache[change.To.TreeEntry.Hash]
  300. // we are not validating UTF-8 here because for example
  301. // git/git 4f7770c87ce3c302e1639a7737a6d2531fe4b160 fetch-pack.c is invalid UTF-8
  302. str_from, err := blobToString(blob_from)
  303. if err != nil {
  304. return err
  305. }
  306. str_to, err := blobToString(blob_to)
  307. if err != nil {
  308. return err
  309. }
  310. file, exists := analyser.files[change.From.Name]
  311. if !exists {
  312. return analyser.handleInsertion(change, author, cache)
  313. }
  314. // possible rename
  315. if change.To.Name != change.From.Name {
  316. err = analyser.handleRename(change.From.Name, change.To.Name)
  317. if err != nil {
  318. return err
  319. }
  320. }
  321. dmp := diffmatchpatch.New()
  322. src, dst, _ := dmp.DiffLinesToRunes(str_from, str_to)
  323. if file.Len() != len(src) {
  324. fmt.Fprintf(os.Stderr, "====TREE====\n%s", file.Dump())
  325. return errors.New(fmt.Sprintf("%s: internal integrity error src %d != %d %s -> %s",
  326. change.To.Name, len(src), file.Len(),
  327. change.From.TreeEntry.Hash.String(), change.To.TreeEntry.Hash.String()))
  328. }
  329. diffs := dmp.DiffMainRunes(src, dst, false)
  330. // we do not call RunesToDiffLines so the number of lines equals
  331. // to the rune count
  332. position := 0
  333. pending := diffmatchpatch.Diff{Text: ""}
  334. apply := func(edit diffmatchpatch.Diff) {
  335. length := utf8.RuneCountInString(edit.Text)
  336. if edit.Type == diffmatchpatch.DiffInsert {
  337. file.Update(analyser.packPersonWithDay(author, analyser.day), position, length, 0)
  338. position += length
  339. } else {
  340. file.Update(analyser.packPersonWithDay(author, analyser.day), position, 0, length)
  341. }
  342. if analyser.Debug {
  343. file.Validate()
  344. }
  345. }
  346. for _, edit := range diffs {
  347. dump_before := ""
  348. if analyser.Debug {
  349. dump_before = file.Dump()
  350. }
  351. length := utf8.RuneCountInString(edit.Text)
  352. debug_error := func() {
  353. fmt.Fprintf(os.Stderr, "%s: internal diff error\n", change.To.Name)
  354. fmt.Fprintf(os.Stderr, "Update(%d, %d, %d (0), %d (0))\n", analyser.day, position,
  355. length, utf8.RuneCountInString(pending.Text))
  356. if dump_before != "" {
  357. fmt.Fprintf(os.Stderr, "====TREE BEFORE====\n%s====END====\n", dump_before)
  358. }
  359. fmt.Fprintf(os.Stderr, "====TREE AFTER====\n%s====END====\n", file.Dump())
  360. }
  361. switch edit.Type {
  362. case diffmatchpatch.DiffEqual:
  363. if pending.Text != "" {
  364. apply(pending)
  365. pending.Text = ""
  366. }
  367. position += length
  368. case diffmatchpatch.DiffInsert:
  369. if pending.Text != "" {
  370. if pending.Type == diffmatchpatch.DiffInsert {
  371. debug_error()
  372. return errors.New("DiffInsert may not appear after DiffInsert")
  373. }
  374. file.Update(analyser.packPersonWithDay(author, analyser.day), position, length,
  375. utf8.RuneCountInString(pending.Text))
  376. if analyser.Debug {
  377. file.Validate()
  378. }
  379. position += length
  380. pending.Text = ""
  381. } else {
  382. pending = edit
  383. }
  384. case diffmatchpatch.DiffDelete:
  385. if pending.Text != "" {
  386. debug_error()
  387. return errors.New("DiffDelete may not appear after DiffInsert/DiffDelete")
  388. }
  389. pending = edit
  390. default:
  391. debug_error()
  392. return errors.New(fmt.Sprintf("diff operation is not supported: %d", edit.Type))
  393. }
  394. }
  395. if pending.Text != "" {
  396. apply(pending)
  397. pending.Text = ""
  398. }
  399. if file.Len() != len(dst) {
  400. return errors.New(fmt.Sprintf("%s: internal integrity error dst %d != %d",
  401. change.To.Name, len(dst), file.Len()))
  402. }
  403. return nil
  404. }
  405. func (analyser *BurndownAnalysis) handleRename(from, to string) error {
  406. file, exists := analyser.files[from]
  407. if !exists {
  408. return errors.New(fmt.Sprintf("file %s does not exist", from))
  409. }
  410. analyser.files[to] = file
  411. delete(analyser.files, from)
  412. return nil
  413. }
  414. func (analyser *BurndownAnalysis) groupStatus() ([]int64, map[string][]int64, [][]int64) {
  415. granularity := analyser.Granularity
  416. if granularity == 0 {
  417. granularity = 1
  418. }
  419. day := analyser.day
  420. day++
  421. adjust := 0
  422. if day%granularity != 0 {
  423. adjust = 1
  424. }
  425. global := make([]int64, day/granularity+adjust)
  426. var group int64
  427. for i := 0; i < day; i++ {
  428. group += analyser.globalStatus[i]
  429. if (i % granularity) == (granularity - 1) {
  430. global[i/granularity] = group
  431. group = 0
  432. }
  433. }
  434. if day%granularity != 0 {
  435. global[len(global)-1] = group
  436. }
  437. locals := make(map[string][]int64)
  438. for key, file := range analyser.files {
  439. status := make([]int64, day/granularity+adjust)
  440. var group int64
  441. for i := 0; i < day; i++ {
  442. group += file.Status(1).(map[int]int64)[i]
  443. if (i % granularity) == (granularity - 1) {
  444. status[i/granularity] = group
  445. group = 0
  446. }
  447. }
  448. if day%granularity != 0 {
  449. status[len(status)-1] = group
  450. }
  451. locals[key] = status
  452. }
  453. peoples := make([][]int64, len(analyser.people))
  454. for key, person := range analyser.people {
  455. status := make([]int64, day/granularity+adjust)
  456. var group int64
  457. for i := 0; i < day; i++ {
  458. group += person[i]
  459. if (i % granularity) == (granularity - 1) {
  460. status[i/granularity] = group
  461. group = 0
  462. }
  463. }
  464. if day%granularity != 0 {
  465. status[len(status)-1] = group
  466. }
  467. peoples[key] = status
  468. }
  469. return global, locals, peoples
  470. }
  471. func (analyser *BurndownAnalysis) updateHistories(
  472. globalStatus []int64, file_statuses map[string][]int64, people_statuses [][]int64, delta int) {
  473. for i := 0; i < delta; i++ {
  474. analyser.globalHistory = append(analyser.globalHistory, globalStatus)
  475. }
  476. to_delete := make([]string, 0)
  477. for key, fh := range analyser.fileHistories {
  478. ls, exists := file_statuses[key]
  479. if !exists {
  480. to_delete = append(to_delete, key)
  481. } else {
  482. for i := 0; i < delta; i++ {
  483. fh = append(fh, ls)
  484. }
  485. analyser.fileHistories[key] = fh
  486. }
  487. }
  488. for _, key := range to_delete {
  489. delete(analyser.fileHistories, key)
  490. }
  491. for key, ls := range file_statuses {
  492. fh, exists := analyser.fileHistories[key]
  493. if exists {
  494. continue
  495. }
  496. for i := 0; i < delta; i++ {
  497. fh = append(fh, ls)
  498. }
  499. analyser.fileHistories[key] = fh
  500. }
  501. for key, ph := range analyser.peopleHistories {
  502. ls := people_statuses[key]
  503. for i := 0; i < delta; i++ {
  504. ph = append(ph, ls)
  505. }
  506. analyser.peopleHistories[key] = ph
  507. }
  508. }