analyser.go 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. package hercules
  2. import (
  3. "bufio"
  4. "bytes"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "time"
  9. "unicode/utf8"
  10. "github.com/sergi/go-diff/diffmatchpatch"
  11. "gopkg.in/src-d/go-git.v4"
  12. )
  13. type Analyser struct {
  14. Repository *git.Repository
  15. Granularity int
  16. OnProgress func(int)
  17. }
  18. func checkClose(c io.Closer) {
  19. if err := c.Close(); err != nil {
  20. panic(err)
  21. }
  22. }
  23. func loc(file *git.Blob) (int, error) {
  24. reader, err := file.Reader()
  25. if err != nil {
  26. panic(err)
  27. }
  28. defer checkClose(reader)
  29. scanner := bufio.NewScanner(reader)
  30. counter := 0
  31. for scanner.Scan() {
  32. if !utf8.Valid(scanner.Bytes()) {
  33. return -1, errors.New("binary")
  34. }
  35. counter++
  36. }
  37. return counter, nil
  38. }
  39. func str(file *git.Blob) (string, error) {
  40. reader, err := file.Reader()
  41. if err != nil {
  42. panic(err)
  43. }
  44. defer checkClose(reader)
  45. buf := new(bytes.Buffer)
  46. buf.ReadFrom(reader)
  47. if !utf8.Valid(buf.Bytes()) {
  48. return "", errors.New("binary")
  49. }
  50. return buf.String(), nil
  51. }
  52. func (analyser *Analyser) handleInsertion(
  53. change *git.Change, day int, status map[int]int64, files map[string]*File) {
  54. blob, err := analyser.Repository.Blob(change.To.TreeEntry.Hash)
  55. if err != nil {
  56. panic(err)
  57. }
  58. lines, err := loc(blob)
  59. if err != nil {
  60. return
  61. }
  62. name := change.To.Name
  63. file, exists := files[name]
  64. if exists {
  65. panic(fmt.Sprintf("file %s already exists", name))
  66. }
  67. file = NewFile(day, lines, status)
  68. files[name] = file
  69. }
  70. func (analyser *Analyser) handleDeletion(
  71. change *git.Change, day int, status map[int]int64, files map[string]*File) {
  72. blob, err := analyser.Repository.Blob(change.From.TreeEntry.Hash)
  73. if err != nil {
  74. panic(err)
  75. }
  76. lines, err := loc(blob)
  77. if err != nil {
  78. return
  79. }
  80. name := change.From.Name
  81. file := files[name]
  82. file.Update(day, 0, 0, lines)
  83. delete(files, name)
  84. }
  85. func (analyser *Analyser) handleModification(
  86. change *git.Change, day int, status map[int]int64, files map[string]*File) {
  87. blob_from, err := analyser.Repository.Blob(change.From.TreeEntry.Hash)
  88. if err != nil {
  89. panic(err)
  90. }
  91. blob_to, err := analyser.Repository.Blob(change.To.TreeEntry.Hash)
  92. if err != nil {
  93. panic(err)
  94. }
  95. str_from, err := str(blob_from)
  96. if err != nil {
  97. return
  98. }
  99. str_to, _ := str(blob_to)
  100. file, exists := files[change.From.Name]
  101. if !exists {
  102. panic(fmt.Sprintf("file %s does not exist", change.From.Name))
  103. }
  104. // possible rename
  105. if change.To.Name != change.From.Name {
  106. analyser.handleRename(change.From.Name, change.To.Name, files)
  107. }
  108. dmp := diffmatchpatch.New()
  109. src, dst, _ := dmp.DiffLinesToRunes(str_from, str_to)
  110. diffs := dmp.DiffMainRunes(src, dst, false)
  111. // we do not call RunesToDiffLines so the number of lines equals
  112. // to the rune count
  113. position := 0
  114. for _, edit := range diffs {
  115. length := utf8.RuneCountInString(edit.Text)
  116. switch edit.Type {
  117. case diffmatchpatch.DiffEqual:
  118. position += length
  119. case diffmatchpatch.DiffInsert:
  120. file.Update(day, position, length, 0)
  121. position += length
  122. case diffmatchpatch.DiffDelete:
  123. file.Update(day, position, 0, length)
  124. break
  125. default:
  126. panic(fmt.Sprintf("diff operation is not supported: %d", edit.Type))
  127. }
  128. }
  129. }
  130. func (analyser *Analyser) handleRename(from, to string, files map[string]*File) {
  131. file, exists := files[from]
  132. if !exists {
  133. panic(fmt.Sprintf("file %s does not exist", from))
  134. }
  135. files[to] = file
  136. delete(files, from)
  137. }
  138. func (analyser *Analyser) commits() []*git.Commit {
  139. result := []*git.Commit{}
  140. repository := analyser.Repository
  141. head, err := repository.Head()
  142. if err != nil {
  143. panic(err)
  144. }
  145. commit, err := repository.Commit(head.Hash())
  146. if err != nil {
  147. panic(err)
  148. }
  149. result = append(result, commit)
  150. for ; err != io.EOF; commit, err = commit.Parents().Next() {
  151. if err != nil {
  152. panic(err)
  153. }
  154. result = append(result, commit)
  155. }
  156. // reverse the order
  157. for i, j := 0, len(result)-1; i < j; i, j = i+1, j-1 {
  158. result[i], result[j] = result[j], result[i]
  159. }
  160. return result
  161. }
  162. func (analyser *Analyser) groupStatus(status map[int]int64, day int) []int64 {
  163. granularity := analyser.Granularity
  164. result := make([]int64, day / granularity)
  165. var group int64
  166. for i := 0; i < day; i++ {
  167. group += status[i]
  168. if i % granularity == (granularity - 1) {
  169. result[i / granularity] = group
  170. group = 0
  171. }
  172. }
  173. return result
  174. }
  175. func (analyser *Analyser) Analyse() [][]int64 {
  176. granularity := analyser.Granularity
  177. if granularity == 0 {
  178. granularity = 1
  179. }
  180. onProgress := analyser.OnProgress
  181. if onProgress == nil {
  182. onProgress = func(int) {}
  183. }
  184. // current daily alive number of lines; key is the number of days from the
  185. // beginning of the history
  186. status := map[int]int64{}
  187. // weekly snapshots of status
  188. statuses := [][]int64{}
  189. // mapping <file path> -> hercules.File
  190. files := map[string]*File{}
  191. // list of commits belonging to the default branch, from oldest to newest
  192. commits := analyser.commits()
  193. var day0 time.Time // will be initialized in the first iteration
  194. var prev_tree *git.Tree = nil
  195. prev_day := 0
  196. for index, commit := range commits {
  197. onProgress(index)
  198. tree, err := commit.Tree()
  199. if err != nil {
  200. panic(err)
  201. }
  202. if index == 0 {
  203. // first iteration - initialize the file objects from the tree
  204. day0 = commit.Author.When
  205. func() {
  206. file_iter := tree.Files()
  207. defer file_iter.Close()
  208. for {
  209. file, err := file_iter.Next()
  210. if err != nil {
  211. if err == io.EOF {
  212. break
  213. }
  214. panic(err)
  215. }
  216. lines, err := loc(&file.Blob)
  217. if err == nil {
  218. files[file.Name] = NewFile(0, lines, status)
  219. }
  220. }
  221. }()
  222. } else {
  223. day := int(commit.Author.When.Sub(day0).Hours() / 24)
  224. delta := (day / granularity) - (prev_day / granularity)
  225. if delta > 0 {
  226. prev_day = day
  227. statuses = append(statuses, analyser.groupStatus(status, day))
  228. }
  229. tree_diff, err := git.DiffTree(prev_tree, tree)
  230. if err != nil {
  231. panic(err)
  232. }
  233. for _, change := range tree_diff {
  234. switch change.Action {
  235. case git.Insert:
  236. analyser.handleInsertion(change, day, status, files)
  237. case git.Delete:
  238. analyser.handleDeletion(change, day, status, files)
  239. case git.Modify:
  240. analyser.handleModification(change, day, status, files)
  241. default:
  242. panic(fmt.Sprintf("unsupported action: %d", change.Action))
  243. }
  244. }
  245. }
  246. prev_tree = tree
  247. }
  248. return statuses
  249. }