analyser.go 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295
  1. package hercules
  2. import (
  3. "bufio"
  4. "bytes"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "os"
  9. "time"
  10. "unicode/utf8"
  11. "github.com/sergi/go-diff/diffmatchpatch"
  12. "gopkg.in/src-d/go-git.v4"
  13. )
  14. type Analyser struct {
  15. Repository *git.Repository
  16. Granularity int
  17. OnProgress func(int)
  18. }
  19. func checkClose(c io.Closer) {
  20. if err := c.Close(); err != nil {
  21. panic(err)
  22. }
  23. }
  24. func loc(file *git.Blob) (int, error) {
  25. reader, err := file.Reader()
  26. if err != nil {
  27. panic(err)
  28. }
  29. defer checkClose(reader)
  30. scanner := bufio.NewScanner(reader)
  31. counter := 0
  32. for scanner.Scan() {
  33. if !utf8.Valid(scanner.Bytes()) {
  34. return -1, errors.New("binary")
  35. }
  36. counter++
  37. }
  38. return counter, nil
  39. }
  40. func str(file *git.Blob) string {
  41. reader, err := file.Reader()
  42. if err != nil {
  43. panic(err)
  44. }
  45. defer checkClose(reader)
  46. buf := new(bytes.Buffer)
  47. buf.ReadFrom(reader)
  48. return buf.String()
  49. }
  50. func (analyser *Analyser) handleInsertion(
  51. change *git.Change, day int, status map[int]int64, files map[string]*File) {
  52. blob, err := analyser.Repository.Blob(change.To.TreeEntry.Hash)
  53. if err != nil {
  54. panic(err)
  55. }
  56. lines, err := loc(blob)
  57. if err != nil {
  58. return
  59. }
  60. name := change.To.Name
  61. file, exists := files[name]
  62. if exists {
  63. panic(fmt.Sprintf("file %s already exists", name))
  64. }
  65. file = NewFile(day, lines, status)
  66. files[name] = file
  67. }
  68. func (analyser *Analyser) handleDeletion(
  69. change *git.Change, day int, status map[int]int64, files map[string]*File) {
  70. blob, err := analyser.Repository.Blob(change.From.TreeEntry.Hash)
  71. if err != nil {
  72. panic(err)
  73. }
  74. lines, err := loc(blob)
  75. if err != nil {
  76. return
  77. }
  78. name := change.From.Name
  79. file := files[name]
  80. file.Update(day, 0, 0, lines)
  81. delete(files, name)
  82. }
  83. func (analyser *Analyser) handleModification(
  84. change *git.Change, day int, status map[int]int64, files map[string]*File) {
  85. blob_from, err := analyser.Repository.Blob(change.From.TreeEntry.Hash)
  86. if err != nil {
  87. panic(err)
  88. }
  89. blob_to, err := analyser.Repository.Blob(change.To.TreeEntry.Hash)
  90. if err != nil {
  91. panic(err)
  92. }
  93. // we are not validating UTF-8 here because for example
  94. // git/git 4f7770c87ce3c302e1639a7737a6d2531fe4b160 fetch-pack.c is invalid UTF-8
  95. str_from := str(blob_from)
  96. str_to := str(blob_to)
  97. file, exists := files[change.From.Name]
  98. if !exists {
  99. fmt.Fprintf(os.Stderr, "warning: file %s does not exist\n", change.From.Name)
  100. analyser.handleInsertion(change, day, status, files)
  101. return
  102. }
  103. // possible rename
  104. if change.To.Name != change.From.Name {
  105. analyser.handleRename(change.From.Name, change.To.Name, files)
  106. }
  107. dmp := diffmatchpatch.New()
  108. src, dst, _ := dmp.DiffLinesToRunes(str_from, str_to)
  109. if file.Len() != len(src) {
  110. panic(fmt.Sprintf("%s: internal integrity error src %d != %d",
  111. change.To.Name, len(src), file.Len()))
  112. }
  113. diffs := dmp.DiffMainRunes(src, dst, false)
  114. // we do not call RunesToDiffLines so the number of lines equals
  115. // to the rune count
  116. position := 0
  117. for _, edit := range diffs {
  118. length := utf8.RuneCountInString(edit.Text)
  119. func() {
  120. defer func() {
  121. r := recover()
  122. if r != nil {
  123. fmt.Fprintf(os.Stderr, "%s: internal diff error\n", change.To.Name)
  124. fmt.Fprint(os.Stderr, "====BEFORE====\n")
  125. fmt.Fprint(os.Stderr, str_from)
  126. fmt.Fprint(os.Stderr, "====AFTER====\n")
  127. fmt.Fprint(os.Stderr, str_to)
  128. fmt.Fprint(os.Stderr, "====END====\n")
  129. panic(r)
  130. }
  131. }()
  132. switch edit.Type {
  133. case diffmatchpatch.DiffEqual:
  134. position += length
  135. case diffmatchpatch.DiffInsert:
  136. file.Update(day, position, length, 0)
  137. position += length
  138. case diffmatchpatch.DiffDelete:
  139. file.Update(day, position, 0, length)
  140. default:
  141. panic(fmt.Sprintf("diff operation is not supported: %d", edit.Type))
  142. }
  143. }()
  144. }
  145. if file.Len() != len(dst) {
  146. panic(fmt.Sprintf("%s: internal integrity error dst %d != %d",
  147. change.To.Name, len(dst), file.Len()))
  148. }
  149. }
  150. func (analyser *Analyser) handleRename(from, to string, files map[string]*File) {
  151. file, exists := files[from]
  152. if !exists {
  153. panic(fmt.Sprintf("file %s does not exist", from))
  154. }
  155. files[to] = file
  156. delete(files, from)
  157. }
  158. func (analyser *Analyser) commits() []*git.Commit {
  159. result := []*git.Commit{}
  160. repository := analyser.Repository
  161. head, err := repository.Head()
  162. if err != nil {
  163. panic(err)
  164. }
  165. commit, err := repository.Commit(head.Hash())
  166. if err != nil {
  167. panic(err)
  168. }
  169. result = append(result, commit)
  170. for ; err != io.EOF; commit, err = commit.Parents().Next() {
  171. if err != nil {
  172. panic(err)
  173. }
  174. result = append(result, commit)
  175. }
  176. // reverse the order
  177. for i, j := 0, len(result)-1; i < j; i, j = i+1, j-1 {
  178. result[i], result[j] = result[j], result[i]
  179. }
  180. return result
  181. }
  182. func (analyser *Analyser) groupStatus(status map[int]int64, day int) []int64 {
  183. granularity := analyser.Granularity
  184. result := make([]int64, day/granularity)
  185. var group int64
  186. for i := 0; i < day; i++ {
  187. group += status[i]
  188. if i%granularity == (granularity - 1) {
  189. result[i/granularity] = group
  190. group = 0
  191. }
  192. }
  193. return result
  194. }
  195. func (analyser *Analyser) Analyse() [][]int64 {
  196. granularity := analyser.Granularity
  197. if granularity == 0 {
  198. granularity = 1
  199. }
  200. onProgress := analyser.OnProgress
  201. if onProgress == nil {
  202. onProgress = func(int) {}
  203. }
  204. // current daily alive number of lines; key is the number of days from the
  205. // beginning of the history
  206. status := map[int]int64{}
  207. // weekly snapshots of status
  208. statuses := [][]int64{}
  209. // mapping <file path> -> hercules.File
  210. files := map[string]*File{}
  211. // list of commits belonging to the default branch, from oldest to newest
  212. commits := analyser.commits()
  213. var day0 time.Time // will be initialized in the first iteration
  214. var prev_tree *git.Tree = nil
  215. prev_day := 0
  216. for index, commit := range commits {
  217. onProgress(index)
  218. tree, err := commit.Tree()
  219. if err != nil {
  220. panic(err)
  221. }
  222. if index == 0 {
  223. // first iteration - initialize the file objects from the tree
  224. day0 = commit.Author.When
  225. func() {
  226. file_iter := tree.Files()
  227. defer file_iter.Close()
  228. for {
  229. file, err := file_iter.Next()
  230. if err != nil {
  231. if err == io.EOF {
  232. break
  233. }
  234. panic(err)
  235. }
  236. lines, err := loc(&file.Blob)
  237. if err == nil {
  238. files[file.Name] = NewFile(0, lines, status)
  239. }
  240. }
  241. }()
  242. } else {
  243. day := int(commit.Author.When.Sub(day0).Hours() / 24)
  244. delta := (day / granularity) - (prev_day / granularity)
  245. if delta > 0 {
  246. prev_day = day
  247. statuses = append(statuses, analyser.groupStatus(status, day))
  248. }
  249. tree_diff, err := git.DiffTree(prev_tree, tree)
  250. if err != nil {
  251. panic(err)
  252. }
  253. for _, change := range tree_diff {
  254. switch change.Action {
  255. case git.Insert:
  256. analyser.handleInsertion(change, day, status, files)
  257. case git.Delete:
  258. analyser.handleDeletion(change, day, status, files)
  259. case git.Modify:
  260. func() {
  261. defer func() {
  262. r := recover()
  263. if r != nil {
  264. fmt.Fprintf(os.Stderr, "%s: modification error\n", commit.Hash.String())
  265. panic(r)
  266. }
  267. }()
  268. analyser.handleModification(change, day, status, files)
  269. }()
  270. default:
  271. panic(fmt.Sprintf("unsupported action: %d", change.Action))
  272. }
  273. }
  274. }
  275. prev_tree = tree
  276. }
  277. return statuses
  278. }