analyser.go 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308
  1. package hercules
  2. import (
  3. "bufio"
  4. "bytes"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "os"
  9. "time"
  10. "unicode/utf8"
  11. "github.com/sergi/go-diff/diffmatchpatch"
  12. "gopkg.in/src-d/go-git.v4"
  13. )
  14. type Analyser struct {
  15. Repository *git.Repository
  16. Granularity int
  17. Sampling int
  18. OnProgress func(int, int)
  19. }
  20. func checkClose(c io.Closer) {
  21. if err := c.Close(); err != nil {
  22. panic(err)
  23. }
  24. }
  25. func loc(file *git.Blob) (int, error) {
  26. reader, err := file.Reader()
  27. if err != nil {
  28. panic(err)
  29. }
  30. defer checkClose(reader)
  31. scanner := bufio.NewScanner(reader)
  32. counter := 0
  33. for scanner.Scan() {
  34. if !utf8.Valid(scanner.Bytes()) {
  35. return -1, errors.New("binary")
  36. }
  37. counter++
  38. }
  39. return counter, nil
  40. }
  41. func str(file *git.Blob) string {
  42. reader, err := file.Reader()
  43. if err != nil {
  44. panic(err)
  45. }
  46. defer checkClose(reader)
  47. buf := new(bytes.Buffer)
  48. buf.ReadFrom(reader)
  49. return buf.String()
  50. }
  51. func (analyser *Analyser) handleInsertion(
  52. change *git.Change, day int, status map[int]int64, files map[string]*File) {
  53. blob, err := analyser.Repository.Blob(change.To.TreeEntry.Hash)
  54. if err != nil {
  55. panic(err)
  56. }
  57. lines, err := loc(blob)
  58. if err != nil {
  59. return
  60. }
  61. name := change.To.Name
  62. file, exists := files[name]
  63. if exists {
  64. panic(fmt.Sprintf("file %s already exists", name))
  65. }
  66. file = NewFile(day, lines, status)
  67. files[name] = file
  68. }
  69. func (analyser *Analyser) handleDeletion(
  70. change *git.Change, day int, status map[int]int64, files map[string]*File) {
  71. blob, err := analyser.Repository.Blob(change.From.TreeEntry.Hash)
  72. if err != nil {
  73. panic(err)
  74. }
  75. lines, err := loc(blob)
  76. if err != nil {
  77. return
  78. }
  79. name := change.From.Name
  80. file := files[name]
  81. file.Update(day, 0, 0, lines)
  82. delete(files, name)
  83. }
  84. func (analyser *Analyser) handleModification(
  85. change *git.Change, day int, status map[int]int64, files map[string]*File) {
  86. blob_from, err := analyser.Repository.Blob(change.From.TreeEntry.Hash)
  87. if err != nil {
  88. panic(err)
  89. }
  90. blob_to, err := analyser.Repository.Blob(change.To.TreeEntry.Hash)
  91. if err != nil {
  92. panic(err)
  93. }
  94. // we are not validating UTF-8 here because for example
  95. // git/git 4f7770c87ce3c302e1639a7737a6d2531fe4b160 fetch-pack.c is invalid UTF-8
  96. str_from := str(blob_from)
  97. str_to := str(blob_to)
  98. file, exists := files[change.From.Name]
  99. if !exists {
  100. // fmt.Fprintf(os.Stderr, "warning: file %s does not exist\n", change.From.Name)
  101. analyser.handleInsertion(change, day, status, files)
  102. return
  103. }
  104. // possible rename
  105. if change.To.Name != change.From.Name {
  106. analyser.handleRename(change.From.Name, change.To.Name, files)
  107. }
  108. dmp := diffmatchpatch.New()
  109. src, dst, _ := dmp.DiffLinesToRunes(str_from, str_to)
  110. if file.Len() != len(src) {
  111. panic(fmt.Sprintf("%s: internal integrity error src %d != %d",
  112. change.To.Name, len(src), file.Len()))
  113. }
  114. diffs := dmp.DiffMainRunes(src, dst, false)
  115. // we do not call RunesToDiffLines so the number of lines equals
  116. // to the rune count
  117. position := 0
  118. for _, edit := range diffs {
  119. length := utf8.RuneCountInString(edit.Text)
  120. func() {
  121. defer func() {
  122. r := recover()
  123. if r != nil {
  124. fmt.Fprintf(os.Stderr, "%s: internal diff error\n", change.To.Name)
  125. fmt.Fprint(os.Stderr, "====BEFORE====\n")
  126. fmt.Fprint(os.Stderr, str_from)
  127. fmt.Fprint(os.Stderr, "====AFTER====\n")
  128. fmt.Fprint(os.Stderr, str_to)
  129. fmt.Fprint(os.Stderr, "====END====\n")
  130. panic(r)
  131. }
  132. }()
  133. switch edit.Type {
  134. case diffmatchpatch.DiffEqual:
  135. position += length
  136. case diffmatchpatch.DiffInsert:
  137. file.Update(day, position, length, 0)
  138. position += length
  139. case diffmatchpatch.DiffDelete:
  140. file.Update(day, position, 0, length)
  141. default:
  142. panic(fmt.Sprintf("diff operation is not supported: %d", edit.Type))
  143. }
  144. }()
  145. }
  146. if file.Len() != len(dst) {
  147. panic(fmt.Sprintf("%s: internal integrity error dst %d != %d",
  148. change.To.Name, len(dst), file.Len()))
  149. }
  150. }
  151. func (analyser *Analyser) handleRename(from, to string, files map[string]*File) {
  152. file, exists := files[from]
  153. if !exists {
  154. panic(fmt.Sprintf("file %s does not exist", from))
  155. }
  156. files[to] = file
  157. delete(files, from)
  158. }
  159. func (analyser *Analyser) Commits() []*git.Commit {
  160. result := []*git.Commit{}
  161. repository := analyser.Repository
  162. head, err := repository.Head()
  163. if err != nil {
  164. panic(err)
  165. }
  166. commit, err := repository.Commit(head.Hash())
  167. if err != nil {
  168. panic(err)
  169. }
  170. result = append(result, commit)
  171. for ; err != io.EOF; commit, err = commit.Parents().Next() {
  172. if err != nil {
  173. panic(err)
  174. }
  175. result = append(result, commit)
  176. }
  177. // reverse the order
  178. for i, j := 0, len(result)-1; i < j; i, j = i+1, j-1 {
  179. result[i], result[j] = result[j], result[i]
  180. }
  181. return result
  182. }
  183. func (analyser *Analyser) groupStatus(status map[int]int64, day int) []int64 {
  184. granularity := analyser.Granularity
  185. if granularity == 0 {
  186. granularity = 1
  187. }
  188. day++
  189. adjust := 0
  190. if day%granularity < granularity-1 {
  191. adjust = 1
  192. }
  193. result := make([]int64, day/granularity+adjust)
  194. var group int64
  195. for i := 0; i < day; i++ {
  196. group += status[i]
  197. if i%granularity == (granularity - 1) {
  198. result[i/granularity] = group
  199. group = 0
  200. }
  201. }
  202. if day%granularity < granularity-1 {
  203. result[len(result)-1] = group
  204. }
  205. return result
  206. }
  207. func (analyser *Analyser) Analyse(commits []*git.Commit) [][]int64 {
  208. sampling := analyser.Sampling
  209. if sampling == 0 {
  210. sampling = 1
  211. }
  212. onProgress := analyser.OnProgress
  213. if onProgress == nil {
  214. onProgress = func(int, int) {}
  215. }
  216. // current daily alive number of lines; key is the number of days from the
  217. // beginning of the history
  218. status := map[int]int64{}
  219. // weekly snapshots of status
  220. statuses := [][]int64{}
  221. // mapping <file path> -> hercules.File
  222. files := map[string]*File{}
  223. var day0 time.Time // will be initialized in the first iteration
  224. var prev_tree *git.Tree = nil
  225. prev_day := 0
  226. for index, commit := range commits {
  227. onProgress(index, len(commits))
  228. tree, err := commit.Tree()
  229. if err != nil {
  230. panic(err)
  231. }
  232. if index == 0 {
  233. // first iteration - initialize the file objects from the tree
  234. day0 = commit.Author.When
  235. func() {
  236. file_iter := tree.Files()
  237. defer file_iter.Close()
  238. for {
  239. file, err := file_iter.Next()
  240. if err != nil {
  241. if err == io.EOF {
  242. break
  243. }
  244. panic(err)
  245. }
  246. lines, err := loc(&file.Blob)
  247. if err == nil {
  248. files[file.Name] = NewFile(0, lines, status)
  249. }
  250. }
  251. }()
  252. } else {
  253. day := int(commit.Author.When.Sub(day0).Hours() / 24)
  254. delta := (day / sampling) - (prev_day / sampling)
  255. if delta > 0 {
  256. prev_day = day
  257. gs := analyser.groupStatus(status, day)
  258. for i := 0; i < delta; i++ {
  259. statuses = append(statuses, gs)
  260. }
  261. }
  262. tree_diff, err := git.DiffTree(prev_tree, tree)
  263. if err != nil {
  264. panic(err)
  265. }
  266. for _, change := range tree_diff {
  267. switch change.Action {
  268. case git.Insert:
  269. analyser.handleInsertion(change, day, status, files)
  270. case git.Delete:
  271. analyser.handleDeletion(change, day, status, files)
  272. case git.Modify:
  273. func() {
  274. defer func() {
  275. r := recover()
  276. if r != nil {
  277. fmt.Fprintf(os.Stderr, "%s: modification error\n", commit.Hash.String())
  278. panic(r)
  279. }
  280. }()
  281. analyser.handleModification(change, day, status, files)
  282. }()
  283. default:
  284. panic(fmt.Sprintf("unsupported action: %d", change.Action))
  285. }
  286. }
  287. }
  288. prev_tree = tree
  289. }
  290. return statuses
  291. }