analyser.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551
  1. package hercules
  2. import (
  3. "bufio"
  4. "bytes"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "os"
  9. "sort"
  10. "time"
  11. "unicode/utf8"
  12. "github.com/sergi/go-diff/diffmatchpatch"
  13. "gopkg.in/src-d/go-git.v4"
  14. "gopkg.in/src-d/go-git.v4/plumbing"
  15. "gopkg.in/src-d/go-git.v4/plumbing/object"
  16. "gopkg.in/src-d/go-git.v4/utils/merkletrie"
  17. )
  18. type Analyser struct {
  19. Repository *git.Repository
  20. Granularity int
  21. Sampling int
  22. SimilarityThreshold int
  23. OnProgress func(int, int)
  24. }
  25. func checkClose(c io.Closer) {
  26. if err := c.Close(); err != nil {
  27. panic(err)
  28. }
  29. }
  30. func loc(file *object.Blob) (int, error) {
  31. reader, err := file.Reader()
  32. if err != nil {
  33. panic(err)
  34. }
  35. defer checkClose(reader)
  36. scanner := bufio.NewScanner(reader)
  37. counter := 0
  38. for scanner.Scan() {
  39. if !utf8.Valid(scanner.Bytes()) {
  40. return -1, errors.New("binary")
  41. }
  42. counter++
  43. }
  44. return counter, nil
  45. }
  46. func str(file *object.Blob) string {
  47. reader, err := file.Reader()
  48. if err != nil {
  49. panic(err)
  50. }
  51. defer checkClose(reader)
  52. buf := new(bytes.Buffer)
  53. buf.ReadFrom(reader)
  54. return buf.String()
  55. }
  56. func (analyser *Analyser) handleInsertion(
  57. change *object.Change, day int, status map[int]int64, files map[string]*File,
  58. cache *map[plumbing.Hash]*object.Blob) {
  59. blob := (*cache)[change.To.TreeEntry.Hash]
  60. lines, err := loc(blob)
  61. if err != nil {
  62. return
  63. }
  64. name := change.To.Name
  65. file, exists := files[name]
  66. if exists {
  67. panic(fmt.Sprintf("file %s already exists", name))
  68. }
  69. file = NewFile(day, lines, status)
  70. files[name] = file
  71. }
  72. func (analyser *Analyser) handleDeletion(
  73. change *object.Change, day int, status map[int]int64, files map[string]*File,
  74. cache *map[plumbing.Hash]*object.Blob) {
  75. blob := (*cache)[change.From.TreeEntry.Hash]
  76. lines, err := loc(blob)
  77. if err != nil {
  78. return
  79. }
  80. name := change.From.Name
  81. file := files[name]
  82. file.Update(day, 0, 0, lines)
  83. delete(files, name)
  84. }
  85. func (analyser *Analyser) handleModification(
  86. change *object.Change, day int, status map[int]int64, files map[string]*File,
  87. cache *map[plumbing.Hash]*object.Blob) {
  88. blob_from := (*cache)[change.From.TreeEntry.Hash]
  89. blob_to := (*cache)[change.To.TreeEntry.Hash]
  90. // we are not validating UTF-8 here because for example
  91. // git/git 4f7770c87ce3c302e1639a7737a6d2531fe4b160 fetch-pack.c is invalid UTF-8
  92. str_from := str(blob_from)
  93. str_to := str(blob_to)
  94. file, exists := files[change.From.Name]
  95. if !exists {
  96. analyser.handleInsertion(change, day, status, files, cache)
  97. return
  98. }
  99. // possible rename
  100. if change.To.Name != change.From.Name {
  101. analyser.handleRename(change.From.Name, change.To.Name, files)
  102. }
  103. dmp := diffmatchpatch.New()
  104. src, dst, _ := dmp.DiffLinesToRunes(str_from, str_to)
  105. if file.Len() != len(src) {
  106. panic(fmt.Sprintf("%s: internal integrity error src %d != %d",
  107. change.To.Name, len(src), file.Len()))
  108. }
  109. diffs := dmp.DiffMainRunes(src, dst, false)
  110. // we do not call RunesToDiffLines so the number of lines equals
  111. // to the rune count
  112. position := 0
  113. pending := diffmatchpatch.Diff{Text: ""}
  114. apply := func(edit diffmatchpatch.Diff) {
  115. length := utf8.RuneCountInString(edit.Text)
  116. if edit.Type == diffmatchpatch.DiffInsert {
  117. file.Update(day, position, length, 0)
  118. position += length
  119. } else {
  120. file.Update(day, position, 0, length)
  121. }
  122. }
  123. for _, edit := range diffs {
  124. length := utf8.RuneCountInString(edit.Text)
  125. func() {
  126. defer func() {
  127. r := recover()
  128. if r != nil {
  129. fmt.Fprintf(os.Stderr, "%s: internal diff error\n", change.To.Name)
  130. fmt.Fprint(os.Stderr, "====BEFORE====\n")
  131. fmt.Fprint(os.Stderr, str_from)
  132. fmt.Fprint(os.Stderr, "====AFTER====\n")
  133. fmt.Fprint(os.Stderr, str_to)
  134. fmt.Fprint(os.Stderr, "====END====\n")
  135. panic(r)
  136. }
  137. }()
  138. switch edit.Type {
  139. case diffmatchpatch.DiffEqual:
  140. if pending.Text != "" {
  141. apply(pending)
  142. pending.Text = ""
  143. }
  144. position += length
  145. case diffmatchpatch.DiffInsert:
  146. if pending.Text != "" {
  147. if pending.Type == diffmatchpatch.DiffInsert {
  148. panic("DiffInsert may not appear after DiffInsert")
  149. }
  150. file.Update(day, position, length, utf8.RuneCountInString(pending.Text))
  151. position += length
  152. pending.Text = ""
  153. } else {
  154. pending = edit
  155. }
  156. case diffmatchpatch.DiffDelete:
  157. if pending.Text != "" {
  158. panic("DiffDelete may not appear after DiffInsert/DiffDelete")
  159. }
  160. pending = edit
  161. default:
  162. panic(fmt.Sprintf("diff operation is not supported: %d", edit.Type))
  163. }
  164. }()
  165. }
  166. if pending.Text != "" {
  167. apply(pending)
  168. pending.Text = ""
  169. }
  170. if file.Len() != len(dst) {
  171. panic(fmt.Sprintf("%s: internal integrity error dst %d != %d",
  172. change.To.Name, len(dst), file.Len()))
  173. }
  174. }
  175. func (analyser *Analyser) handleRename(from, to string, files map[string]*File) {
  176. file, exists := files[from]
  177. if !exists {
  178. panic(fmt.Sprintf("file %s does not exist", from))
  179. }
  180. files[to] = file
  181. delete(files, from)
  182. }
  183. func (analyser *Analyser) Commits() []*object.Commit {
  184. result := []*object.Commit{}
  185. repository := analyser.Repository
  186. head, err := repository.Head()
  187. if err != nil {
  188. panic(err)
  189. }
  190. commit, err := repository.CommitObject(head.Hash())
  191. if err != nil {
  192. panic(err)
  193. }
  194. result = append(result, commit)
  195. for ; err != io.EOF; commit, err = commit.Parents().Next() {
  196. if err != nil {
  197. panic(err)
  198. }
  199. result = append(result, commit)
  200. }
  201. // reverse the order
  202. for i, j := 0, len(result)-1; i < j; i, j = i+1, j-1 {
  203. result[i], result[j] = result[j], result[i]
  204. }
  205. return result
  206. }
  207. func (analyser *Analyser) groupStatus(status map[int]int64, day int) []int64 {
  208. granularity := analyser.Granularity
  209. if granularity == 0 {
  210. granularity = 1
  211. }
  212. day++
  213. adjust := 0
  214. if day%granularity < granularity-1 {
  215. adjust = 1
  216. }
  217. result := make([]int64, day/granularity+adjust)
  218. var group int64
  219. for i := 0; i < day; i++ {
  220. group += status[i]
  221. if i%granularity == (granularity - 1) {
  222. result[i/granularity] = group
  223. group = 0
  224. }
  225. }
  226. if day%granularity < granularity-1 {
  227. result[len(result)-1] = group
  228. }
  229. return result
  230. }
  231. type sortableChange struct {
  232. change *object.Change
  233. hash plumbing.Hash
  234. }
  235. type sortableChanges []sortableChange
  236. func (change *sortableChange) Less(other *sortableChange) bool {
  237. for x := 0; x < 20; x++ {
  238. if change.hash[x] < other.hash[x] {
  239. return true
  240. }
  241. }
  242. return false
  243. }
  244. func (slice sortableChanges) Len() int {
  245. return len(slice)
  246. }
  247. func (slice sortableChanges) Less(i, j int) bool {
  248. return slice[i].Less(&slice[j])
  249. }
  250. func (slice sortableChanges) Swap(i, j int) {
  251. slice[i], slice[j] = slice[j], slice[i]
  252. }
  253. type sortableBlob struct {
  254. change *object.Change
  255. size int64
  256. }
  257. type sortableBlobs []sortableBlob
  258. func (change *sortableBlob) Less(other *sortableBlob) bool {
  259. return change.size < other.size
  260. }
  261. func (slice sortableBlobs) Len() int {
  262. return len(slice)
  263. }
  264. func (slice sortableBlobs) Less(i, j int) bool {
  265. return slice[i].Less(&slice[j])
  266. }
  267. func (slice sortableBlobs) Swap(i, j int) {
  268. slice[i], slice[j] = slice[j], slice[i]
  269. }
  270. func (analyser *Analyser) sizesAreClose(size1 int64, size2 int64) bool {
  271. return abs64(size1-size2)*100/min64(size1, size2) <=
  272. int64(100-analyser.SimilarityThreshold)
  273. }
  274. func (analyser *Analyser) blobsAreClose(
  275. blob1 *object.Blob, blob2 *object.Blob) bool {
  276. str_from := str(blob1)
  277. str_to := str(blob2)
  278. dmp := diffmatchpatch.New()
  279. src, dst, _ := dmp.DiffLinesToRunes(str_from, str_to)
  280. diffs := dmp.DiffMainRunes(src, dst, false)
  281. common := 0
  282. for _, edit := range diffs {
  283. if edit.Type == diffmatchpatch.DiffEqual {
  284. common += utf8.RuneCountInString(edit.Text)
  285. }
  286. }
  287. return common*100/min(len(src), len(dst)) >=
  288. analyser.SimilarityThreshold
  289. }
  290. func (analyser *Analyser) getBlob(hash plumbing.Hash) *object.Blob {
  291. blob, err := analyser.Repository.BlobObject(hash)
  292. if err != nil {
  293. panic(err)
  294. }
  295. return blob
  296. }
  297. func (analyser *Analyser) cacheBlobs(changes object.Changes) *map[plumbing.Hash]*object.Blob {
  298. cache := make(map[plumbing.Hash]*object.Blob)
  299. for _, change := range changes {
  300. action, err := change.Action()
  301. if err != nil {
  302. panic(err)
  303. }
  304. switch action {
  305. case merkletrie.Insert:
  306. cache[change.To.TreeEntry.Hash] = analyser.getBlob(change.To.TreeEntry.Hash)
  307. case merkletrie.Delete:
  308. cache[change.From.TreeEntry.Hash] = analyser.getBlob(change.From.TreeEntry.Hash)
  309. case merkletrie.Modify:
  310. cache[change.To.TreeEntry.Hash] = analyser.getBlob(change.To.TreeEntry.Hash)
  311. cache[change.From.TreeEntry.Hash] = analyser.getBlob(change.From.TreeEntry.Hash)
  312. default:
  313. panic(fmt.Sprintf("unsupported action: %d", change.Action))
  314. }
  315. }
  316. return &cache
  317. }
  318. func (analyser *Analyser) detectRenames(
  319. changes object.Changes, cache *map[plumbing.Hash]*object.Blob) object.Changes {
  320. reduced_changes := make(object.Changes, 0, changes.Len())
  321. // Stage 1 - find renames by matching the hashes
  322. // n log(n)
  323. // We sort additions and deletions by hash and then do the single scan along
  324. // both slices.
  325. deleted := make(sortableChanges, 0, changes.Len())
  326. added := make(sortableChanges, 0, changes.Len())
  327. for _, change := range changes {
  328. action, err := change.Action()
  329. if err != nil {
  330. panic(err)
  331. }
  332. switch action {
  333. case merkletrie.Insert:
  334. added = append(added, sortableChange{change, change.To.TreeEntry.Hash})
  335. case merkletrie.Delete:
  336. deleted = append(deleted, sortableChange{change, change.From.TreeEntry.Hash})
  337. case merkletrie.Modify:
  338. reduced_changes = append(reduced_changes, change)
  339. default:
  340. panic(fmt.Sprintf("unsupported action: %d", change.Action))
  341. }
  342. }
  343. sort.Sort(deleted)
  344. sort.Sort(added)
  345. a := 0
  346. d := 0
  347. still_deleted := make(object.Changes, 0, deleted.Len())
  348. still_added := make(object.Changes, 0, added.Len())
  349. for a < added.Len() && d < deleted.Len() {
  350. if added[a].hash == deleted[d].hash {
  351. reduced_changes = append(
  352. reduced_changes,
  353. &object.Change{From: deleted[d].change.From, To: added[a].change.To})
  354. a++
  355. d++
  356. } else if added[a].Less(&deleted[d]) {
  357. still_added = append(still_added, added[a].change)
  358. a++
  359. } else {
  360. still_deleted = append(still_deleted, deleted[d].change)
  361. d++
  362. }
  363. }
  364. for ; a < added.Len(); a++ {
  365. still_added = append(still_added, added[a].change)
  366. }
  367. for ; d < deleted.Len(); d++ {
  368. still_deleted = append(still_deleted, deleted[d].change)
  369. }
  370. // Stage 2 - apply the similarity threshold
  371. // n^2 but actually linear
  372. // We sort the blobs by size and do the single linear scan.
  373. added_blobs := make(sortableBlobs, 0, still_added.Len())
  374. deleted_blobs := make(sortableBlobs, 0, still_deleted.Len())
  375. for _, change := range still_added {
  376. blob := (*cache)[change.To.TreeEntry.Hash]
  377. added_blobs = append(
  378. added_blobs, sortableBlob{change: change, size: blob.Size})
  379. }
  380. for _, change := range still_deleted {
  381. blob := (*cache)[change.From.TreeEntry.Hash]
  382. deleted_blobs = append(
  383. deleted_blobs, sortableBlob{change: change, size: blob.Size})
  384. }
  385. sort.Sort(added_blobs)
  386. sort.Sort(deleted_blobs)
  387. d_start := 0
  388. for a = 0; a < added_blobs.Len(); a++ {
  389. my_blob := (*cache)[added_blobs[a].change.To.TreeEntry.Hash]
  390. my_size := added_blobs[a].size
  391. for d = d_start; d < deleted_blobs.Len() && !analyser.sizesAreClose(my_size, deleted_blobs[d].size); d++ {
  392. }
  393. d_start = d
  394. found_match := false
  395. for d = d_start; d < deleted_blobs.Len() && analyser.sizesAreClose(my_size, deleted_blobs[d].size); d++ {
  396. if analyser.blobsAreClose(
  397. my_blob, (*cache)[deleted_blobs[d].change.From.TreeEntry.Hash]) {
  398. found_match = true
  399. reduced_changes = append(
  400. reduced_changes,
  401. &object.Change{From: deleted_blobs[d].change.From,
  402. To: added_blobs[a].change.To})
  403. break
  404. }
  405. }
  406. if found_match {
  407. added_blobs = append(added_blobs[:a], added_blobs[a+1:]...)
  408. a--
  409. deleted_blobs = append(deleted_blobs[:d], deleted_blobs[d+1:]...)
  410. }
  411. }
  412. // Stage 3 - we give up, everything left are independent additions and deletions
  413. for _, blob := range added_blobs {
  414. reduced_changes = append(reduced_changes, blob.change)
  415. }
  416. for _, blob := range deleted_blobs {
  417. reduced_changes = append(reduced_changes, blob.change)
  418. }
  419. return reduced_changes
  420. }
  421. func (analyser *Analyser) Analyse(commits []*object.Commit) [][]int64 {
  422. sampling := analyser.Sampling
  423. if sampling == 0 {
  424. sampling = 1
  425. }
  426. onProgress := analyser.OnProgress
  427. if onProgress == nil {
  428. onProgress = func(int, int) {}
  429. }
  430. if analyser.SimilarityThreshold < 0 || analyser.SimilarityThreshold > 100 {
  431. panic("hercules.Analyser: an invalid SimilarityThreshold was specified")
  432. }
  433. // current daily alive number of lines; key is the number of days from the
  434. // beginning of the history
  435. status := map[int]int64{}
  436. // weekly snapshots of status
  437. statuses := [][]int64{}
  438. // mapping <file path> -> hercules.File
  439. files := map[string]*File{}
  440. var day0 time.Time // will be initialized in the first iteration
  441. var prev_tree *object.Tree = nil
  442. prev_day := 0
  443. for index, commit := range commits {
  444. onProgress(index, len(commits))
  445. tree, err := commit.Tree()
  446. if err != nil {
  447. panic(err)
  448. }
  449. if index == 0 {
  450. // first iteration - initialize the file objects from the tree
  451. day0 = commit.Author.When
  452. func() {
  453. file_iter := tree.Files()
  454. defer file_iter.Close()
  455. for {
  456. file, err := file_iter.Next()
  457. if err != nil {
  458. if err == io.EOF {
  459. break
  460. }
  461. panic(err)
  462. }
  463. lines, err := loc(&file.Blob)
  464. if err == nil {
  465. files[file.Name] = NewFile(0, lines, status)
  466. }
  467. }
  468. }()
  469. } else {
  470. day := int(commit.Author.When.Sub(day0).Hours() / 24)
  471. delta := (day / sampling) - (prev_day / sampling)
  472. if delta > 0 {
  473. prev_day = day
  474. gs := analyser.groupStatus(status, day)
  475. for i := 0; i < delta; i++ {
  476. statuses = append(statuses, gs)
  477. }
  478. }
  479. tree_diff, err := object.DiffTree(prev_tree, tree)
  480. if err != nil {
  481. panic(err)
  482. }
  483. cache := analyser.cacheBlobs(tree_diff)
  484. tree_diff = analyser.detectRenames(tree_diff, cache)
  485. for _, change := range tree_diff {
  486. action, err := change.Action()
  487. if err != nil {
  488. panic(err)
  489. }
  490. switch action {
  491. case merkletrie.Insert:
  492. analyser.handleInsertion(change, day, status, files, cache)
  493. case merkletrie.Delete:
  494. analyser.handleDeletion(change, day, status, files, cache)
  495. case merkletrie.Modify:
  496. func() {
  497. defer func() {
  498. r := recover()
  499. if r != nil {
  500. fmt.Fprintf(os.Stderr, "%s: modification error\n", commit.Hash.String())
  501. panic(r)
  502. }
  503. }()
  504. analyser.handleModification(change, day, status, files, cache)
  505. }()
  506. }
  507. }
  508. }
  509. prev_tree = tree
  510. }
  511. return statuses
  512. }