analyser.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664
  1. package hercules
  2. import (
  3. "bufio"
  4. "bytes"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "os"
  9. "sort"
  10. "time"
  11. "unicode/utf8"
  12. "github.com/sergi/go-diff/diffmatchpatch"
  13. "gopkg.in/src-d/go-git.v4"
  14. "gopkg.in/src-d/go-git.v4/config"
  15. "gopkg.in/src-d/go-git.v4/plumbing"
  16. "gopkg.in/src-d/go-git.v4/plumbing/object"
  17. "gopkg.in/src-d/go-git.v4/utils/merkletrie"
  18. )
  19. type Analyser struct {
  20. Repository *git.Repository
  21. Granularity int
  22. Sampling int
  23. SimilarityThreshold int
  24. Debug bool
  25. OnProgress func(int, int)
  26. }
  27. func checkClose(c io.Closer) {
  28. if err := c.Close(); err != nil {
  29. panic(err)
  30. }
  31. }
  32. func loc(file *object.Blob) (int, error) {
  33. reader, err := file.Reader()
  34. if err != nil {
  35. panic(err)
  36. }
  37. defer checkClose(reader)
  38. scanner := bufio.NewScanner(reader)
  39. counter := 0
  40. for scanner.Scan() {
  41. if !utf8.Valid(scanner.Bytes()) {
  42. return -1, errors.New("binary")
  43. }
  44. counter++
  45. }
  46. return counter, nil
  47. }
  48. func str(file *object.Blob) string {
  49. reader, err := file.Reader()
  50. if err != nil {
  51. panic(err)
  52. }
  53. defer checkClose(reader)
  54. buf := new(bytes.Buffer)
  55. buf.ReadFrom(reader)
  56. return buf.String()
  57. }
  58. type DummyIO struct {
  59. }
  60. func (DummyIO) Read(p []byte) (int, error) {
  61. return 0, io.EOF
  62. }
  63. func (DummyIO) Write(p []byte) (int, error) {
  64. return len(p), nil
  65. }
  66. func (DummyIO) Close() error {
  67. return nil
  68. }
  69. type DummyEncodedObject struct {
  70. FakeHash plumbing.Hash
  71. }
  72. func (obj DummyEncodedObject) Hash() plumbing.Hash {
  73. return obj.FakeHash
  74. }
  75. func (obj DummyEncodedObject) Type() plumbing.ObjectType {
  76. return plumbing.BlobObject
  77. }
  78. func (obj DummyEncodedObject) SetType(plumbing.ObjectType) {
  79. }
  80. func (obj DummyEncodedObject) Size() int64 {
  81. return 0
  82. }
  83. func (obj DummyEncodedObject) SetSize(int64) {
  84. }
  85. func (obj DummyEncodedObject) Reader() (io.ReadCloser, error) {
  86. return DummyIO{}, nil
  87. }
  88. func (obj DummyEncodedObject) Writer() (io.WriteCloser, error) {
  89. return DummyIO{}, nil
  90. }
  91. func createDummyBlob(hash *plumbing.Hash) (*object.Blob, error) {
  92. return object.DecodeBlob(DummyEncodedObject{*hash})
  93. }
  94. func (analyser *Analyser) handleInsertion(
  95. change *object.Change, day int, status map[int]int64, files map[string]*File,
  96. cache *map[plumbing.Hash]*object.Blob) {
  97. blob := (*cache)[change.To.TreeEntry.Hash]
  98. lines, err := loc(blob)
  99. if err != nil {
  100. return
  101. }
  102. name := change.To.Name
  103. file, exists := files[name]
  104. if exists {
  105. panic(fmt.Sprintf("file %s already exists", name))
  106. }
  107. file = NewFile(day, lines, status)
  108. files[name] = file
  109. }
  110. func (analyser *Analyser) handleDeletion(
  111. change *object.Change, day int, status map[int]int64, files map[string]*File,
  112. cache *map[plumbing.Hash]*object.Blob) {
  113. blob := (*cache)[change.From.TreeEntry.Hash]
  114. lines, err := loc(blob)
  115. if err != nil {
  116. return
  117. }
  118. name := change.From.Name
  119. file := files[name]
  120. file.Update(day, 0, 0, lines)
  121. delete(files, name)
  122. }
  123. func (analyser *Analyser) handleModification(
  124. change *object.Change, day int, status map[int]int64, files map[string]*File,
  125. cache *map[plumbing.Hash]*object.Blob) {
  126. blob_from := (*cache)[change.From.TreeEntry.Hash]
  127. blob_to := (*cache)[change.To.TreeEntry.Hash]
  128. // we are not validating UTF-8 here because for example
  129. // git/git 4f7770c87ce3c302e1639a7737a6d2531fe4b160 fetch-pack.c is invalid UTF-8
  130. str_from := str(blob_from)
  131. str_to := str(blob_to)
  132. file, exists := files[change.From.Name]
  133. if !exists {
  134. analyser.handleInsertion(change, day, status, files, cache)
  135. return
  136. }
  137. // possible rename
  138. if change.To.Name != change.From.Name {
  139. analyser.handleRename(change.From.Name, change.To.Name, files)
  140. }
  141. dmp := diffmatchpatch.New()
  142. src, dst, _ := dmp.DiffLinesToRunes(str_from, str_to)
  143. if file.Len() != len(src) {
  144. panic(fmt.Sprintf("%s: internal integrity error src %d != %d",
  145. change.To.Name, len(src), file.Len()))
  146. }
  147. diffs := dmp.DiffMainRunes(src, dst, false)
  148. // we do not call RunesToDiffLines so the number of lines equals
  149. // to the rune count
  150. position := 0
  151. pending := diffmatchpatch.Diff{Text: ""}
  152. apply := func(edit diffmatchpatch.Diff) {
  153. length := utf8.RuneCountInString(edit.Text)
  154. if edit.Type == diffmatchpatch.DiffInsert {
  155. file.Update(day, position, length, 0)
  156. position += length
  157. } else {
  158. file.Update(day, position, 0, length)
  159. }
  160. if analyser.Debug {
  161. file.Validate()
  162. }
  163. }
  164. for _, edit := range diffs {
  165. dump_before := ""
  166. if analyser.Debug {
  167. dump_before = file.Dump()
  168. }
  169. length := utf8.RuneCountInString(edit.Text)
  170. func() {
  171. defer func() {
  172. r := recover()
  173. if r != nil {
  174. fmt.Fprintf(os.Stderr, "%s: internal diff error\n", change.To.Name)
  175. fmt.Fprintf(os.Stderr, "Update(%d, %d, %d (0), %d (0))\n", day, position,
  176. length, utf8.RuneCountInString(pending.Text))
  177. if dump_before != "" {
  178. fmt.Fprintf(os.Stderr, "====TREE BEFORE====\n%s====END====\n", dump_before)
  179. }
  180. fmt.Fprintf(os.Stderr, "====TREE AFTER====\n%s====END====\n", file.Dump())
  181. panic(r)
  182. }
  183. }()
  184. switch edit.Type {
  185. case diffmatchpatch.DiffEqual:
  186. if pending.Text != "" {
  187. apply(pending)
  188. pending.Text = ""
  189. }
  190. position += length
  191. case diffmatchpatch.DiffInsert:
  192. if pending.Text != "" {
  193. if pending.Type == diffmatchpatch.DiffInsert {
  194. panic("DiffInsert may not appear after DiffInsert")
  195. }
  196. file.Update(day, position, length, utf8.RuneCountInString(pending.Text))
  197. if analyser.Debug {
  198. file.Validate()
  199. }
  200. position += length
  201. pending.Text = ""
  202. } else {
  203. pending = edit
  204. }
  205. case diffmatchpatch.DiffDelete:
  206. if pending.Text != "" {
  207. panic("DiffDelete may not appear after DiffInsert/DiffDelete")
  208. }
  209. pending = edit
  210. default:
  211. panic(fmt.Sprintf("diff operation is not supported: %d", edit.Type))
  212. }
  213. }()
  214. }
  215. if pending.Text != "" {
  216. apply(pending)
  217. pending.Text = ""
  218. }
  219. if file.Len() != len(dst) {
  220. panic(fmt.Sprintf("%s: internal integrity error dst %d != %d",
  221. change.To.Name, len(dst), file.Len()))
  222. }
  223. }
  224. func (analyser *Analyser) handleRename(from, to string, files map[string]*File) {
  225. file, exists := files[from]
  226. if !exists {
  227. panic(fmt.Sprintf("file %s does not exist", from))
  228. }
  229. files[to] = file
  230. delete(files, from)
  231. }
  232. func (analyser *Analyser) Commits() []*object.Commit {
  233. result := []*object.Commit{}
  234. repository := analyser.Repository
  235. head, err := repository.Head()
  236. if err != nil {
  237. panic(err)
  238. }
  239. commit, err := repository.CommitObject(head.Hash())
  240. if err != nil {
  241. panic(err)
  242. }
  243. result = append(result, commit)
  244. for ; err != io.EOF; commit, err = commit.Parents().Next() {
  245. if err != nil {
  246. panic(err)
  247. }
  248. result = append(result, commit)
  249. }
  250. // reverse the order
  251. for i, j := 0, len(result)-1; i < j; i, j = i+1, j-1 {
  252. result[i], result[j] = result[j], result[i]
  253. }
  254. return result
  255. }
  256. func (analyser *Analyser) groupStatus(status map[int]int64, day int) []int64 {
  257. granularity := analyser.Granularity
  258. if granularity == 0 {
  259. granularity = 1
  260. }
  261. day++
  262. adjust := 0
  263. if day%granularity < granularity-1 {
  264. adjust = 1
  265. }
  266. result := make([]int64, day/granularity+adjust)
  267. var group int64
  268. for i := 0; i < day; i++ {
  269. group += status[i]
  270. if i%granularity == (granularity - 1) {
  271. result[i/granularity] = group
  272. group = 0
  273. }
  274. }
  275. if day%granularity < granularity-1 {
  276. result[len(result)-1] = group
  277. }
  278. return result
  279. }
  280. type sortableChange struct {
  281. change *object.Change
  282. hash plumbing.Hash
  283. }
  284. type sortableChanges []sortableChange
  285. func (change *sortableChange) Less(other *sortableChange) bool {
  286. for x := 0; x < 20; x++ {
  287. if change.hash[x] < other.hash[x] {
  288. return true
  289. }
  290. }
  291. return false
  292. }
  293. func (slice sortableChanges) Len() int {
  294. return len(slice)
  295. }
  296. func (slice sortableChanges) Less(i, j int) bool {
  297. return slice[i].Less(&slice[j])
  298. }
  299. func (slice sortableChanges) Swap(i, j int) {
  300. slice[i], slice[j] = slice[j], slice[i]
  301. }
  302. type sortableBlob struct {
  303. change *object.Change
  304. size int64
  305. }
  306. type sortableBlobs []sortableBlob
  307. func (change *sortableBlob) Less(other *sortableBlob) bool {
  308. return change.size < other.size
  309. }
  310. func (slice sortableBlobs) Len() int {
  311. return len(slice)
  312. }
  313. func (slice sortableBlobs) Less(i, j int) bool {
  314. return slice[i].Less(&slice[j])
  315. }
  316. func (slice sortableBlobs) Swap(i, j int) {
  317. slice[i], slice[j] = slice[j], slice[i]
  318. }
  319. func (analyser *Analyser) sizesAreClose(size1 int64, size2 int64) bool {
  320. return abs64(size1-size2)*100/max64(1, min64(size1, size2)) <=
  321. int64(100-analyser.SimilarityThreshold)
  322. }
  323. func (analyser *Analyser) blobsAreClose(
  324. blob1 *object.Blob, blob2 *object.Blob) bool {
  325. str_from := str(blob1)
  326. str_to := str(blob2)
  327. dmp := diffmatchpatch.New()
  328. src, dst, _ := dmp.DiffLinesToRunes(str_from, str_to)
  329. diffs := dmp.DiffMainRunes(src, dst, false)
  330. common := 0
  331. for _, edit := range diffs {
  332. if edit.Type == diffmatchpatch.DiffEqual {
  333. common += utf8.RuneCountInString(edit.Text)
  334. }
  335. }
  336. return common*100/max(1, min(len(src), len(dst))) >=
  337. analyser.SimilarityThreshold
  338. }
  339. func (analyser *Analyser) getBlob(entry *object.ChangeEntry, commit *object.Commit) (
  340. *object.Blob, error) {
  341. blob, err := analyser.Repository.BlobObject(entry.TreeEntry.Hash)
  342. if err != nil {
  343. if err.Error() != git.ErrObjectNotFound.Error() {
  344. fmt.Fprintf(os.Stderr, "getBlob(%s)\n", entry.TreeEntry.Hash.String())
  345. return nil, err
  346. }
  347. file, err_modules := commit.File(".gitmodules")
  348. if err_modules != nil {
  349. return nil, err
  350. }
  351. contents, err_modules := file.Contents()
  352. if err_modules != nil {
  353. return nil, err
  354. }
  355. modules := config.NewModules()
  356. err_modules = modules.Unmarshal([]byte(contents))
  357. if err_modules != nil {
  358. return nil, err
  359. }
  360. _, exists := modules.Submodules[entry.Name]
  361. if exists {
  362. // we found that this is a submodule
  363. return createDummyBlob(&entry.TreeEntry.Hash)
  364. }
  365. return nil, err
  366. }
  367. return blob, nil
  368. }
  369. func (analyser *Analyser) cacheBlobs(changes *object.Changes, commit *object.Commit) (
  370. *map[plumbing.Hash]*object.Blob, error) {
  371. cache := make(map[plumbing.Hash]*object.Blob)
  372. for _, change := range *changes {
  373. action, err := change.Action()
  374. if err != nil {
  375. return nil, err
  376. }
  377. switch action {
  378. case merkletrie.Insert:
  379. cache[change.To.TreeEntry.Hash], err = analyser.getBlob(&change.To, commit)
  380. if err != nil {
  381. fmt.Fprintf(os.Stderr, "file to %s\n", change.To.Name)
  382. }
  383. case merkletrie.Delete:
  384. cache[change.From.TreeEntry.Hash], err = analyser.getBlob(&change.From, commit)
  385. if err != nil {
  386. if err.Error() != git.ErrObjectNotFound.Error() {
  387. fmt.Fprintf(os.Stderr, "file from %s\n", change.From.Name)
  388. } else {
  389. cache[change.From.TreeEntry.Hash], err = createDummyBlob(
  390. &change.From.TreeEntry.Hash)
  391. }
  392. }
  393. case merkletrie.Modify:
  394. cache[change.To.TreeEntry.Hash], err = analyser.getBlob(&change.To, commit)
  395. if err != nil {
  396. fmt.Fprintf(os.Stderr, "file to %s\n", change.To.Name)
  397. }
  398. cache[change.From.TreeEntry.Hash], err = analyser.getBlob(&change.From, commit)
  399. if err != nil {
  400. fmt.Fprintf(os.Stderr, "file from %s\n", change.From.Name)
  401. }
  402. default:
  403. panic(fmt.Sprintf("unsupported action: %d", change.Action))
  404. }
  405. if err != nil {
  406. return nil, err
  407. }
  408. }
  409. return &cache, nil
  410. }
  411. func (analyser *Analyser) detectRenames(
  412. changes *object.Changes, cache *map[plumbing.Hash]*object.Blob) object.Changes {
  413. reduced_changes := make(object.Changes, 0, changes.Len())
  414. // Stage 1 - find renames by matching the hashes
  415. // n log(n)
  416. // We sort additions and deletions by hash and then do the single scan along
  417. // both slices.
  418. deleted := make(sortableChanges, 0, changes.Len())
  419. added := make(sortableChanges, 0, changes.Len())
  420. for _, change := range *changes {
  421. action, err := change.Action()
  422. if err != nil {
  423. panic(err)
  424. }
  425. switch action {
  426. case merkletrie.Insert:
  427. added = append(added, sortableChange{change, change.To.TreeEntry.Hash})
  428. case merkletrie.Delete:
  429. deleted = append(deleted, sortableChange{change, change.From.TreeEntry.Hash})
  430. case merkletrie.Modify:
  431. reduced_changes = append(reduced_changes, change)
  432. default:
  433. panic(fmt.Sprintf("unsupported action: %d", change.Action))
  434. }
  435. }
  436. sort.Sort(deleted)
  437. sort.Sort(added)
  438. a := 0
  439. d := 0
  440. still_deleted := make(object.Changes, 0, deleted.Len())
  441. still_added := make(object.Changes, 0, added.Len())
  442. for a < added.Len() && d < deleted.Len() {
  443. if added[a].hash == deleted[d].hash {
  444. reduced_changes = append(
  445. reduced_changes,
  446. &object.Change{From: deleted[d].change.From, To: added[a].change.To})
  447. a++
  448. d++
  449. } else if added[a].Less(&deleted[d]) {
  450. still_added = append(still_added, added[a].change)
  451. a++
  452. } else {
  453. still_deleted = append(still_deleted, deleted[d].change)
  454. d++
  455. }
  456. }
  457. for ; a < added.Len(); a++ {
  458. still_added = append(still_added, added[a].change)
  459. }
  460. for ; d < deleted.Len(); d++ {
  461. still_deleted = append(still_deleted, deleted[d].change)
  462. }
  463. // Stage 2 - apply the similarity threshold
  464. // n^2 but actually linear
  465. // We sort the blobs by size and do the single linear scan.
  466. added_blobs := make(sortableBlobs, 0, still_added.Len())
  467. deleted_blobs := make(sortableBlobs, 0, still_deleted.Len())
  468. for _, change := range still_added {
  469. blob := (*cache)[change.To.TreeEntry.Hash]
  470. added_blobs = append(
  471. added_blobs, sortableBlob{change: change, size: blob.Size})
  472. }
  473. for _, change := range still_deleted {
  474. blob := (*cache)[change.From.TreeEntry.Hash]
  475. deleted_blobs = append(
  476. deleted_blobs, sortableBlob{change: change, size: blob.Size})
  477. }
  478. sort.Sort(added_blobs)
  479. sort.Sort(deleted_blobs)
  480. d_start := 0
  481. for a = 0; a < added_blobs.Len(); a++ {
  482. my_blob := (*cache)[added_blobs[a].change.To.TreeEntry.Hash]
  483. my_size := added_blobs[a].size
  484. for d = d_start; d < deleted_blobs.Len() && !analyser.sizesAreClose(my_size, deleted_blobs[d].size); d++ {
  485. }
  486. d_start = d
  487. found_match := false
  488. for d = d_start; d < deleted_blobs.Len() && analyser.sizesAreClose(my_size, deleted_blobs[d].size); d++ {
  489. if analyser.blobsAreClose(
  490. my_blob, (*cache)[deleted_blobs[d].change.From.TreeEntry.Hash]) {
  491. found_match = true
  492. reduced_changes = append(
  493. reduced_changes,
  494. &object.Change{From: deleted_blobs[d].change.From,
  495. To: added_blobs[a].change.To})
  496. break
  497. }
  498. }
  499. if found_match {
  500. added_blobs = append(added_blobs[:a], added_blobs[a+1:]...)
  501. a--
  502. deleted_blobs = append(deleted_blobs[:d], deleted_blobs[d+1:]...)
  503. }
  504. }
  505. // Stage 3 - we give up, everything left are independent additions and deletions
  506. for _, blob := range added_blobs {
  507. reduced_changes = append(reduced_changes, blob.change)
  508. }
  509. for _, blob := range deleted_blobs {
  510. reduced_changes = append(reduced_changes, blob.change)
  511. }
  512. return reduced_changes
  513. }
  514. func (analyser *Analyser) Analyse(commits []*object.Commit) [][]int64 {
  515. sampling := analyser.Sampling
  516. if sampling == 0 {
  517. sampling = 1
  518. }
  519. onProgress := analyser.OnProgress
  520. if onProgress == nil {
  521. onProgress = func(int, int) {}
  522. }
  523. if analyser.SimilarityThreshold < 0 || analyser.SimilarityThreshold > 100 {
  524. panic("hercules.Analyser: an invalid SimilarityThreshold was specified")
  525. }
  526. // current daily alive number of lines; key is the number of days from the
  527. // beginning of the history
  528. status := map[int]int64{}
  529. // weekly snapshots of status
  530. statuses := [][]int64{}
  531. // mapping <file path> -> hercules.File
  532. files := map[string]*File{}
  533. var day0 time.Time // will be initialized in the first iteration
  534. var prev_tree *object.Tree = nil
  535. prev_day := 0
  536. for index, commit := range commits {
  537. onProgress(index, len(commits))
  538. tree, err := commit.Tree()
  539. if err != nil {
  540. panic(err)
  541. }
  542. if index == 0 {
  543. // first iteration - initialize the file objects from the tree
  544. day0 = commit.Author.When
  545. func() {
  546. file_iter := tree.Files()
  547. defer file_iter.Close()
  548. for {
  549. file, err := file_iter.Next()
  550. if err != nil {
  551. if err == io.EOF {
  552. break
  553. }
  554. panic(err)
  555. }
  556. lines, err := loc(&file.Blob)
  557. if err == nil {
  558. files[file.Name] = NewFile(0, lines, status)
  559. }
  560. }
  561. }()
  562. } else {
  563. day := int(commit.Author.When.Sub(day0).Hours() / 24)
  564. delta := (day / sampling) - (prev_day / sampling)
  565. if delta > 0 {
  566. prev_day = day
  567. gs := analyser.groupStatus(status, day)
  568. for i := 0; i < delta; i++ {
  569. statuses = append(statuses, gs)
  570. }
  571. }
  572. tree_diff, err := object.DiffTree(prev_tree, tree)
  573. if err != nil {
  574. fmt.Fprintf(os.Stderr, "commit #%d %s\n", index, commit.Hash.String())
  575. panic(err)
  576. }
  577. cache, err := analyser.cacheBlobs(&tree_diff, commit)
  578. if err != nil {
  579. fmt.Fprintf(os.Stderr, "commit #%d %s\n", index, commit.Hash.String())
  580. panic(err)
  581. }
  582. tree_diff = analyser.detectRenames(&tree_diff, cache)
  583. for _, change := range tree_diff {
  584. action, err := change.Action()
  585. if err != nil {
  586. fmt.Fprintf(os.Stderr, "commit #%d %s\n", index, commit.Hash.String())
  587. panic(err)
  588. }
  589. switch action {
  590. case merkletrie.Insert:
  591. analyser.handleInsertion(change, day, status, files, cache)
  592. case merkletrie.Delete:
  593. analyser.handleDeletion(change, day, status, files, cache)
  594. case merkletrie.Modify:
  595. func() {
  596. defer func() {
  597. r := recover()
  598. if r != nil {
  599. fmt.Fprintf(os.Stderr, "#%d - %s: modification error\n",
  600. index, commit.Hash.String())
  601. panic(r)
  602. }
  603. }()
  604. analyser.handleModification(change, day, status, files, cache)
  605. }()
  606. }
  607. }
  608. }
  609. prev_tree = tree
  610. }
  611. return statuses
  612. }