renames.go 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578
  1. package plumbing
  2. import (
  3. "log"
  4. "path/filepath"
  5. "sort"
  6. "strings"
  7. "sync"
  8. "unicode/utf8"
  9. "github.com/sergi/go-diff/diffmatchpatch"
  10. "gopkg.in/src-d/go-git.v4"
  11. "gopkg.in/src-d/go-git.v4/plumbing"
  12. "gopkg.in/src-d/go-git.v4/plumbing/object"
  13. "gopkg.in/src-d/go-git.v4/utils/merkletrie"
  14. "gopkg.in/src-d/hercules.v9/internal"
  15. "gopkg.in/src-d/hercules.v9/internal/core"
  16. )
  17. // RenameAnalysis improves TreeDiff's results by searching for changed blobs under different
  18. // paths which are likely to be the result of a rename with subsequent edits.
  19. // RenameAnalysis is a PipelineItem.
  20. type RenameAnalysis struct {
  21. core.NoopMerger
  22. // SimilarityThreshold adjusts the heuristic to determine file renames.
  23. // It has the same units as cgit's -X rename-threshold or -M. Better to
  24. // set it to the default value of 80 (80%).
  25. SimilarityThreshold int
  26. repository *git.Repository
  27. }
  28. const (
  29. // RenameAnalysisDefaultThreshold specifies the default percentage of common lines in a pair
  30. // of files to consider them linked. The exact code of the decision is sizesAreClose().
  31. // CGit's default is 50%. Ours is 80% because 50% can be too computationally expensive.
  32. RenameAnalysisDefaultThreshold = 80
  33. // ConfigRenameAnalysisSimilarityThreshold is the name of the configuration option
  34. // (RenameAnalysis.Configure()) which sets the similarity threshold.
  35. ConfigRenameAnalysisSimilarityThreshold = "RenameAnalysis.SimilarityThreshold"
  36. // RenameAnalysisMinimumSize is the minimum size of a blob to be considered.
  37. RenameAnalysisMinimumSize = 32
  38. // RenameAnalysisMaxCandidates is the maximum number of rename candidates to consider per file.
  39. RenameAnalysisMaxCandidates = 50
  40. // RenameAnalysisSetSizeLimit is the maximum number of added + removed files for
  41. // RenameAnalysisMaxCandidates to be active; the bigger numbers set it to 1.
  42. RenameAnalysisSetSizeLimit = 1000
  43. // RenameAnalysisByteDiffSizeThreshold is the maximum size of each of the compared parts
  44. // to be diff-ed on byte level.
  45. RenameAnalysisByteDiffSizeThreshold = 100000
  46. )
  47. // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
  48. func (ra *RenameAnalysis) Name() string {
  49. return "RenameAnalysis"
  50. }
  51. // Provides returns the list of names of entities which are produced by this PipelineItem.
  52. // Each produced entity will be inserted into `deps` of dependent Consume()-s according
  53. // to this list. Also used by core.Registry to build the global map of providers.
  54. func (ra *RenameAnalysis) Provides() []string {
  55. arr := [...]string{DependencyTreeChanges}
  56. return arr[:]
  57. }
  58. // Requires returns the list of names of entities which are needed by this PipelineItem.
  59. // Each requested entity will be inserted into `deps` of Consume(). In turn, those
  60. // entities are Provides() upstream.
  61. func (ra *RenameAnalysis) Requires() []string {
  62. arr := [...]string{DependencyBlobCache, DependencyTreeChanges}
  63. return arr[:]
  64. }
  65. // ListConfigurationOptions returns the list of changeable public properties of this PipelineItem.
  66. func (ra *RenameAnalysis) ListConfigurationOptions() []core.ConfigurationOption {
  67. options := [...]core.ConfigurationOption{{
  68. Name: ConfigRenameAnalysisSimilarityThreshold,
  69. Description: "The threshold on the similarity index used to detect renames.",
  70. Flag: "M",
  71. Type: core.IntConfigurationOption,
  72. Default: RenameAnalysisDefaultThreshold},
  73. }
  74. return options[:]
  75. }
  76. // Configure sets the properties previously published by ListConfigurationOptions().
  77. func (ra *RenameAnalysis) Configure(facts map[string]interface{}) error {
  78. if val, exists := facts[ConfigRenameAnalysisSimilarityThreshold].(int); exists {
  79. ra.SimilarityThreshold = val
  80. }
  81. return nil
  82. }
  83. // Initialize resets the temporary caches and prepares this PipelineItem for a series of Consume()
  84. // calls. The repository which is going to be analysed is supplied as an argument.
  85. func (ra *RenameAnalysis) Initialize(repository *git.Repository) error {
  86. if ra.SimilarityThreshold < 0 || ra.SimilarityThreshold > 100 {
  87. log.Printf("Warning: adjusted the similarity threshold to %d\n",
  88. RenameAnalysisDefaultThreshold)
  89. ra.SimilarityThreshold = RenameAnalysisDefaultThreshold
  90. }
  91. ra.repository = repository
  92. return nil
  93. }
  94. // Consume runs this PipelineItem on the next commit data.
  95. // `deps` contain all the results from upstream PipelineItem-s as requested by Requires().
  96. // Additionally, DependencyCommit is always present there and represents the analysed *object.Commit.
  97. // This function returns the mapping with analysis results. The keys must be the same as
  98. // in Provides(). If there was an error, nil is returned.
  99. func (ra *RenameAnalysis) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  100. changes := deps[DependencyTreeChanges].(object.Changes)
  101. cache := deps[DependencyBlobCache].(map[plumbing.Hash]*CachedBlob)
  102. reducedChanges := make(object.Changes, 0, changes.Len())
  103. // Stage 1 - find renames by matching the hashes
  104. // n log(n)
  105. // We sort additions and deletions by hash and then do the single scan along
  106. // both slices.
  107. deleted := make(sortableChanges, 0, changes.Len())
  108. added := make(sortableChanges, 0, changes.Len())
  109. for _, change := range changes {
  110. action, err := change.Action()
  111. if err != nil {
  112. return nil, err
  113. }
  114. switch action {
  115. case merkletrie.Insert:
  116. added = append(added, sortableChange{change, change.To.TreeEntry.Hash})
  117. case merkletrie.Delete:
  118. deleted = append(deleted, sortableChange{change, change.From.TreeEntry.Hash})
  119. case merkletrie.Modify:
  120. reducedChanges = append(reducedChanges, change)
  121. }
  122. }
  123. sort.Sort(deleted)
  124. sort.Sort(added)
  125. stillDeleted := make(object.Changes, 0, deleted.Len())
  126. stillAdded := make(object.Changes, 0, added.Len())
  127. {
  128. a := 0
  129. d := 0
  130. for a < added.Len() && d < deleted.Len() {
  131. if added[a].hash == deleted[d].hash {
  132. reducedChanges = append(
  133. reducedChanges,
  134. &object.Change{From: deleted[d].change.From, To: added[a].change.To})
  135. a++
  136. d++
  137. } else if added[a].Less(&deleted[d]) {
  138. stillAdded = append(stillAdded, added[a].change)
  139. a++
  140. } else {
  141. stillDeleted = append(stillDeleted, deleted[d].change)
  142. d++
  143. }
  144. }
  145. for ; a < added.Len(); a++ {
  146. stillAdded = append(stillAdded, added[a].change)
  147. }
  148. for ; d < deleted.Len(); d++ {
  149. stillDeleted = append(stillDeleted, deleted[d].change)
  150. }
  151. }
  152. // Stage 2 - apply the similarity threshold
  153. // n^2 but actually linear
  154. // We sort the blobs by size and do the single linear scan.
  155. maxCandidates := RenameAnalysisMaxCandidates
  156. if len(stillAdded)+len(stillDeleted) > RenameAnalysisSetSizeLimit {
  157. maxCandidates = 1
  158. }
  159. addedBlobs := make(sortableBlobs, 0, stillAdded.Len())
  160. deletedBlobs := make(sortableBlobs, 0, stillDeleted.Len())
  161. var smallChanges []*object.Change
  162. for _, change := range stillAdded {
  163. blob := cache[change.To.TreeEntry.Hash]
  164. if blob.Size < RenameAnalysisMinimumSize {
  165. smallChanges = append(smallChanges, change)
  166. } else {
  167. addedBlobs = append(
  168. addedBlobs, sortableBlob{change: change, size: blob.Size})
  169. }
  170. }
  171. for _, change := range stillDeleted {
  172. blob := cache[change.From.TreeEntry.Hash]
  173. if blob.Size < RenameAnalysisMinimumSize {
  174. smallChanges = append(smallChanges, change)
  175. } else {
  176. deletedBlobs = append(
  177. deletedBlobs, sortableBlob{change: change, size: blob.Size})
  178. }
  179. }
  180. sort.Sort(addedBlobs)
  181. sort.Sort(deletedBlobs)
  182. finished := make(chan bool, 2)
  183. finishedA := make(chan bool, 1)
  184. finishedB := make(chan bool, 1)
  185. errs := make(chan error)
  186. matchesA := make(object.Changes, 0, changes.Len())
  187. matchesB := make(object.Changes, 0, changes.Len())
  188. addedBlobsA := addedBlobs
  189. addedBlobsB := make(sortableBlobs, len(addedBlobs))
  190. copy(addedBlobsB, addedBlobs)
  191. deletedBlobsA := deletedBlobs
  192. deletedBlobsB := make(sortableBlobs, len(deletedBlobs))
  193. copy(deletedBlobsB, deletedBlobs)
  194. wg := sync.WaitGroup{}
  195. matchA := func() {
  196. defer func() {
  197. finished <- true
  198. wg.Done()
  199. }()
  200. aStart := 0
  201. // we will try to find a matching added blob for each deleted blob
  202. for d := 0; d < deletedBlobsA.Len(); d++ {
  203. myBlob := cache[deletedBlobsA[d].change.From.TreeEntry.Hash]
  204. mySize := deletedBlobsA[d].size
  205. myName := filepath.Base(deletedBlobsA[d].change.From.Name)
  206. var a int
  207. for a = aStart; a < addedBlobsA.Len() && !ra.sizesAreClose(mySize, addedBlobsA[a].size); a++ {
  208. }
  209. aStart = a
  210. foundMatch := false
  211. // get the list of possible candidates and sort by file name similarity
  212. var candidates []int
  213. for a = aStart; a < addedBlobsA.Len() && ra.sizesAreClose(mySize, addedBlobsA[a].size); a++ {
  214. candidates = append(candidates, a)
  215. }
  216. sortRenameCandidates(candidates, myName, func(a int) string {
  217. return addedBlobsA[a].change.To.Name
  218. })
  219. var ci int
  220. for ci, a = range candidates {
  221. select {
  222. case <-finished:
  223. return
  224. default:
  225. break
  226. }
  227. if ci > maxCandidates {
  228. break
  229. }
  230. blobsAreClose, err := ra.blobsAreClose(
  231. myBlob, cache[addedBlobsA[a].change.To.TreeEntry.Hash])
  232. if err != nil {
  233. errs <- err
  234. return
  235. }
  236. if blobsAreClose {
  237. foundMatch = true
  238. matchesA = append(
  239. matchesA,
  240. &object.Change{
  241. From: deletedBlobsA[d].change.From,
  242. To: addedBlobsA[a].change.To})
  243. break
  244. }
  245. }
  246. if foundMatch {
  247. deletedBlobsA = append(deletedBlobsA[:d], deletedBlobsA[d+1:]...)
  248. d--
  249. addedBlobsA = append(addedBlobsA[:a], addedBlobsA[a+1:]...)
  250. }
  251. }
  252. finishedA <- true
  253. }
  254. matchB := func() {
  255. defer func() {
  256. finished <- true
  257. wg.Done()
  258. }()
  259. dStart := 0
  260. for a := 0; a < addedBlobsB.Len(); a++ {
  261. myBlob := cache[addedBlobsB[a].change.To.TreeEntry.Hash]
  262. mySize := addedBlobsB[a].size
  263. myName := filepath.Base(addedBlobsB[a].change.To.Name)
  264. var d int
  265. for d = dStart; d < deletedBlobsB.Len() && !ra.sizesAreClose(mySize, deletedBlobsB[d].size); d++ {
  266. }
  267. dStart = d
  268. foundMatch := false
  269. // get the list of possible candidates and sort by file name similarity
  270. var candidates []int
  271. for d = dStart; d < deletedBlobsB.Len() && ra.sizesAreClose(mySize, deletedBlobsB[d].size); d++ {
  272. candidates = append(candidates, d)
  273. }
  274. sortRenameCandidates(candidates, myName, func(d int) string {
  275. return deletedBlobsB[d].change.From.Name
  276. })
  277. var ci int
  278. for ci, d = range candidates {
  279. select {
  280. case <-finished:
  281. return
  282. default:
  283. break
  284. }
  285. if ci > maxCandidates {
  286. break
  287. }
  288. blobsAreClose, err := ra.blobsAreClose(
  289. myBlob, cache[deletedBlobsB[d].change.From.TreeEntry.Hash])
  290. if err != nil {
  291. errs <- err
  292. return
  293. }
  294. if blobsAreClose {
  295. foundMatch = true
  296. matchesB = append(
  297. matchesB,
  298. &object.Change{
  299. From: deletedBlobsB[d].change.From,
  300. To: addedBlobsB[a].change.To})
  301. break
  302. }
  303. }
  304. if foundMatch {
  305. addedBlobsB = append(addedBlobsB[:a], addedBlobsB[a+1:]...)
  306. a--
  307. deletedBlobsB = append(deletedBlobsB[:d], deletedBlobsB[d+1:]...)
  308. }
  309. }
  310. finishedB <- true
  311. }
  312. // run two functions in parallel, and take the result from the one which finished earlier
  313. wg.Add(2)
  314. go func() { matchA() }()
  315. go func() { matchB() }()
  316. wg.Wait()
  317. var matches object.Changes
  318. select {
  319. case err := <-errs:
  320. return nil, err
  321. case <-finishedA:
  322. addedBlobs = addedBlobsA
  323. deletedBlobs = deletedBlobsA
  324. matches = matchesA
  325. case <-finishedB:
  326. addedBlobs = addedBlobsB
  327. deletedBlobs = deletedBlobsB
  328. matches = matchesB
  329. default:
  330. panic("Impossible happened: two functions returned without an error " +
  331. "but no results from both")
  332. }
  333. // Stage 3 - we give up, everything left are independent additions and deletions
  334. for _, change := range matches {
  335. reducedChanges = append(reducedChanges, change)
  336. }
  337. for _, blob := range addedBlobs {
  338. reducedChanges = append(reducedChanges, blob.change)
  339. }
  340. for _, blob := range deletedBlobs {
  341. reducedChanges = append(reducedChanges, blob.change)
  342. }
  343. for _, change := range smallChanges {
  344. reducedChanges = append(reducedChanges, change)
  345. }
  346. return map[string]interface{}{DependencyTreeChanges: reducedChanges}, nil
  347. }
  348. // Fork clones this PipelineItem.
  349. func (ra *RenameAnalysis) Fork(n int) []core.PipelineItem {
  350. return core.ForkSamePipelineItem(ra, n)
  351. }
  352. func (ra *RenameAnalysis) sizesAreClose(size1 int64, size2 int64) bool {
  353. size := internal.Max64(1, internal.Max64(size1, size2))
  354. return (internal.Abs64(size1-size2)*10000)/size <= int64(100-ra.SimilarityThreshold)*100
  355. }
  356. func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (bool, error) {
  357. cleanReturn := false
  358. defer func() {
  359. if !cleanReturn {
  360. log.Println()
  361. log.Println(blob1.Hash.String())
  362. log.Println(blob2.Hash.String())
  363. }
  364. }()
  365. _, err1 := blob1.CountLines()
  366. _, err2 := blob2.CountLines()
  367. if err1 == ErrorBinary || err2 == ErrorBinary {
  368. // binary mode
  369. bsdifflen := DiffBytes(blob1.Data, blob2.Data)
  370. delta := int((int64(bsdifflen) * 100) / internal.Max64(
  371. internal.Min64(blob1.Size, blob2.Size), 1))
  372. cleanReturn = true
  373. return 100-delta >= ra.SimilarityThreshold, nil
  374. }
  375. src, dst := string(blob1.Data), string(blob2.Data)
  376. maxSize := internal.Max(1, internal.Max(utf8.RuneCountInString(src), utf8.RuneCountInString(dst)))
  377. // compute the line-by-line diff, then the char-level diffs of the del-ins blocks
  378. // yes, this algorithm is greedy and not exact
  379. dmp := diffmatchpatch.New()
  380. srcLineRunes, dstLineRunes, _ := dmp.DiffLinesToRunes(src, dst)
  381. // the third returned value, []string, is the mapping from runes to lines
  382. // we cannot use it because it is approximate and has string collisions
  383. // that is, the mapping is wrong for huge files
  384. diffs := dmp.DiffMainRunes(srcLineRunes, dstLineRunes, false)
  385. srcPositions := calcLinePositions(src)
  386. dstPositions := calcLinePositions(dst)
  387. var common, posSrc, prevPosSrc, posDst int
  388. possibleDelInsBlock := false
  389. for _, edit := range diffs {
  390. switch edit.Type {
  391. case diffmatchpatch.DiffDelete:
  392. possibleDelInsBlock = true
  393. prevPosSrc = posSrc
  394. posSrc += utf8.RuneCountInString(edit.Text)
  395. case diffmatchpatch.DiffInsert:
  396. nextPosDst := posDst + utf8.RuneCountInString(edit.Text)
  397. if possibleDelInsBlock {
  398. possibleDelInsBlock = false
  399. if internal.Max(srcPositions[posSrc]-srcPositions[prevPosSrc],
  400. dstPositions[nextPosDst]-dstPositions[posDst]) < RenameAnalysisByteDiffSizeThreshold {
  401. localDmp := diffmatchpatch.New()
  402. localSrc := src[srcPositions[prevPosSrc]:srcPositions[posSrc]]
  403. localDst := dst[dstPositions[posDst]:dstPositions[nextPosDst]]
  404. localDiffs := localDmp.DiffMainRunes(
  405. strToLiteralRunes(localSrc), strToLiteralRunes(localDst), false)
  406. for _, localEdit := range localDiffs {
  407. if localEdit.Type == diffmatchpatch.DiffEqual {
  408. common += utf8.RuneCountInString(localEdit.Text)
  409. }
  410. }
  411. }
  412. }
  413. posDst = nextPosDst
  414. case diffmatchpatch.DiffEqual:
  415. possibleDelInsBlock = false
  416. step := utf8.RuneCountInString(edit.Text)
  417. // for i := range edit.Text does *not* work
  418. // idk why, but `i` appears to be bigger than the number of runes
  419. for i := 0; i < step; i++ {
  420. common += srcPositions[posSrc+i+1] - srcPositions[posSrc+i]
  421. }
  422. posSrc += step
  423. posDst += step
  424. }
  425. if possibleDelInsBlock {
  426. continue
  427. }
  428. // supposing that the rest of the lines are the same (they are not - too optimistic),
  429. // estimate the maximum similarity and exit the loop if it lower than our threshold
  430. var srcPendingSize, dstPendingSize int
  431. srcPendingSize = len(src) - srcPositions[posSrc]
  432. dstPendingSize = len(dst) - dstPositions[posDst]
  433. maxCommon := common + internal.Min(srcPendingSize, dstPendingSize)
  434. similarity := (maxCommon * 100) / maxSize
  435. if similarity < ra.SimilarityThreshold {
  436. cleanReturn = true
  437. return false, nil
  438. }
  439. similarity = (common * 100) / maxSize
  440. if similarity >= ra.SimilarityThreshold {
  441. cleanReturn = true
  442. return true, nil
  443. }
  444. }
  445. // the very last "overly optimistic" estimate was actually precise, so since we are still here
  446. // the blobs are similar
  447. cleanReturn = true
  448. return true, nil
  449. }
  450. func calcLinePositions(text string) []int {
  451. if text == "" {
  452. return []int{0}
  453. }
  454. lines := strings.Split(text, "\n")
  455. positions := make([]int, len(lines)+1)
  456. accum := 0
  457. for i, l := range lines {
  458. positions[i] = accum
  459. accum += len(l) + 1 // +1 for \n
  460. }
  461. if len(lines) > 0 && lines[len(lines)-1] != "\n" {
  462. accum--
  463. }
  464. positions[len(lines)] = accum
  465. return positions
  466. }
  467. func strToLiteralRunes(s string) []rune {
  468. lrunes := make([]rune, len(s))
  469. for i, b := range []byte(s) {
  470. lrunes[i] = rune(b)
  471. }
  472. return lrunes
  473. }
  474. type sortableChange struct {
  475. change *object.Change
  476. hash plumbing.Hash
  477. }
  478. type sortableChanges []sortableChange
  479. func (change *sortableChange) Less(other *sortableChange) bool {
  480. for x := 0; x < 20; x++ {
  481. if change.hash[x] < other.hash[x] {
  482. return true
  483. }
  484. }
  485. return false
  486. }
  487. func (slice sortableChanges) Len() int {
  488. return len(slice)
  489. }
  490. func (slice sortableChanges) Less(i, j int) bool {
  491. return slice[i].Less(&slice[j])
  492. }
  493. func (slice sortableChanges) Swap(i, j int) {
  494. slice[i], slice[j] = slice[j], slice[i]
  495. }
  496. type sortableBlob struct {
  497. change *object.Change
  498. size int64
  499. }
  500. type sortableBlobs []sortableBlob
  501. func (change *sortableBlob) Less(other *sortableBlob) bool {
  502. return change.size < other.size
  503. }
  504. func (slice sortableBlobs) Len() int {
  505. return len(slice)
  506. }
  507. func (slice sortableBlobs) Less(i, j int) bool {
  508. return slice[i].Less(&slice[j])
  509. }
  510. func (slice sortableBlobs) Swap(i, j int) {
  511. slice[i], slice[j] = slice[j], slice[i]
  512. }
  513. type candidateDistance struct {
  514. Candidate int
  515. Distance int
  516. }
  517. func sortRenameCandidates(candidates []int, origin string, nameGetter func(int) string) {
  518. distances := make([]candidateDistance, len(candidates))
  519. ctx := LevenshteinContext{}
  520. for i, x := range candidates {
  521. name := filepath.Base(nameGetter(x))
  522. distances[i] = candidateDistance{x, ctx.Distance(origin, name)}
  523. }
  524. sort.Slice(distances, func(i, j int) bool {
  525. return distances[i].Distance < distances[j].Distance
  526. })
  527. for i, cd := range distances {
  528. candidates[i] = cd.Candidate
  529. }
  530. }
  531. func init() {
  532. core.Registry.Register(&RenameAnalysis{})
  533. }