renames.go 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578
  1. package plumbing
  2. import (
  3. "log"
  4. "path/filepath"
  5. "sort"
  6. "strings"
  7. "sync"
  8. "unicode/utf8"
  9. "github.com/sergi/go-diff/diffmatchpatch"
  10. "gopkg.in/src-d/go-git.v4"
  11. "gopkg.in/src-d/go-git.v4/plumbing"
  12. "gopkg.in/src-d/go-git.v4/plumbing/object"
  13. "gopkg.in/src-d/go-git.v4/utils/merkletrie"
  14. "gopkg.in/src-d/hercules.v9/internal"
  15. "gopkg.in/src-d/hercules.v9/internal/core"
  16. )
  17. // RenameAnalysis improves TreeDiff's results by searching for changed blobs under different
  18. // paths which are likely to be the result of a rename with subsequent edits.
  19. // RenameAnalysis is a PipelineItem.
  20. type RenameAnalysis struct {
  21. core.NoopMerger
  22. // SimilarityThreshold adjusts the heuristic to determine file renames.
  23. // It has the same units as cgit's -X rename-threshold or -M. Better to
  24. // set it to the default value of 80 (80%).
  25. SimilarityThreshold int
  26. repository *git.Repository
  27. }
  28. const (
  29. // RenameAnalysisDefaultThreshold specifies the default percentage of common lines in a pair
  30. // of files to consider them linked. The exact code of the decision is sizesAreClose().
  31. // CGit's default is 50%. Ours is 80% because 50% can be too computationally expensive.
  32. RenameAnalysisDefaultThreshold = 80
  33. // ConfigRenameAnalysisSimilarityThreshold is the name of the configuration option
  34. // (RenameAnalysis.Configure()) which sets the similarity threshold.
  35. ConfigRenameAnalysisSimilarityThreshold = "RenameAnalysis.SimilarityThreshold"
  36. // RenameAnalysisMinimumSize is the minimum size of a blob to be considered.
  37. RenameAnalysisMinimumSize = 32
  38. // RenameAnalysisMaxCandidates is the maximum number of rename candidates to consider per file.
  39. RenameAnalysisMaxCandidates = 50
  40. // RenameAnalysisSetSizeLimit is the maximum number of added + removed files for
  41. // RenameAnalysisMaxCandidates to be active; the bigger numbers set it to 1.
  42. RenameAnalysisSetSizeLimit = 1000
  43. // RenameAnalysisByteDiffSizeThreshold is the maximum size of each of the compared parts
  44. // to be diff-ed on byte level.
  45. RenameAnalysisByteDiffSizeThreshold = 100000
  46. )
  47. // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
  48. func (ra *RenameAnalysis) Name() string {
  49. return "RenameAnalysis"
  50. }
  51. // Provides returns the list of names of entities which are produced by this PipelineItem.
  52. // Each produced entity will be inserted into `deps` of dependent Consume()-s according
  53. // to this list. Also used by core.Registry to build the global map of providers.
  54. func (ra *RenameAnalysis) Provides() []string {
  55. arr := [...]string{DependencyTreeChanges}
  56. return arr[:]
  57. }
  58. // Requires returns the list of names of entities which are needed by this PipelineItem.
  59. // Each requested entity will be inserted into `deps` of Consume(). In turn, those
  60. // entities are Provides() upstream.
  61. func (ra *RenameAnalysis) Requires() []string {
  62. arr := [...]string{DependencyBlobCache, DependencyTreeChanges}
  63. return arr[:]
  64. }
  65. // ListConfigurationOptions returns the list of changeable public properties of this PipelineItem.
  66. func (ra *RenameAnalysis) ListConfigurationOptions() []core.ConfigurationOption {
  67. options := [...]core.ConfigurationOption{{
  68. Name: ConfigRenameAnalysisSimilarityThreshold,
  69. Description: "The threshold on the similarity index used to detect renames.",
  70. Flag: "M",
  71. Type: core.IntConfigurationOption,
  72. Default: RenameAnalysisDefaultThreshold},
  73. }
  74. return options[:]
  75. }
  76. // Configure sets the properties previously published by ListConfigurationOptions().
  77. func (ra *RenameAnalysis) Configure(facts map[string]interface{}) error {
  78. if val, exists := facts[ConfigRenameAnalysisSimilarityThreshold].(int); exists {
  79. ra.SimilarityThreshold = val
  80. }
  81. return nil
  82. }
  83. // Initialize resets the temporary caches and prepares this PipelineItem for a series of Consume()
  84. // calls. The repository which is going to be analysed is supplied as an argument.
  85. func (ra *RenameAnalysis) Initialize(repository *git.Repository) error {
  86. if ra.SimilarityThreshold < 0 || ra.SimilarityThreshold > 100 {
  87. log.Printf("Warning: adjusted the similarity threshold to %d\n",
  88. RenameAnalysisDefaultThreshold)
  89. ra.SimilarityThreshold = RenameAnalysisDefaultThreshold
  90. }
  91. ra.repository = repository
  92. return nil
  93. }
  94. // Consume runs this PipelineItem on the next commit data.
  95. // `deps` contain all the results from upstream PipelineItem-s as requested by Requires().
  96. // Additionally, DependencyCommit is always present there and represents the analysed *object.Commit.
  97. // This function returns the mapping with analysis results. The keys must be the same as
  98. // in Provides(). If there was an error, nil is returned.
  99. func (ra *RenameAnalysis) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  100. changes := deps[DependencyTreeChanges].(object.Changes)
  101. cache := deps[DependencyBlobCache].(map[plumbing.Hash]*CachedBlob)
  102. reducedChanges := make(object.Changes, 0, changes.Len())
  103. // Stage 1 - find renames by matching the hashes
  104. // n log(n)
  105. // We sort additions and deletions by hash and then do the single scan along
  106. // both slices.
  107. deleted := make(sortableChanges, 0, changes.Len())
  108. added := make(sortableChanges, 0, changes.Len())
  109. for _, change := range changes {
  110. action, err := change.Action()
  111. if err != nil {
  112. return nil, err
  113. }
  114. switch action {
  115. case merkletrie.Insert:
  116. added = append(added, sortableChange{change, change.To.TreeEntry.Hash})
  117. case merkletrie.Delete:
  118. deleted = append(deleted, sortableChange{change, change.From.TreeEntry.Hash})
  119. case merkletrie.Modify:
  120. reducedChanges = append(reducedChanges, change)
  121. }
  122. }
  123. sort.Sort(deleted)
  124. sort.Sort(added)
  125. stillDeleted := make(object.Changes, 0, deleted.Len())
  126. stillAdded := make(object.Changes, 0, added.Len())
  127. {
  128. a := 0
  129. d := 0
  130. for a < added.Len() && d < deleted.Len() {
  131. if added[a].hash == deleted[d].hash {
  132. reducedChanges = append(
  133. reducedChanges,
  134. &object.Change{From: deleted[d].change.From, To: added[a].change.To})
  135. a++
  136. d++
  137. } else if added[a].Less(&deleted[d]) {
  138. stillAdded = append(stillAdded, added[a].change)
  139. a++
  140. } else {
  141. stillDeleted = append(stillDeleted, deleted[d].change)
  142. d++
  143. }
  144. }
  145. for ; a < added.Len(); a++ {
  146. stillAdded = append(stillAdded, added[a].change)
  147. }
  148. for ; d < deleted.Len(); d++ {
  149. stillDeleted = append(stillDeleted, deleted[d].change)
  150. }
  151. }
  152. // Stage 2 - apply the similarity threshold
  153. // n^2 but actually linear
  154. // We sort the blobs by size and do the single linear scan.
  155. maxCandidates := RenameAnalysisMaxCandidates
  156. if len(stillAdded)+len(stillDeleted) > RenameAnalysisSetSizeLimit {
  157. maxCandidates = 1
  158. }
  159. addedBlobs := make(sortableBlobs, 0, stillAdded.Len())
  160. deletedBlobs := make(sortableBlobs, 0, stillDeleted.Len())
  161. var smallChanges []*object.Change
  162. for _, change := range stillAdded {
  163. blob := cache[change.To.TreeEntry.Hash]
  164. if blob.Size < RenameAnalysisMinimumSize {
  165. smallChanges = append(smallChanges, change)
  166. } else {
  167. addedBlobs = append(
  168. addedBlobs, sortableBlob{change: change, size: blob.Size})
  169. }
  170. }
  171. for _, change := range stillDeleted {
  172. blob := cache[change.From.TreeEntry.Hash]
  173. if blob.Size < RenameAnalysisMinimumSize {
  174. smallChanges = append(smallChanges, change)
  175. } else {
  176. deletedBlobs = append(
  177. deletedBlobs, sortableBlob{change: change, size: blob.Size})
  178. }
  179. }
  180. sort.Sort(addedBlobs)
  181. sort.Sort(deletedBlobs)
  182. finished := make(chan bool, 2)
  183. finishedA := make(chan bool, 1)
  184. finishedB := make(chan bool, 1)
  185. errs := make(chan error)
  186. matchesA := make(object.Changes, 0, changes.Len())
  187. matchesB := make(object.Changes, 0, changes.Len())
  188. addedBlobsA := addedBlobs
  189. addedBlobsB := make(sortableBlobs, len(addedBlobs))
  190. copy(addedBlobsB, addedBlobs)
  191. deletedBlobsA := deletedBlobs
  192. deletedBlobsB := make(sortableBlobs, len(deletedBlobs))
  193. copy(deletedBlobsB, deletedBlobs)
  194. wg := sync.WaitGroup{}
  195. matchA := func() {
  196. defer func() {
  197. finished <- true
  198. wg.Done()
  199. }()
  200. aStart := 0
  201. // we will try to find a matching added blob for each deleted blob
  202. for d := 0; d < deletedBlobsA.Len(); d++ {
  203. myBlob := cache[deletedBlobsA[d].change.From.TreeEntry.Hash]
  204. mySize := deletedBlobsA[d].size
  205. myName := filepath.Base(deletedBlobsA[d].change.From.Name)
  206. var a int
  207. for a = aStart; a < addedBlobsA.Len() && !ra.sizesAreClose(mySize, addedBlobsA[a].size); a++ {
  208. }
  209. aStart = a
  210. foundMatch := false
  211. // get the list of possible candidates and sort by file name similarity
  212. var candidates []int
  213. for a = aStart; a < addedBlobsA.Len() && ra.sizesAreClose(mySize, addedBlobsA[a].size); a++ {
  214. candidates = append(candidates, a)
  215. }
  216. sortRenameCandidates(candidates, myName, func(a int) string {
  217. return addedBlobsA[a].change.To.Name
  218. })
  219. var ci int
  220. for ci, a = range candidates {
  221. select {
  222. case <-finished:
  223. return
  224. default:
  225. break
  226. }
  227. if ci > maxCandidates {
  228. break
  229. }
  230. blobsAreClose, err := ra.blobsAreClose(
  231. myBlob, cache[addedBlobsA[a].change.To.TreeEntry.Hash])
  232. if err != nil {
  233. errs <- err
  234. return
  235. }
  236. if blobsAreClose {
  237. foundMatch = true
  238. matchesA = append(
  239. matchesA,
  240. &object.Change{
  241. From: deletedBlobsA[d].change.From,
  242. To: addedBlobsA[a].change.To})
  243. break
  244. }
  245. }
  246. if foundMatch {
  247. deletedBlobsA = append(deletedBlobsA[:d], deletedBlobsA[d+1:]...)
  248. d--
  249. addedBlobsA = append(addedBlobsA[:a], addedBlobsA[a+1:]...)
  250. }
  251. }
  252. finishedA <- true
  253. }
  254. matchB := func() {
  255. defer func() {
  256. finished <- true
  257. wg.Done()
  258. }()
  259. dStart := 0
  260. for a := 0; a < addedBlobsB.Len(); a++ {
  261. myBlob := cache[addedBlobsB[a].change.To.TreeEntry.Hash]
  262. mySize := addedBlobsB[a].size
  263. myName := filepath.Base(addedBlobsB[a].change.To.Name)
  264. var d int
  265. for d = dStart; d < deletedBlobsB.Len() && !ra.sizesAreClose(mySize, deletedBlobsB[d].size); d++ {
  266. }
  267. dStart = d
  268. foundMatch := false
  269. // get the list of possible candidates and sort by file name similarity
  270. var candidates []int
  271. for d = dStart; d < deletedBlobsB.Len() && ra.sizesAreClose(mySize, deletedBlobsB[d].size); d++ {
  272. candidates = append(candidates, d)
  273. }
  274. sortRenameCandidates(candidates, myName, func(d int) string {
  275. return deletedBlobsB[d].change.From.Name
  276. })
  277. var ci int
  278. for ci, d = range candidates {
  279. select {
  280. case <-finished:
  281. return
  282. default:
  283. break
  284. }
  285. if ci > maxCandidates {
  286. break
  287. }
  288. blobsAreClose, err := ra.blobsAreClose(
  289. myBlob, cache[deletedBlobsB[d].change.From.TreeEntry.Hash])
  290. if err != nil {
  291. errs <- err
  292. return
  293. }
  294. if blobsAreClose {
  295. foundMatch = true
  296. matchesB = append(
  297. matchesB,
  298. &object.Change{
  299. From: deletedBlobsB[d].change.From,
  300. To: addedBlobsB[a].change.To})
  301. break
  302. }
  303. }
  304. if foundMatch {
  305. addedBlobsB = append(addedBlobsB[:a], addedBlobsB[a+1:]...)
  306. a--
  307. deletedBlobsB = append(deletedBlobsB[:d], deletedBlobsB[d+1:]...)
  308. }
  309. }
  310. finishedB <- true
  311. }
  312. // run two functions in parallel, and take the result from the one which finished earlier
  313. wg.Add(2)
  314. go matchA()
  315. go matchB()
  316. wg.Wait()
  317. var matches object.Changes
  318. select {
  319. case err := <-errs:
  320. return nil, err
  321. case <-finishedA:
  322. addedBlobs = addedBlobsA
  323. deletedBlobs = deletedBlobsA
  324. matches = matchesA
  325. case <-finishedB:
  326. addedBlobs = addedBlobsB
  327. deletedBlobs = deletedBlobsB
  328. matches = matchesB
  329. default:
  330. panic("Impossible happened: two functions returned without an error " +
  331. "but no results from both")
  332. }
  333. // Stage 3 - we give up, everything left are independent additions and deletions
  334. for _, change := range matches {
  335. reducedChanges = append(reducedChanges, change)
  336. }
  337. for _, blob := range addedBlobs {
  338. reducedChanges = append(reducedChanges, blob.change)
  339. }
  340. for _, blob := range deletedBlobs {
  341. reducedChanges = append(reducedChanges, blob.change)
  342. }
  343. for _, change := range smallChanges {
  344. reducedChanges = append(reducedChanges, change)
  345. }
  346. return map[string]interface{}{DependencyTreeChanges: reducedChanges}, nil
  347. }
  348. // Fork clones this PipelineItem.
  349. func (ra *RenameAnalysis) Fork(n int) []core.PipelineItem {
  350. return core.ForkSamePipelineItem(ra, n)
  351. }
  352. func (ra *RenameAnalysis) sizesAreClose(size1 int64, size2 int64) bool {
  353. size := internal.Max64(1, internal.Max64(size1, size2))
  354. return (internal.Abs64(size1-size2)*10000)/size <= int64(100-ra.SimilarityThreshold)*100
  355. }
  356. func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (bool, error) {
  357. cleanReturn := false
  358. defer func() {
  359. if !cleanReturn {
  360. log.Println()
  361. log.Println(blob1.Hash.String())
  362. log.Println(blob2.Hash.String())
  363. }
  364. }()
  365. _, err1 := blob1.CountLines()
  366. _, err2 := blob2.CountLines()
  367. if err1 == ErrorBinary || err2 == ErrorBinary {
  368. // binary mode
  369. bsdifflen := DiffBytes(blob1.Data, blob2.Data)
  370. delta := int((int64(bsdifflen) * 100) / internal.Max64(
  371. internal.Min64(blob1.Size, blob2.Size), 1))
  372. cleanReturn = true
  373. return 100-delta >= ra.SimilarityThreshold, nil
  374. }
  375. src, dst := string(blob1.Data), string(blob2.Data)
  376. maxSize := internal.Max(1, internal.Max(utf8.RuneCountInString(src), utf8.RuneCountInString(dst)))
  377. // compute the line-by-line diff, then the char-level diffs of the del-ins blocks
  378. // yes, this algorithm is greedy and not exact
  379. dmp := diffmatchpatch.New()
  380. srcLineRunes, dstLineRunes, _ := dmp.DiffLinesToRunes(src, dst)
  381. // the third returned value, []string, is the mapping from runes to lines
  382. // we cannot use it because it is approximate and has string collisions
  383. // that is, the mapping is wrong for huge files
  384. diffs := dmp.DiffMainRunes(srcLineRunes, dstLineRunes, false)
  385. srcPositions := calcLinePositions(src)
  386. dstPositions := calcLinePositions(dst)
  387. var common, posSrc, prevPosSrc, posDst int
  388. possibleDelInsBlock := false
  389. for _, edit := range diffs {
  390. switch edit.Type {
  391. case diffmatchpatch.DiffDelete:
  392. possibleDelInsBlock = true
  393. prevPosSrc = posSrc
  394. posSrc += utf8.RuneCountInString(edit.Text)
  395. case diffmatchpatch.DiffInsert:
  396. nextPosDst := posDst + utf8.RuneCountInString(edit.Text)
  397. if possibleDelInsBlock {
  398. possibleDelInsBlock = false
  399. if internal.Max(srcPositions[posSrc]-srcPositions[prevPosSrc],
  400. dstPositions[nextPosDst]-dstPositions[posDst]) < RenameAnalysisByteDiffSizeThreshold {
  401. localDmp := diffmatchpatch.New()
  402. localSrc := src[srcPositions[prevPosSrc]:srcPositions[posSrc]]
  403. localDst := dst[dstPositions[posDst]:dstPositions[nextPosDst]]
  404. localDiffs := localDmp.DiffMainRunes(
  405. strToLiteralRunes(localSrc), strToLiteralRunes(localDst), false)
  406. for _, localEdit := range localDiffs {
  407. if localEdit.Type == diffmatchpatch.DiffEqual {
  408. common += utf8.RuneCountInString(localEdit.Text)
  409. }
  410. }
  411. }
  412. }
  413. posDst = nextPosDst
  414. case diffmatchpatch.DiffEqual:
  415. possibleDelInsBlock = false
  416. step := utf8.RuneCountInString(edit.Text)
  417. // for i := range edit.Text does *not* work
  418. // idk why, but `i` appears to be bigger than the number of runes
  419. for i := 0; i < step; i++ {
  420. common += srcPositions[posSrc+i+1] - srcPositions[posSrc+i]
  421. }
  422. posSrc += step
  423. posDst += step
  424. }
  425. if possibleDelInsBlock {
  426. continue
  427. }
  428. // supposing that the rest of the lines are the same (they are not - too optimistic),
  429. // estimate the maximum similarity and exit the loop if it lower than our threshold
  430. var srcPendingSize, dstPendingSize int
  431. srcPendingSize = len(src) - srcPositions[posSrc]
  432. dstPendingSize = len(dst) - dstPositions[posDst]
  433. maxCommon := common + internal.Min(srcPendingSize, dstPendingSize)
  434. similarity := (maxCommon * 100) / maxSize
  435. if similarity < ra.SimilarityThreshold {
  436. cleanReturn = true
  437. return false, nil
  438. }
  439. similarity = (common * 100) / maxSize
  440. if similarity >= ra.SimilarityThreshold {
  441. cleanReturn = true
  442. return true, nil
  443. }
  444. }
  445. // the very last "overly optimistic" estimate was actually precise, so since we are still here
  446. // the blobs are similar
  447. cleanReturn = true
  448. return true, nil
  449. }
  450. func calcLinePositions(text string) []int {
  451. if text == "" {
  452. return []int{0}
  453. }
  454. lines := strings.Split(text, "\n")
  455. positions := make([]int, len(lines)+1)
  456. accum := 0
  457. for i, l := range lines {
  458. positions[i] = accum
  459. accum += len(l) + 1 // +1 for \n
  460. }
  461. if len(lines) > 0 && lines[len(lines)-1] != "\n" {
  462. accum--
  463. }
  464. positions[len(lines)] = accum
  465. return positions
  466. }
  467. func strToLiteralRunes(s string) []rune {
  468. lrunes := make([]rune, len(s))
  469. for i, b := range []byte(s) {
  470. lrunes[i] = rune(b)
  471. }
  472. return lrunes
  473. }
  474. type sortableChange struct {
  475. change *object.Change
  476. hash plumbing.Hash
  477. }
  478. type sortableChanges []sortableChange
  479. func (change *sortableChange) Less(other *sortableChange) bool {
  480. for x := 0; x < 20; x++ {
  481. if change.hash[x] < other.hash[x] {
  482. return true
  483. }
  484. }
  485. return false
  486. }
  487. func (slice sortableChanges) Len() int {
  488. return len(slice)
  489. }
  490. func (slice sortableChanges) Less(i, j int) bool {
  491. return slice[i].Less(&slice[j])
  492. }
  493. func (slice sortableChanges) Swap(i, j int) {
  494. slice[i], slice[j] = slice[j], slice[i]
  495. }
  496. type sortableBlob struct {
  497. change *object.Change
  498. size int64
  499. }
  500. type sortableBlobs []sortableBlob
  501. func (change *sortableBlob) Less(other *sortableBlob) bool {
  502. return change.size < other.size
  503. }
  504. func (slice sortableBlobs) Len() int {
  505. return len(slice)
  506. }
  507. func (slice sortableBlobs) Less(i, j int) bool {
  508. return slice[i].Less(&slice[j])
  509. }
  510. func (slice sortableBlobs) Swap(i, j int) {
  511. slice[i], slice[j] = slice[j], slice[i]
  512. }
  513. type candidateDistance struct {
  514. Candidate int
  515. Distance int
  516. }
  517. func sortRenameCandidates(candidates []int, origin string, nameGetter func(int) string) {
  518. distances := make([]candidateDistance, len(candidates))
  519. ctx := LevenshteinContext{}
  520. for i, x := range candidates {
  521. name := filepath.Base(nameGetter(x))
  522. distances[i] = candidateDistance{x, ctx.Distance(origin, name)}
  523. }
  524. sort.Slice(distances, func(i, j int) bool {
  525. return distances[i].Distance < distances[j].Distance
  526. })
  527. for i, cd := range distances {
  528. candidates[i] = cd.Candidate
  529. }
  530. }
  531. func init() {
  532. core.Registry.Register(&RenameAnalysis{})
  533. }