renames.go 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582
  1. package plumbing
  2. import (
  3. "log"
  4. "path/filepath"
  5. "sort"
  6. "strings"
  7. "sync"
  8. "time"
  9. "unicode/utf8"
  10. "github.com/sergi/go-diff/diffmatchpatch"
  11. "gopkg.in/src-d/go-git.v4"
  12. "gopkg.in/src-d/go-git.v4/plumbing"
  13. "gopkg.in/src-d/go-git.v4/plumbing/object"
  14. "gopkg.in/src-d/go-git.v4/utils/merkletrie"
  15. "gopkg.in/src-d/hercules.v10/internal"
  16. "gopkg.in/src-d/hercules.v10/internal/core"
  17. "gopkg.in/src-d/hercules.v10/internal/levenshtein"
  18. )
  19. // RenameAnalysis improves TreeDiff's results by searching for changed blobs under different
  20. // paths which are likely to be the result of a rename with subsequent edits.
  21. // RenameAnalysis is a PipelineItem.
  22. type RenameAnalysis struct {
  23. core.NoopMerger
  24. // SimilarityThreshold adjusts the heuristic to determine file renames.
  25. // It has the same units as cgit's -X rename-threshold or -M. Better to
  26. // set it to the default value of 80 (80%).
  27. SimilarityThreshold int
  28. repository *git.Repository
  29. }
  30. const (
  31. // RenameAnalysisDefaultThreshold specifies the default percentage of common lines in a pair
  32. // of files to consider them linked. The exact code of the decision is sizesAreClose().
  33. // CGit's default is 50%. Ours is 80% because 50% can be too computationally expensive.
  34. RenameAnalysisDefaultThreshold = 80
  35. // ConfigRenameAnalysisSimilarityThreshold is the name of the configuration option
  36. // (RenameAnalysis.Configure()) which sets the similarity threshold.
  37. ConfigRenameAnalysisSimilarityThreshold = "RenameAnalysis.SimilarityThreshold"
  38. // RenameAnalysisMinimumSize is the minimum size of a blob to be considered.
  39. RenameAnalysisMinimumSize = 32
  40. // RenameAnalysisMaxCandidates is the maximum number of rename candidates to consider per file.
  41. RenameAnalysisMaxCandidates = 50
  42. // RenameAnalysisSetSizeLimit is the maximum number of added + removed files for
  43. // RenameAnalysisMaxCandidates to be active; the bigger numbers set it to 1.
  44. RenameAnalysisSetSizeLimit = 1000
  45. // RenameAnalysisByteDiffSizeThreshold is the maximum size of each of the compared parts
  46. // to be diff-ed on byte level.
  47. RenameAnalysisByteDiffSizeThreshold = 100000
  48. )
  49. // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
  50. func (ra *RenameAnalysis) Name() string {
  51. return "RenameAnalysis"
  52. }
  53. // Provides returns the list of names of entities which are produced by this PipelineItem.
  54. // Each produced entity will be inserted into `deps` of dependent Consume()-s according
  55. // to this list. Also used by core.Registry to build the global map of providers.
  56. func (ra *RenameAnalysis) Provides() []string {
  57. arr := [...]string{DependencyTreeChanges}
  58. return arr[:]
  59. }
  60. // Requires returns the list of names of entities which are needed by this PipelineItem.
  61. // Each requested entity will be inserted into `deps` of Consume(). In turn, those
  62. // entities are Provides() upstream.
  63. func (ra *RenameAnalysis) Requires() []string {
  64. arr := [...]string{DependencyBlobCache, DependencyTreeChanges}
  65. return arr[:]
  66. }
  67. // ListConfigurationOptions returns the list of changeable public properties of this PipelineItem.
  68. func (ra *RenameAnalysis) ListConfigurationOptions() []core.ConfigurationOption {
  69. options := [...]core.ConfigurationOption{{
  70. Name: ConfigRenameAnalysisSimilarityThreshold,
  71. Description: "The threshold on the similarity index used to detect renames.",
  72. Flag: "M",
  73. Type: core.IntConfigurationOption,
  74. Default: RenameAnalysisDefaultThreshold},
  75. }
  76. return options[:]
  77. }
  78. // Configure sets the properties previously published by ListConfigurationOptions().
  79. func (ra *RenameAnalysis) Configure(facts map[string]interface{}) error {
  80. if val, exists := facts[ConfigRenameAnalysisSimilarityThreshold].(int); exists {
  81. ra.SimilarityThreshold = val
  82. }
  83. return nil
  84. }
  85. // Initialize resets the temporary caches and prepares this PipelineItem for a series of Consume()
  86. // calls. The repository which is going to be analysed is supplied as an argument.
  87. func (ra *RenameAnalysis) Initialize(repository *git.Repository) error {
  88. if ra.SimilarityThreshold < 0 || ra.SimilarityThreshold > 100 {
  89. log.Printf("Warning: adjusted the similarity threshold to %d\n",
  90. RenameAnalysisDefaultThreshold)
  91. ra.SimilarityThreshold = RenameAnalysisDefaultThreshold
  92. }
  93. ra.repository = repository
  94. return nil
  95. }
  96. // Consume runs this PipelineItem on the next commit data.
  97. // `deps` contain all the results from upstream PipelineItem-s as requested by Requires().
  98. // Additionally, DependencyCommit is always present there and represents the analysed *object.Commit.
  99. // This function returns the mapping with analysis results. The keys must be the same as
  100. // in Provides(). If there was an error, nil is returned.
  101. func (ra *RenameAnalysis) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  102. changes := deps[DependencyTreeChanges].(object.Changes)
  103. cache := deps[DependencyBlobCache].(map[plumbing.Hash]*CachedBlob)
  104. reducedChanges := make(object.Changes, 0, changes.Len())
  105. // Stage 1 - find renames by matching the hashes
  106. // n log(n)
  107. // We sort additions and deletions by hash and then do the single scan along
  108. // both slices.
  109. deleted := make(sortableChanges, 0, changes.Len())
  110. added := make(sortableChanges, 0, changes.Len())
  111. for _, change := range changes {
  112. action, err := change.Action()
  113. if err != nil {
  114. return nil, err
  115. }
  116. switch action {
  117. case merkletrie.Insert:
  118. added = append(added, sortableChange{change, change.To.TreeEntry.Hash})
  119. case merkletrie.Delete:
  120. deleted = append(deleted, sortableChange{change, change.From.TreeEntry.Hash})
  121. case merkletrie.Modify:
  122. reducedChanges = append(reducedChanges, change)
  123. }
  124. }
  125. sort.Sort(deleted)
  126. sort.Sort(added)
  127. stillDeleted := make(object.Changes, 0, deleted.Len())
  128. stillAdded := make(object.Changes, 0, added.Len())
  129. {
  130. a := 0
  131. d := 0
  132. for a < added.Len() && d < deleted.Len() {
  133. if added[a].hash == deleted[d].hash {
  134. reducedChanges = append(
  135. reducedChanges,
  136. &object.Change{From: deleted[d].change.From, To: added[a].change.To})
  137. a++
  138. d++
  139. } else if added[a].Less(&deleted[d]) {
  140. stillAdded = append(stillAdded, added[a].change)
  141. a++
  142. } else {
  143. stillDeleted = append(stillDeleted, deleted[d].change)
  144. d++
  145. }
  146. }
  147. for ; a < added.Len(); a++ {
  148. stillAdded = append(stillAdded, added[a].change)
  149. }
  150. for ; d < deleted.Len(); d++ {
  151. stillDeleted = append(stillDeleted, deleted[d].change)
  152. }
  153. }
  154. // Stage 2 - apply the similarity threshold
  155. // n^2 but actually linear
  156. // We sort the blobs by size and do the single linear scan.
  157. maxCandidates := RenameAnalysisMaxCandidates
  158. if len(stillAdded)+len(stillDeleted) > RenameAnalysisSetSizeLimit {
  159. maxCandidates = 1
  160. }
  161. addedBlobs := make(sortableBlobs, 0, stillAdded.Len())
  162. deletedBlobs := make(sortableBlobs, 0, stillDeleted.Len())
  163. var smallChanges []*object.Change
  164. for _, change := range stillAdded {
  165. blob := cache[change.To.TreeEntry.Hash]
  166. if blob.Size < RenameAnalysisMinimumSize {
  167. smallChanges = append(smallChanges, change)
  168. } else {
  169. addedBlobs = append(
  170. addedBlobs, sortableBlob{change: change, size: blob.Size})
  171. }
  172. }
  173. for _, change := range stillDeleted {
  174. blob := cache[change.From.TreeEntry.Hash]
  175. if blob.Size < RenameAnalysisMinimumSize {
  176. smallChanges = append(smallChanges, change)
  177. } else {
  178. deletedBlobs = append(
  179. deletedBlobs, sortableBlob{change: change, size: blob.Size})
  180. }
  181. }
  182. sort.Sort(addedBlobs)
  183. sort.Sort(deletedBlobs)
  184. finished := make(chan bool, 2)
  185. finishedA := make(chan bool, 1)
  186. finishedB := make(chan bool, 1)
  187. errs := make(chan error)
  188. matchesA := make(object.Changes, 0, changes.Len())
  189. matchesB := make(object.Changes, 0, changes.Len())
  190. addedBlobsA := addedBlobs
  191. addedBlobsB := make(sortableBlobs, len(addedBlobs))
  192. copy(addedBlobsB, addedBlobs)
  193. deletedBlobsA := deletedBlobs
  194. deletedBlobsB := make(sortableBlobs, len(deletedBlobs))
  195. copy(deletedBlobsB, deletedBlobs)
  196. wg := sync.WaitGroup{}
  197. matchA := func() {
  198. defer func() {
  199. finished <- true
  200. wg.Done()
  201. }()
  202. aStart := 0
  203. // we will try to find a matching added blob for each deleted blob
  204. for d := 0; d < deletedBlobsA.Len(); d++ {
  205. myBlob := cache[deletedBlobsA[d].change.From.TreeEntry.Hash]
  206. mySize := deletedBlobsA[d].size
  207. myName := filepath.Base(deletedBlobsA[d].change.From.Name)
  208. var a int
  209. for a = aStart; a < addedBlobsA.Len() && !ra.sizesAreClose(mySize, addedBlobsA[a].size); a++ {
  210. }
  211. aStart = a
  212. foundMatch := false
  213. // get the list of possible candidates and sort by file name similarity
  214. var candidates []int
  215. for a = aStart; a < addedBlobsA.Len() && ra.sizesAreClose(mySize, addedBlobsA[a].size); a++ {
  216. candidates = append(candidates, a)
  217. }
  218. sortRenameCandidates(candidates, myName, func(a int) string {
  219. return addedBlobsA[a].change.To.Name
  220. })
  221. var ci int
  222. for ci, a = range candidates {
  223. select {
  224. case <-finished:
  225. return
  226. default:
  227. break
  228. }
  229. if ci > maxCandidates {
  230. break
  231. }
  232. blobsAreClose, err := ra.blobsAreClose(
  233. myBlob, cache[addedBlobsA[a].change.To.TreeEntry.Hash])
  234. if err != nil {
  235. errs <- err
  236. return
  237. }
  238. if blobsAreClose {
  239. foundMatch = true
  240. matchesA = append(
  241. matchesA,
  242. &object.Change{
  243. From: deletedBlobsA[d].change.From,
  244. To: addedBlobsA[a].change.To})
  245. break
  246. }
  247. }
  248. if foundMatch {
  249. deletedBlobsA = append(deletedBlobsA[:d], deletedBlobsA[d+1:]...)
  250. d--
  251. addedBlobsA = append(addedBlobsA[:a], addedBlobsA[a+1:]...)
  252. }
  253. }
  254. finishedA <- true
  255. }
  256. matchB := func() {
  257. defer func() {
  258. finished <- true
  259. wg.Done()
  260. }()
  261. dStart := 0
  262. for a := 0; a < addedBlobsB.Len(); a++ {
  263. myBlob := cache[addedBlobsB[a].change.To.TreeEntry.Hash]
  264. mySize := addedBlobsB[a].size
  265. myName := filepath.Base(addedBlobsB[a].change.To.Name)
  266. var d int
  267. for d = dStart; d < deletedBlobsB.Len() && !ra.sizesAreClose(mySize, deletedBlobsB[d].size); d++ {
  268. }
  269. dStart = d
  270. foundMatch := false
  271. // get the list of possible candidates and sort by file name similarity
  272. var candidates []int
  273. for d = dStart; d < deletedBlobsB.Len() && ra.sizesAreClose(mySize, deletedBlobsB[d].size); d++ {
  274. candidates = append(candidates, d)
  275. }
  276. sortRenameCandidates(candidates, myName, func(d int) string {
  277. return deletedBlobsB[d].change.From.Name
  278. })
  279. var ci int
  280. for ci, d = range candidates {
  281. select {
  282. case <-finished:
  283. return
  284. default:
  285. break
  286. }
  287. if ci > maxCandidates {
  288. break
  289. }
  290. blobsAreClose, err := ra.blobsAreClose(
  291. myBlob, cache[deletedBlobsB[d].change.From.TreeEntry.Hash])
  292. if err != nil {
  293. errs <- err
  294. return
  295. }
  296. if blobsAreClose {
  297. foundMatch = true
  298. matchesB = append(
  299. matchesB,
  300. &object.Change{
  301. From: deletedBlobsB[d].change.From,
  302. To: addedBlobsB[a].change.To})
  303. break
  304. }
  305. }
  306. if foundMatch {
  307. addedBlobsB = append(addedBlobsB[:a], addedBlobsB[a+1:]...)
  308. a--
  309. deletedBlobsB = append(deletedBlobsB[:d], deletedBlobsB[d+1:]...)
  310. }
  311. }
  312. finishedB <- true
  313. }
  314. // run two functions in parallel, and take the result from the one which finished earlier
  315. wg.Add(2)
  316. go matchA()
  317. go matchB()
  318. wg.Wait()
  319. var matches object.Changes
  320. select {
  321. case err := <-errs:
  322. return nil, err
  323. case <-finishedA:
  324. addedBlobs = addedBlobsA
  325. deletedBlobs = deletedBlobsA
  326. matches = matchesA
  327. case <-finishedB:
  328. addedBlobs = addedBlobsB
  329. deletedBlobs = deletedBlobsB
  330. matches = matchesB
  331. default:
  332. panic("Impossible happened: two functions returned without an error " +
  333. "but no results from both")
  334. }
  335. // Stage 3 - we give up, everything left are independent additions and deletions
  336. for _, change := range matches {
  337. reducedChanges = append(reducedChanges, change)
  338. }
  339. for _, blob := range addedBlobs {
  340. reducedChanges = append(reducedChanges, blob.change)
  341. }
  342. for _, blob := range deletedBlobs {
  343. reducedChanges = append(reducedChanges, blob.change)
  344. }
  345. for _, change := range smallChanges {
  346. reducedChanges = append(reducedChanges, change)
  347. }
  348. return map[string]interface{}{DependencyTreeChanges: reducedChanges}, nil
  349. }
  350. // Fork clones this PipelineItem.
  351. func (ra *RenameAnalysis) Fork(n int) []core.PipelineItem {
  352. return core.ForkSamePipelineItem(ra, n)
  353. }
  354. func (ra *RenameAnalysis) sizesAreClose(size1 int64, size2 int64) bool {
  355. size := internal.Max64(1, internal.Max64(size1, size2))
  356. return (internal.Abs64(size1-size2)*10000)/size <= int64(100-ra.SimilarityThreshold)*100
  357. }
  358. func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (bool, error) {
  359. cleanReturn := false
  360. defer func() {
  361. if !cleanReturn {
  362. log.Println()
  363. log.Println(blob1.Hash.String())
  364. log.Println(blob2.Hash.String())
  365. }
  366. }()
  367. _, err1 := blob1.CountLines()
  368. _, err2 := blob2.CountLines()
  369. if err1 == ErrorBinary || err2 == ErrorBinary {
  370. // binary mode
  371. bsdifflen := DiffBytes(blob1.Data, blob2.Data)
  372. delta := int((int64(bsdifflen) * 100) / internal.Max64(
  373. internal.Min64(blob1.Size, blob2.Size), 1))
  374. cleanReturn = true
  375. return 100-delta >= ra.SimilarityThreshold, nil
  376. }
  377. src, dst := string(blob1.Data), string(blob2.Data)
  378. maxSize := internal.Max(1, internal.Max(utf8.RuneCountInString(src), utf8.RuneCountInString(dst)))
  379. // compute the line-by-line diff, then the char-level diffs of the del-ins blocks
  380. // yes, this algorithm is greedy and not exact
  381. dmp := diffmatchpatch.New()
  382. dmp.DiffTimeout = time.Hour
  383. srcLineRunes, dstLineRunes, _ := dmp.DiffLinesToRunes(src, dst)
  384. // the third returned value, []string, is the mapping from runes to lines
  385. // we cannot use it because it is approximate and has string collisions
  386. // that is, the mapping is wrong for huge files
  387. diffs := dmp.DiffMainRunes(srcLineRunes, dstLineRunes, false)
  388. srcPositions := calcLinePositions(src)
  389. dstPositions := calcLinePositions(dst)
  390. var common, posSrc, prevPosSrc, posDst int
  391. possibleDelInsBlock := false
  392. for _, edit := range diffs {
  393. switch edit.Type {
  394. case diffmatchpatch.DiffDelete:
  395. possibleDelInsBlock = true
  396. prevPosSrc = posSrc
  397. posSrc += utf8.RuneCountInString(edit.Text)
  398. case diffmatchpatch.DiffInsert:
  399. nextPosDst := posDst + utf8.RuneCountInString(edit.Text)
  400. if possibleDelInsBlock {
  401. possibleDelInsBlock = false
  402. if internal.Max(srcPositions[posSrc]-srcPositions[prevPosSrc],
  403. dstPositions[nextPosDst]-dstPositions[posDst]) < RenameAnalysisByteDiffSizeThreshold {
  404. localDmp := diffmatchpatch.New()
  405. localDmp.DiffTimeout = time.Hour
  406. localSrc := src[srcPositions[prevPosSrc]:srcPositions[posSrc]]
  407. localDst := dst[dstPositions[posDst]:dstPositions[nextPosDst]]
  408. localDiffs := localDmp.DiffMainRunes(
  409. strToLiteralRunes(localSrc), strToLiteralRunes(localDst), false)
  410. for _, localEdit := range localDiffs {
  411. if localEdit.Type == diffmatchpatch.DiffEqual {
  412. common += utf8.RuneCountInString(localEdit.Text)
  413. }
  414. }
  415. }
  416. }
  417. posDst = nextPosDst
  418. case diffmatchpatch.DiffEqual:
  419. possibleDelInsBlock = false
  420. step := utf8.RuneCountInString(edit.Text)
  421. // for i := range edit.Text does *not* work
  422. // idk why, but `i` appears to be bigger than the number of runes
  423. for i := 0; i < step; i++ {
  424. common += srcPositions[posSrc+i+1] - srcPositions[posSrc+i]
  425. }
  426. posSrc += step
  427. posDst += step
  428. }
  429. if possibleDelInsBlock {
  430. continue
  431. }
  432. // supposing that the rest of the lines are the same (they are not - too optimistic),
  433. // estimate the maximum similarity and exit the loop if it lower than our threshold
  434. var srcPendingSize, dstPendingSize int
  435. srcPendingSize = len(src) - srcPositions[posSrc]
  436. dstPendingSize = len(dst) - dstPositions[posDst]
  437. maxCommon := common + internal.Min(srcPendingSize, dstPendingSize)
  438. similarity := (maxCommon * 100) / maxSize
  439. if similarity < ra.SimilarityThreshold {
  440. cleanReturn = true
  441. return false, nil
  442. }
  443. similarity = (common * 100) / maxSize
  444. if similarity >= ra.SimilarityThreshold {
  445. cleanReturn = true
  446. return true, nil
  447. }
  448. }
  449. // the very last "overly optimistic" estimate was actually precise, so since we are still here
  450. // the blobs are similar
  451. cleanReturn = true
  452. return true, nil
  453. }
  454. func calcLinePositions(text string) []int {
  455. if text == "" {
  456. return []int{0}
  457. }
  458. lines := strings.Split(text, "\n")
  459. positions := make([]int, len(lines)+1)
  460. accum := 0
  461. for i, l := range lines {
  462. positions[i] = accum
  463. accum += len(l) + 1 // +1 for \n
  464. }
  465. if len(lines) > 0 && lines[len(lines)-1] != "\n" {
  466. accum--
  467. }
  468. positions[len(lines)] = accum
  469. return positions
  470. }
  471. func strToLiteralRunes(s string) []rune {
  472. lrunes := make([]rune, len(s))
  473. for i, b := range []byte(s) {
  474. lrunes[i] = rune(b)
  475. }
  476. return lrunes
  477. }
  478. type sortableChange struct {
  479. change *object.Change
  480. hash plumbing.Hash
  481. }
  482. type sortableChanges []sortableChange
  483. func (change *sortableChange) Less(other *sortableChange) bool {
  484. for x := 0; x < 20; x++ {
  485. if change.hash[x] < other.hash[x] {
  486. return true
  487. }
  488. }
  489. return false
  490. }
  491. func (slice sortableChanges) Len() int {
  492. return len(slice)
  493. }
  494. func (slice sortableChanges) Less(i, j int) bool {
  495. return slice[i].Less(&slice[j])
  496. }
  497. func (slice sortableChanges) Swap(i, j int) {
  498. slice[i], slice[j] = slice[j], slice[i]
  499. }
  500. type sortableBlob struct {
  501. change *object.Change
  502. size int64
  503. }
  504. type sortableBlobs []sortableBlob
  505. func (change *sortableBlob) Less(other *sortableBlob) bool {
  506. return change.size < other.size
  507. }
  508. func (slice sortableBlobs) Len() int {
  509. return len(slice)
  510. }
  511. func (slice sortableBlobs) Less(i, j int) bool {
  512. return slice[i].Less(&slice[j])
  513. }
  514. func (slice sortableBlobs) Swap(i, j int) {
  515. slice[i], slice[j] = slice[j], slice[i]
  516. }
  517. type candidateDistance struct {
  518. Candidate int
  519. Distance int
  520. }
  521. func sortRenameCandidates(candidates []int, origin string, nameGetter func(int) string) {
  522. distances := make([]candidateDistance, len(candidates))
  523. ctx := levenshtein.Context{}
  524. for i, x := range candidates {
  525. name := filepath.Base(nameGetter(x))
  526. distances[i] = candidateDistance{x, ctx.Distance(origin, name)}
  527. }
  528. sort.Slice(distances, func(i, j int) bool {
  529. return distances[i].Distance < distances[j].Distance
  530. })
  531. for i, cd := range distances {
  532. candidates[i] = cd.Candidate
  533. }
  534. }
  535. func init() {
  536. core.Registry.Register(&RenameAnalysis{})
  537. }