renames.go 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586
  1. package plumbing
  2. import (
  3. "path/filepath"
  4. "sort"
  5. "strings"
  6. "sync"
  7. "time"
  8. "unicode/utf8"
  9. "github.com/sergi/go-diff/diffmatchpatch"
  10. "gopkg.in/src-d/go-git.v4"
  11. "gopkg.in/src-d/go-git.v4/plumbing"
  12. "gopkg.in/src-d/go-git.v4/plumbing/object"
  13. "gopkg.in/src-d/go-git.v4/utils/merkletrie"
  14. "gopkg.in/src-d/hercules.v10/internal"
  15. "gopkg.in/src-d/hercules.v10/internal/core"
  16. "gopkg.in/src-d/hercules.v10/internal/levenshtein"
  17. )
  18. // RenameAnalysis improves TreeDiff's results by searching for changed blobs under different
  19. // paths which are likely to be the result of a rename with subsequent edits.
  20. // RenameAnalysis is a PipelineItem.
  21. type RenameAnalysis struct {
  22. core.NoopMerger
  23. // SimilarityThreshold adjusts the heuristic to determine file renames.
  24. // It has the same units as cgit's -X rename-threshold or -M. Better to
  25. // set it to the default value of 80 (80%).
  26. SimilarityThreshold int
  27. repository *git.Repository
  28. l core.Logger
  29. }
  30. const (
  31. // RenameAnalysisDefaultThreshold specifies the default percentage of common lines in a pair
  32. // of files to consider them linked. The exact code of the decision is sizesAreClose().
  33. // CGit's default is 50%. Ours is 80% because 50% can be too computationally expensive.
  34. RenameAnalysisDefaultThreshold = 80
  35. // ConfigRenameAnalysisSimilarityThreshold is the name of the configuration option
  36. // (RenameAnalysis.Configure()) which sets the similarity threshold.
  37. ConfigRenameAnalysisSimilarityThreshold = "RenameAnalysis.SimilarityThreshold"
  38. // RenameAnalysisMinimumSize is the minimum size of a blob to be considered.
  39. RenameAnalysisMinimumSize = 32
  40. // RenameAnalysisMaxCandidates is the maximum number of rename candidates to consider per file.
  41. RenameAnalysisMaxCandidates = 50
  42. // RenameAnalysisSetSizeLimit is the maximum number of added + removed files for
  43. // RenameAnalysisMaxCandidates to be active; the bigger numbers set it to 1.
  44. RenameAnalysisSetSizeLimit = 1000
  45. // RenameAnalysisByteDiffSizeThreshold is the maximum size of each of the compared parts
  46. // to be diff-ed on byte level.
  47. RenameAnalysisByteDiffSizeThreshold = 100000
  48. )
  49. // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
  50. func (ra *RenameAnalysis) Name() string {
  51. return "RenameAnalysis"
  52. }
  53. // Provides returns the list of names of entities which are produced by this PipelineItem.
  54. // Each produced entity will be inserted into `deps` of dependent Consume()-s according
  55. // to this list. Also used by core.Registry to build the global map of providers.
  56. func (ra *RenameAnalysis) Provides() []string {
  57. arr := [...]string{DependencyTreeChanges}
  58. return arr[:]
  59. }
  60. // Requires returns the list of names of entities which are needed by this PipelineItem.
  61. // Each requested entity will be inserted into `deps` of Consume(). In turn, those
  62. // entities are Provides() upstream.
  63. func (ra *RenameAnalysis) Requires() []string {
  64. arr := [...]string{DependencyBlobCache, DependencyTreeChanges}
  65. return arr[:]
  66. }
  67. // ListConfigurationOptions returns the list of changeable public properties of this PipelineItem.
  68. func (ra *RenameAnalysis) ListConfigurationOptions() []core.ConfigurationOption {
  69. options := [...]core.ConfigurationOption{{
  70. Name: ConfigRenameAnalysisSimilarityThreshold,
  71. Description: "The threshold on the similarity index used to detect renames.",
  72. Flag: "M",
  73. Type: core.IntConfigurationOption,
  74. Default: RenameAnalysisDefaultThreshold},
  75. }
  76. return options[:]
  77. }
  78. // Configure sets the properties previously published by ListConfigurationOptions().
  79. func (ra *RenameAnalysis) Configure(facts map[string]interface{}) error {
  80. if l, exists := facts[core.ConfigLogger].(core.Logger); exists {
  81. ra.l = l
  82. }
  83. if val, exists := facts[ConfigRenameAnalysisSimilarityThreshold].(int); exists {
  84. ra.SimilarityThreshold = val
  85. }
  86. return nil
  87. }
  88. // Initialize resets the temporary caches and prepares this PipelineItem for a series of Consume()
  89. // calls. The repository which is going to be analysed is supplied as an argument.
  90. func (ra *RenameAnalysis) Initialize(repository *git.Repository) error {
  91. ra.l = core.NewLogger()
  92. if ra.SimilarityThreshold < 0 || ra.SimilarityThreshold > 100 {
  93. ra.l.Warnf("adjusted the similarity threshold to %d\n",
  94. RenameAnalysisDefaultThreshold)
  95. ra.SimilarityThreshold = RenameAnalysisDefaultThreshold
  96. }
  97. ra.repository = repository
  98. return nil
  99. }
  100. // Consume runs this PipelineItem on the next commit data.
  101. // `deps` contain all the results from upstream PipelineItem-s as requested by Requires().
  102. // Additionally, DependencyCommit is always present there and represents the analysed *object.Commit.
  103. // This function returns the mapping with analysis results. The keys must be the same as
  104. // in Provides(). If there was an error, nil is returned.
  105. func (ra *RenameAnalysis) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  106. changes := deps[DependencyTreeChanges].(object.Changes)
  107. cache := deps[DependencyBlobCache].(map[plumbing.Hash]*CachedBlob)
  108. reducedChanges := make(object.Changes, 0, changes.Len())
  109. // Stage 1 - find renames by matching the hashes
  110. // n log(n)
  111. // We sort additions and deletions by hash and then do the single scan along
  112. // both slices.
  113. deleted := make(sortableChanges, 0, changes.Len())
  114. added := make(sortableChanges, 0, changes.Len())
  115. for _, change := range changes {
  116. action, err := change.Action()
  117. if err != nil {
  118. return nil, err
  119. }
  120. switch action {
  121. case merkletrie.Insert:
  122. added = append(added, sortableChange{change, change.To.TreeEntry.Hash})
  123. case merkletrie.Delete:
  124. deleted = append(deleted, sortableChange{change, change.From.TreeEntry.Hash})
  125. case merkletrie.Modify:
  126. reducedChanges = append(reducedChanges, change)
  127. }
  128. }
  129. sort.Sort(deleted)
  130. sort.Sort(added)
  131. stillDeleted := make(object.Changes, 0, deleted.Len())
  132. stillAdded := make(object.Changes, 0, added.Len())
  133. {
  134. a := 0
  135. d := 0
  136. for a < added.Len() && d < deleted.Len() {
  137. if added[a].hash == deleted[d].hash {
  138. reducedChanges = append(
  139. reducedChanges,
  140. &object.Change{From: deleted[d].change.From, To: added[a].change.To})
  141. a++
  142. d++
  143. } else if added[a].Less(&deleted[d]) {
  144. stillAdded = append(stillAdded, added[a].change)
  145. a++
  146. } else {
  147. stillDeleted = append(stillDeleted, deleted[d].change)
  148. d++
  149. }
  150. }
  151. for ; a < added.Len(); a++ {
  152. stillAdded = append(stillAdded, added[a].change)
  153. }
  154. for ; d < deleted.Len(); d++ {
  155. stillDeleted = append(stillDeleted, deleted[d].change)
  156. }
  157. }
  158. // Stage 2 - apply the similarity threshold
  159. // n^2 but actually linear
  160. // We sort the blobs by size and do the single linear scan.
  161. maxCandidates := RenameAnalysisMaxCandidates
  162. if len(stillAdded)+len(stillDeleted) > RenameAnalysisSetSizeLimit {
  163. maxCandidates = 1
  164. }
  165. addedBlobs := make(sortableBlobs, 0, stillAdded.Len())
  166. deletedBlobs := make(sortableBlobs, 0, stillDeleted.Len())
  167. var smallChanges []*object.Change
  168. for _, change := range stillAdded {
  169. blob := cache[change.To.TreeEntry.Hash]
  170. if blob.Size < RenameAnalysisMinimumSize {
  171. smallChanges = append(smallChanges, change)
  172. } else {
  173. addedBlobs = append(
  174. addedBlobs, sortableBlob{change: change, size: blob.Size})
  175. }
  176. }
  177. for _, change := range stillDeleted {
  178. blob := cache[change.From.TreeEntry.Hash]
  179. if blob.Size < RenameAnalysisMinimumSize {
  180. smallChanges = append(smallChanges, change)
  181. } else {
  182. deletedBlobs = append(
  183. deletedBlobs, sortableBlob{change: change, size: blob.Size})
  184. }
  185. }
  186. sort.Sort(addedBlobs)
  187. sort.Sort(deletedBlobs)
  188. finished := make(chan bool, 2)
  189. finishedA := make(chan bool, 1)
  190. finishedB := make(chan bool, 1)
  191. errs := make(chan error)
  192. matchesA := make(object.Changes, 0, changes.Len())
  193. matchesB := make(object.Changes, 0, changes.Len())
  194. addedBlobsA := addedBlobs
  195. addedBlobsB := make(sortableBlobs, len(addedBlobs))
  196. copy(addedBlobsB, addedBlobs)
  197. deletedBlobsA := deletedBlobs
  198. deletedBlobsB := make(sortableBlobs, len(deletedBlobs))
  199. copy(deletedBlobsB, deletedBlobs)
  200. wg := sync.WaitGroup{}
  201. matchA := func() {
  202. defer func() {
  203. finished <- true
  204. wg.Done()
  205. }()
  206. aStart := 0
  207. // we will try to find a matching added blob for each deleted blob
  208. for d := 0; d < deletedBlobsA.Len(); d++ {
  209. myBlob := cache[deletedBlobsA[d].change.From.TreeEntry.Hash]
  210. mySize := deletedBlobsA[d].size
  211. myName := filepath.Base(deletedBlobsA[d].change.From.Name)
  212. var a int
  213. for a = aStart; a < addedBlobsA.Len() && !ra.sizesAreClose(mySize, addedBlobsA[a].size); a++ {
  214. }
  215. aStart = a
  216. foundMatch := false
  217. // get the list of possible candidates and sort by file name similarity
  218. var candidates []int
  219. for a = aStart; a < addedBlobsA.Len() && ra.sizesAreClose(mySize, addedBlobsA[a].size); a++ {
  220. candidates = append(candidates, a)
  221. }
  222. sortRenameCandidates(candidates, myName, func(a int) string {
  223. return addedBlobsA[a].change.To.Name
  224. })
  225. var ci int
  226. for ci, a = range candidates {
  227. select {
  228. case <-finished:
  229. return
  230. default:
  231. break
  232. }
  233. if ci > maxCandidates {
  234. break
  235. }
  236. blobsAreClose, err := ra.blobsAreClose(
  237. myBlob, cache[addedBlobsA[a].change.To.TreeEntry.Hash])
  238. if err != nil {
  239. errs <- err
  240. return
  241. }
  242. if blobsAreClose {
  243. foundMatch = true
  244. matchesA = append(
  245. matchesA,
  246. &object.Change{
  247. From: deletedBlobsA[d].change.From,
  248. To: addedBlobsA[a].change.To})
  249. break
  250. }
  251. }
  252. if foundMatch {
  253. deletedBlobsA = append(deletedBlobsA[:d], deletedBlobsA[d+1:]...)
  254. d--
  255. addedBlobsA = append(addedBlobsA[:a], addedBlobsA[a+1:]...)
  256. }
  257. }
  258. finishedA <- true
  259. }
  260. matchB := func() {
  261. defer func() {
  262. finished <- true
  263. wg.Done()
  264. }()
  265. dStart := 0
  266. for a := 0; a < addedBlobsB.Len(); a++ {
  267. myBlob := cache[addedBlobsB[a].change.To.TreeEntry.Hash]
  268. mySize := addedBlobsB[a].size
  269. myName := filepath.Base(addedBlobsB[a].change.To.Name)
  270. var d int
  271. for d = dStart; d < deletedBlobsB.Len() && !ra.sizesAreClose(mySize, deletedBlobsB[d].size); d++ {
  272. }
  273. dStart = d
  274. foundMatch := false
  275. // get the list of possible candidates and sort by file name similarity
  276. var candidates []int
  277. for d = dStart; d < deletedBlobsB.Len() && ra.sizesAreClose(mySize, deletedBlobsB[d].size); d++ {
  278. candidates = append(candidates, d)
  279. }
  280. sortRenameCandidates(candidates, myName, func(d int) string {
  281. return deletedBlobsB[d].change.From.Name
  282. })
  283. var ci int
  284. for ci, d = range candidates {
  285. select {
  286. case <-finished:
  287. return
  288. default:
  289. break
  290. }
  291. if ci > maxCandidates {
  292. break
  293. }
  294. blobsAreClose, err := ra.blobsAreClose(
  295. myBlob, cache[deletedBlobsB[d].change.From.TreeEntry.Hash])
  296. if err != nil {
  297. errs <- err
  298. return
  299. }
  300. if blobsAreClose {
  301. foundMatch = true
  302. matchesB = append(
  303. matchesB,
  304. &object.Change{
  305. From: deletedBlobsB[d].change.From,
  306. To: addedBlobsB[a].change.To})
  307. break
  308. }
  309. }
  310. if foundMatch {
  311. addedBlobsB = append(addedBlobsB[:a], addedBlobsB[a+1:]...)
  312. a--
  313. deletedBlobsB = append(deletedBlobsB[:d], deletedBlobsB[d+1:]...)
  314. }
  315. }
  316. finishedB <- true
  317. }
  318. // run two functions in parallel, and take the result from the one which finished earlier
  319. wg.Add(2)
  320. go matchA()
  321. go matchB()
  322. wg.Wait()
  323. var matches object.Changes
  324. select {
  325. case err := <-errs:
  326. return nil, err
  327. case <-finishedA:
  328. addedBlobs = addedBlobsA
  329. deletedBlobs = deletedBlobsA
  330. matches = matchesA
  331. case <-finishedB:
  332. addedBlobs = addedBlobsB
  333. deletedBlobs = deletedBlobsB
  334. matches = matchesB
  335. default:
  336. panic("Impossible happened: two functions returned without an error " +
  337. "but no results from both")
  338. }
  339. // Stage 3 - we give up, everything left are independent additions and deletions
  340. for _, change := range matches {
  341. reducedChanges = append(reducedChanges, change)
  342. }
  343. for _, blob := range addedBlobs {
  344. reducedChanges = append(reducedChanges, blob.change)
  345. }
  346. for _, blob := range deletedBlobs {
  347. reducedChanges = append(reducedChanges, blob.change)
  348. }
  349. for _, change := range smallChanges {
  350. reducedChanges = append(reducedChanges, change)
  351. }
  352. return map[string]interface{}{DependencyTreeChanges: reducedChanges}, nil
  353. }
  354. // Fork clones this PipelineItem.
  355. func (ra *RenameAnalysis) Fork(n int) []core.PipelineItem {
  356. return core.ForkSamePipelineItem(ra, n)
  357. }
  358. func (ra *RenameAnalysis) sizesAreClose(size1 int64, size2 int64) bool {
  359. size := internal.Max64(1, internal.Max64(size1, size2))
  360. return (internal.Abs64(size1-size2)*10000)/size <= int64(100-ra.SimilarityThreshold)*100
  361. }
  362. func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (bool, error) {
  363. cleanReturn := false
  364. defer func() {
  365. if !cleanReturn {
  366. ra.l.Warnf("unclean return detected for blobs '%s' and '%s'\n",
  367. blob1.Hash.String(), blob2.Hash.String())
  368. }
  369. }()
  370. _, err1 := blob1.CountLines()
  371. _, err2 := blob2.CountLines()
  372. if err1 == ErrorBinary || err2 == ErrorBinary {
  373. // binary mode
  374. bsdifflen := DiffBytes(blob1.Data, blob2.Data)
  375. delta := int((int64(bsdifflen) * 100) / internal.Max64(
  376. internal.Min64(blob1.Size, blob2.Size), 1))
  377. cleanReturn = true
  378. return 100-delta >= ra.SimilarityThreshold, nil
  379. }
  380. src, dst := string(blob1.Data), string(blob2.Data)
  381. maxSize := internal.Max(1, internal.Max(utf8.RuneCountInString(src), utf8.RuneCountInString(dst)))
  382. // compute the line-by-line diff, then the char-level diffs of the del-ins blocks
  383. // yes, this algorithm is greedy and not exact
  384. dmp := diffmatchpatch.New()
  385. dmp.DiffTimeout = time.Hour
  386. srcLineRunes, dstLineRunes, _ := dmp.DiffLinesToRunes(src, dst)
  387. // the third returned value, []string, is the mapping from runes to lines
  388. // we cannot use it because it is approximate and has string collisions
  389. // that is, the mapping is wrong for huge files
  390. diffs := dmp.DiffMainRunes(srcLineRunes, dstLineRunes, false)
  391. srcPositions := calcLinePositions(src)
  392. dstPositions := calcLinePositions(dst)
  393. var common, posSrc, prevPosSrc, posDst int
  394. possibleDelInsBlock := false
  395. for _, edit := range diffs {
  396. switch edit.Type {
  397. case diffmatchpatch.DiffDelete:
  398. possibleDelInsBlock = true
  399. prevPosSrc = posSrc
  400. posSrc += utf8.RuneCountInString(edit.Text)
  401. case diffmatchpatch.DiffInsert:
  402. nextPosDst := posDst + utf8.RuneCountInString(edit.Text)
  403. if possibleDelInsBlock {
  404. possibleDelInsBlock = false
  405. if internal.Max(srcPositions[posSrc]-srcPositions[prevPosSrc],
  406. dstPositions[nextPosDst]-dstPositions[posDst]) < RenameAnalysisByteDiffSizeThreshold {
  407. localDmp := diffmatchpatch.New()
  408. localDmp.DiffTimeout = time.Hour
  409. localSrc := src[srcPositions[prevPosSrc]:srcPositions[posSrc]]
  410. localDst := dst[dstPositions[posDst]:dstPositions[nextPosDst]]
  411. localDiffs := localDmp.DiffMainRunes(
  412. strToLiteralRunes(localSrc), strToLiteralRunes(localDst), false)
  413. for _, localEdit := range localDiffs {
  414. if localEdit.Type == diffmatchpatch.DiffEqual {
  415. common += utf8.RuneCountInString(localEdit.Text)
  416. }
  417. }
  418. }
  419. }
  420. posDst = nextPosDst
  421. case diffmatchpatch.DiffEqual:
  422. possibleDelInsBlock = false
  423. step := utf8.RuneCountInString(edit.Text)
  424. // for i := range edit.Text does *not* work
  425. // idk why, but `i` appears to be bigger than the number of runes
  426. for i := 0; i < step; i++ {
  427. common += srcPositions[posSrc+i+1] - srcPositions[posSrc+i]
  428. }
  429. posSrc += step
  430. posDst += step
  431. }
  432. if possibleDelInsBlock {
  433. continue
  434. }
  435. // supposing that the rest of the lines are the same (they are not - too optimistic),
  436. // estimate the maximum similarity and exit the loop if it lower than our threshold
  437. var srcPendingSize, dstPendingSize int
  438. srcPendingSize = len(src) - srcPositions[posSrc]
  439. dstPendingSize = len(dst) - dstPositions[posDst]
  440. maxCommon := common + internal.Min(srcPendingSize, dstPendingSize)
  441. similarity := (maxCommon * 100) / maxSize
  442. if similarity < ra.SimilarityThreshold {
  443. cleanReturn = true
  444. return false, nil
  445. }
  446. similarity = (common * 100) / maxSize
  447. if similarity >= ra.SimilarityThreshold {
  448. cleanReturn = true
  449. return true, nil
  450. }
  451. }
  452. // the very last "overly optimistic" estimate was actually precise, so since we are still here
  453. // the blobs are similar
  454. cleanReturn = true
  455. return true, nil
  456. }
  457. func calcLinePositions(text string) []int {
  458. if text == "" {
  459. return []int{0}
  460. }
  461. lines := strings.Split(text, "\n")
  462. positions := make([]int, len(lines)+1)
  463. accum := 0
  464. for i, l := range lines {
  465. positions[i] = accum
  466. accum += len(l) + 1 // +1 for \n
  467. }
  468. if len(lines) > 0 && lines[len(lines)-1] != "\n" {
  469. accum--
  470. }
  471. positions[len(lines)] = accum
  472. return positions
  473. }
  474. func strToLiteralRunes(s string) []rune {
  475. lrunes := make([]rune, len(s))
  476. for i, b := range []byte(s) {
  477. lrunes[i] = rune(b)
  478. }
  479. return lrunes
  480. }
  481. type sortableChange struct {
  482. change *object.Change
  483. hash plumbing.Hash
  484. }
  485. type sortableChanges []sortableChange
  486. func (change *sortableChange) Less(other *sortableChange) bool {
  487. for x := 0; x < 20; x++ {
  488. if change.hash[x] < other.hash[x] {
  489. return true
  490. }
  491. }
  492. return false
  493. }
  494. func (slice sortableChanges) Len() int {
  495. return len(slice)
  496. }
  497. func (slice sortableChanges) Less(i, j int) bool {
  498. return slice[i].Less(&slice[j])
  499. }
  500. func (slice sortableChanges) Swap(i, j int) {
  501. slice[i], slice[j] = slice[j], slice[i]
  502. }
  503. type sortableBlob struct {
  504. change *object.Change
  505. size int64
  506. }
  507. type sortableBlobs []sortableBlob
  508. func (change *sortableBlob) Less(other *sortableBlob) bool {
  509. return change.size < other.size
  510. }
  511. func (slice sortableBlobs) Len() int {
  512. return len(slice)
  513. }
  514. func (slice sortableBlobs) Less(i, j int) bool {
  515. return slice[i].Less(&slice[j])
  516. }
  517. func (slice sortableBlobs) Swap(i, j int) {
  518. slice[i], slice[j] = slice[j], slice[i]
  519. }
  520. type candidateDistance struct {
  521. Candidate int
  522. Distance int
  523. }
  524. func sortRenameCandidates(candidates []int, origin string, nameGetter func(int) string) {
  525. distances := make([]candidateDistance, len(candidates))
  526. ctx := levenshtein.Context{}
  527. for i, x := range candidates {
  528. name := filepath.Base(nameGetter(x))
  529. distances[i] = candidateDistance{x, ctx.Distance(origin, name)}
  530. }
  531. sort.Slice(distances, func(i, j int) bool {
  532. return distances[i].Distance < distances[j].Distance
  533. })
  534. for i, cd := range distances {
  535. candidates[i] = cd.Candidate
  536. }
  537. }
  538. func init() {
  539. core.Registry.Register(&RenameAnalysis{})
  540. }