renames.go 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584
  1. package plumbing
  2. import (
  3. "path/filepath"
  4. "sort"
  5. "strings"
  6. "sync"
  7. "time"
  8. "unicode/utf8"
  9. "github.com/sergi/go-diff/diffmatchpatch"
  10. "gopkg.in/src-d/go-git.v4"
  11. "gopkg.in/src-d/go-git.v4/plumbing"
  12. "gopkg.in/src-d/go-git.v4/plumbing/object"
  13. "gopkg.in/src-d/go-git.v4/utils/merkletrie"
  14. "gopkg.in/src-d/hercules.v10/internal"
  15. "gopkg.in/src-d/hercules.v10/internal/core"
  16. "gopkg.in/src-d/hercules.v10/internal/levenshtein"
  17. )
  18. // RenameAnalysis improves TreeDiff's results by searching for changed blobs under different
  19. // paths which are likely to be the result of a rename with subsequent edits.
  20. // RenameAnalysis is a PipelineItem.
  21. type RenameAnalysis struct {
  22. core.NoopMerger
  23. // SimilarityThreshold adjusts the heuristic to determine file renames.
  24. // It has the same units as cgit's -X rename-threshold or -M. Better to
  25. // set it to the default value of 80 (80%).
  26. SimilarityThreshold int
  27. repository *git.Repository
  28. l core.Logger
  29. }
  30. const (
  31. // RenameAnalysisDefaultThreshold specifies the default percentage of common lines in a pair
  32. // of files to consider them linked. The exact code of the decision is sizesAreClose().
  33. // CGit's default is 50%. Ours is 80% because 50% can be too computationally expensive.
  34. RenameAnalysisDefaultThreshold = 80
  35. // ConfigRenameAnalysisSimilarityThreshold is the name of the configuration option
  36. // (RenameAnalysis.Configure()) which sets the similarity threshold.
  37. ConfigRenameAnalysisSimilarityThreshold = "RenameAnalysis.SimilarityThreshold"
  38. // RenameAnalysisMinimumSize is the minimum size of a blob to be considered.
  39. RenameAnalysisMinimumSize = 32
  40. // RenameAnalysisMaxCandidates is the maximum number of rename candidates to consider per file.
  41. RenameAnalysisMaxCandidates = 50
  42. // RenameAnalysisSetSizeLimit is the maximum number of added + removed files for
  43. // RenameAnalysisMaxCandidates to be active; the bigger numbers set it to 1.
  44. RenameAnalysisSetSizeLimit = 1000
  45. // RenameAnalysisByteDiffSizeThreshold is the maximum size of each of the compared parts
  46. // to be diff-ed on byte level.
  47. RenameAnalysisByteDiffSizeThreshold = 100000
  48. )
  49. // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
  50. func (ra *RenameAnalysis) Name() string {
  51. return "RenameAnalysis"
  52. }
  53. // Provides returns the list of names of entities which are produced by this PipelineItem.
  54. // Each produced entity will be inserted into `deps` of dependent Consume()-s according
  55. // to this list. Also used by core.Registry to build the global map of providers.
  56. func (ra *RenameAnalysis) Provides() []string {
  57. return []string{DependencyTreeChanges}
  58. }
  59. // Requires returns the list of names of entities which are needed by this PipelineItem.
  60. // Each requested entity will be inserted into `deps` of Consume(). In turn, those
  61. // entities are Provides() upstream.
  62. func (ra *RenameAnalysis) Requires() []string {
  63. return []string{DependencyBlobCache, DependencyTreeChanges}
  64. }
  65. // ListConfigurationOptions returns the list of changeable public properties of this PipelineItem.
  66. func (ra *RenameAnalysis) ListConfigurationOptions() []core.ConfigurationOption {
  67. options := [...]core.ConfigurationOption{{
  68. Name: ConfigRenameAnalysisSimilarityThreshold,
  69. Description: "The threshold on the similarity index used to detect renames.",
  70. Flag: "M",
  71. Type: core.IntConfigurationOption,
  72. Default: RenameAnalysisDefaultThreshold},
  73. }
  74. return options[:]
  75. }
  76. // Configure sets the properties previously published by ListConfigurationOptions().
  77. func (ra *RenameAnalysis) Configure(facts map[string]interface{}) error {
  78. if l, exists := facts[core.ConfigLogger].(core.Logger); exists {
  79. ra.l = l
  80. }
  81. if val, exists := facts[ConfigRenameAnalysisSimilarityThreshold].(int); exists {
  82. ra.SimilarityThreshold = val
  83. }
  84. return nil
  85. }
  86. // Initialize resets the temporary caches and prepares this PipelineItem for a series of Consume()
  87. // calls. The repository which is going to be analysed is supplied as an argument.
  88. func (ra *RenameAnalysis) Initialize(repository *git.Repository) error {
  89. ra.l = core.NewLogger()
  90. if ra.SimilarityThreshold < 0 || ra.SimilarityThreshold > 100 {
  91. ra.l.Warnf("adjusted the similarity threshold to %d\n",
  92. RenameAnalysisDefaultThreshold)
  93. ra.SimilarityThreshold = RenameAnalysisDefaultThreshold
  94. }
  95. ra.repository = repository
  96. return nil
  97. }
  98. // Consume runs this PipelineItem on the next commit data.
  99. // `deps` contain all the results from upstream PipelineItem-s as requested by Requires().
  100. // Additionally, DependencyCommit is always present there and represents the analysed *object.Commit.
  101. // This function returns the mapping with analysis results. The keys must be the same as
  102. // in Provides(). If there was an error, nil is returned.
  103. func (ra *RenameAnalysis) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  104. changes := deps[DependencyTreeChanges].(object.Changes)
  105. cache := deps[DependencyBlobCache].(map[plumbing.Hash]*CachedBlob)
  106. reducedChanges := make(object.Changes, 0, changes.Len())
  107. // Stage 1 - find renames by matching the hashes
  108. // n log(n)
  109. // We sort additions and deletions by hash and then do the single scan along
  110. // both slices.
  111. deleted := make(sortableChanges, 0, changes.Len())
  112. added := make(sortableChanges, 0, changes.Len())
  113. for _, change := range changes {
  114. action, err := change.Action()
  115. if err != nil {
  116. return nil, err
  117. }
  118. switch action {
  119. case merkletrie.Insert:
  120. added = append(added, sortableChange{change, change.To.TreeEntry.Hash})
  121. case merkletrie.Delete:
  122. deleted = append(deleted, sortableChange{change, change.From.TreeEntry.Hash})
  123. case merkletrie.Modify:
  124. reducedChanges = append(reducedChanges, change)
  125. }
  126. }
  127. sort.Sort(deleted)
  128. sort.Sort(added)
  129. stillDeleted := make(object.Changes, 0, deleted.Len())
  130. stillAdded := make(object.Changes, 0, added.Len())
  131. {
  132. a := 0
  133. d := 0
  134. for a < added.Len() && d < deleted.Len() {
  135. if added[a].hash == deleted[d].hash {
  136. reducedChanges = append(
  137. reducedChanges,
  138. &object.Change{From: deleted[d].change.From, To: added[a].change.To})
  139. a++
  140. d++
  141. } else if added[a].Less(&deleted[d]) {
  142. stillAdded = append(stillAdded, added[a].change)
  143. a++
  144. } else {
  145. stillDeleted = append(stillDeleted, deleted[d].change)
  146. d++
  147. }
  148. }
  149. for ; a < added.Len(); a++ {
  150. stillAdded = append(stillAdded, added[a].change)
  151. }
  152. for ; d < deleted.Len(); d++ {
  153. stillDeleted = append(stillDeleted, deleted[d].change)
  154. }
  155. }
  156. // Stage 2 - apply the similarity threshold
  157. // n^2 but actually linear
  158. // We sort the blobs by size and do the single linear scan.
  159. maxCandidates := RenameAnalysisMaxCandidates
  160. if len(stillAdded)+len(stillDeleted) > RenameAnalysisSetSizeLimit {
  161. maxCandidates = 1
  162. }
  163. addedBlobs := make(sortableBlobs, 0, stillAdded.Len())
  164. deletedBlobs := make(sortableBlobs, 0, stillDeleted.Len())
  165. var smallChanges []*object.Change
  166. for _, change := range stillAdded {
  167. blob := cache[change.To.TreeEntry.Hash]
  168. if blob.Size < RenameAnalysisMinimumSize {
  169. smallChanges = append(smallChanges, change)
  170. } else {
  171. addedBlobs = append(
  172. addedBlobs, sortableBlob{change: change, size: blob.Size})
  173. }
  174. }
  175. for _, change := range stillDeleted {
  176. blob := cache[change.From.TreeEntry.Hash]
  177. if blob.Size < RenameAnalysisMinimumSize {
  178. smallChanges = append(smallChanges, change)
  179. } else {
  180. deletedBlobs = append(
  181. deletedBlobs, sortableBlob{change: change, size: blob.Size})
  182. }
  183. }
  184. sort.Sort(addedBlobs)
  185. sort.Sort(deletedBlobs)
  186. finished := make(chan bool, 2)
  187. finishedA := make(chan bool, 1)
  188. finishedB := make(chan bool, 1)
  189. errs := make(chan error)
  190. matchesA := make(object.Changes, 0, changes.Len())
  191. matchesB := make(object.Changes, 0, changes.Len())
  192. addedBlobsA := addedBlobs
  193. addedBlobsB := make(sortableBlobs, len(addedBlobs))
  194. copy(addedBlobsB, addedBlobs)
  195. deletedBlobsA := deletedBlobs
  196. deletedBlobsB := make(sortableBlobs, len(deletedBlobs))
  197. copy(deletedBlobsB, deletedBlobs)
  198. wg := sync.WaitGroup{}
  199. matchA := func() {
  200. defer func() {
  201. finished <- true
  202. wg.Done()
  203. }()
  204. aStart := 0
  205. // we will try to find a matching added blob for each deleted blob
  206. for d := 0; d < deletedBlobsA.Len(); d++ {
  207. myBlob := cache[deletedBlobsA[d].change.From.TreeEntry.Hash]
  208. mySize := deletedBlobsA[d].size
  209. myName := filepath.Base(deletedBlobsA[d].change.From.Name)
  210. var a int
  211. for a = aStart; a < addedBlobsA.Len() && !ra.sizesAreClose(mySize, addedBlobsA[a].size); a++ {
  212. }
  213. aStart = a
  214. foundMatch := false
  215. // get the list of possible candidates and sort by file name similarity
  216. var candidates []int
  217. for a = aStart; a < addedBlobsA.Len() && ra.sizesAreClose(mySize, addedBlobsA[a].size); a++ {
  218. candidates = append(candidates, a)
  219. }
  220. sortRenameCandidates(candidates, myName, func(a int) string {
  221. return addedBlobsA[a].change.To.Name
  222. })
  223. var ci int
  224. for ci, a = range candidates {
  225. select {
  226. case <-finished:
  227. return
  228. default:
  229. break
  230. }
  231. if ci > maxCandidates {
  232. break
  233. }
  234. blobsAreClose, err := ra.blobsAreClose(
  235. myBlob, cache[addedBlobsA[a].change.To.TreeEntry.Hash])
  236. if err != nil {
  237. errs <- err
  238. return
  239. }
  240. if blobsAreClose {
  241. foundMatch = true
  242. matchesA = append(
  243. matchesA,
  244. &object.Change{
  245. From: deletedBlobsA[d].change.From,
  246. To: addedBlobsA[a].change.To})
  247. break
  248. }
  249. }
  250. if foundMatch {
  251. deletedBlobsA = append(deletedBlobsA[:d], deletedBlobsA[d+1:]...)
  252. d--
  253. addedBlobsA = append(addedBlobsA[:a], addedBlobsA[a+1:]...)
  254. }
  255. }
  256. finishedA <- true
  257. }
  258. matchB := func() {
  259. defer func() {
  260. finished <- true
  261. wg.Done()
  262. }()
  263. dStart := 0
  264. for a := 0; a < addedBlobsB.Len(); a++ {
  265. myBlob := cache[addedBlobsB[a].change.To.TreeEntry.Hash]
  266. mySize := addedBlobsB[a].size
  267. myName := filepath.Base(addedBlobsB[a].change.To.Name)
  268. var d int
  269. for d = dStart; d < deletedBlobsB.Len() && !ra.sizesAreClose(mySize, deletedBlobsB[d].size); d++ {
  270. }
  271. dStart = d
  272. foundMatch := false
  273. // get the list of possible candidates and sort by file name similarity
  274. var candidates []int
  275. for d = dStart; d < deletedBlobsB.Len() && ra.sizesAreClose(mySize, deletedBlobsB[d].size); d++ {
  276. candidates = append(candidates, d)
  277. }
  278. sortRenameCandidates(candidates, myName, func(d int) string {
  279. return deletedBlobsB[d].change.From.Name
  280. })
  281. var ci int
  282. for ci, d = range candidates {
  283. select {
  284. case <-finished:
  285. return
  286. default:
  287. break
  288. }
  289. if ci > maxCandidates {
  290. break
  291. }
  292. blobsAreClose, err := ra.blobsAreClose(
  293. myBlob, cache[deletedBlobsB[d].change.From.TreeEntry.Hash])
  294. if err != nil {
  295. errs <- err
  296. return
  297. }
  298. if blobsAreClose {
  299. foundMatch = true
  300. matchesB = append(
  301. matchesB,
  302. &object.Change{
  303. From: deletedBlobsB[d].change.From,
  304. To: addedBlobsB[a].change.To})
  305. break
  306. }
  307. }
  308. if foundMatch {
  309. addedBlobsB = append(addedBlobsB[:a], addedBlobsB[a+1:]...)
  310. a--
  311. deletedBlobsB = append(deletedBlobsB[:d], deletedBlobsB[d+1:]...)
  312. }
  313. }
  314. finishedB <- true
  315. }
  316. // run two functions in parallel, and take the result from the one which finished earlier
  317. wg.Add(2)
  318. go matchA()
  319. go matchB()
  320. wg.Wait()
  321. var matches object.Changes
  322. select {
  323. case err := <-errs:
  324. return nil, err
  325. case <-finishedA:
  326. addedBlobs = addedBlobsA
  327. deletedBlobs = deletedBlobsA
  328. matches = matchesA
  329. case <-finishedB:
  330. addedBlobs = addedBlobsB
  331. deletedBlobs = deletedBlobsB
  332. matches = matchesB
  333. default:
  334. panic("Impossible happened: two functions returned without an error " +
  335. "but no results from both")
  336. }
  337. // Stage 3 - we give up, everything left are independent additions and deletions
  338. for _, change := range matches {
  339. reducedChanges = append(reducedChanges, change)
  340. }
  341. for _, blob := range addedBlobs {
  342. reducedChanges = append(reducedChanges, blob.change)
  343. }
  344. for _, blob := range deletedBlobs {
  345. reducedChanges = append(reducedChanges, blob.change)
  346. }
  347. for _, change := range smallChanges {
  348. reducedChanges = append(reducedChanges, change)
  349. }
  350. return map[string]interface{}{DependencyTreeChanges: reducedChanges}, nil
  351. }
  352. // Fork clones this PipelineItem.
  353. func (ra *RenameAnalysis) Fork(n int) []core.PipelineItem {
  354. return core.ForkSamePipelineItem(ra, n)
  355. }
  356. func (ra *RenameAnalysis) sizesAreClose(size1 int64, size2 int64) bool {
  357. size := internal.Max64(1, internal.Max64(size1, size2))
  358. return (internal.Abs64(size1-size2)*10000)/size <= int64(100-ra.SimilarityThreshold)*100
  359. }
  360. func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (bool, error) {
  361. cleanReturn := false
  362. defer func() {
  363. if !cleanReturn {
  364. ra.l.Warnf("\nunclean return detected for blobs '%s' and '%s'\n",
  365. blob1.Hash.String(), blob2.Hash.String())
  366. }
  367. }()
  368. _, err1 := blob1.CountLines()
  369. _, err2 := blob2.CountLines()
  370. if err1 == ErrorBinary || err2 == ErrorBinary {
  371. // binary mode
  372. bsdifflen := DiffBytes(blob1.Data, blob2.Data)
  373. delta := int((int64(bsdifflen) * 100) / internal.Max64(
  374. internal.Min64(blob1.Size, blob2.Size), 1))
  375. cleanReturn = true
  376. return 100-delta >= ra.SimilarityThreshold, nil
  377. }
  378. src, dst := string(blob1.Data), string(blob2.Data)
  379. maxSize := internal.Max(1, internal.Max(utf8.RuneCountInString(src), utf8.RuneCountInString(dst)))
  380. // compute the line-by-line diff, then the char-level diffs of the del-ins blocks
  381. // yes, this algorithm is greedy and not exact
  382. dmp := diffmatchpatch.New()
  383. dmp.DiffTimeout = time.Hour
  384. srcLineRunes, dstLineRunes, _ := dmp.DiffLinesToRunes(src, dst)
  385. // the third returned value, []string, is the mapping from runes to lines
  386. // we cannot use it because it is approximate and has string collisions
  387. // that is, the mapping is wrong for huge files
  388. diffs := dmp.DiffMainRunes(srcLineRunes, dstLineRunes, false)
  389. srcPositions := calcLinePositions(src)
  390. dstPositions := calcLinePositions(dst)
  391. var common, posSrc, prevPosSrc, posDst int
  392. possibleDelInsBlock := false
  393. for _, edit := range diffs {
  394. switch edit.Type {
  395. case diffmatchpatch.DiffDelete:
  396. possibleDelInsBlock = true
  397. prevPosSrc = posSrc
  398. posSrc += utf8.RuneCountInString(edit.Text)
  399. case diffmatchpatch.DiffInsert:
  400. nextPosDst := posDst + utf8.RuneCountInString(edit.Text)
  401. if possibleDelInsBlock {
  402. possibleDelInsBlock = false
  403. if internal.Max(srcPositions[posSrc]-srcPositions[prevPosSrc],
  404. dstPositions[nextPosDst]-dstPositions[posDst]) < RenameAnalysisByteDiffSizeThreshold {
  405. localDmp := diffmatchpatch.New()
  406. localDmp.DiffTimeout = time.Hour
  407. localSrc := src[srcPositions[prevPosSrc]:srcPositions[posSrc]]
  408. localDst := dst[dstPositions[posDst]:dstPositions[nextPosDst]]
  409. localDiffs := localDmp.DiffMainRunes(
  410. strToLiteralRunes(localSrc), strToLiteralRunes(localDst), false)
  411. for _, localEdit := range localDiffs {
  412. if localEdit.Type == diffmatchpatch.DiffEqual {
  413. common += utf8.RuneCountInString(localEdit.Text)
  414. }
  415. }
  416. }
  417. }
  418. posDst = nextPosDst
  419. case diffmatchpatch.DiffEqual:
  420. possibleDelInsBlock = false
  421. step := utf8.RuneCountInString(edit.Text)
  422. // for i := range edit.Text does *not* work
  423. // idk why, but `i` appears to be bigger than the number of runes
  424. for i := 0; i < step; i++ {
  425. common += srcPositions[posSrc+i+1] - srcPositions[posSrc+i]
  426. }
  427. posSrc += step
  428. posDst += step
  429. }
  430. if possibleDelInsBlock {
  431. continue
  432. }
  433. // supposing that the rest of the lines are the same (they are not - too optimistic),
  434. // estimate the maximum similarity and exit the loop if it lower than our threshold
  435. var srcPendingSize, dstPendingSize int
  436. srcPendingSize = len(src) - srcPositions[posSrc]
  437. dstPendingSize = len(dst) - dstPositions[posDst]
  438. maxCommon := common + internal.Min(srcPendingSize, dstPendingSize)
  439. similarity := (maxCommon * 100) / maxSize
  440. if similarity < ra.SimilarityThreshold {
  441. cleanReturn = true
  442. return false, nil
  443. }
  444. similarity = (common * 100) / maxSize
  445. if similarity >= ra.SimilarityThreshold {
  446. cleanReturn = true
  447. return true, nil
  448. }
  449. }
  450. // the very last "overly optimistic" estimate was actually precise, so since we are still here
  451. // the blobs are similar
  452. cleanReturn = true
  453. return true, nil
  454. }
  455. func calcLinePositions(text string) []int {
  456. if text == "" {
  457. return []int{0}
  458. }
  459. lines := strings.Split(text, "\n")
  460. positions := make([]int, len(lines)+1)
  461. accum := 0
  462. for i, l := range lines {
  463. positions[i] = accum
  464. accum += len(l) + 1 // +1 for \n
  465. }
  466. if len(lines) > 0 && lines[len(lines)-1] != "\n" {
  467. accum--
  468. }
  469. positions[len(lines)] = accum
  470. return positions
  471. }
  472. func strToLiteralRunes(s string) []rune {
  473. lrunes := make([]rune, len(s))
  474. for i, b := range []byte(s) {
  475. lrunes[i] = rune(b)
  476. }
  477. return lrunes
  478. }
  479. type sortableChange struct {
  480. change *object.Change
  481. hash plumbing.Hash
  482. }
  483. type sortableChanges []sortableChange
  484. func (change *sortableChange) Less(other *sortableChange) bool {
  485. for x := 0; x < 20; x++ {
  486. if change.hash[x] < other.hash[x] {
  487. return true
  488. }
  489. }
  490. return false
  491. }
  492. func (slice sortableChanges) Len() int {
  493. return len(slice)
  494. }
  495. func (slice sortableChanges) Less(i, j int) bool {
  496. return slice[i].Less(&slice[j])
  497. }
  498. func (slice sortableChanges) Swap(i, j int) {
  499. slice[i], slice[j] = slice[j], slice[i]
  500. }
  501. type sortableBlob struct {
  502. change *object.Change
  503. size int64
  504. }
  505. type sortableBlobs []sortableBlob
  506. func (change *sortableBlob) Less(other *sortableBlob) bool {
  507. return change.size < other.size
  508. }
  509. func (slice sortableBlobs) Len() int {
  510. return len(slice)
  511. }
  512. func (slice sortableBlobs) Less(i, j int) bool {
  513. return slice[i].Less(&slice[j])
  514. }
  515. func (slice sortableBlobs) Swap(i, j int) {
  516. slice[i], slice[j] = slice[j], slice[i]
  517. }
  518. type candidateDistance struct {
  519. Candidate int
  520. Distance int
  521. }
  522. func sortRenameCandidates(candidates []int, origin string, nameGetter func(int) string) {
  523. distances := make([]candidateDistance, len(candidates))
  524. ctx := levenshtein.Context{}
  525. for i, x := range candidates {
  526. name := filepath.Base(nameGetter(x))
  527. distances[i] = candidateDistance{x, ctx.Distance(origin, name)}
  528. }
  529. sort.Slice(distances, func(i, j int) bool {
  530. return distances[i].Distance < distances[j].Distance
  531. })
  532. for i, cd := range distances {
  533. candidates[i] = cd.Candidate
  534. }
  535. }
  536. func init() {
  537. core.Registry.Register(&RenameAnalysis{})
  538. }