renames.go 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581
  1. package plumbing
  2. import (
  3. "log"
  4. "path/filepath"
  5. "sort"
  6. "strings"
  7. "sync"
  8. "time"
  9. "unicode/utf8"
  10. "github.com/sergi/go-diff/diffmatchpatch"
  11. "gopkg.in/src-d/go-git.v4"
  12. "gopkg.in/src-d/go-git.v4/plumbing"
  13. "gopkg.in/src-d/go-git.v4/plumbing/object"
  14. "gopkg.in/src-d/go-git.v4/utils/merkletrie"
  15. "gopkg.in/src-d/hercules.v9/internal"
  16. "gopkg.in/src-d/hercules.v9/internal/core"
  17. )
  18. // RenameAnalysis improves TreeDiff's results by searching for changed blobs under different
  19. // paths which are likely to be the result of a rename with subsequent edits.
  20. // RenameAnalysis is a PipelineItem.
  21. type RenameAnalysis struct {
  22. core.NoopMerger
  23. // SimilarityThreshold adjusts the heuristic to determine file renames.
  24. // It has the same units as cgit's -X rename-threshold or -M. Better to
  25. // set it to the default value of 80 (80%).
  26. SimilarityThreshold int
  27. repository *git.Repository
  28. }
  29. const (
  30. // RenameAnalysisDefaultThreshold specifies the default percentage of common lines in a pair
  31. // of files to consider them linked. The exact code of the decision is sizesAreClose().
  32. // CGit's default is 50%. Ours is 80% because 50% can be too computationally expensive.
  33. RenameAnalysisDefaultThreshold = 80
  34. // ConfigRenameAnalysisSimilarityThreshold is the name of the configuration option
  35. // (RenameAnalysis.Configure()) which sets the similarity threshold.
  36. ConfigRenameAnalysisSimilarityThreshold = "RenameAnalysis.SimilarityThreshold"
  37. // RenameAnalysisMinimumSize is the minimum size of a blob to be considered.
  38. RenameAnalysisMinimumSize = 32
  39. // RenameAnalysisMaxCandidates is the maximum number of rename candidates to consider per file.
  40. RenameAnalysisMaxCandidates = 50
  41. // RenameAnalysisSetSizeLimit is the maximum number of added + removed files for
  42. // RenameAnalysisMaxCandidates to be active; the bigger numbers set it to 1.
  43. RenameAnalysisSetSizeLimit = 1000
  44. // RenameAnalysisByteDiffSizeThreshold is the maximum size of each of the compared parts
  45. // to be diff-ed on byte level.
  46. RenameAnalysisByteDiffSizeThreshold = 100000
  47. )
  48. // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
  49. func (ra *RenameAnalysis) Name() string {
  50. return "RenameAnalysis"
  51. }
  52. // Provides returns the list of names of entities which are produced by this PipelineItem.
  53. // Each produced entity will be inserted into `deps` of dependent Consume()-s according
  54. // to this list. Also used by core.Registry to build the global map of providers.
  55. func (ra *RenameAnalysis) Provides() []string {
  56. arr := [...]string{DependencyTreeChanges}
  57. return arr[:]
  58. }
  59. // Requires returns the list of names of entities which are needed by this PipelineItem.
  60. // Each requested entity will be inserted into `deps` of Consume(). In turn, those
  61. // entities are Provides() upstream.
  62. func (ra *RenameAnalysis) Requires() []string {
  63. arr := [...]string{DependencyBlobCache, DependencyTreeChanges}
  64. return arr[:]
  65. }
  66. // ListConfigurationOptions returns the list of changeable public properties of this PipelineItem.
  67. func (ra *RenameAnalysis) ListConfigurationOptions() []core.ConfigurationOption {
  68. options := [...]core.ConfigurationOption{{
  69. Name: ConfigRenameAnalysisSimilarityThreshold,
  70. Description: "The threshold on the similarity index used to detect renames.",
  71. Flag: "M",
  72. Type: core.IntConfigurationOption,
  73. Default: RenameAnalysisDefaultThreshold},
  74. }
  75. return options[:]
  76. }
  77. // Configure sets the properties previously published by ListConfigurationOptions().
  78. func (ra *RenameAnalysis) Configure(facts map[string]interface{}) error {
  79. if val, exists := facts[ConfigRenameAnalysisSimilarityThreshold].(int); exists {
  80. ra.SimilarityThreshold = val
  81. }
  82. return nil
  83. }
  84. // Initialize resets the temporary caches and prepares this PipelineItem for a series of Consume()
  85. // calls. The repository which is going to be analysed is supplied as an argument.
  86. func (ra *RenameAnalysis) Initialize(repository *git.Repository) error {
  87. if ra.SimilarityThreshold < 0 || ra.SimilarityThreshold > 100 {
  88. log.Printf("Warning: adjusted the similarity threshold to %d\n",
  89. RenameAnalysisDefaultThreshold)
  90. ra.SimilarityThreshold = RenameAnalysisDefaultThreshold
  91. }
  92. ra.repository = repository
  93. return nil
  94. }
  95. // Consume runs this PipelineItem on the next commit data.
  96. // `deps` contain all the results from upstream PipelineItem-s as requested by Requires().
  97. // Additionally, DependencyCommit is always present there and represents the analysed *object.Commit.
  98. // This function returns the mapping with analysis results. The keys must be the same as
  99. // in Provides(). If there was an error, nil is returned.
  100. func (ra *RenameAnalysis) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  101. changes := deps[DependencyTreeChanges].(object.Changes)
  102. cache := deps[DependencyBlobCache].(map[plumbing.Hash]*CachedBlob)
  103. reducedChanges := make(object.Changes, 0, changes.Len())
  104. // Stage 1 - find renames by matching the hashes
  105. // n log(n)
  106. // We sort additions and deletions by hash and then do the single scan along
  107. // both slices.
  108. deleted := make(sortableChanges, 0, changes.Len())
  109. added := make(sortableChanges, 0, changes.Len())
  110. for _, change := range changes {
  111. action, err := change.Action()
  112. if err != nil {
  113. return nil, err
  114. }
  115. switch action {
  116. case merkletrie.Insert:
  117. added = append(added, sortableChange{change, change.To.TreeEntry.Hash})
  118. case merkletrie.Delete:
  119. deleted = append(deleted, sortableChange{change, change.From.TreeEntry.Hash})
  120. case merkletrie.Modify:
  121. reducedChanges = append(reducedChanges, change)
  122. }
  123. }
  124. sort.Sort(deleted)
  125. sort.Sort(added)
  126. stillDeleted := make(object.Changes, 0, deleted.Len())
  127. stillAdded := make(object.Changes, 0, added.Len())
  128. {
  129. a := 0
  130. d := 0
  131. for a < added.Len() && d < deleted.Len() {
  132. if added[a].hash == deleted[d].hash {
  133. reducedChanges = append(
  134. reducedChanges,
  135. &object.Change{From: deleted[d].change.From, To: added[a].change.To})
  136. a++
  137. d++
  138. } else if added[a].Less(&deleted[d]) {
  139. stillAdded = append(stillAdded, added[a].change)
  140. a++
  141. } else {
  142. stillDeleted = append(stillDeleted, deleted[d].change)
  143. d++
  144. }
  145. }
  146. for ; a < added.Len(); a++ {
  147. stillAdded = append(stillAdded, added[a].change)
  148. }
  149. for ; d < deleted.Len(); d++ {
  150. stillDeleted = append(stillDeleted, deleted[d].change)
  151. }
  152. }
  153. // Stage 2 - apply the similarity threshold
  154. // n^2 but actually linear
  155. // We sort the blobs by size and do the single linear scan.
  156. maxCandidates := RenameAnalysisMaxCandidates
  157. if len(stillAdded)+len(stillDeleted) > RenameAnalysisSetSizeLimit {
  158. maxCandidates = 1
  159. }
  160. addedBlobs := make(sortableBlobs, 0, stillAdded.Len())
  161. deletedBlobs := make(sortableBlobs, 0, stillDeleted.Len())
  162. var smallChanges []*object.Change
  163. for _, change := range stillAdded {
  164. blob := cache[change.To.TreeEntry.Hash]
  165. if blob.Size < RenameAnalysisMinimumSize {
  166. smallChanges = append(smallChanges, change)
  167. } else {
  168. addedBlobs = append(
  169. addedBlobs, sortableBlob{change: change, size: blob.Size})
  170. }
  171. }
  172. for _, change := range stillDeleted {
  173. blob := cache[change.From.TreeEntry.Hash]
  174. if blob.Size < RenameAnalysisMinimumSize {
  175. smallChanges = append(smallChanges, change)
  176. } else {
  177. deletedBlobs = append(
  178. deletedBlobs, sortableBlob{change: change, size: blob.Size})
  179. }
  180. }
  181. sort.Sort(addedBlobs)
  182. sort.Sort(deletedBlobs)
  183. finished := make(chan bool, 2)
  184. finishedA := make(chan bool, 1)
  185. finishedB := make(chan bool, 1)
  186. errs := make(chan error)
  187. matchesA := make(object.Changes, 0, changes.Len())
  188. matchesB := make(object.Changes, 0, changes.Len())
  189. addedBlobsA := addedBlobs
  190. addedBlobsB := make(sortableBlobs, len(addedBlobs))
  191. copy(addedBlobsB, addedBlobs)
  192. deletedBlobsA := deletedBlobs
  193. deletedBlobsB := make(sortableBlobs, len(deletedBlobs))
  194. copy(deletedBlobsB, deletedBlobs)
  195. wg := sync.WaitGroup{}
  196. matchA := func() {
  197. defer func() {
  198. finished <- true
  199. wg.Done()
  200. }()
  201. aStart := 0
  202. // we will try to find a matching added blob for each deleted blob
  203. for d := 0; d < deletedBlobsA.Len(); d++ {
  204. myBlob := cache[deletedBlobsA[d].change.From.TreeEntry.Hash]
  205. mySize := deletedBlobsA[d].size
  206. myName := filepath.Base(deletedBlobsA[d].change.From.Name)
  207. var a int
  208. for a = aStart; a < addedBlobsA.Len() && !ra.sizesAreClose(mySize, addedBlobsA[a].size); a++ {
  209. }
  210. aStart = a
  211. foundMatch := false
  212. // get the list of possible candidates and sort by file name similarity
  213. var candidates []int
  214. for a = aStart; a < addedBlobsA.Len() && ra.sizesAreClose(mySize, addedBlobsA[a].size); a++ {
  215. candidates = append(candidates, a)
  216. }
  217. sortRenameCandidates(candidates, myName, func(a int) string {
  218. return addedBlobsA[a].change.To.Name
  219. })
  220. var ci int
  221. for ci, a = range candidates {
  222. select {
  223. case <-finished:
  224. return
  225. default:
  226. break
  227. }
  228. if ci > maxCandidates {
  229. break
  230. }
  231. blobsAreClose, err := ra.blobsAreClose(
  232. myBlob, cache[addedBlobsA[a].change.To.TreeEntry.Hash])
  233. if err != nil {
  234. errs <- err
  235. return
  236. }
  237. if blobsAreClose {
  238. foundMatch = true
  239. matchesA = append(
  240. matchesA,
  241. &object.Change{
  242. From: deletedBlobsA[d].change.From,
  243. To: addedBlobsA[a].change.To})
  244. break
  245. }
  246. }
  247. if foundMatch {
  248. deletedBlobsA = append(deletedBlobsA[:d], deletedBlobsA[d+1:]...)
  249. d--
  250. addedBlobsA = append(addedBlobsA[:a], addedBlobsA[a+1:]...)
  251. }
  252. }
  253. finishedA <- true
  254. }
  255. matchB := func() {
  256. defer func() {
  257. finished <- true
  258. wg.Done()
  259. }()
  260. dStart := 0
  261. for a := 0; a < addedBlobsB.Len(); a++ {
  262. myBlob := cache[addedBlobsB[a].change.To.TreeEntry.Hash]
  263. mySize := addedBlobsB[a].size
  264. myName := filepath.Base(addedBlobsB[a].change.To.Name)
  265. var d int
  266. for d = dStart; d < deletedBlobsB.Len() && !ra.sizesAreClose(mySize, deletedBlobsB[d].size); d++ {
  267. }
  268. dStart = d
  269. foundMatch := false
  270. // get the list of possible candidates and sort by file name similarity
  271. var candidates []int
  272. for d = dStart; d < deletedBlobsB.Len() && ra.sizesAreClose(mySize, deletedBlobsB[d].size); d++ {
  273. candidates = append(candidates, d)
  274. }
  275. sortRenameCandidates(candidates, myName, func(d int) string {
  276. return deletedBlobsB[d].change.From.Name
  277. })
  278. var ci int
  279. for ci, d = range candidates {
  280. select {
  281. case <-finished:
  282. return
  283. default:
  284. break
  285. }
  286. if ci > maxCandidates {
  287. break
  288. }
  289. blobsAreClose, err := ra.blobsAreClose(
  290. myBlob, cache[deletedBlobsB[d].change.From.TreeEntry.Hash])
  291. if err != nil {
  292. errs <- err
  293. return
  294. }
  295. if blobsAreClose {
  296. foundMatch = true
  297. matchesB = append(
  298. matchesB,
  299. &object.Change{
  300. From: deletedBlobsB[d].change.From,
  301. To: addedBlobsB[a].change.To})
  302. break
  303. }
  304. }
  305. if foundMatch {
  306. addedBlobsB = append(addedBlobsB[:a], addedBlobsB[a+1:]...)
  307. a--
  308. deletedBlobsB = append(deletedBlobsB[:d], deletedBlobsB[d+1:]...)
  309. }
  310. }
  311. finishedB <- true
  312. }
  313. // run two functions in parallel, and take the result from the one which finished earlier
  314. wg.Add(2)
  315. go matchA()
  316. go matchB()
  317. wg.Wait()
  318. var matches object.Changes
  319. select {
  320. case err := <-errs:
  321. return nil, err
  322. case <-finishedA:
  323. addedBlobs = addedBlobsA
  324. deletedBlobs = deletedBlobsA
  325. matches = matchesA
  326. case <-finishedB:
  327. addedBlobs = addedBlobsB
  328. deletedBlobs = deletedBlobsB
  329. matches = matchesB
  330. default:
  331. panic("Impossible happened: two functions returned without an error " +
  332. "but no results from both")
  333. }
  334. // Stage 3 - we give up, everything left are independent additions and deletions
  335. for _, change := range matches {
  336. reducedChanges = append(reducedChanges, change)
  337. }
  338. for _, blob := range addedBlobs {
  339. reducedChanges = append(reducedChanges, blob.change)
  340. }
  341. for _, blob := range deletedBlobs {
  342. reducedChanges = append(reducedChanges, blob.change)
  343. }
  344. for _, change := range smallChanges {
  345. reducedChanges = append(reducedChanges, change)
  346. }
  347. return map[string]interface{}{DependencyTreeChanges: reducedChanges}, nil
  348. }
  349. // Fork clones this PipelineItem.
  350. func (ra *RenameAnalysis) Fork(n int) []core.PipelineItem {
  351. return core.ForkSamePipelineItem(ra, n)
  352. }
  353. func (ra *RenameAnalysis) sizesAreClose(size1 int64, size2 int64) bool {
  354. size := internal.Max64(1, internal.Max64(size1, size2))
  355. return (internal.Abs64(size1-size2)*10000)/size <= int64(100-ra.SimilarityThreshold)*100
  356. }
  357. func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (bool, error) {
  358. cleanReturn := false
  359. defer func() {
  360. if !cleanReturn {
  361. log.Println()
  362. log.Println(blob1.Hash.String())
  363. log.Println(blob2.Hash.String())
  364. }
  365. }()
  366. _, err1 := blob1.CountLines()
  367. _, err2 := blob2.CountLines()
  368. if err1 == ErrorBinary || err2 == ErrorBinary {
  369. // binary mode
  370. bsdifflen := DiffBytes(blob1.Data, blob2.Data)
  371. delta := int((int64(bsdifflen) * 100) / internal.Max64(
  372. internal.Min64(blob1.Size, blob2.Size), 1))
  373. cleanReturn = true
  374. return 100-delta >= ra.SimilarityThreshold, nil
  375. }
  376. src, dst := string(blob1.Data), string(blob2.Data)
  377. maxSize := internal.Max(1, internal.Max(utf8.RuneCountInString(src), utf8.RuneCountInString(dst)))
  378. // compute the line-by-line diff, then the char-level diffs of the del-ins blocks
  379. // yes, this algorithm is greedy and not exact
  380. dmp := diffmatchpatch.New()
  381. dmp.DiffTimeout = time.Hour
  382. srcLineRunes, dstLineRunes, _ := dmp.DiffLinesToRunes(src, dst)
  383. // the third returned value, []string, is the mapping from runes to lines
  384. // we cannot use it because it is approximate and has string collisions
  385. // that is, the mapping is wrong for huge files
  386. diffs := dmp.DiffMainRunes(srcLineRunes, dstLineRunes, false)
  387. srcPositions := calcLinePositions(src)
  388. dstPositions := calcLinePositions(dst)
  389. var common, posSrc, prevPosSrc, posDst int
  390. possibleDelInsBlock := false
  391. for _, edit := range diffs {
  392. switch edit.Type {
  393. case diffmatchpatch.DiffDelete:
  394. possibleDelInsBlock = true
  395. prevPosSrc = posSrc
  396. posSrc += utf8.RuneCountInString(edit.Text)
  397. case diffmatchpatch.DiffInsert:
  398. nextPosDst := posDst + utf8.RuneCountInString(edit.Text)
  399. if possibleDelInsBlock {
  400. possibleDelInsBlock = false
  401. if internal.Max(srcPositions[posSrc]-srcPositions[prevPosSrc],
  402. dstPositions[nextPosDst]-dstPositions[posDst]) < RenameAnalysisByteDiffSizeThreshold {
  403. localDmp := diffmatchpatch.New()
  404. localDmp.DiffTimeout = time.Hour
  405. localSrc := src[srcPositions[prevPosSrc]:srcPositions[posSrc]]
  406. localDst := dst[dstPositions[posDst]:dstPositions[nextPosDst]]
  407. localDiffs := localDmp.DiffMainRunes(
  408. strToLiteralRunes(localSrc), strToLiteralRunes(localDst), false)
  409. for _, localEdit := range localDiffs {
  410. if localEdit.Type == diffmatchpatch.DiffEqual {
  411. common += utf8.RuneCountInString(localEdit.Text)
  412. }
  413. }
  414. }
  415. }
  416. posDst = nextPosDst
  417. case diffmatchpatch.DiffEqual:
  418. possibleDelInsBlock = false
  419. step := utf8.RuneCountInString(edit.Text)
  420. // for i := range edit.Text does *not* work
  421. // idk why, but `i` appears to be bigger than the number of runes
  422. for i := 0; i < step; i++ {
  423. common += srcPositions[posSrc+i+1] - srcPositions[posSrc+i]
  424. }
  425. posSrc += step
  426. posDst += step
  427. }
  428. if possibleDelInsBlock {
  429. continue
  430. }
  431. // supposing that the rest of the lines are the same (they are not - too optimistic),
  432. // estimate the maximum similarity and exit the loop if it lower than our threshold
  433. var srcPendingSize, dstPendingSize int
  434. srcPendingSize = len(src) - srcPositions[posSrc]
  435. dstPendingSize = len(dst) - dstPositions[posDst]
  436. maxCommon := common + internal.Min(srcPendingSize, dstPendingSize)
  437. similarity := (maxCommon * 100) / maxSize
  438. if similarity < ra.SimilarityThreshold {
  439. cleanReturn = true
  440. return false, nil
  441. }
  442. similarity = (common * 100) / maxSize
  443. if similarity >= ra.SimilarityThreshold {
  444. cleanReturn = true
  445. return true, nil
  446. }
  447. }
  448. // the very last "overly optimistic" estimate was actually precise, so since we are still here
  449. // the blobs are similar
  450. cleanReturn = true
  451. return true, nil
  452. }
  453. func calcLinePositions(text string) []int {
  454. if text == "" {
  455. return []int{0}
  456. }
  457. lines := strings.Split(text, "\n")
  458. positions := make([]int, len(lines)+1)
  459. accum := 0
  460. for i, l := range lines {
  461. positions[i] = accum
  462. accum += len(l) + 1 // +1 for \n
  463. }
  464. if len(lines) > 0 && lines[len(lines)-1] != "\n" {
  465. accum--
  466. }
  467. positions[len(lines)] = accum
  468. return positions
  469. }
  470. func strToLiteralRunes(s string) []rune {
  471. lrunes := make([]rune, len(s))
  472. for i, b := range []byte(s) {
  473. lrunes[i] = rune(b)
  474. }
  475. return lrunes
  476. }
  477. type sortableChange struct {
  478. change *object.Change
  479. hash plumbing.Hash
  480. }
  481. type sortableChanges []sortableChange
  482. func (change *sortableChange) Less(other *sortableChange) bool {
  483. for x := 0; x < 20; x++ {
  484. if change.hash[x] < other.hash[x] {
  485. return true
  486. }
  487. }
  488. return false
  489. }
  490. func (slice sortableChanges) Len() int {
  491. return len(slice)
  492. }
  493. func (slice sortableChanges) Less(i, j int) bool {
  494. return slice[i].Less(&slice[j])
  495. }
  496. func (slice sortableChanges) Swap(i, j int) {
  497. slice[i], slice[j] = slice[j], slice[i]
  498. }
  499. type sortableBlob struct {
  500. change *object.Change
  501. size int64
  502. }
  503. type sortableBlobs []sortableBlob
  504. func (change *sortableBlob) Less(other *sortableBlob) bool {
  505. return change.size < other.size
  506. }
  507. func (slice sortableBlobs) Len() int {
  508. return len(slice)
  509. }
  510. func (slice sortableBlobs) Less(i, j int) bool {
  511. return slice[i].Less(&slice[j])
  512. }
  513. func (slice sortableBlobs) Swap(i, j int) {
  514. slice[i], slice[j] = slice[j], slice[i]
  515. }
  516. type candidateDistance struct {
  517. Candidate int
  518. Distance int
  519. }
  520. func sortRenameCandidates(candidates []int, origin string, nameGetter func(int) string) {
  521. distances := make([]candidateDistance, len(candidates))
  522. ctx := LevenshteinContext{}
  523. for i, x := range candidates {
  524. name := filepath.Base(nameGetter(x))
  525. distances[i] = candidateDistance{x, ctx.Distance(origin, name)}
  526. }
  527. sort.Slice(distances, func(i, j int) bool {
  528. return distances[i].Distance < distances[j].Distance
  529. })
  530. for i, cd := range distances {
  531. candidates[i] = cd.Candidate
  532. }
  533. }
  534. func init() {
  535. core.Registry.Register(&RenameAnalysis{})
  536. }