renames.go 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571
  1. package plumbing
  2. import (
  3. "log"
  4. "path/filepath"
  5. "sort"
  6. "strings"
  7. "sync"
  8. "unicode/utf8"
  9. "github.com/sergi/go-diff/diffmatchpatch"
  10. "gopkg.in/src-d/go-git.v4"
  11. "gopkg.in/src-d/go-git.v4/plumbing"
  12. "gopkg.in/src-d/go-git.v4/plumbing/object"
  13. "gopkg.in/src-d/go-git.v4/utils/merkletrie"
  14. "gopkg.in/src-d/hercules.v8/internal"
  15. "gopkg.in/src-d/hercules.v8/internal/core"
  16. )
  17. // RenameAnalysis improves TreeDiff's results by searching for changed blobs under different
  18. // paths which are likely to be the result of a rename with subsequent edits.
  19. // RenameAnalysis is a PipelineItem.
  20. type RenameAnalysis struct {
  21. core.NoopMerger
  22. // SimilarityThreshold adjusts the heuristic to determine file renames.
  23. // It has the same units as cgit's -X rename-threshold or -M. Better to
  24. // set it to the default value of 80 (80%).
  25. SimilarityThreshold int
  26. repository *git.Repository
  27. }
  28. const (
  29. // RenameAnalysisDefaultThreshold specifies the default percentage of common lines in a pair
  30. // of files to consider them linked. The exact code of the decision is sizesAreClose().
  31. // CGit's default is 50%. Ours is 80% because 50% can be too computationally expensive.
  32. RenameAnalysisDefaultThreshold = 80
  33. // ConfigRenameAnalysisSimilarityThreshold is the name of the configuration option
  34. // (RenameAnalysis.Configure()) which sets the similarity threshold.
  35. ConfigRenameAnalysisSimilarityThreshold = "RenameAnalysis.SimilarityThreshold"
  36. // RenameAnalysisMinimumSize is the minimum size of a blob to be considered.
  37. RenameAnalysisMinimumSize = 32
  38. // RenameAnalysisMaxCandidates is the maximum number of rename candidates to consider per file.
  39. RenameAnalysisMaxCandidates = 50
  40. // RenameAnalysisSetSizeLimit is the maximum number of added + removed files for
  41. // RenameAnalysisMaxCandidates to be active; the bigger numbers set it to 1.
  42. RenameAnalysisSetSizeLimit = 1000
  43. // RenameAnalysisByteDiffSizeThreshold is the maximum size of each of the compared parts
  44. // to be diff-ed on byte level.
  45. RenameAnalysisByteDiffSizeThreshold = 100000
  46. )
  47. // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
  48. func (ra *RenameAnalysis) Name() string {
  49. return "RenameAnalysis"
  50. }
  51. // Provides returns the list of names of entities which are produced by this PipelineItem.
  52. // Each produced entity will be inserted into `deps` of dependent Consume()-s according
  53. // to this list. Also used by core.Registry to build the global map of providers.
  54. func (ra *RenameAnalysis) Provides() []string {
  55. arr := [...]string{DependencyTreeChanges}
  56. return arr[:]
  57. }
  58. // Requires returns the list of names of entities which are needed by this PipelineItem.
  59. // Each requested entity will be inserted into `deps` of Consume(). In turn, those
  60. // entities are Provides() upstream.
  61. func (ra *RenameAnalysis) Requires() []string {
  62. arr := [...]string{DependencyBlobCache, DependencyTreeChanges}
  63. return arr[:]
  64. }
  65. // ListConfigurationOptions returns the list of changeable public properties of this PipelineItem.
  66. func (ra *RenameAnalysis) ListConfigurationOptions() []core.ConfigurationOption {
  67. options := [...]core.ConfigurationOption{{
  68. Name: ConfigRenameAnalysisSimilarityThreshold,
  69. Description: "The threshold on the similarity index used to detect renames.",
  70. Flag: "M",
  71. Type: core.IntConfigurationOption,
  72. Default: RenameAnalysisDefaultThreshold},
  73. }
  74. return options[:]
  75. }
  76. // Configure sets the properties previously published by ListConfigurationOptions().
  77. func (ra *RenameAnalysis) Configure(facts map[string]interface{}) error {
  78. if val, exists := facts[ConfigRenameAnalysisSimilarityThreshold].(int); exists {
  79. ra.SimilarityThreshold = val
  80. }
  81. return nil
  82. }
  83. // Initialize resets the temporary caches and prepares this PipelineItem for a series of Consume()
  84. // calls. The repository which is going to be analysed is supplied as an argument.
  85. func (ra *RenameAnalysis) Initialize(repository *git.Repository) error {
  86. if ra.SimilarityThreshold < 0 || ra.SimilarityThreshold > 100 {
  87. log.Printf("Warning: adjusted the similarity threshold to %d\n",
  88. RenameAnalysisDefaultThreshold)
  89. ra.SimilarityThreshold = RenameAnalysisDefaultThreshold
  90. }
  91. ra.repository = repository
  92. return nil
  93. }
  94. // Consume runs this PipelineItem on the next commit data.
  95. // `deps` contain all the results from upstream PipelineItem-s as requested by Requires().
  96. // Additionally, DependencyCommit is always present there and represents the analysed *object.Commit.
  97. // This function returns the mapping with analysis results. The keys must be the same as
  98. // in Provides(). If there was an error, nil is returned.
  99. func (ra *RenameAnalysis) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  100. changes := deps[DependencyTreeChanges].(object.Changes)
  101. cache := deps[DependencyBlobCache].(map[plumbing.Hash]*CachedBlob)
  102. reducedChanges := make(object.Changes, 0, changes.Len())
  103. // Stage 1 - find renames by matching the hashes
  104. // n log(n)
  105. // We sort additions and deletions by hash and then do the single scan along
  106. // both slices.
  107. deleted := make(sortableChanges, 0, changes.Len())
  108. added := make(sortableChanges, 0, changes.Len())
  109. for _, change := range changes {
  110. action, err := change.Action()
  111. if err != nil {
  112. return nil, err
  113. }
  114. switch action {
  115. case merkletrie.Insert:
  116. added = append(added, sortableChange{change, change.To.TreeEntry.Hash})
  117. case merkletrie.Delete:
  118. deleted = append(deleted, sortableChange{change, change.From.TreeEntry.Hash})
  119. case merkletrie.Modify:
  120. reducedChanges = append(reducedChanges, change)
  121. }
  122. }
  123. sort.Sort(deleted)
  124. sort.Sort(added)
  125. stillDeleted := make(object.Changes, 0, deleted.Len())
  126. stillAdded := make(object.Changes, 0, added.Len())
  127. {
  128. a := 0
  129. d := 0
  130. for a < added.Len() && d < deleted.Len() {
  131. if added[a].hash == deleted[d].hash {
  132. reducedChanges = append(
  133. reducedChanges,
  134. &object.Change{From: deleted[d].change.From, To: added[a].change.To})
  135. a++
  136. d++
  137. } else if added[a].Less(&deleted[d]) {
  138. stillAdded = append(stillAdded, added[a].change)
  139. a++
  140. } else {
  141. stillDeleted = append(stillDeleted, deleted[d].change)
  142. d++
  143. }
  144. }
  145. for ; a < added.Len(); a++ {
  146. stillAdded = append(stillAdded, added[a].change)
  147. }
  148. for ; d < deleted.Len(); d++ {
  149. stillDeleted = append(stillDeleted, deleted[d].change)
  150. }
  151. }
  152. // Stage 2 - apply the similarity threshold
  153. // n^2 but actually linear
  154. // We sort the blobs by size and do the single linear scan.
  155. maxCandidates := RenameAnalysisMaxCandidates
  156. if len(stillAdded)+len(stillDeleted) > RenameAnalysisSetSizeLimit {
  157. maxCandidates = 1
  158. }
  159. addedBlobs := make(sortableBlobs, 0, stillAdded.Len())
  160. deletedBlobs := make(sortableBlobs, 0, stillDeleted.Len())
  161. var smallChanges []*object.Change
  162. for _, change := range stillAdded {
  163. blob := cache[change.To.TreeEntry.Hash]
  164. if blob.Size < RenameAnalysisMinimumSize {
  165. smallChanges = append(smallChanges, change)
  166. } else {
  167. addedBlobs = append(
  168. addedBlobs, sortableBlob{change: change, size: blob.Size})
  169. }
  170. }
  171. for _, change := range stillDeleted {
  172. blob := cache[change.From.TreeEntry.Hash]
  173. if blob.Size < RenameAnalysisMinimumSize {
  174. smallChanges = append(smallChanges, change)
  175. } else {
  176. deletedBlobs = append(
  177. deletedBlobs, sortableBlob{change: change, size: blob.Size})
  178. }
  179. }
  180. sort.Sort(addedBlobs)
  181. sort.Sort(deletedBlobs)
  182. var finished, finishedA, finishedB bool
  183. matchesA := make(object.Changes, 0, changes.Len())
  184. matchesB := make(object.Changes, 0, changes.Len())
  185. addedBlobsA := addedBlobs
  186. addedBlobsB := make(sortableBlobs, len(addedBlobs))
  187. copy(addedBlobsB, addedBlobs)
  188. deletedBlobsA := deletedBlobs
  189. deletedBlobsB := make(sortableBlobs, len(deletedBlobs))
  190. copy(deletedBlobsB, deletedBlobs)
  191. wg := sync.WaitGroup{}
  192. matchA := func() error {
  193. defer func() {
  194. finished = true
  195. wg.Done()
  196. }()
  197. aStart := 0
  198. // we will try to find a matching added blob for each deleted blob
  199. for d := 0; d < deletedBlobsA.Len(); d++ {
  200. myBlob := cache[deletedBlobsA[d].change.From.TreeEntry.Hash]
  201. mySize := deletedBlobsA[d].size
  202. myName := filepath.Base(deletedBlobsA[d].change.From.Name)
  203. var a int
  204. for a = aStart; a < addedBlobsA.Len() && !ra.sizesAreClose(mySize, addedBlobsA[a].size); a++ {
  205. }
  206. aStart = a
  207. foundMatch := false
  208. // get the list of possible candidates and sort by file name similarity
  209. var candidates []int
  210. for a = aStart; a < addedBlobsA.Len() && ra.sizesAreClose(mySize, addedBlobsA[a].size); a++ {
  211. candidates = append(candidates, a)
  212. }
  213. sortRenameCandidates(candidates, myName, func(a int) string {
  214. return addedBlobsA[a].change.To.Name
  215. })
  216. var ci int
  217. for ci, a = range candidates {
  218. if finished {
  219. return nil
  220. }
  221. if ci > maxCandidates {
  222. break
  223. }
  224. blobsAreClose, err := ra.blobsAreClose(
  225. myBlob, cache[addedBlobsA[a].change.To.TreeEntry.Hash])
  226. if err != nil {
  227. return err
  228. }
  229. if blobsAreClose {
  230. foundMatch = true
  231. matchesA = append(
  232. matchesA,
  233. &object.Change{
  234. From: deletedBlobsA[d].change.From,
  235. To: addedBlobsA[a].change.To})
  236. break
  237. }
  238. }
  239. if foundMatch {
  240. deletedBlobsA = append(deletedBlobsA[:d], deletedBlobsA[d+1:]...)
  241. d--
  242. addedBlobsA = append(addedBlobsA[:a], addedBlobsA[a+1:]...)
  243. }
  244. }
  245. finishedA = true
  246. return nil
  247. }
  248. matchB := func() error {
  249. defer func() {
  250. finished = true
  251. wg.Done()
  252. }()
  253. dStart := 0
  254. for a := 0; a < addedBlobsB.Len(); a++ {
  255. myBlob := cache[addedBlobsB[a].change.To.TreeEntry.Hash]
  256. mySize := addedBlobsB[a].size
  257. myName := filepath.Base(addedBlobsB[a].change.To.Name)
  258. var d int
  259. for d = dStart; d < deletedBlobsB.Len() && !ra.sizesAreClose(mySize, deletedBlobsB[d].size); d++ {
  260. }
  261. dStart = d
  262. foundMatch := false
  263. // get the list of possible candidates and sort by file name similarity
  264. var candidates []int
  265. for d = dStart; d < deletedBlobsB.Len() && ra.sizesAreClose(mySize, deletedBlobsB[d].size); d++ {
  266. candidates = append(candidates, d)
  267. }
  268. sortRenameCandidates(candidates, myName, func(d int) string {
  269. return deletedBlobsB[d].change.From.Name
  270. })
  271. var ci int
  272. for ci, d = range candidates {
  273. if finished {
  274. return nil
  275. }
  276. if ci > maxCandidates {
  277. break
  278. }
  279. blobsAreClose, err := ra.blobsAreClose(
  280. myBlob, cache[deletedBlobsB[d].change.From.TreeEntry.Hash])
  281. if err != nil {
  282. return err
  283. }
  284. if blobsAreClose {
  285. foundMatch = true
  286. matchesB = append(
  287. matchesB,
  288. &object.Change{
  289. From: deletedBlobsB[d].change.From,
  290. To: addedBlobsB[a].change.To})
  291. break
  292. }
  293. }
  294. if foundMatch {
  295. addedBlobsB = append(addedBlobsB[:a], addedBlobsB[a+1:]...)
  296. a--
  297. deletedBlobsB = append(deletedBlobsB[:d], deletedBlobsB[d+1:]...)
  298. }
  299. }
  300. finishedB = true
  301. return nil
  302. }
  303. // run two functions in parallel, and take the result from the one which finished earlier
  304. wg.Add(2)
  305. var err error
  306. go func() { err = matchA() }()
  307. go func() { err = matchB() }()
  308. wg.Wait()
  309. if err != nil {
  310. return nil, err
  311. }
  312. var matches object.Changes
  313. if finishedA {
  314. addedBlobs = addedBlobsA
  315. deletedBlobs = deletedBlobsA
  316. matches = matchesA
  317. } else {
  318. if !finishedB {
  319. panic("Impossible happened: two functions returned without an error " +
  320. "but no results from both")
  321. }
  322. addedBlobs = addedBlobsB
  323. deletedBlobs = deletedBlobsB
  324. matches = matchesB
  325. }
  326. // Stage 3 - we give up, everything left are independent additions and deletions
  327. for _, change := range matches {
  328. reducedChanges = append(reducedChanges, change)
  329. }
  330. for _, blob := range addedBlobs {
  331. reducedChanges = append(reducedChanges, blob.change)
  332. }
  333. for _, blob := range deletedBlobs {
  334. reducedChanges = append(reducedChanges, blob.change)
  335. }
  336. for _, change := range smallChanges {
  337. reducedChanges = append(reducedChanges, change)
  338. }
  339. return map[string]interface{}{DependencyTreeChanges: reducedChanges}, nil
  340. }
  341. // Fork clones this PipelineItem.
  342. func (ra *RenameAnalysis) Fork(n int) []core.PipelineItem {
  343. return core.ForkSamePipelineItem(ra, n)
  344. }
  345. func (ra *RenameAnalysis) sizesAreClose(size1 int64, size2 int64) bool {
  346. size := internal.Max64(1, internal.Max64(size1, size2))
  347. return (internal.Abs64(size1-size2)*10000)/size <= int64(100-ra.SimilarityThreshold)*100
  348. }
  349. func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (bool, error) {
  350. cleanReturn := false
  351. defer func() {
  352. if !cleanReturn {
  353. log.Println()
  354. log.Println(blob1.Hash.String())
  355. log.Println(blob2.Hash.String())
  356. }
  357. }()
  358. _, err1 := blob1.CountLines()
  359. _, err2 := blob2.CountLines()
  360. if err1 == ErrorBinary || err2 == ErrorBinary {
  361. // binary mode
  362. bsdifflen := DiffBytes(blob1.Data, blob2.Data)
  363. delta := int((int64(bsdifflen) * 100) / internal.Max64(
  364. internal.Min64(blob1.Size, blob2.Size), 1))
  365. cleanReturn = true
  366. return 100-delta >= ra.SimilarityThreshold, nil
  367. }
  368. src, dst := string(blob1.Data), string(blob2.Data)
  369. maxSize := internal.Max(1, internal.Max(utf8.RuneCountInString(src), utf8.RuneCountInString(dst)))
  370. // compute the line-by-line diff, then the char-level diffs of the del-ins blocks
  371. // yes, this algorithm is greedy and not exact
  372. dmp := diffmatchpatch.New()
  373. srcLineRunes, dstLineRunes, _ := dmp.DiffLinesToRunes(src, dst)
  374. // the third returned value, []string, is the mapping from runes to lines
  375. // we cannot use it because it is approximate and has string collisions
  376. // that is, the mapping is wrong for huge files
  377. diffs := dmp.DiffMainRunes(srcLineRunes, dstLineRunes, false)
  378. srcPositions := calcLinePositions(src)
  379. dstPositions := calcLinePositions(dst)
  380. var common, posSrc, prevPosSrc, posDst int
  381. possibleDelInsBlock := false
  382. for _, edit := range diffs {
  383. switch edit.Type {
  384. case diffmatchpatch.DiffDelete:
  385. possibleDelInsBlock = true
  386. prevPosSrc = posSrc
  387. posSrc += utf8.RuneCountInString(edit.Text)
  388. case diffmatchpatch.DiffInsert:
  389. nextPosDst := posDst + utf8.RuneCountInString(edit.Text)
  390. if possibleDelInsBlock {
  391. possibleDelInsBlock = false
  392. if internal.Max(srcPositions[posSrc]-srcPositions[prevPosSrc],
  393. dstPositions[nextPosDst]-dstPositions[posDst]) < RenameAnalysisByteDiffSizeThreshold {
  394. localDmp := diffmatchpatch.New()
  395. localSrc := src[srcPositions[prevPosSrc]:srcPositions[posSrc]]
  396. localDst := dst[dstPositions[posDst]:dstPositions[nextPosDst]]
  397. localDiffs := localDmp.DiffMainRunes(
  398. strToLiteralRunes(localSrc), strToLiteralRunes(localDst), false)
  399. for _, localEdit := range localDiffs {
  400. if localEdit.Type == diffmatchpatch.DiffEqual {
  401. common += utf8.RuneCountInString(localEdit.Text)
  402. }
  403. }
  404. }
  405. }
  406. posDst = nextPosDst
  407. case diffmatchpatch.DiffEqual:
  408. possibleDelInsBlock = false
  409. step := utf8.RuneCountInString(edit.Text)
  410. // for i := range edit.Text does *not* work
  411. // idk why, but `i` appears to be bigger than the number of runes
  412. for i := 0; i < step; i++ {
  413. common += srcPositions[posSrc+i+1] - srcPositions[posSrc+i]
  414. }
  415. posSrc += step
  416. posDst += step
  417. }
  418. if possibleDelInsBlock {
  419. continue
  420. }
  421. // supposing that the rest of the lines are the same (they are not - too optimistic),
  422. // estimate the maximum similarity and exit the loop if it lower than our threshold
  423. var srcPendingSize, dstPendingSize int
  424. srcPendingSize = len(src) - srcPositions[posSrc]
  425. dstPendingSize = len(dst) - dstPositions[posDst]
  426. maxCommon := common + internal.Min(srcPendingSize, dstPendingSize)
  427. similarity := (maxCommon * 100) / maxSize
  428. if similarity < ra.SimilarityThreshold {
  429. cleanReturn = true
  430. return false, nil
  431. }
  432. similarity = (common * 100) / maxSize
  433. if similarity >= ra.SimilarityThreshold {
  434. cleanReturn = true
  435. return true, nil
  436. }
  437. }
  438. // the very last "overly optimistic" estimate was actually precise, so since we are still here
  439. // the blobs are similar
  440. cleanReturn = true
  441. return true, nil
  442. }
  443. func calcLinePositions(text string) []int {
  444. if text == "" {
  445. return []int{0}
  446. }
  447. lines := strings.Split(text, "\n")
  448. positions := make([]int, len(lines)+1)
  449. accum := 0
  450. for i, l := range lines {
  451. positions[i] = accum
  452. accum += len(l) + 1 // +1 for \n
  453. }
  454. if len(lines) > 0 && lines[len(lines)-1] != "\n" {
  455. accum--
  456. }
  457. positions[len(lines)] = accum
  458. return positions
  459. }
  460. func strToLiteralRunes(s string) []rune {
  461. lrunes := make([]rune, len(s))
  462. for i, b := range []byte(s) {
  463. lrunes[i] = rune(b)
  464. }
  465. return lrunes
  466. }
  467. type sortableChange struct {
  468. change *object.Change
  469. hash plumbing.Hash
  470. }
  471. type sortableChanges []sortableChange
  472. func (change *sortableChange) Less(other *sortableChange) bool {
  473. for x := 0; x < 20; x++ {
  474. if change.hash[x] < other.hash[x] {
  475. return true
  476. }
  477. }
  478. return false
  479. }
  480. func (slice sortableChanges) Len() int {
  481. return len(slice)
  482. }
  483. func (slice sortableChanges) Less(i, j int) bool {
  484. return slice[i].Less(&slice[j])
  485. }
  486. func (slice sortableChanges) Swap(i, j int) {
  487. slice[i], slice[j] = slice[j], slice[i]
  488. }
  489. type sortableBlob struct {
  490. change *object.Change
  491. size int64
  492. }
  493. type sortableBlobs []sortableBlob
  494. func (change *sortableBlob) Less(other *sortableBlob) bool {
  495. return change.size < other.size
  496. }
  497. func (slice sortableBlobs) Len() int {
  498. return len(slice)
  499. }
  500. func (slice sortableBlobs) Less(i, j int) bool {
  501. return slice[i].Less(&slice[j])
  502. }
  503. func (slice sortableBlobs) Swap(i, j int) {
  504. slice[i], slice[j] = slice[j], slice[i]
  505. }
  506. type candidateDistance struct {
  507. Candidate int
  508. Distance int
  509. }
  510. func sortRenameCandidates(candidates []int, origin string, nameGetter func(int) string) {
  511. distances := make([]candidateDistance, len(candidates))
  512. ctx := LevenshteinContext{}
  513. for i, x := range candidates {
  514. name := filepath.Base(nameGetter(x))
  515. distances[i] = candidateDistance{x, ctx.Distance(origin, name)}
  516. }
  517. sort.Slice(distances, func(i, j int) bool {
  518. return distances[i].Distance < distances[j].Distance
  519. })
  520. for i, cd := range distances {
  521. candidates[i] = cd.Candidate
  522. }
  523. }
  524. func init() {
  525. core.Registry.Register(&RenameAnalysis{})
  526. }