renames.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516
  1. package plumbing
  2. import (
  3. "log"
  4. "path/filepath"
  5. "sort"
  6. "sync"
  7. "unicode/utf8"
  8. "github.com/sergi/go-diff/diffmatchpatch"
  9. "gopkg.in/src-d/go-git.v4"
  10. "gopkg.in/src-d/go-git.v4/plumbing"
  11. "gopkg.in/src-d/go-git.v4/plumbing/object"
  12. "gopkg.in/src-d/go-git.v4/utils/merkletrie"
  13. "gopkg.in/src-d/hercules.v6/internal"
  14. "gopkg.in/src-d/hercules.v6/internal/core"
  15. )
  16. // RenameAnalysis improves TreeDiff's results by searching for changed blobs under different
  17. // paths which are likely to be the result of a rename with subsequent edits.
  18. // RenameAnalysis is a PipelineItem.
  19. type RenameAnalysis struct {
  20. core.NoopMerger
  21. // SimilarityThreshold adjusts the heuristic to determine file renames.
  22. // It has the same units as cgit's -X rename-threshold or -M. Better to
  23. // set it to the default value of 80 (80%).
  24. SimilarityThreshold int
  25. repository *git.Repository
  26. }
  27. const (
  28. // RenameAnalysisDefaultThreshold specifies the default percentage of common lines in a pair
  29. // of files to consider them linked. The exact code of the decision is sizesAreClose().
  30. // CGit's default is 50%. Ours is 80% because 50% can be too computationally expensive.
  31. RenameAnalysisDefaultThreshold = 80
  32. // ConfigRenameAnalysisSimilarityThreshold is the name of the configuration option
  33. // (RenameAnalysis.Configure()) which sets the similarity threshold.
  34. ConfigRenameAnalysisSimilarityThreshold = "RenameAnalysis.SimilarityThreshold"
  35. // RenameAnalysisMinimumSize is the minimum size of a blob to be considered.
  36. RenameAnalysisMinimumSize = 32
  37. // RenameAnalysisMaxCandidates is the maximum number of rename candidates to consider per file.
  38. RenameAnalysisMaxCandidates = 50
  39. )
  40. // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
  41. func (ra *RenameAnalysis) Name() string {
  42. return "RenameAnalysis"
  43. }
  44. // Provides returns the list of names of entities which are produced by this PipelineItem.
  45. // Each produced entity will be inserted into `deps` of dependent Consume()-s according
  46. // to this list. Also used by core.Registry to build the global map of providers.
  47. func (ra *RenameAnalysis) Provides() []string {
  48. arr := [...]string{DependencyTreeChanges}
  49. return arr[:]
  50. }
  51. // Requires returns the list of names of entities which are needed by this PipelineItem.
  52. // Each requested entity will be inserted into `deps` of Consume(). In turn, those
  53. // entities are Provides() upstream.
  54. func (ra *RenameAnalysis) Requires() []string {
  55. arr := [...]string{DependencyBlobCache, DependencyTreeChanges}
  56. return arr[:]
  57. }
  58. // ListConfigurationOptions returns the list of changeable public properties of this PipelineItem.
  59. func (ra *RenameAnalysis) ListConfigurationOptions() []core.ConfigurationOption {
  60. options := [...]core.ConfigurationOption{{
  61. Name: ConfigRenameAnalysisSimilarityThreshold,
  62. Description: "The threshold on the similarity index used to detect renames.",
  63. Flag: "M",
  64. Type: core.IntConfigurationOption,
  65. Default: RenameAnalysisDefaultThreshold},
  66. }
  67. return options[:]
  68. }
  69. // Configure sets the properties previously published by ListConfigurationOptions().
  70. func (ra *RenameAnalysis) Configure(facts map[string]interface{}) error {
  71. if val, exists := facts[ConfigRenameAnalysisSimilarityThreshold].(int); exists {
  72. ra.SimilarityThreshold = val
  73. }
  74. return nil
  75. }
  76. // Initialize resets the temporary caches and prepares this PipelineItem for a series of Consume()
  77. // calls. The repository which is going to be analysed is supplied as an argument.
  78. func (ra *RenameAnalysis) Initialize(repository *git.Repository) error {
  79. if ra.SimilarityThreshold < 0 || ra.SimilarityThreshold > 100 {
  80. log.Printf("Warning: adjusted the similarity threshold to %d\n",
  81. RenameAnalysisDefaultThreshold)
  82. ra.SimilarityThreshold = RenameAnalysisDefaultThreshold
  83. }
  84. ra.repository = repository
  85. return nil
  86. }
  87. // Consume runs this PipelineItem on the next commit data.
  88. // `deps` contain all the results from upstream PipelineItem-s as requested by Requires().
  89. // Additionally, DependencyCommit is always present there and represents the analysed *object.Commit.
  90. // This function returns the mapping with analysis results. The keys must be the same as
  91. // in Provides(). If there was an error, nil is returned.
  92. func (ra *RenameAnalysis) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  93. changes := deps[DependencyTreeChanges].(object.Changes)
  94. cache := deps[DependencyBlobCache].(map[plumbing.Hash]*CachedBlob)
  95. reducedChanges := make(object.Changes, 0, changes.Len())
  96. // Stage 1 - find renames by matching the hashes
  97. // n log(n)
  98. // We sort additions and deletions by hash and then do the single scan along
  99. // both slices.
  100. deleted := make(sortableChanges, 0, changes.Len())
  101. added := make(sortableChanges, 0, changes.Len())
  102. for _, change := range changes {
  103. action, err := change.Action()
  104. if err != nil {
  105. return nil, err
  106. }
  107. switch action {
  108. case merkletrie.Insert:
  109. added = append(added, sortableChange{change, change.To.TreeEntry.Hash})
  110. case merkletrie.Delete:
  111. deleted = append(deleted, sortableChange{change, change.From.TreeEntry.Hash})
  112. case merkletrie.Modify:
  113. reducedChanges = append(reducedChanges, change)
  114. }
  115. }
  116. sort.Sort(deleted)
  117. sort.Sort(added)
  118. stillDeleted := make(object.Changes, 0, deleted.Len())
  119. stillAdded := make(object.Changes, 0, added.Len())
  120. {
  121. a := 0
  122. d := 0
  123. for a < added.Len() && d < deleted.Len() {
  124. if added[a].hash == deleted[d].hash {
  125. reducedChanges = append(
  126. reducedChanges,
  127. &object.Change{From: deleted[d].change.From, To: added[a].change.To})
  128. a++
  129. d++
  130. } else if added[a].Less(&deleted[d]) {
  131. stillAdded = append(stillAdded, added[a].change)
  132. a++
  133. } else {
  134. stillDeleted = append(stillDeleted, deleted[d].change)
  135. d++
  136. }
  137. }
  138. for ; a < added.Len(); a++ {
  139. stillAdded = append(stillAdded, added[a].change)
  140. }
  141. for ; d < deleted.Len(); d++ {
  142. stillDeleted = append(stillDeleted, deleted[d].change)
  143. }
  144. }
  145. // Stage 2 - apply the similarity threshold
  146. // n^2 but actually linear
  147. // We sort the blobs by size and do the single linear scan.
  148. addedBlobs := make(sortableBlobs, 0, stillAdded.Len())
  149. deletedBlobs := make(sortableBlobs, 0, stillDeleted.Len())
  150. var smallChanges []*object.Change
  151. for _, change := range stillAdded {
  152. blob := cache[change.To.TreeEntry.Hash]
  153. if blob.Size < RenameAnalysisMinimumSize {
  154. smallChanges = append(smallChanges, change)
  155. } else {
  156. addedBlobs = append(
  157. addedBlobs, sortableBlob{change: change, size: blob.Size})
  158. }
  159. }
  160. for _, change := range stillDeleted {
  161. blob := cache[change.From.TreeEntry.Hash]
  162. if blob.Size < RenameAnalysisMinimumSize {
  163. smallChanges = append(smallChanges, change)
  164. } else {
  165. deletedBlobs = append(
  166. deletedBlobs, sortableBlob{change: change, size: blob.Size})
  167. }
  168. }
  169. sort.Sort(addedBlobs)
  170. sort.Sort(deletedBlobs)
  171. var finished, finishedA, finishedB bool
  172. matchesA := make(object.Changes, 0, changes.Len())
  173. matchesB := make(object.Changes, 0, changes.Len())
  174. addedBlobsA := addedBlobs
  175. addedBlobsB := make(sortableBlobs, len(addedBlobs))
  176. copy(addedBlobsB, addedBlobs)
  177. deletedBlobsA := deletedBlobs
  178. deletedBlobsB := make(sortableBlobs, len(deletedBlobs))
  179. copy(deletedBlobsB, deletedBlobs)
  180. wg := sync.WaitGroup{}
  181. matchA := func() error {
  182. defer func() {
  183. finished = true
  184. wg.Done()
  185. }()
  186. aStart := 0
  187. // we will try to find a matching added blob for each deleted blob
  188. for d := 0; d < deletedBlobsA.Len(); d++ {
  189. myBlob := cache[deletedBlobsA[d].change.From.TreeEntry.Hash]
  190. mySize := deletedBlobsA[d].size
  191. myName := filepath.Base(deletedBlobsA[d].change.From.Name)
  192. var a int
  193. for a = aStart; a < addedBlobsA.Len() && !ra.sizesAreClose(mySize, addedBlobsA[a].size); a++ {
  194. }
  195. aStart = a
  196. foundMatch := false
  197. // get the list of possible candidates and sort by file name similarity
  198. var candidates []int
  199. for a = aStart; a < addedBlobsA.Len() && ra.sizesAreClose(mySize, addedBlobsA[a].size); a++ {
  200. candidates = append(candidates, a)
  201. }
  202. sortRenameCandidates(candidates, myName, func(a int) string {
  203. return addedBlobsA[a].change.To.Name
  204. })
  205. var ci int
  206. for ci, a = range candidates {
  207. if finished {
  208. return nil
  209. }
  210. if ci > RenameAnalysisMaxCandidates {
  211. break
  212. }
  213. blobsAreClose, err := ra.blobsAreClose(
  214. myBlob, cache[addedBlobsA[a].change.To.TreeEntry.Hash])
  215. if err != nil {
  216. return err
  217. }
  218. if blobsAreClose {
  219. foundMatch = true
  220. matchesA = append(
  221. matchesA,
  222. &object.Change{
  223. From: deletedBlobsA[d].change.From,
  224. To: addedBlobsA[a].change.To})
  225. break
  226. }
  227. }
  228. if foundMatch {
  229. deletedBlobsA = append(deletedBlobsA[:d], deletedBlobsA[d+1:]...)
  230. d--
  231. addedBlobsA = append(addedBlobsA[:a], addedBlobsA[a+1:]...)
  232. }
  233. }
  234. finishedA = true
  235. return nil
  236. }
  237. matchB := func() error {
  238. defer func() {
  239. finished = true
  240. wg.Done()
  241. }()
  242. dStart := 0
  243. for a := 0; a < addedBlobsB.Len(); a++ {
  244. myBlob := cache[addedBlobsB[a].change.To.TreeEntry.Hash]
  245. mySize := addedBlobsB[a].size
  246. myName := filepath.Base(addedBlobsB[a].change.To.Name)
  247. var d int
  248. for d = dStart; d < deletedBlobsB.Len() && !ra.sizesAreClose(mySize, deletedBlobsB[d].size); d++ {
  249. }
  250. dStart = d
  251. foundMatch := false
  252. // get the list of possible candidates and sort by file name similarity
  253. var candidates []int
  254. for d = dStart; d < deletedBlobsB.Len() && ra.sizesAreClose(mySize, deletedBlobsB[d].size); d++ {
  255. candidates = append(candidates, d)
  256. }
  257. sortRenameCandidates(candidates, myName, func(d int) string {
  258. return deletedBlobsB[d].change.From.Name
  259. })
  260. var ci int
  261. for ci, d = range candidates {
  262. if finished {
  263. return nil
  264. }
  265. if ci > RenameAnalysisMaxCandidates {
  266. break
  267. }
  268. blobsAreClose, err := ra.blobsAreClose(
  269. myBlob, cache[deletedBlobsB[d].change.From.TreeEntry.Hash])
  270. if err != nil {
  271. return err
  272. }
  273. if blobsAreClose {
  274. foundMatch = true
  275. matchesB = append(
  276. matchesB,
  277. &object.Change{
  278. From: deletedBlobsB[d].change.From,
  279. To: addedBlobsB[a].change.To})
  280. break
  281. }
  282. }
  283. if foundMatch {
  284. addedBlobsB = append(addedBlobsB[:a], addedBlobsB[a+1:]...)
  285. a--
  286. deletedBlobsB = append(deletedBlobsB[:d], deletedBlobsB[d+1:]...)
  287. }
  288. }
  289. finishedB = true
  290. return nil
  291. }
  292. // run two functions in parallel, and take the result from the one which finished earlier
  293. wg.Add(2)
  294. var err error
  295. go func() { err = matchA() }()
  296. go func() { err = matchB() }()
  297. wg.Wait()
  298. if err != nil {
  299. return nil, err
  300. }
  301. var matches object.Changes
  302. if finishedA {
  303. addedBlobs = addedBlobsA
  304. deletedBlobs = deletedBlobsA
  305. matches = matchesA
  306. } else {
  307. if !finishedB {
  308. panic("Impossible happened: two functions returned without an error " +
  309. "but no results from both")
  310. }
  311. addedBlobs = addedBlobsB
  312. deletedBlobs = deletedBlobsB
  313. matches = matchesB
  314. }
  315. // Stage 3 - we give up, everything left are independent additions and deletions
  316. for _, change := range matches {
  317. reducedChanges = append(reducedChanges, change)
  318. }
  319. for _, blob := range addedBlobs {
  320. reducedChanges = append(reducedChanges, blob.change)
  321. }
  322. for _, blob := range deletedBlobs {
  323. reducedChanges = append(reducedChanges, blob.change)
  324. }
  325. for _, change := range smallChanges {
  326. reducedChanges = append(reducedChanges, change)
  327. }
  328. return map[string]interface{}{DependencyTreeChanges: reducedChanges}, nil
  329. }
  330. // Fork clones this PipelineItem.
  331. func (ra *RenameAnalysis) Fork(n int) []core.PipelineItem {
  332. return core.ForkSamePipelineItem(ra, n)
  333. }
  334. func (ra *RenameAnalysis) sizesAreClose(size1 int64, size2 int64) bool {
  335. size := internal.Max64(1, internal.Max64(size1, size2))
  336. return (internal.Abs64(size1-size2)*10000)/size <= int64(100-ra.SimilarityThreshold)*100
  337. }
  338. func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (bool, error) {
  339. defer func() {
  340. if err := recover(); err != nil {
  341. log.Println()
  342. log.Println(blob1.Hash.String())
  343. log.Println(blob2.Hash.String())
  344. panic(err)
  345. }
  346. }()
  347. _, err1 := blob1.CountLines()
  348. _, err2 := blob2.CountLines()
  349. if err1 == ErrorBinary || err2 == ErrorBinary {
  350. // binary mode
  351. bsdifflen := DiffBytes(blob1.Data, blob2.Data)
  352. delta := int((int64(bsdifflen) * 100) / internal.Max64(
  353. internal.Min64(blob1.Size, blob2.Size), 1))
  354. return 100-delta >= ra.SimilarityThreshold, nil
  355. }
  356. src, dst := string(blob1.Data), string(blob2.Data)
  357. maxSize := internal.Max(1, internal.Max(utf8.RuneCountInString(src), utf8.RuneCountInString(dst)))
  358. // compute the line-by-line diff, then the char-level diffs of the del-ins blocks
  359. // yes, this algorithm is greedy and not exact
  360. dmp := diffmatchpatch.New()
  361. srcLines, dstLines, lines := dmp.DiffLinesToRunes(src, dst)
  362. diffs := dmp.DiffMainRunes(srcLines, dstLines, false)
  363. var common, posSrc, prevPosSrc, posDst int
  364. possibleDelInsBlock := false
  365. for _, edit := range diffs {
  366. switch edit.Type {
  367. case diffmatchpatch.DiffDelete:
  368. possibleDelInsBlock = true
  369. prevPosSrc = posSrc
  370. for _, lineno := range edit.Text {
  371. posSrc += len(lines[lineno])
  372. }
  373. case diffmatchpatch.DiffInsert:
  374. nextPosDst := posDst
  375. for _, lineno := range edit.Text {
  376. nextPosDst += len(lines[lineno])
  377. }
  378. if possibleDelInsBlock {
  379. possibleDelInsBlock = false
  380. localDmp := diffmatchpatch.New()
  381. localSrc := src[prevPosSrc:posSrc]
  382. localDst := dst[posDst:nextPosDst]
  383. localDiffs := localDmp.DiffMainRunes([]rune(localSrc), []rune(localDst), false)
  384. for _, localEdit := range localDiffs {
  385. if localEdit.Type == diffmatchpatch.DiffEqual {
  386. common += utf8.RuneCountInString(localEdit.Text)
  387. }
  388. }
  389. }
  390. posDst = nextPosDst
  391. case diffmatchpatch.DiffEqual:
  392. possibleDelInsBlock = false
  393. for _, lineno := range edit.Text {
  394. common += utf8.RuneCountInString(lines[lineno])
  395. step := len(lines[lineno])
  396. posSrc += step
  397. posDst += step
  398. }
  399. }
  400. if possibleDelInsBlock {
  401. continue
  402. }
  403. // supposing that the rest of the lines are the same (they are not - too optimistic),
  404. // estimate the maximum similarity and exit the loop if it lower than our threshold
  405. maxCommon := common + internal.Min(
  406. utf8.RuneCountInString(src[posSrc:]),
  407. utf8.RuneCountInString(dst[posDst:]))
  408. similarity := (maxCommon * 100) / maxSize
  409. if similarity < ra.SimilarityThreshold {
  410. return false, nil
  411. }
  412. }
  413. // the very last "overly optimistic" estimate was actually precise, so since we are still here
  414. // the blobs are similar
  415. return true, nil
  416. }
  417. type sortableChange struct {
  418. change *object.Change
  419. hash plumbing.Hash
  420. }
  421. type sortableChanges []sortableChange
  422. func (change *sortableChange) Less(other *sortableChange) bool {
  423. for x := 0; x < 20; x++ {
  424. if change.hash[x] < other.hash[x] {
  425. return true
  426. }
  427. }
  428. return false
  429. }
  430. func (slice sortableChanges) Len() int {
  431. return len(slice)
  432. }
  433. func (slice sortableChanges) Less(i, j int) bool {
  434. return slice[i].Less(&slice[j])
  435. }
  436. func (slice sortableChanges) Swap(i, j int) {
  437. slice[i], slice[j] = slice[j], slice[i]
  438. }
  439. type sortableBlob struct {
  440. change *object.Change
  441. size int64
  442. }
  443. type sortableBlobs []sortableBlob
  444. func (change *sortableBlob) Less(other *sortableBlob) bool {
  445. return change.size < other.size
  446. }
  447. func (slice sortableBlobs) Len() int {
  448. return len(slice)
  449. }
  450. func (slice sortableBlobs) Less(i, j int) bool {
  451. return slice[i].Less(&slice[j])
  452. }
  453. func (slice sortableBlobs) Swap(i, j int) {
  454. slice[i], slice[j] = slice[j], slice[i]
  455. }
  456. type candidateDistance struct {
  457. Candidate int
  458. Distance int
  459. }
  460. func sortRenameCandidates(candidates []int, origin string, nameGetter func(int) string) {
  461. distances := make([]candidateDistance, len(candidates))
  462. ctx := LevenshteinContext{}
  463. for i, x := range candidates {
  464. name := filepath.Base(nameGetter(x))
  465. distances[i] = candidateDistance{x, ctx.Distance(origin, name)}
  466. }
  467. sort.Slice(distances, func(i, j int) bool {
  468. return distances[i].Distance < distances[j].Distance
  469. })
  470. for i, cd := range distances {
  471. candidates[i] = cd.Candidate
  472. }
  473. }
  474. func init() {
  475. core.Registry.Register(&RenameAnalysis{})
  476. }