renames.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495
  1. package plumbing
  2. import (
  3. "log"
  4. "path/filepath"
  5. "sort"
  6. "sync"
  7. "unicode/utf8"
  8. "github.com/sergi/go-diff/diffmatchpatch"
  9. "gopkg.in/src-d/go-git.v4"
  10. "gopkg.in/src-d/go-git.v4/plumbing"
  11. "gopkg.in/src-d/go-git.v4/plumbing/object"
  12. "gopkg.in/src-d/go-git.v4/utils/merkletrie"
  13. "gopkg.in/src-d/hercules.v5/internal"
  14. "gopkg.in/src-d/hercules.v5/internal/core"
  15. )
  16. // RenameAnalysis improves TreeDiff's results by searching for changed blobs under different
  17. // paths which are likely to be the result of a rename with subsequent edits.
  18. // RenameAnalysis is a PipelineItem.
  19. type RenameAnalysis struct {
  20. core.NoopMerger
  21. // SimilarityThreshold adjusts the heuristic to determine file renames.
  22. // It has the same units as cgit's -X rename-threshold or -M. Better to
  23. // set it to the default value of 80 (80%).
  24. SimilarityThreshold int
  25. repository *git.Repository
  26. }
  27. const (
  28. // RenameAnalysisDefaultThreshold specifies the default percentage of common lines in a pair
  29. // of files to consider them linked. The exact code of the decision is sizesAreClose().
  30. // CGit's default is 50%. Ours is 80% because 50% can be too computationally expensive.
  31. RenameAnalysisDefaultThreshold = 80
  32. // ConfigRenameAnalysisSimilarityThreshold is the name of the configuration option
  33. // (RenameAnalysis.Configure()) which sets the similarity threshold.
  34. ConfigRenameAnalysisSimilarityThreshold = "RenameAnalysis.SimilarityThreshold"
  35. // RenameAnalysisMinimumSize is the minimum size of a blob to be considered.
  36. RenameAnalysisMinimumSize = 32
  37. // RenameAnalysisMaxCandidates is the maximum number of rename candidates to consider per file.
  38. RenameAnalysisMaxCandidates = 50
  39. )
  40. // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
  41. func (ra *RenameAnalysis) Name() string {
  42. return "RenameAnalysis"
  43. }
  44. // Provides returns the list of names of entities which are produced by this PipelineItem.
  45. // Each produced entity will be inserted into `deps` of dependent Consume()-s according
  46. // to this list. Also used by core.Registry to build the global map of providers.
  47. func (ra *RenameAnalysis) Provides() []string {
  48. arr := [...]string{DependencyTreeChanges}
  49. return arr[:]
  50. }
  51. // Requires returns the list of names of entities which are needed by this PipelineItem.
  52. // Each requested entity will be inserted into `deps` of Consume(). In turn, those
  53. // entities are Provides() upstream.
  54. func (ra *RenameAnalysis) Requires() []string {
  55. arr := [...]string{DependencyBlobCache, DependencyTreeChanges}
  56. return arr[:]
  57. }
  58. // ListConfigurationOptions returns the list of changeable public properties of this PipelineItem.
  59. func (ra *RenameAnalysis) ListConfigurationOptions() []core.ConfigurationOption {
  60. options := [...]core.ConfigurationOption{{
  61. Name: ConfigRenameAnalysisSimilarityThreshold,
  62. Description: "The threshold on the similarity index used to detect renames.",
  63. Flag: "M",
  64. Type: core.IntConfigurationOption,
  65. Default: RenameAnalysisDefaultThreshold},
  66. }
  67. return options[:]
  68. }
  69. // Configure sets the properties previously published by ListConfigurationOptions().
  70. func (ra *RenameAnalysis) Configure(facts map[string]interface{}) {
  71. if val, exists := facts[ConfigRenameAnalysisSimilarityThreshold].(int); exists {
  72. ra.SimilarityThreshold = val
  73. }
  74. }
  75. // Initialize resets the temporary caches and prepares this PipelineItem for a series of Consume()
  76. // calls. The repository which is going to be analysed is supplied as an argument.
  77. func (ra *RenameAnalysis) Initialize(repository *git.Repository) {
  78. if ra.SimilarityThreshold < 0 || ra.SimilarityThreshold > 100 {
  79. log.Printf("Warning: adjusted the similarity threshold to %d\n",
  80. RenameAnalysisDefaultThreshold)
  81. ra.SimilarityThreshold = RenameAnalysisDefaultThreshold
  82. }
  83. ra.repository = repository
  84. }
  85. // Consume runs this PipelineItem on the next commit data.
  86. // `deps` contain all the results from upstream PipelineItem-s as requested by Requires().
  87. // Additionally, DependencyCommit is always present there and represents the analysed *object.Commit.
  88. // This function returns the mapping with analysis results. The keys must be the same as
  89. // in Provides(). If there was an error, nil is returned.
  90. func (ra *RenameAnalysis) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  91. changes := deps[DependencyTreeChanges].(object.Changes)
  92. cache := deps[DependencyBlobCache].(map[plumbing.Hash]*CachedBlob)
  93. reducedChanges := make(object.Changes, 0, changes.Len())
  94. // Stage 1 - find renames by matching the hashes
  95. // n log(n)
  96. // We sort additions and deletions by hash and then do the single scan along
  97. // both slices.
  98. deleted := make(sortableChanges, 0, changes.Len())
  99. added := make(sortableChanges, 0, changes.Len())
  100. for _, change := range changes {
  101. action, err := change.Action()
  102. if err != nil {
  103. return nil, err
  104. }
  105. switch action {
  106. case merkletrie.Insert:
  107. added = append(added, sortableChange{change, change.To.TreeEntry.Hash})
  108. case merkletrie.Delete:
  109. deleted = append(deleted, sortableChange{change, change.From.TreeEntry.Hash})
  110. case merkletrie.Modify:
  111. reducedChanges = append(reducedChanges, change)
  112. }
  113. }
  114. sort.Sort(deleted)
  115. sort.Sort(added)
  116. stillDeleted := make(object.Changes, 0, deleted.Len())
  117. stillAdded := make(object.Changes, 0, added.Len())
  118. {
  119. a := 0
  120. d := 0
  121. for a < added.Len() && d < deleted.Len() {
  122. if added[a].hash == deleted[d].hash {
  123. reducedChanges = append(
  124. reducedChanges,
  125. &object.Change{From: deleted[d].change.From, To: added[a].change.To})
  126. a++
  127. d++
  128. } else if added[a].Less(&deleted[d]) {
  129. stillAdded = append(stillAdded, added[a].change)
  130. a++
  131. } else {
  132. stillDeleted = append(stillDeleted, deleted[d].change)
  133. d++
  134. }
  135. }
  136. for ; a < added.Len(); a++ {
  137. stillAdded = append(stillAdded, added[a].change)
  138. }
  139. for ; d < deleted.Len(); d++ {
  140. stillDeleted = append(stillDeleted, deleted[d].change)
  141. }
  142. }
  143. // Stage 2 - apply the similarity threshold
  144. // n^2 but actually linear
  145. // We sort the blobs by size and do the single linear scan.
  146. addedBlobs := make(sortableBlobs, 0, stillAdded.Len())
  147. deletedBlobs := make(sortableBlobs, 0, stillDeleted.Len())
  148. var smallChanges []*object.Change
  149. for _, change := range stillAdded {
  150. blob := cache[change.To.TreeEntry.Hash]
  151. if blob.Size < RenameAnalysisMinimumSize {
  152. smallChanges = append(smallChanges, change)
  153. } else {
  154. addedBlobs = append(
  155. addedBlobs, sortableBlob{change: change, size: blob.Size})
  156. }
  157. }
  158. for _, change := range stillDeleted {
  159. blob := cache[change.From.TreeEntry.Hash]
  160. if blob.Size < RenameAnalysisMinimumSize {
  161. smallChanges = append(smallChanges, change)
  162. } else {
  163. deletedBlobs = append(
  164. deletedBlobs, sortableBlob{change: change, size: blob.Size})
  165. }
  166. }
  167. sort.Sort(addedBlobs)
  168. sort.Sort(deletedBlobs)
  169. var finished, finishedA, finishedB bool
  170. matchesA := make(object.Changes, 0, changes.Len())
  171. matchesB := make(object.Changes, 0, changes.Len())
  172. addedBlobsA := addedBlobs
  173. addedBlobsB := make(sortableBlobs, len(addedBlobs))
  174. copy(addedBlobsB, addedBlobs)
  175. deletedBlobsA := deletedBlobs
  176. deletedBlobsB := make(sortableBlobs, len(deletedBlobs))
  177. copy(deletedBlobsB, deletedBlobs)
  178. wg := sync.WaitGroup{}
  179. matchA := func() error {
  180. defer func() {
  181. finished = true
  182. wg.Done()
  183. }()
  184. aStart := 0
  185. // we will try to find a matching added blob for each deleted blob
  186. for d := 0; d < deletedBlobsA.Len(); d++ {
  187. myBlob := cache[deletedBlobsA[d].change.From.TreeEntry.Hash]
  188. mySize := deletedBlobsA[d].size
  189. myName := filepath.Base(deletedBlobsA[d].change.From.Name)
  190. var a int
  191. for a = aStart; a < addedBlobsA.Len() && !ra.sizesAreClose(mySize, addedBlobsA[a].size); a++ {
  192. }
  193. aStart = a
  194. foundMatch := false
  195. // get the list of possible candidates and sort by file name similarity
  196. var candidates []int
  197. for a = aStart; a < addedBlobsA.Len() && ra.sizesAreClose(mySize, addedBlobsA[a].size); a++ {
  198. candidates = append(candidates, a)
  199. }
  200. sortRenameCandidates(candidates, myName, func(a int) string {
  201. return addedBlobsA[a].change.To.Name
  202. })
  203. var ci int
  204. for ci, a = range candidates {
  205. if finished {
  206. return nil
  207. }
  208. if ci > RenameAnalysisMaxCandidates {
  209. break
  210. }
  211. blobsAreClose, err := ra.blobsAreClose(
  212. myBlob, cache[addedBlobsA[a].change.To.TreeEntry.Hash])
  213. if err != nil {
  214. return err
  215. }
  216. if blobsAreClose {
  217. foundMatch = true
  218. matchesA = append(
  219. matchesA,
  220. &object.Change{
  221. From: deletedBlobsA[d].change.From,
  222. To: addedBlobsA[a].change.To})
  223. break
  224. }
  225. }
  226. if foundMatch {
  227. deletedBlobsA = append(deletedBlobsA[:d], deletedBlobsA[d+1:]...)
  228. d--
  229. addedBlobsA = append(addedBlobsA[:a], addedBlobsA[a+1:]...)
  230. }
  231. }
  232. finishedA = true
  233. return nil
  234. }
  235. matchB := func() error {
  236. defer func() {
  237. finished = true
  238. wg.Done()
  239. }()
  240. dStart := 0
  241. for a := 0; a < addedBlobsB.Len(); a++ {
  242. myBlob := cache[addedBlobsB[a].change.To.TreeEntry.Hash]
  243. mySize := addedBlobsB[a].size
  244. myName := filepath.Base(addedBlobsB[a].change.To.Name)
  245. var d int
  246. for d = dStart; d < deletedBlobsB.Len() && !ra.sizesAreClose(mySize, deletedBlobsB[d].size); d++ {
  247. }
  248. dStart = d
  249. foundMatch := false
  250. // get the list of possible candidates and sort by file name similarity
  251. var candidates []int
  252. for d = dStart; d < deletedBlobsB.Len() && ra.sizesAreClose(mySize, deletedBlobsB[d].size); d++ {
  253. candidates = append(candidates, d)
  254. }
  255. sortRenameCandidates(candidates, myName, func(d int) string {
  256. return deletedBlobsB[d].change.From.Name
  257. })
  258. var ci int
  259. for ci, d = range candidates {
  260. if finished {
  261. return nil
  262. }
  263. if ci > RenameAnalysisMaxCandidates {
  264. break
  265. }
  266. blobsAreClose, err := ra.blobsAreClose(
  267. myBlob, cache[deletedBlobsB[d].change.From.TreeEntry.Hash])
  268. if err != nil {
  269. return err
  270. }
  271. if blobsAreClose {
  272. foundMatch = true
  273. matchesB = append(
  274. matchesB,
  275. &object.Change{
  276. From: deletedBlobsB[d].change.From,
  277. To: addedBlobsB[a].change.To})
  278. break
  279. }
  280. }
  281. if foundMatch {
  282. addedBlobsB = append(addedBlobsB[:a], addedBlobsB[a+1:]...)
  283. a--
  284. deletedBlobsB = append(deletedBlobsB[:d], deletedBlobsB[d+1:]...)
  285. }
  286. }
  287. finishedB = true
  288. return nil
  289. }
  290. // run two functions in parallel, and take the result from the one which finished earlier
  291. wg.Add(2)
  292. var err error
  293. go func() { err = matchA() }()
  294. go func() { err = matchB() }()
  295. wg.Wait()
  296. if err != nil {
  297. return nil, err
  298. }
  299. var matches object.Changes
  300. if finishedA {
  301. addedBlobs = addedBlobsA
  302. deletedBlobs = deletedBlobsA
  303. matches = matchesA
  304. } else {
  305. if !finishedB {
  306. panic("Impossible happened: two functions returned without an error " +
  307. "but no results from both")
  308. }
  309. addedBlobs = addedBlobsB
  310. deletedBlobs = deletedBlobsB
  311. matches = matchesB
  312. }
  313. // Stage 3 - we give up, everything left are independent additions and deletions
  314. for _, change := range matches {
  315. reducedChanges = append(reducedChanges, change)
  316. }
  317. for _, blob := range addedBlobs {
  318. reducedChanges = append(reducedChanges, blob.change)
  319. }
  320. for _, blob := range deletedBlobs {
  321. reducedChanges = append(reducedChanges, blob.change)
  322. }
  323. for _, change := range smallChanges {
  324. reducedChanges = append(reducedChanges, change)
  325. }
  326. return map[string]interface{}{DependencyTreeChanges: reducedChanges}, nil
  327. }
  328. // Fork clones this PipelineItem.
  329. func (ra *RenameAnalysis) Fork(n int) []core.PipelineItem {
  330. return core.ForkSamePipelineItem(ra, n)
  331. }
  332. func (ra *RenameAnalysis) sizesAreClose(size1 int64, size2 int64) bool {
  333. size := internal.Max64(1, internal.Max64(size1, size2))
  334. return (internal.Abs64(size1-size2)*10000)/size <= int64(100-ra.SimilarityThreshold) * 100
  335. }
  336. func (ra *RenameAnalysis) blobsAreClose(
  337. blob1 *CachedBlob, blob2 *CachedBlob) (bool, error) {
  338. src, dst := string(blob1.Data), string(blob2.Data)
  339. maxSize := internal.Max(1, internal.Max(utf8.RuneCountInString(src), utf8.RuneCountInString(dst)))
  340. // compute the line-by-line diff, then the char-level diffs of the del-ins blocks
  341. // yes, this algorithm is greedy and not exact
  342. dmp := diffmatchpatch.New()
  343. srcLines, dstLines, lines := dmp.DiffLinesToRunes(src, dst)
  344. diffs := dmp.DiffMainRunes(srcLines, dstLines, false)
  345. var common, posSrc, prevPosSrc, posDst int
  346. possibleDelInsBlock := false
  347. for _, edit := range diffs {
  348. switch edit.Type {
  349. case diffmatchpatch.DiffDelete:
  350. possibleDelInsBlock = true
  351. prevPosSrc = posSrc
  352. for _, lineno := range edit.Text {
  353. posSrc += len(lines[lineno])
  354. }
  355. case diffmatchpatch.DiffInsert:
  356. nextPosDst := posDst
  357. for _, lineno := range edit.Text {
  358. nextPosDst += len(lines[lineno])
  359. }
  360. if possibleDelInsBlock {
  361. possibleDelInsBlock = false
  362. localDmp := diffmatchpatch.New()
  363. localSrc := src[prevPosSrc:posSrc]
  364. localDst := dst[posDst:nextPosDst]
  365. localDiffs := localDmp.DiffMainRunes([]rune(localSrc), []rune(localDst), false)
  366. for _, localEdit := range localDiffs {
  367. if localEdit.Type == diffmatchpatch.DiffEqual {
  368. common += utf8.RuneCountInString(localEdit.Text)
  369. }
  370. }
  371. }
  372. posDst = nextPosDst
  373. case diffmatchpatch.DiffEqual:
  374. possibleDelInsBlock = false
  375. for _, lineno := range edit.Text {
  376. common += utf8.RuneCountInString(lines[lineno])
  377. step := len(lines[lineno])
  378. posSrc += step
  379. posDst += step
  380. }
  381. }
  382. // supposing that the rest of the lines are the same (they are not - too optimistic),
  383. // estimate the maximum similarity and exit the loop if it lower than our threshold
  384. maxCommon := common + internal.Min(
  385. utf8.RuneCountInString(src[posSrc:]),
  386. utf8.RuneCountInString(dst[posDst:]))
  387. similarity := (maxCommon * 100) / maxSize
  388. if similarity < ra.SimilarityThreshold {
  389. return false, nil
  390. }
  391. }
  392. // the very last "overly optimistic" estimate was actually precise, so since we are still here
  393. // the blobs are similar
  394. return true, nil
  395. }
  396. type sortableChange struct {
  397. change *object.Change
  398. hash plumbing.Hash
  399. }
  400. type sortableChanges []sortableChange
  401. func (change *sortableChange) Less(other *sortableChange) bool {
  402. for x := 0; x < 20; x++ {
  403. if change.hash[x] < other.hash[x] {
  404. return true
  405. }
  406. }
  407. return false
  408. }
  409. func (slice sortableChanges) Len() int {
  410. return len(slice)
  411. }
  412. func (slice sortableChanges) Less(i, j int) bool {
  413. return slice[i].Less(&slice[j])
  414. }
  415. func (slice sortableChanges) Swap(i, j int) {
  416. slice[i], slice[j] = slice[j], slice[i]
  417. }
  418. type sortableBlob struct {
  419. change *object.Change
  420. size int64
  421. }
  422. type sortableBlobs []sortableBlob
  423. func (change *sortableBlob) Less(other *sortableBlob) bool {
  424. return change.size < other.size
  425. }
  426. func (slice sortableBlobs) Len() int {
  427. return len(slice)
  428. }
  429. func (slice sortableBlobs) Less(i, j int) bool {
  430. return slice[i].Less(&slice[j])
  431. }
  432. func (slice sortableBlobs) Swap(i, j int) {
  433. slice[i], slice[j] = slice[j], slice[i]
  434. }
  435. type candidateDistance struct {
  436. Candidate int
  437. Distance int
  438. }
  439. func sortRenameCandidates(candidates []int, origin string, nameGetter func(int) string) {
  440. distances := make([]candidateDistance, len(candidates))
  441. ctx := LevenshteinContext{}
  442. for i, x := range candidates {
  443. name := filepath.Base(nameGetter(x))
  444. distances[i] = candidateDistance{x, ctx.Distance(origin, name)}
  445. }
  446. sort.Slice(distances, func(i, j int) bool {
  447. return distances[i].Distance < distances[j].Distance
  448. })
  449. for i, cd := range distances {
  450. candidates[i] = cd.Candidate
  451. }
  452. }
  453. func init() {
  454. core.Registry.Register(&RenameAnalysis{})
  455. }