renames.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533
  1. package plumbing
  2. import (
  3. "log"
  4. "path/filepath"
  5. "sort"
  6. "sync"
  7. "unicode/utf8"
  8. "github.com/sergi/go-diff/diffmatchpatch"
  9. "gopkg.in/src-d/go-git.v4"
  10. "gopkg.in/src-d/go-git.v4/plumbing"
  11. "gopkg.in/src-d/go-git.v4/plumbing/object"
  12. "gopkg.in/src-d/go-git.v4/utils/merkletrie"
  13. "gopkg.in/src-d/hercules.v7/internal"
  14. "gopkg.in/src-d/hercules.v7/internal/core"
  15. )
  16. // RenameAnalysis improves TreeDiff's results by searching for changed blobs under different
  17. // paths which are likely to be the result of a rename with subsequent edits.
  18. // RenameAnalysis is a PipelineItem.
  19. type RenameAnalysis struct {
  20. core.NoopMerger
  21. // SimilarityThreshold adjusts the heuristic to determine file renames.
  22. // It has the same units as cgit's -X rename-threshold or -M. Better to
  23. // set it to the default value of 80 (80%).
  24. SimilarityThreshold int
  25. repository *git.Repository
  26. }
  27. const (
  28. // RenameAnalysisDefaultThreshold specifies the default percentage of common lines in a pair
  29. // of files to consider them linked. The exact code of the decision is sizesAreClose().
  30. // CGit's default is 50%. Ours is 80% because 50% can be too computationally expensive.
  31. RenameAnalysisDefaultThreshold = 80
  32. // ConfigRenameAnalysisSimilarityThreshold is the name of the configuration option
  33. // (RenameAnalysis.Configure()) which sets the similarity threshold.
  34. ConfigRenameAnalysisSimilarityThreshold = "RenameAnalysis.SimilarityThreshold"
  35. // RenameAnalysisMinimumSize is the minimum size of a blob to be considered.
  36. RenameAnalysisMinimumSize = 32
  37. // RenameAnalysisMaxCandidates is the maximum number of rename candidates to consider per file.
  38. RenameAnalysisMaxCandidates = 50
  39. // RenameAnalysisSetSizeLimit is the maximum number of added + removed files for
  40. // RenameAnalysisMaxCandidates to be active; the bigger numbers set it to 1.
  41. RenameAnalysisSetSizeLimit = 1000
  42. )
  43. // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
  44. func (ra *RenameAnalysis) Name() string {
  45. return "RenameAnalysis"
  46. }
  47. // Provides returns the list of names of entities which are produced by this PipelineItem.
  48. // Each produced entity will be inserted into `deps` of dependent Consume()-s according
  49. // to this list. Also used by core.Registry to build the global map of providers.
  50. func (ra *RenameAnalysis) Provides() []string {
  51. arr := [...]string{DependencyTreeChanges}
  52. return arr[:]
  53. }
  54. // Requires returns the list of names of entities which are needed by this PipelineItem.
  55. // Each requested entity will be inserted into `deps` of Consume(). In turn, those
  56. // entities are Provides() upstream.
  57. func (ra *RenameAnalysis) Requires() []string {
  58. arr := [...]string{DependencyBlobCache, DependencyTreeChanges}
  59. return arr[:]
  60. }
  61. // ListConfigurationOptions returns the list of changeable public properties of this PipelineItem.
  62. func (ra *RenameAnalysis) ListConfigurationOptions() []core.ConfigurationOption {
  63. options := [...]core.ConfigurationOption{{
  64. Name: ConfigRenameAnalysisSimilarityThreshold,
  65. Description: "The threshold on the similarity index used to detect renames.",
  66. Flag: "M",
  67. Type: core.IntConfigurationOption,
  68. Default: RenameAnalysisDefaultThreshold},
  69. }
  70. return options[:]
  71. }
  72. // Configure sets the properties previously published by ListConfigurationOptions().
  73. func (ra *RenameAnalysis) Configure(facts map[string]interface{}) error {
  74. if val, exists := facts[ConfigRenameAnalysisSimilarityThreshold].(int); exists {
  75. ra.SimilarityThreshold = val
  76. }
  77. return nil
  78. }
  79. // Initialize resets the temporary caches and prepares this PipelineItem for a series of Consume()
  80. // calls. The repository which is going to be analysed is supplied as an argument.
  81. func (ra *RenameAnalysis) Initialize(repository *git.Repository) error {
  82. if ra.SimilarityThreshold < 0 || ra.SimilarityThreshold > 100 {
  83. log.Printf("Warning: adjusted the similarity threshold to %d\n",
  84. RenameAnalysisDefaultThreshold)
  85. ra.SimilarityThreshold = RenameAnalysisDefaultThreshold
  86. }
  87. ra.repository = repository
  88. return nil
  89. }
  90. // Consume runs this PipelineItem on the next commit data.
  91. // `deps` contain all the results from upstream PipelineItem-s as requested by Requires().
  92. // Additionally, DependencyCommit is always present there and represents the analysed *object.Commit.
  93. // This function returns the mapping with analysis results. The keys must be the same as
  94. // in Provides(). If there was an error, nil is returned.
  95. func (ra *RenameAnalysis) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  96. changes := deps[DependencyTreeChanges].(object.Changes)
  97. cache := deps[DependencyBlobCache].(map[plumbing.Hash]*CachedBlob)
  98. reducedChanges := make(object.Changes, 0, changes.Len())
  99. // Stage 1 - find renames by matching the hashes
  100. // n log(n)
  101. // We sort additions and deletions by hash and then do the single scan along
  102. // both slices.
  103. deleted := make(sortableChanges, 0, changes.Len())
  104. added := make(sortableChanges, 0, changes.Len())
  105. for _, change := range changes {
  106. action, err := change.Action()
  107. if err != nil {
  108. return nil, err
  109. }
  110. switch action {
  111. case merkletrie.Insert:
  112. added = append(added, sortableChange{change, change.To.TreeEntry.Hash})
  113. case merkletrie.Delete:
  114. deleted = append(deleted, sortableChange{change, change.From.TreeEntry.Hash})
  115. case merkletrie.Modify:
  116. reducedChanges = append(reducedChanges, change)
  117. }
  118. }
  119. sort.Sort(deleted)
  120. sort.Sort(added)
  121. stillDeleted := make(object.Changes, 0, deleted.Len())
  122. stillAdded := make(object.Changes, 0, added.Len())
  123. {
  124. a := 0
  125. d := 0
  126. for a < added.Len() && d < deleted.Len() {
  127. if added[a].hash == deleted[d].hash {
  128. reducedChanges = append(
  129. reducedChanges,
  130. &object.Change{From: deleted[d].change.From, To: added[a].change.To})
  131. a++
  132. d++
  133. } else if added[a].Less(&deleted[d]) {
  134. stillAdded = append(stillAdded, added[a].change)
  135. a++
  136. } else {
  137. stillDeleted = append(stillDeleted, deleted[d].change)
  138. d++
  139. }
  140. }
  141. for ; a < added.Len(); a++ {
  142. stillAdded = append(stillAdded, added[a].change)
  143. }
  144. for ; d < deleted.Len(); d++ {
  145. stillDeleted = append(stillDeleted, deleted[d].change)
  146. }
  147. }
  148. // Stage 2 - apply the similarity threshold
  149. // n^2 but actually linear
  150. // We sort the blobs by size and do the single linear scan.
  151. maxCandidates := RenameAnalysisMaxCandidates
  152. if len(stillAdded)+len(stillDeleted) > RenameAnalysisSetSizeLimit {
  153. maxCandidates = 1
  154. }
  155. addedBlobs := make(sortableBlobs, 0, stillAdded.Len())
  156. deletedBlobs := make(sortableBlobs, 0, stillDeleted.Len())
  157. var smallChanges []*object.Change
  158. for _, change := range stillAdded {
  159. blob := cache[change.To.TreeEntry.Hash]
  160. if blob.Size < RenameAnalysisMinimumSize {
  161. smallChanges = append(smallChanges, change)
  162. } else {
  163. addedBlobs = append(
  164. addedBlobs, sortableBlob{change: change, size: blob.Size})
  165. }
  166. }
  167. for _, change := range stillDeleted {
  168. blob := cache[change.From.TreeEntry.Hash]
  169. if blob.Size < RenameAnalysisMinimumSize {
  170. smallChanges = append(smallChanges, change)
  171. } else {
  172. deletedBlobs = append(
  173. deletedBlobs, sortableBlob{change: change, size: blob.Size})
  174. }
  175. }
  176. sort.Sort(addedBlobs)
  177. sort.Sort(deletedBlobs)
  178. var finished, finishedA, finishedB bool
  179. matchesA := make(object.Changes, 0, changes.Len())
  180. matchesB := make(object.Changes, 0, changes.Len())
  181. addedBlobsA := addedBlobs
  182. addedBlobsB := make(sortableBlobs, len(addedBlobs))
  183. copy(addedBlobsB, addedBlobs)
  184. deletedBlobsA := deletedBlobs
  185. deletedBlobsB := make(sortableBlobs, len(deletedBlobs))
  186. copy(deletedBlobsB, deletedBlobs)
  187. wg := sync.WaitGroup{}
  188. matchA := func() error {
  189. defer func() {
  190. finished = true
  191. wg.Done()
  192. }()
  193. aStart := 0
  194. // we will try to find a matching added blob for each deleted blob
  195. for d := 0; d < deletedBlobsA.Len(); d++ {
  196. myBlob := cache[deletedBlobsA[d].change.From.TreeEntry.Hash]
  197. mySize := deletedBlobsA[d].size
  198. myName := filepath.Base(deletedBlobsA[d].change.From.Name)
  199. var a int
  200. for a = aStart; a < addedBlobsA.Len() && !ra.sizesAreClose(mySize, addedBlobsA[a].size); a++ {
  201. }
  202. aStart = a
  203. foundMatch := false
  204. // get the list of possible candidates and sort by file name similarity
  205. var candidates []int
  206. for a = aStart; a < addedBlobsA.Len() && ra.sizesAreClose(mySize, addedBlobsA[a].size); a++ {
  207. candidates = append(candidates, a)
  208. }
  209. sortRenameCandidates(candidates, myName, func(a int) string {
  210. return addedBlobsA[a].change.To.Name
  211. })
  212. var ci int
  213. for ci, a = range candidates {
  214. if finished {
  215. return nil
  216. }
  217. if ci > maxCandidates {
  218. break
  219. }
  220. blobsAreClose, err := ra.blobsAreClose(
  221. myBlob, cache[addedBlobsA[a].change.To.TreeEntry.Hash])
  222. if err != nil {
  223. return err
  224. }
  225. if blobsAreClose {
  226. foundMatch = true
  227. matchesA = append(
  228. matchesA,
  229. &object.Change{
  230. From: deletedBlobsA[d].change.From,
  231. To: addedBlobsA[a].change.To})
  232. break
  233. }
  234. }
  235. if foundMatch {
  236. deletedBlobsA = append(deletedBlobsA[:d], deletedBlobsA[d+1:]...)
  237. d--
  238. addedBlobsA = append(addedBlobsA[:a], addedBlobsA[a+1:]...)
  239. }
  240. }
  241. finishedA = true
  242. return nil
  243. }
  244. matchB := func() error {
  245. defer func() {
  246. finished = true
  247. wg.Done()
  248. }()
  249. dStart := 0
  250. for a := 0; a < addedBlobsB.Len(); a++ {
  251. myBlob := cache[addedBlobsB[a].change.To.TreeEntry.Hash]
  252. mySize := addedBlobsB[a].size
  253. myName := filepath.Base(addedBlobsB[a].change.To.Name)
  254. var d int
  255. for d = dStart; d < deletedBlobsB.Len() && !ra.sizesAreClose(mySize, deletedBlobsB[d].size); d++ {
  256. }
  257. dStart = d
  258. foundMatch := false
  259. // get the list of possible candidates and sort by file name similarity
  260. var candidates []int
  261. for d = dStart; d < deletedBlobsB.Len() && ra.sizesAreClose(mySize, deletedBlobsB[d].size); d++ {
  262. candidates = append(candidates, d)
  263. }
  264. sortRenameCandidates(candidates, myName, func(d int) string {
  265. return deletedBlobsB[d].change.From.Name
  266. })
  267. var ci int
  268. for ci, d = range candidates {
  269. if finished {
  270. return nil
  271. }
  272. if ci > maxCandidates {
  273. break
  274. }
  275. blobsAreClose, err := ra.blobsAreClose(
  276. myBlob, cache[deletedBlobsB[d].change.From.TreeEntry.Hash])
  277. if err != nil {
  278. return err
  279. }
  280. if blobsAreClose {
  281. foundMatch = true
  282. matchesB = append(
  283. matchesB,
  284. &object.Change{
  285. From: deletedBlobsB[d].change.From,
  286. To: addedBlobsB[a].change.To})
  287. break
  288. }
  289. }
  290. if foundMatch {
  291. addedBlobsB = append(addedBlobsB[:a], addedBlobsB[a+1:]...)
  292. a--
  293. deletedBlobsB = append(deletedBlobsB[:d], deletedBlobsB[d+1:]...)
  294. }
  295. }
  296. finishedB = true
  297. return nil
  298. }
  299. // run two functions in parallel, and take the result from the one which finished earlier
  300. wg.Add(2)
  301. var err error
  302. go func() { err = matchA() }()
  303. go func() { err = matchB() }()
  304. wg.Wait()
  305. if err != nil {
  306. return nil, err
  307. }
  308. var matches object.Changes
  309. if finishedA {
  310. addedBlobs = addedBlobsA
  311. deletedBlobs = deletedBlobsA
  312. matches = matchesA
  313. } else {
  314. if !finishedB {
  315. panic("Impossible happened: two functions returned without an error " +
  316. "but no results from both")
  317. }
  318. addedBlobs = addedBlobsB
  319. deletedBlobs = deletedBlobsB
  320. matches = matchesB
  321. }
  322. // Stage 3 - we give up, everything left are independent additions and deletions
  323. for _, change := range matches {
  324. reducedChanges = append(reducedChanges, change)
  325. }
  326. for _, blob := range addedBlobs {
  327. reducedChanges = append(reducedChanges, blob.change)
  328. }
  329. for _, blob := range deletedBlobs {
  330. reducedChanges = append(reducedChanges, blob.change)
  331. }
  332. for _, change := range smallChanges {
  333. reducedChanges = append(reducedChanges, change)
  334. }
  335. return map[string]interface{}{DependencyTreeChanges: reducedChanges}, nil
  336. }
  337. // Fork clones this PipelineItem.
  338. func (ra *RenameAnalysis) Fork(n int) []core.PipelineItem {
  339. return core.ForkSamePipelineItem(ra, n)
  340. }
  341. func (ra *RenameAnalysis) sizesAreClose(size1 int64, size2 int64) bool {
  342. size := internal.Max64(1, internal.Max64(size1, size2))
  343. return (internal.Abs64(size1-size2)*10000)/size <= int64(100-ra.SimilarityThreshold)*100
  344. }
  345. func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (bool, error) {
  346. defer func() {
  347. if err := recover(); err != nil {
  348. log.Println()
  349. log.Println(blob1.Hash.String())
  350. log.Println(blob2.Hash.String())
  351. panic(err)
  352. }
  353. }()
  354. _, err1 := blob1.CountLines()
  355. _, err2 := blob2.CountLines()
  356. if err1 == ErrorBinary || err2 == ErrorBinary {
  357. // binary mode
  358. bsdifflen := DiffBytes(blob1.Data, blob2.Data)
  359. delta := int((int64(bsdifflen) * 100) / internal.Max64(
  360. internal.Min64(blob1.Size, blob2.Size), 1))
  361. return 100-delta >= ra.SimilarityThreshold, nil
  362. }
  363. src, dst := string(blob1.Data), string(blob2.Data)
  364. maxSize := internal.Max(1, internal.Max(utf8.RuneCountInString(src), utf8.RuneCountInString(dst)))
  365. // compute the line-by-line diff, then the char-level diffs of the del-ins blocks
  366. // yes, this algorithm is greedy and not exact
  367. dmp := diffmatchpatch.New()
  368. srcLines, dstLines, lines := dmp.DiffLinesToRunes(src, dst)
  369. diffs := dmp.DiffMainRunes(srcLines, dstLines, false)
  370. var common, posSrc, prevPosSrc, posDst int
  371. possibleDelInsBlock := false
  372. for _, edit := range diffs {
  373. switch edit.Type {
  374. case diffmatchpatch.DiffDelete:
  375. possibleDelInsBlock = true
  376. prevPosSrc = posSrc
  377. for _, lineno := range edit.Text {
  378. posSrc += len(lines[lineno])
  379. }
  380. case diffmatchpatch.DiffInsert:
  381. nextPosDst := posDst
  382. for _, lineno := range edit.Text {
  383. nextPosDst += len(lines[lineno])
  384. }
  385. if possibleDelInsBlock {
  386. possibleDelInsBlock = false
  387. localDmp := diffmatchpatch.New()
  388. localSrc := src[prevPosSrc:posSrc]
  389. localDst := dst[posDst:nextPosDst]
  390. localDiffs := localDmp.DiffMainRunes([]rune(localSrc), []rune(localDst), false)
  391. for _, localEdit := range localDiffs {
  392. if localEdit.Type == diffmatchpatch.DiffEqual {
  393. common += utf8.RuneCountInString(localEdit.Text)
  394. }
  395. }
  396. }
  397. posDst = nextPosDst
  398. case diffmatchpatch.DiffEqual:
  399. possibleDelInsBlock = false
  400. for _, lineno := range edit.Text {
  401. common += utf8.RuneCountInString(lines[lineno])
  402. step := len(lines[lineno])
  403. posSrc += step
  404. posDst += step
  405. }
  406. }
  407. if possibleDelInsBlock {
  408. continue
  409. }
  410. // supposing that the rest of the lines are the same (they are not - too optimistic),
  411. // estimate the maximum similarity and exit the loop if it lower than our threshold
  412. var srcPendingSize, dstPendingSize int
  413. if posSrc < len(src) {
  414. srcPendingSize = utf8.RuneCountInString(src[posSrc:])
  415. }
  416. if posDst < len(dst) {
  417. dstPendingSize = utf8.RuneCountInString(dst[posDst:])
  418. }
  419. maxCommon := common + internal.Min(srcPendingSize, dstPendingSize)
  420. similarity := (maxCommon * 100) / maxSize
  421. if similarity < ra.SimilarityThreshold {
  422. return false, nil
  423. }
  424. similarity = (common * 100) / maxSize
  425. if similarity >= ra.SimilarityThreshold {
  426. return true, nil
  427. }
  428. }
  429. // the very last "overly optimistic" estimate was actually precise, so since we are still here
  430. // the blobs are similar
  431. return true, nil
  432. }
  433. type sortableChange struct {
  434. change *object.Change
  435. hash plumbing.Hash
  436. }
  437. type sortableChanges []sortableChange
  438. func (change *sortableChange) Less(other *sortableChange) bool {
  439. for x := 0; x < 20; x++ {
  440. if change.hash[x] < other.hash[x] {
  441. return true
  442. }
  443. }
  444. return false
  445. }
  446. func (slice sortableChanges) Len() int {
  447. return len(slice)
  448. }
  449. func (slice sortableChanges) Less(i, j int) bool {
  450. return slice[i].Less(&slice[j])
  451. }
  452. func (slice sortableChanges) Swap(i, j int) {
  453. slice[i], slice[j] = slice[j], slice[i]
  454. }
  455. type sortableBlob struct {
  456. change *object.Change
  457. size int64
  458. }
  459. type sortableBlobs []sortableBlob
  460. func (change *sortableBlob) Less(other *sortableBlob) bool {
  461. return change.size < other.size
  462. }
  463. func (slice sortableBlobs) Len() int {
  464. return len(slice)
  465. }
  466. func (slice sortableBlobs) Less(i, j int) bool {
  467. return slice[i].Less(&slice[j])
  468. }
  469. func (slice sortableBlobs) Swap(i, j int) {
  470. slice[i], slice[j] = slice[j], slice[i]
  471. }
  472. type candidateDistance struct {
  473. Candidate int
  474. Distance int
  475. }
  476. func sortRenameCandidates(candidates []int, origin string, nameGetter func(int) string) {
  477. distances := make([]candidateDistance, len(candidates))
  478. ctx := LevenshteinContext{}
  479. for i, x := range candidates {
  480. name := filepath.Base(nameGetter(x))
  481. distances[i] = candidateDistance{x, ctx.Distance(origin, name)}
  482. }
  483. sort.Slice(distances, func(i, j int) bool {
  484. return distances[i].Distance < distances[j].Distance
  485. })
  486. for i, cd := range distances {
  487. candidates[i] = cd.Candidate
  488. }
  489. }
  490. func init() {
  491. core.Registry.Register(&RenameAnalysis{})
  492. }