package hercules import ( "fmt" "os" "sort" "unicode/utf8" "github.com/sergi/go-diff/diffmatchpatch" "gopkg.in/src-d/go-git.v4" "gopkg.in/src-d/go-git.v4/plumbing" "gopkg.in/src-d/go-git.v4/plumbing/object" "gopkg.in/src-d/go-git.v4/utils/merkletrie" ) // RenameAnalysis improves TreeDiff's results by searching for changed blobs under different // paths which are likely to be the result of a rename with subsequent edits. // RenameAnalysis is a PipelineItem. type RenameAnalysis struct { // SimilarityThreshold adjusts the heuristic to determine file renames. // It has the same units as cgit's -X rename-threshold or -M. Better to // set it to the default value of 90 (90%). SimilarityThreshold int repository *git.Repository } const ( // RenameAnalysisDefaultThreshold specifies the default percentage of common lines in a pair // of files to consider them linked. The exact code of the decision is sizesAreClose(). RenameAnalysisDefaultThreshold = 90 // ConfigRenameAnalysisSimilarityThreshold is the name of the configuration option // (RenameAnalysis.Configure()) which sets the similarity threshold. ConfigRenameAnalysisSimilarityThreshold = "RenameAnalysis.SimilarityThreshold" ) func (ra *RenameAnalysis) Name() string { return "RenameAnalysis" } func (ra *RenameAnalysis) Provides() []string { arr := [...]string{DependencyTreeChanges} return arr[:] } func (ra *RenameAnalysis) Requires() []string { arr := [...]string{DependencyBlobCache, DependencyTreeChanges} return arr[:] } func (ra *RenameAnalysis) ListConfigurationOptions() []ConfigurationOption { options := [...]ConfigurationOption{{ Name: ConfigRenameAnalysisSimilarityThreshold, Description: "The threshold on the similarity index used to detect renames.", Flag: "M", Type: IntConfigurationOption, Default: RenameAnalysisDefaultThreshold}, } return options[:] } func (ra *RenameAnalysis) Configure(facts map[string]interface{}) { if val, exists := facts[ConfigRenameAnalysisSimilarityThreshold].(int); exists { ra.SimilarityThreshold = val } } func (ra *RenameAnalysis) Initialize(repository *git.Repository) { if ra.SimilarityThreshold < 0 || ra.SimilarityThreshold > 100 { fmt.Fprintf(os.Stderr, "Warning: adjusted the similarity threshold to %d\n", RenameAnalysisDefaultThreshold) ra.SimilarityThreshold = RenameAnalysisDefaultThreshold } ra.repository = repository } func (ra *RenameAnalysis) Consume(deps map[string]interface{}) (map[string]interface{}, error) { changes := deps[DependencyTreeChanges].(object.Changes) cache := deps[DependencyBlobCache].(map[plumbing.Hash]*object.Blob) reducedChanges := make(object.Changes, 0, changes.Len()) // Stage 1 - find renames by matching the hashes // n log(n) // We sort additions and deletions by hash and then do the single scan along // both slices. deleted := make(sortableChanges, 0, changes.Len()) added := make(sortableChanges, 0, changes.Len()) for _, change := range changes { action, err := change.Action() if err != nil { return nil, err } switch action { case merkletrie.Insert: added = append(added, sortableChange{change, change.To.TreeEntry.Hash}) case merkletrie.Delete: deleted = append(deleted, sortableChange{change, change.From.TreeEntry.Hash}) case merkletrie.Modify: reducedChanges = append(reducedChanges, change) } } sort.Sort(deleted) sort.Sort(added) a := 0 d := 0 stillDeleted := make(object.Changes, 0, deleted.Len()) stillAdded := make(object.Changes, 0, added.Len()) for a < added.Len() && d < deleted.Len() { if added[a].hash == deleted[d].hash { reducedChanges = append( reducedChanges, &object.Change{From: deleted[d].change.From, To: added[a].change.To}) a++ d++ } else if added[a].Less(&deleted[d]) { stillAdded = append(stillAdded, added[a].change) a++ } else { stillDeleted = append(stillDeleted, deleted[d].change) d++ } } for ; a < added.Len(); a++ { stillAdded = append(stillAdded, added[a].change) } for ; d < deleted.Len(); d++ { stillDeleted = append(stillDeleted, deleted[d].change) } // Stage 2 - apply the similarity threshold // n^2 but actually linear // We sort the blobs by size and do the single linear scan. addedBlobs := make(sortableBlobs, 0, stillAdded.Len()) deletedBlobs := make(sortableBlobs, 0, stillDeleted.Len()) for _, change := range stillAdded { blob := cache[change.To.TreeEntry.Hash] addedBlobs = append( addedBlobs, sortableBlob{change: change, size: blob.Size}) } for _, change := range stillDeleted { blob := cache[change.From.TreeEntry.Hash] deletedBlobs = append( deletedBlobs, sortableBlob{change: change, size: blob.Size}) } sort.Sort(addedBlobs) sort.Sort(deletedBlobs) dStart := 0 for a = 0; a < addedBlobs.Len(); a++ { myBlob := cache[addedBlobs[a].change.To.TreeEntry.Hash] mySize := addedBlobs[a].size for d = dStart; d < deletedBlobs.Len() && !ra.sizesAreClose(mySize, deletedBlobs[d].size); d++ { } dStart = d foundMatch := false for d = dStart; d < deletedBlobs.Len() && ra.sizesAreClose(mySize, deletedBlobs[d].size); d++ { blobsAreClose, err := ra.blobsAreClose( myBlob, cache[deletedBlobs[d].change.From.TreeEntry.Hash]) if err != nil { return nil, err } if blobsAreClose { foundMatch = true reducedChanges = append( reducedChanges, &object.Change{From: deletedBlobs[d].change.From, To: addedBlobs[a].change.To}) break } } if foundMatch { addedBlobs = append(addedBlobs[:a], addedBlobs[a+1:]...) a-- deletedBlobs = append(deletedBlobs[:d], deletedBlobs[d+1:]...) } } // Stage 3 - we give up, everything left are independent additions and deletions for _, blob := range addedBlobs { reducedChanges = append(reducedChanges, blob.change) } for _, blob := range deletedBlobs { reducedChanges = append(reducedChanges, blob.change) } return map[string]interface{}{DependencyTreeChanges: reducedChanges}, nil } func (ra *RenameAnalysis) sizesAreClose(size1 int64, size2 int64) bool { return abs64(size1-size2)*100/max64(1, min64(size1, size2)) <= int64(100-ra.SimilarityThreshold) } func (ra *RenameAnalysis) blobsAreClose( blob1 *object.Blob, blob2 *object.Blob) (bool, error) { strFrom, err := BlobToString(blob1) if err != nil { return false, err } strTo, err := BlobToString(blob2) if err != nil { return false, err } dmp := diffmatchpatch.New() src, dst, _ := dmp.DiffLinesToRunes(strFrom, strTo) diffs := dmp.DiffMainRunes(src, dst, false) common := 0 for _, edit := range diffs { if edit.Type == diffmatchpatch.DiffEqual { common += utf8.RuneCountInString(edit.Text) } } return common*100/max(1, min(len(src), len(dst))) >= ra.SimilarityThreshold, nil } type sortableChange struct { change *object.Change hash plumbing.Hash } type sortableChanges []sortableChange func (change *sortableChange) Less(other *sortableChange) bool { for x := 0; x < 20; x++ { if change.hash[x] < other.hash[x] { return true } } return false } func (slice sortableChanges) Len() int { return len(slice) } func (slice sortableChanges) Less(i, j int) bool { return slice[i].Less(&slice[j]) } func (slice sortableChanges) Swap(i, j int) { slice[i], slice[j] = slice[j], slice[i] } type sortableBlob struct { change *object.Change size int64 } type sortableBlobs []sortableBlob func (change *sortableBlob) Less(other *sortableBlob) bool { return change.size < other.size } func (slice sortableBlobs) Len() int { return len(slice) } func (slice sortableBlobs) Less(i, j int) bool { return slice[i].Less(&slice[j]) } func (slice sortableBlobs) Swap(i, j int) { slice[i], slice[j] = slice[j], slice[i] } func init() { Registry.Register(&RenameAnalysis{}) }