ソースを参照

Merge pull request #136 from vmarkovtsev/master

Optimize renames detection
Vadim Markovtsev 6 年 前
コミット
8b2349d7e6

+ 2 - 2
Gopkg.lock

@@ -214,11 +214,11 @@
 
 [[projects]]
   branch = "master"
-  digest = "1:11847ffa6c699272c5eec9a987a6a50e81fcab8e9379857a1c6998150bad7d72"
+  digest = "1:d0d418e1c02e6fc00259ef09d0d4f5135fc6aedac356ff0a11f4e5ef0c447270"
   name = "github.com/sergi/go-diff"
   packages = ["diffmatchpatch"]
   pruneopts = "UT"
-  revision = "feef008d51ad2b3778f85d387ccf91735543008d"
+  revision = "da645544ed44df016359bd4c0e3dc60ee3a0da43"
 
 [[projects]]
   digest = "1:645cabccbb4fa8aab25a956cbcbdf6a6845ca736b2c64e197ca7cbb9d210b939"

+ 1 - 1
Gopkg.toml

@@ -31,7 +31,7 @@
   name = "github.com/pkg/errors"
 
 [[constraint]]
-  branch = "master"
+  revision = "da645544ed44df016359bd4c0e3dc60ee3a0da43"
   name = "github.com/sergi/go-diff"
 
 [[constraint]]

+ 81 - 0
internal/plumbing/levenshtein.go

@@ -0,0 +1,81 @@
+// Copyright (c) 2015, Arbo von Monkiewitsch All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license.
+
+package plumbing
+
+// LevenshteinContext is the object which allows to calculate the Levenshtein distance
+// with Distance() method. It is needed to ensure 0 memory allocations.
+type LevenshteinContext struct {
+	intSlice []int
+}
+
+func (c *LevenshteinContext) getIntSlice(l int) []int {
+	if cap(c.intSlice) < l {
+		c.intSlice = make([]int, l)
+	}
+	return c.intSlice[:l]
+}
+
+// Distance calculates the Levenshtein distance between two strings which
+// is defined as the minimum number of edits needed to transform one string
+// into the other, with the allowable edit operations being insertion, deletion,
+// or substitution of a single character
+// http://en.wikipedia.org/wiki/Levenshtein_distance
+//
+// This implementation is optimized to use O(min(m,n)) space.
+// It is based on the optimized C version found here:
+// http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Levenshtein_distance#C
+func (c *LevenshteinContext) Distance(str1, str2 string) int {
+	s1 := []rune(str1)
+	s2 := []rune(str2)
+
+	lenS1 := len(s1)
+	lenS2 := len(s2)
+
+	if lenS2 == 0 {
+		return lenS1
+	}
+
+	column := c.getIntSlice(lenS1 + 1)
+	// Column[0] will be initialised at the start of the first loop before it
+	// is read, unless lenS2 is zero, which we deal with above
+	for i := 1; i <= lenS1; i++ {
+		column[i] = i
+	}
+
+	for x := 0; x < lenS2; x++ {
+		s2Rune := s2[x]
+		column[0] = x + 1
+		lastdiag := x
+
+		for y := 0; y < lenS1; y++ {
+			olddiag := column[y+1]
+			cost := 0
+			if s1[y] != s2Rune {
+				cost = 1
+			}
+			column[y+1] = min(
+				column[y+1]+1,
+				column[y]+1,
+				lastdiag+cost,
+			)
+			lastdiag = olddiag
+		}
+	}
+
+	return column[lenS1]
+}
+
+func min(a, b, c int) int {
+	if a < b {
+		if a < c {
+			return a
+		}
+	} else {
+		if b < c {
+			return b
+		}
+	}
+	return c
+}

+ 79 - 0
internal/plumbing/levenshtein_test.go

@@ -0,0 +1,79 @@
+// Copyright (c) 2015, Arbo von Monkiewitsch All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license.
+
+package plumbing
+
+import (
+	"fmt"
+	"testing"
+)
+
+var distanceTests = []struct {
+	first  string
+	second string
+	wanted int
+}{
+	{"a", "a", 0},
+	{"ab", "ab", 0},
+	{"ab", "aa", 1},
+	{"ab", "aa", 1},
+	{"ab", "aaa", 2},
+	{"bbb", "a", 3},
+	{"kitten", "sitting", 3},
+	{"a", "", 1},
+	{"", "a", 1},
+	{"aa", "aü", 1},
+	{"Fön", "Föm", 1},
+}
+
+func TestDistance(t *testing.T) {
+
+	lev := &LevenshteinContext{}
+
+	for index, distanceTest := range distanceTests {
+		result := lev.Distance(distanceTest.first, distanceTest.second)
+		if result != distanceTest.wanted {
+			output := fmt.Sprintf("%v \t distance of %v and %v should be %v but was %v.",
+				index, distanceTest.first, distanceTest.second, distanceTest.wanted, result)
+			t.Errorf(output)
+		}
+	}
+}
+
+func BenchmarkDistance(b *testing.B) {
+	s1 := "frederick"
+	s2 := "fredelstick"
+	total := 0
+
+	b.ReportAllocs()
+	b.ResetTimer()
+
+	c := &LevenshteinContext{}
+
+	for i := 0; i < b.N; i++ {
+		total += c.Distance(s1, s2)
+	}
+
+	if total == 0 {
+		b.Logf("total is %d", total)
+	}
+}
+
+func BenchmarkDistanceOriginal(b *testing.B) {
+	s1 := "frederick"
+	s2 := "fredelstick"
+	total := 0
+
+	b.ReportAllocs()
+	b.ResetTimer()
+
+	ctx := LevenshteinContext{}
+	for i := 0; i < b.N; i++ {
+		total += ctx.Distance(s1, s2)
+	}
+
+	if total == 0 {
+		b.Logf("total is %d", total)
+	}
+}

+ 208 - 47
internal/plumbing/renames.go

@@ -2,7 +2,9 @@ package plumbing
 
 import (
 	"log"
+	"path/filepath"
 	"sort"
+	"sync"
 	"unicode/utf8"
 
 	"github.com/sergi/go-diff/diffmatchpatch"
@@ -36,6 +38,12 @@ const (
 	// ConfigRenameAnalysisSimilarityThreshold is the name of the configuration option
 	// (RenameAnalysis.Configure()) which sets the similarity threshold.
 	ConfigRenameAnalysisSimilarityThreshold = "RenameAnalysis.SimilarityThreshold"
+
+	// RenameAnalysisMinimumSize is the minimum size of a blob to be considered.
+	RenameAnalysisMinimumSize = 32
+
+	// RenameAnalysisMaxCandidates is the maximum number of rename candidates to consider per file.
+	RenameAnalysisMaxCandidates = 50
 )
 
 // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
@@ -122,86 +130,219 @@ func (ra *RenameAnalysis) Consume(deps map[string]interface{}) (map[string]inter
 	}
 	sort.Sort(deleted)
 	sort.Sort(added)
-	a := 0
-	d := 0
 	stillDeleted := make(object.Changes, 0, deleted.Len())
 	stillAdded := make(object.Changes, 0, added.Len())
-	for a < added.Len() && d < deleted.Len() {
-		if added[a].hash == deleted[d].hash {
-			reducedChanges = append(
-				reducedChanges,
-				&object.Change{From: deleted[d].change.From, To: added[a].change.To})
-			a++
-			d++
-		} else if added[a].Less(&deleted[d]) {
+	{
+		a := 0
+		d := 0
+		for a < added.Len() && d < deleted.Len() {
+			if added[a].hash == deleted[d].hash {
+				reducedChanges = append(
+					reducedChanges,
+					&object.Change{From: deleted[d].change.From, To: added[a].change.To})
+				a++
+				d++
+			} else if added[a].Less(&deleted[d]) {
+				stillAdded = append(stillAdded, added[a].change)
+				a++
+			} else {
+				stillDeleted = append(stillDeleted, deleted[d].change)
+				d++
+			}
+		}
+		for ; a < added.Len(); a++ {
 			stillAdded = append(stillAdded, added[a].change)
-			a++
-		} else {
+		}
+		for ; d < deleted.Len(); d++ {
 			stillDeleted = append(stillDeleted, deleted[d].change)
-			d++
 		}
 	}
-	for ; a < added.Len(); a++ {
-		stillAdded = append(stillAdded, added[a].change)
-	}
-	for ; d < deleted.Len(); d++ {
-		stillDeleted = append(stillDeleted, deleted[d].change)
-	}
 
 	// Stage 2 - apply the similarity threshold
 	// n^2 but actually linear
 	// We sort the blobs by size and do the single linear scan.
 	addedBlobs := make(sortableBlobs, 0, stillAdded.Len())
 	deletedBlobs := make(sortableBlobs, 0, stillDeleted.Len())
+	var smallChanges []*object.Change
 	for _, change := range stillAdded {
 		blob := cache[change.To.TreeEntry.Hash]
-		addedBlobs = append(
-			addedBlobs, sortableBlob{change: change, size: blob.Size})
+		if blob.Size < RenameAnalysisMinimumSize {
+			smallChanges = append(smallChanges, change)
+		} else {
+			addedBlobs = append(
+				addedBlobs, sortableBlob{change: change, size: blob.Size})
+		}
 	}
 	for _, change := range stillDeleted {
 		blob := cache[change.From.TreeEntry.Hash]
-		deletedBlobs = append(
-			deletedBlobs, sortableBlob{change: change, size: blob.Size})
+		if blob.Size < RenameAnalysisMinimumSize {
+			smallChanges = append(smallChanges, change)
+		} else {
+			deletedBlobs = append(
+				deletedBlobs, sortableBlob{change: change, size: blob.Size})
+		}
 	}
 	sort.Sort(addedBlobs)
 	sort.Sort(deletedBlobs)
-	dStart := 0
-	for a = 0; a < addedBlobs.Len(); a++ {
-		myBlob := cache[addedBlobs[a].change.To.TreeEntry.Hash]
-		mySize := addedBlobs[a].size
-		for d = dStart; d < deletedBlobs.Len() && !ra.sizesAreClose(mySize, deletedBlobs[d].size); d++ {
+
+	var finished, finishedA, finishedB bool
+	matchesA := make(object.Changes, 0, changes.Len())
+	matchesB := make(object.Changes, 0, changes.Len())
+	addedBlobsA := addedBlobs
+	addedBlobsB := make(sortableBlobs, len(addedBlobs))
+	copy(addedBlobsB, addedBlobs)
+	deletedBlobsA := deletedBlobs
+	deletedBlobsB := make(sortableBlobs, len(deletedBlobs))
+	copy(deletedBlobsB, deletedBlobs)
+	wg := sync.WaitGroup{}
+	matchA := func() error {
+		defer func() {
+			finished = true
+			wg.Done()
+		}()
+		aStart := 0
+		// we will try to find a matching added blob for each deleted blob
+		for d := 0; d < deletedBlobsA.Len(); d++ {
+			myBlob := cache[deletedBlobsA[d].change.From.TreeEntry.Hash]
+			mySize := deletedBlobsA[d].size
+			myName := filepath.Base(deletedBlobsA[d].change.From.Name)
+			var a int
+			for a = aStart; a < addedBlobsA.Len() && !ra.sizesAreClose(mySize, addedBlobsA[a].size); a++ {
+			}
+			aStart = a
+			foundMatch := false
+			// get the list of possible candidates and sort by file name similarity
+			var candidates []int
+			for a = aStart; a < addedBlobsA.Len() && ra.sizesAreClose(mySize, addedBlobsA[a].size); a++ {
+				candidates = append(candidates, a)
+			}
+			sortRenameCandidates(candidates, myName, func(a int) string {
+				return addedBlobsA[a].change.To.Name
+			})
+			var ci int
+			for ci, a = range candidates {
+				if finished {
+					return nil
+				}
+				if ci > RenameAnalysisMaxCandidates {
+					break
+				}
+				blobsAreClose, err := ra.blobsAreClose(
+					myBlob, cache[addedBlobsA[a].change.To.TreeEntry.Hash])
+				if err != nil {
+					return err
+				}
+				if blobsAreClose {
+					foundMatch = true
+					matchesA = append(
+						matchesA,
+						&object.Change{
+							From: deletedBlobsA[d].change.From,
+							To:   addedBlobsA[a].change.To})
+					break
+				}
+			}
+			if foundMatch {
+				deletedBlobsA = append(deletedBlobsA[:d], deletedBlobsA[d+1:]...)
+				d--
+				addedBlobsA = append(addedBlobsA[:a], addedBlobsA[a+1:]...)
+			}
 		}
-		dStart = d
-		foundMatch := false
-		for d = dStart; d < deletedBlobs.Len() && ra.sizesAreClose(mySize, deletedBlobs[d].size); d++ {
-			blobsAreClose, err := ra.blobsAreClose(
-				myBlob, cache[deletedBlobs[d].change.From.TreeEntry.Hash])
-			if err != nil {
-				return nil, err
+		finishedA = true
+		return nil
+	}
+	matchB := func() error {
+		defer func() {
+			finished = true
+			wg.Done()
+		}()
+		dStart := 0
+		for a := 0; a < addedBlobsB.Len(); a++ {
+			myBlob := cache[addedBlobsB[a].change.To.TreeEntry.Hash]
+			mySize := addedBlobsB[a].size
+			myName := filepath.Base(addedBlobsB[a].change.To.Name)
+			var d int
+			for d = dStart; d < deletedBlobsB.Len() && !ra.sizesAreClose(mySize, deletedBlobsB[d].size); d++ {
 			}
-			if blobsAreClose {
-				foundMatch = true
-				reducedChanges = append(
-					reducedChanges,
-					&object.Change{From: deletedBlobs[d].change.From,
-						To: addedBlobs[a].change.To})
-				break
+			dStart = d
+			foundMatch := false
+			// get the list of possible candidates and sort by file name similarity
+			var candidates []int
+			for d = dStart; d < deletedBlobsB.Len() && ra.sizesAreClose(mySize, deletedBlobsB[d].size); d++ {
+				candidates = append(candidates, d)
+			}
+			sortRenameCandidates(candidates, myName, func(d int) string {
+				return deletedBlobsB[d].change.From.Name
+			})
+			var ci int
+			for ci, d = range candidates {
+				if finished {
+					return nil
+				}
+				if ci > RenameAnalysisMaxCandidates {
+					break
+				}
+				blobsAreClose, err := ra.blobsAreClose(
+					myBlob, cache[deletedBlobsB[d].change.From.TreeEntry.Hash])
+				if err != nil {
+					return err
+				}
+				if blobsAreClose {
+					foundMatch = true
+					matchesB = append(
+						matchesB,
+						&object.Change{
+							From: deletedBlobsB[d].change.From,
+							To: addedBlobsB[a].change.To})
+					break
+				}
+			}
+			if foundMatch {
+				addedBlobsB = append(addedBlobsB[:a], addedBlobsB[a+1:]...)
+				a--
+				deletedBlobsB = append(deletedBlobsB[:d], deletedBlobsB[d+1:]...)
 			}
 		}
-		if foundMatch {
-			addedBlobs = append(addedBlobs[:a], addedBlobs[a+1:]...)
-			a--
-			deletedBlobs = append(deletedBlobs[:d], deletedBlobs[d+1:]...)
+		finishedB = true
+		return nil
+	}
+	// run two functions in parallel, and take the result from the one which finished earlier
+	wg.Add(2)
+	var err error
+	go func() { err = matchA() }()
+	go func() { err = matchB() }()
+	wg.Wait()
+	if err != nil {
+		return nil, err
+	}
+	var matches object.Changes
+	if finishedA {
+		addedBlobs = addedBlobsA
+		deletedBlobs = deletedBlobsA
+		matches = matchesA
+	} else {
+		if !finishedB {
+			panic("Impossible happened: two functions returned without an error " +
+				"but no results from both")
 		}
+		addedBlobs = addedBlobsB
+		deletedBlobs = deletedBlobsB
+		matches = matchesB
 	}
 
 	// Stage 3 - we give up, everything left are independent additions and deletions
+	for _, change := range matches {
+		reducedChanges = append(reducedChanges, change)
+	}
 	for _, blob := range addedBlobs {
 		reducedChanges = append(reducedChanges, blob.change)
 	}
 	for _, blob := range deletedBlobs {
 		reducedChanges = append(reducedChanges, blob.change)
 	}
+	for _, change := range smallChanges {
+		reducedChanges = append(reducedChanges, change)
+	}
 	return map[string]interface{}{DependencyTreeChanges: reducedChanges}, nil
 }
 
@@ -212,7 +353,7 @@ func (ra *RenameAnalysis) Fork(n int) []core.PipelineItem {
 
 func (ra *RenameAnalysis) sizesAreClose(size1 int64, size2 int64) bool {
 	size := internal.Max64(1, internal.Max64(size1, size2))
-	return (internal.Abs64(size1-size2)*100)/size <= int64(100-ra.SimilarityThreshold)
+	return (internal.Abs64(size1-size2)*10000)/size <= int64(100-ra.SimilarityThreshold) * 100
 }
 
 func (ra *RenameAnalysis) blobsAreClose(
@@ -328,6 +469,26 @@ func (slice sortableBlobs) Swap(i, j int) {
 	slice[i], slice[j] = slice[j], slice[i]
 }
 
+type candidateDistance struct {
+	Candidate int
+	Distance  int
+}
+
+func sortRenameCandidates(candidates []int, origin string, nameGetter func(int) string) {
+	distances := make([]candidateDistance, len(candidates))
+	ctx := LevenshteinContext{}
+	for i, x := range candidates {
+		name := filepath.Base(nameGetter(x))
+		distances[i] = candidateDistance{x, ctx.Distance(origin, name)}
+	}
+	sort.Slice(distances, func(i, j int) bool {
+		return distances[i].Distance < distances[j].Distance
+	})
+	for i, cd := range distances {
+		candidates[i]  = cd.Candidate
+	}
+}
+
 func init() {
 	core.Registry.Register(&RenameAnalysis{})
 }

+ 18 - 1
internal/plumbing/renames_test.go

@@ -122,7 +122,7 @@ func TestRenameAnalysisConsume(t *testing.T) {
 	assert.Nil(t, err)
 	renamed := res[DependencyTreeChanges].(object.Changes)
 	assert.Equal(t, len(renamed), 2)
-	ra.SimilarityThreshold = 38
+	ra.SimilarityThreshold = 39
 	res, err = ra.Consume(deps)
 	assert.Nil(t, err)
 	renamed = res[DependencyTreeChanges].(object.Changes)
@@ -169,3 +169,20 @@ func TestRenameAnalysisFork(t *testing.T) {
 	assert.True(t, ra1 == ra2)
 	ra1.Merge([]core.PipelineItem{ra2})
 }
+
+func TestRenameAnalysisSizesAreClose(t *testing.T) {
+	ra := fixtureRenameAnalysis()
+	assert.True(t, ra.sizesAreClose(941, 963))
+	assert.True(t, ra.sizesAreClose(941, 1150))
+	assert.True(t, ra.sizesAreClose(941, 803))
+	assert.False(t, ra.sizesAreClose(1320, 1668))
+}
+
+func TestRenameAnalysisSortRenameCandidates(t *testing.T) {
+	candidates := []int{0, 1, 2, 3}
+	sortRenameCandidates(candidates, "test_regression.py", func(i int) string {
+		return []string{"gather_nd_op.h", "test.py", "test_file_system.cc", "regression.py"}[i]
+	})
+	assert.Equal(t, candidates[0], 3)
+	assert.Equal(t, candidates[1], 1)
+}