瀏覽代碼

Merge pull request #127 from vmarkovtsev/master

Optimize the renames detection
Vadim Markovtsev 6 年之前
父節點
當前提交
81a9c67b7a
共有 1 個文件被更改,包括 16 次插入6 次删除
  1. 16 6
      internal/plumbing/renames.go

+ 16 - 6
internal/plumbing/renames.go

@@ -21,7 +21,7 @@ type RenameAnalysis struct {
 	core.NoopMerger
 	// SimilarityThreshold adjusts the heuristic to determine file renames.
 	// It has the same units as cgit's -X rename-threshold or -M. Better to
-	// set it to the default value of 50 (50%).
+	// set it to the default value of 80 (80%).
 	SimilarityThreshold int
 
 	repository *git.Repository
@@ -30,8 +30,8 @@ type RenameAnalysis struct {
 const (
 	// RenameAnalysisDefaultThreshold specifies the default percentage of common lines in a pair
 	// of files to consider them linked. The exact code of the decision is sizesAreClose().
-	// This defaults to CGit's 50%.
-	RenameAnalysisDefaultThreshold = 50
+	// CGit's default is 50%. Ours is 80% because 50% can be too computationally expensive.
+	RenameAnalysisDefaultThreshold = 80
 
 	// ConfigRenameAnalysisSimilarityThreshold is the name of the configuration option
 	// (RenameAnalysis.Configure()) which sets the similarity threshold.
@@ -218,6 +218,7 @@ func (ra *RenameAnalysis) sizesAreClose(size1 int64, size2 int64) bool {
 func (ra *RenameAnalysis) blobsAreClose(
 	blob1 *CachedBlob, blob2 *CachedBlob) (bool, error) {
 	src, dst := string(blob1.Data), string(blob2.Data)
+	maxSize := internal.Max(1, internal.Max(utf8.RuneCountInString(src), utf8.RuneCountInString(dst)))
 
 	// compute the line-by-line diff, then the char-level diffs of the del-ins blocks
 	// yes, this algorithm is greedy and not exact
@@ -261,10 +262,19 @@ func (ra *RenameAnalysis) blobsAreClose(
 				posDst += step
 			}
 		}
+		// supposing that the rest of the lines are the same (they are not - too optimistic),
+		// estimate the maximum similarity and exit the loop if it lower than our threshold
+		maxCommon := common + internal.Min(
+			utf8.RuneCountInString(src[posSrc:]),
+			utf8.RuneCountInString(dst[posDst:]))
+		similarity := (maxCommon * 100) / maxSize
+		if similarity < ra.SimilarityThreshold {
+			return false, nil
+		}
 	}
-	size := internal.Max(1, internal.Max(utf8.RuneCountInString(src), utf8.RuneCountInString(dst)))
-	similarity := (common * 100) / size
-	return similarity >= ra.SimilarityThreshold, nil
+	// the very last "overly optimistic" estimate was actually precise, so since we are still here
+	// the blobs are similar
+	return true, nil
 }
 
 type sortableChange struct {