فهرست منبع

Merge pull request #195 from vmarkovtsev/master

Fix renames once again
Vadim Markovtsev 6 سال پیش
والد
کامیت
c1002f4265

+ 1 - 0
internal/core/forks.go

@@ -89,6 +89,7 @@ const (
 
 // planPrintFunc is used to print the execution plan in prepareRunPlan().
 var planPrintFunc = func(args ...interface{}) {
+	fmt.Fprintln(os.Stderr)
 	fmt.Fprintln(os.Stderr, args...)
 }
 

+ 0 - 1
internal/core/pipeline.go

@@ -757,7 +757,6 @@ func (pipeline *Pipeline) Run(commits []*object.Commit) (map[LeafPipelineItem]in
 			continue
 		}
 		if pipeline.PrintActions {
-			fmt.Fprintln(os.Stderr)
 			printAction(step)
 		}
 		if index > 0 && index%100 == 0 && pipeline.HibernationDistance > 0 {

+ 4 - 0
internal/core/pipeline_test.go

@@ -537,9 +537,13 @@ func TestPipelineDumpPlanConfigure(t *testing.T) {
 	pipeline.Initialize(map[string]interface{}{ConfigPipelineDumpPlan: true})
 	assert.True(t, pipeline.DumpPlan)
 	stream := &bytes.Buffer{}
+	backupPlanPrintFunc := planPrintFunc
 	planPrintFunc = func(args ...interface{}) {
 		fmt.Fprintln(stream, args...)
 	}
+	defer func() {
+		planPrintFunc = backupPlanPrintFunc
+	}()
 	commits := make([]*object.Commit, 1)
 	commits[0], _ = test.Repository.CommitObject(plumbing.NewHash(
 		"af9ddc0db70f09f3f27b4b98e415592a7485171c"))

+ 11 - 2
internal/plumbing/day.go

@@ -1,6 +1,7 @@
 package plumbing
 
 import (
+	"log"
 	"time"
 
 	"gopkg.in/src-d/go-git.v4"
@@ -13,6 +14,7 @@ import (
 // It is a PipelineItem.
 type DaysSinceStart struct {
 	core.NoopMerger
+	remote      string
 	day0        *time.Time
 	previousDay int
 	commits     map[int][]plumbing.Hash
@@ -75,6 +77,9 @@ func (days *DaysSinceStart) Initialize(repository *git.Repository) error {
 			delete(days.commits, key)
 		}
 	}
+	if r, err := repository.Remotes(); err == nil && len(r) > 0 {
+		days.remote = r[0].Config().URLs[0]
+	}
 	return nil
 }
 
@@ -88,9 +93,13 @@ func (days *DaysSinceStart) Consume(deps map[string]interface{}) (map[string]int
 	index := deps[core.DependencyIndex].(int)
 	if index == 0 {
 		// first iteration - initialize the file objects from the tree
-		*days.day0 = commit.Committer.When
 		// our precision is 1 day
-		*days.day0 = days.day0.Truncate(24 * time.Hour)
+		*days.day0 = commit.Committer.When.Truncate(24 * time.Hour)
+		if days.day0.Unix() < 631152000 { // 01.01.1990, that was 30 years ago
+			log.Println()
+			log.Printf("Warning: suspicious committer timestamp in %s > %s",
+				days.remote, commit.Hash.String())
+		}
 	}
 	day := int(commit.Committer.When.Sub(*days.day0).Hours() / 24)
 	if day < days.previousDay {

+ 31 - 0
internal/plumbing/day_test.go

@@ -1,7 +1,11 @@
 package plumbing
 
 import (
+	"bytes"
+	"log"
+	"os"
 	"testing"
+	"time"
 
 	"github.com/stretchr/testify/assert"
 	"gopkg.in/src-d/go-git.v4/plumbing"
@@ -123,3 +127,30 @@ func TestDaysSinceStartFork(t *testing.T) {
 	// just for the sake of it
 	dss1.Merge([]core.PipelineItem{dss2})
 }
+
+func TestDaysSinceStartConsumeZero(t *testing.T) {
+	dss := fixtureDaysSinceStart()
+	deps := map[string]interface{}{}
+	commit, _ := test.Repository.CommitObject(plumbing.NewHash(
+		"cce947b98a050c6d356bc6ba95030254914027b1"))
+	commit.Committer.When = time.Unix(0, 0)
+	deps[core.DependencyCommit] = commit
+	deps[core.DependencyIndex] = 0
+	// print warning to log
+	myOutput := &bytes.Buffer{}
+	log.SetOutput(myOutput)
+	defer func() {
+		log.SetOutput(os.Stderr)
+	}()
+	res, err := dss.Consume(deps)
+	assert.Nil(t, err)
+	assert.Contains(t, myOutput.String(), "Warning")
+	assert.Contains(t, myOutput.String(), "cce947b98a050c6d356bc6ba95030254914027b1")
+	assert.Contains(t, myOutput.String(), "hercules")
+	assert.Contains(t, myOutput.String(), "github.com")
+	assert.Equal(t, res[DependencyDay].(int), 0)
+	assert.Equal(t, dss.previousDay, 0)
+	assert.Equal(t, dss.day0.Year(), 1970)
+	assert.Equal(t, dss.day0.Minute(), 0)
+	assert.Equal(t, dss.day0.Second(), 0)
+}

+ 67 - 29
internal/plumbing/renames.go

@@ -4,6 +4,7 @@ import (
 	"log"
 	"path/filepath"
 	"sort"
+	"strings"
 	"sync"
 	"unicode/utf8"
 
@@ -48,6 +49,10 @@ const (
 	// RenameAnalysisSetSizeLimit is the maximum number of added + removed files for
 	// RenameAnalysisMaxCandidates to be active; the bigger numbers set it to 1.
 	RenameAnalysisSetSizeLimit = 1000
+
+	// RenameAnalysisByteDiffSizeThreshold is the maximum size of each of the compared parts
+	// to be diff-ed on byte level.
+	RenameAnalysisByteDiffSizeThreshold = 100000
 )
 
 // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
@@ -367,12 +372,12 @@ func (ra *RenameAnalysis) sizesAreClose(size1 int64, size2 int64) bool {
 }
 
 func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (bool, error) {
+	cleanReturn := false
 	defer func() {
-		if err := recover(); err != nil {
+		if !cleanReturn {
 			log.Println()
 			log.Println(blob1.Hash.String())
 			log.Println(blob2.Hash.String())
-			panic(err)
 		}
 	}()
 	_, err1 := blob1.CountLines()
@@ -382,6 +387,7 @@ func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (b
 		bsdifflen := DiffBytes(blob1.Data, blob2.Data)
 		delta := int((int64(bsdifflen) * 100) / internal.Max64(
 			internal.Min64(blob1.Size, blob2.Size), 1))
+		cleanReturn = true
 		return 100-delta >= ra.SimilarityThreshold, nil
 	}
 	src, dst := string(blob1.Data), string(blob2.Data)
@@ -390,8 +396,14 @@ func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (b
 	// compute the line-by-line diff, then the char-level diffs of the del-ins blocks
 	// yes, this algorithm is greedy and not exact
 	dmp := diffmatchpatch.New()
-	srcLines, dstLines, lines := dmp.DiffLinesToRunes(src, dst)
-	diffs := dmp.DiffMainRunes(srcLines, dstLines, false)
+	srcLineRunes, dstLineRunes, _ := dmp.DiffLinesToRunes(src, dst)
+	// the third returned value, []string, is the mapping from runes to lines
+	// we cannot use it because it is approximate and has string collisions
+	// that is, the mapping is wrong for huge files
+	diffs := dmp.DiffMainRunes(srcLineRunes, dstLineRunes, false)
+
+	srcPositions := calcLinePositions(src)
+	dstPositions := calcLinePositions(dst)
 	var common, posSrc, prevPosSrc, posDst int
 	possibleDelInsBlock := false
 	for _, edit := range diffs {
@@ -399,35 +411,36 @@ func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (b
 		case diffmatchpatch.DiffDelete:
 			possibleDelInsBlock = true
 			prevPosSrc = posSrc
-			for _, lineno := range edit.Text {
-				posSrc += len(lines[lineno])
-			}
+			posSrc += utf8.RuneCountInString(edit.Text)
 		case diffmatchpatch.DiffInsert:
-			nextPosDst := posDst
-			for _, lineno := range edit.Text {
-				nextPosDst += len(lines[lineno])
-			}
+			nextPosDst := posDst + utf8.RuneCountInString(edit.Text)
 			if possibleDelInsBlock {
 				possibleDelInsBlock = false
-				localDmp := diffmatchpatch.New()
-				localSrc := src[prevPosSrc:posSrc]
-				localDst := dst[posDst:nextPosDst]
-				localDiffs := localDmp.DiffMainRunes([]rune(localSrc), []rune(localDst), false)
-				for _, localEdit := range localDiffs {
-					if localEdit.Type == diffmatchpatch.DiffEqual {
-						common += utf8.RuneCountInString(localEdit.Text)
+				if internal.Max(srcPositions[posSrc]-srcPositions[prevPosSrc],
+					dstPositions[nextPosDst]-dstPositions[posDst]) < RenameAnalysisByteDiffSizeThreshold {
+					localDmp := diffmatchpatch.New()
+					localSrc := src[srcPositions[prevPosSrc]:srcPositions[posSrc]]
+					localDst := dst[dstPositions[posDst]:dstPositions[nextPosDst]]
+					localDiffs := localDmp.DiffMainRunes(
+						strToLiteralRunes(localSrc), strToLiteralRunes(localDst), false)
+					for _, localEdit := range localDiffs {
+						if localEdit.Type == diffmatchpatch.DiffEqual {
+							common += utf8.RuneCountInString(localEdit.Text)
+						}
 					}
 				}
 			}
 			posDst = nextPosDst
 		case diffmatchpatch.DiffEqual:
 			possibleDelInsBlock = false
-			for _, lineno := range edit.Text {
-				common += utf8.RuneCountInString(lines[lineno])
-				step := len(lines[lineno])
-				posSrc += step
-				posDst += step
+			step := utf8.RuneCountInString(edit.Text)
+			// for i := range edit.Text does *not* work
+			// idk why, but `i` appears to be bigger than the number of runes
+			for i := 0; i < step; i++ {
+				common += srcPositions[posSrc+i+1] - srcPositions[posSrc+i]
 			}
+			posSrc += step
+			posDst += step
 		}
 		if possibleDelInsBlock {
 			continue
@@ -435,27 +448,52 @@ func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (b
 		// supposing that the rest of the lines are the same (they are not - too optimistic),
 		// estimate the maximum similarity and exit the loop if it lower than our threshold
 		var srcPendingSize, dstPendingSize int
-		if posSrc < len(src) {
-			srcPendingSize = utf8.RuneCountInString(src[posSrc:])
-		}
-		if posDst < len(dst) {
-			dstPendingSize = utf8.RuneCountInString(dst[posDst:])
-		}
+		srcPendingSize = len(src) - srcPositions[posSrc]
+		dstPendingSize = len(dst) - dstPositions[posDst]
 		maxCommon := common + internal.Min(srcPendingSize, dstPendingSize)
 		similarity := (maxCommon * 100) / maxSize
 		if similarity < ra.SimilarityThreshold {
+			cleanReturn = true
 			return false, nil
 		}
 		similarity = (common * 100) / maxSize
 		if similarity >= ra.SimilarityThreshold {
+			cleanReturn = true
 			return true, nil
 		}
 	}
 	// the very last "overly optimistic" estimate was actually precise, so since we are still here
 	// the blobs are similar
+	cleanReturn = true
 	return true, nil
 }
 
+func calcLinePositions(text string) []int {
+	if text == "" {
+		return []int{0}
+	}
+	lines := strings.Split(text, "\n")
+	positions := make([]int, len(lines)+1)
+	accum := 0
+	for i, l := range lines {
+		positions[i] = accum
+		accum += len(l) + 1 // +1 for \n
+	}
+	if len(lines) > 0 && lines[len(lines)-1] != "\n" {
+		accum--
+	}
+	positions[len(lines)] = accum
+	return positions
+}
+
+func strToLiteralRunes(s string) []rune {
+	lrunes := make([]rune, len(s))
+	for i, b := range []byte(s) {
+		lrunes[i] = rune(b)
+	}
+	return lrunes
+}
+
 type sortableChange struct {
 	change *object.Change
 	hash   plumbing.Hash

+ 23 - 5
internal/plumbing/renames_test.go

@@ -264,20 +264,25 @@ func TestBlobsAreCloseBinary(t *testing.T) {
 	assert.False(t, result)
 }
 
-func TestBlobsAreCloseBug(t *testing.T) {
-	gzsource, err := os.Open(path.Join("..", "test_data", "rename_bug.xml.gz"))
+func loadData(t *testing.T, name string) []byte {
+	gzsource, err := os.Open(path.Join("..", "test_data", name))
 	defer gzsource.Close()
 	if err != nil {
-		t.Errorf("open ../test_data/rename_bug.xml.gz: %v", err)
+		t.Errorf("open ../test_data/%s: %v", name, err)
 	}
 	gzreader, err := gzip.NewReader(gzsource)
 	if err != nil {
-		t.Errorf("gzip ../test_data/rename_bug.xml.gz: %v", err)
+		t.Errorf("gzip ../test_data/%s: %v", name, err)
 	}
 	data, err := ioutil.ReadAll(gzreader)
 	if err != nil {
-		t.Errorf("gzip ../test_data/rename_bug.xml.gz: %v", err)
+		t.Errorf("gzip ../test_data/%s: %v", name, err)
 	}
+	return data
+}
+
+func TestBlobsAreCloseBug1(t *testing.T) {
+	data := loadData(t, "rename_bug1.xml.gz")
 	blob1 := &CachedBlob{Data: data}
 	blob2 := &CachedBlob{Data: data}
 	blob1.Size = int64(len(data))
@@ -287,3 +292,16 @@ func TestBlobsAreCloseBug(t *testing.T) {
 	assert.Nil(t, err)
 	assert.True(t, result)
 }
+
+func TestBlobsAreCloseBug2(t *testing.T) {
+	data1 := loadData(t, "rename_bug2.base.gz")
+	data2 := loadData(t, "rename_bug2.head.gz")
+	blob1 := &CachedBlob{Data: data1}
+	blob2 := &CachedBlob{Data: data2}
+	blob1.Size = int64(len(data1))
+	blob2.Size = int64(len(data2))
+	ra := fixtureRenameAnalysis()
+	result, err := ra.blobsAreClose(blob1, blob2)
+	assert.Nil(t, err)
+	assert.False(t, result)
+}

+ 3 - 3
internal/test/repository.go

@@ -2,14 +2,14 @@ package test
 
 import (
 	"io"
+	"io/ioutil"
 	"os"
+	"path"
 
-	git "gopkg.in/src-d/go-git.v4"
+	"gopkg.in/src-d/go-git.v4"
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
 	"gopkg.in/src-d/go-git.v4/storage/memory"
-	"io/ioutil"
-	"path"
 )
 
 // Repository is a boilerplate sample repository (Hercules itself).

internal/test_data/rename_bug.xml.gz → internal/test_data/rename_bug1.xml.gz


BIN
internal/test_data/rename_bug2.base.gz


BIN
internal/test_data/rename_bug2.head.gz