Browse Source

Fix crash on zipline

Signed-off-by: Vadim Markovtsev <vadim@sourced.tech>
Vadim Markovtsev 6 years ago
parent
commit
088e453a17

+ 67 - 29
internal/plumbing/renames.go

@@ -4,6 +4,7 @@ import (
 	"log"
 	"log"
 	"path/filepath"
 	"path/filepath"
 	"sort"
 	"sort"
+	"strings"
 	"sync"
 	"sync"
 	"unicode/utf8"
 	"unicode/utf8"
 
 
@@ -48,6 +49,10 @@ const (
 	// RenameAnalysisSetSizeLimit is the maximum number of added + removed files for
 	// RenameAnalysisSetSizeLimit is the maximum number of added + removed files for
 	// RenameAnalysisMaxCandidates to be active; the bigger numbers set it to 1.
 	// RenameAnalysisMaxCandidates to be active; the bigger numbers set it to 1.
 	RenameAnalysisSetSizeLimit = 1000
 	RenameAnalysisSetSizeLimit = 1000
+
+	// RenameAnalysisByteDiffSizeThreshold is the maximum size of each of the compared parts
+	// to be diff-ed on byte level.
+	RenameAnalysisByteDiffSizeThreshold = 100000
 )
 )
 
 
 // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
 // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
@@ -367,12 +372,12 @@ func (ra *RenameAnalysis) sizesAreClose(size1 int64, size2 int64) bool {
 }
 }
 
 
 func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (bool, error) {
 func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (bool, error) {
+	cleanReturn := false
 	defer func() {
 	defer func() {
-		if err := recover(); err != nil {
+		if !cleanReturn {
 			log.Println()
 			log.Println()
 			log.Println(blob1.Hash.String())
 			log.Println(blob1.Hash.String())
 			log.Println(blob2.Hash.String())
 			log.Println(blob2.Hash.String())
-			panic(err)
 		}
 		}
 	}()
 	}()
 	_, err1 := blob1.CountLines()
 	_, err1 := blob1.CountLines()
@@ -382,6 +387,7 @@ func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (b
 		bsdifflen := DiffBytes(blob1.Data, blob2.Data)
 		bsdifflen := DiffBytes(blob1.Data, blob2.Data)
 		delta := int((int64(bsdifflen) * 100) / internal.Max64(
 		delta := int((int64(bsdifflen) * 100) / internal.Max64(
 			internal.Min64(blob1.Size, blob2.Size), 1))
 			internal.Min64(blob1.Size, blob2.Size), 1))
+		cleanReturn = true
 		return 100-delta >= ra.SimilarityThreshold, nil
 		return 100-delta >= ra.SimilarityThreshold, nil
 	}
 	}
 	src, dst := string(blob1.Data), string(blob2.Data)
 	src, dst := string(blob1.Data), string(blob2.Data)
@@ -390,8 +396,14 @@ func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (b
 	// compute the line-by-line diff, then the char-level diffs of the del-ins blocks
 	// compute the line-by-line diff, then the char-level diffs of the del-ins blocks
 	// yes, this algorithm is greedy and not exact
 	// yes, this algorithm is greedy and not exact
 	dmp := diffmatchpatch.New()
 	dmp := diffmatchpatch.New()
-	srcLines, dstLines, lines := dmp.DiffLinesToRunes(src, dst)
-	diffs := dmp.DiffMainRunes(srcLines, dstLines, false)
+	srcLineRunes, dstLineRunes, _ := dmp.DiffLinesToRunes(src, dst)
+	// the third returned value, []string, is the mapping from runes to lines
+	// we cannot use it because it is approximate and has string collisions
+	// that is, the mapping is wrong for huge files
+	diffs := dmp.DiffMainRunes(srcLineRunes, dstLineRunes, false)
+
+	srcPositions := calcLinePositions(src)
+	dstPositions := calcLinePositions(dst)
 	var common, posSrc, prevPosSrc, posDst int
 	var common, posSrc, prevPosSrc, posDst int
 	possibleDelInsBlock := false
 	possibleDelInsBlock := false
 	for _, edit := range diffs {
 	for _, edit := range diffs {
@@ -399,35 +411,36 @@ func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (b
 		case diffmatchpatch.DiffDelete:
 		case diffmatchpatch.DiffDelete:
 			possibleDelInsBlock = true
 			possibleDelInsBlock = true
 			prevPosSrc = posSrc
 			prevPosSrc = posSrc
-			for _, lineno := range edit.Text {
-				posSrc += len(lines[lineno])
-			}
+			posSrc += utf8.RuneCountInString(edit.Text)
 		case diffmatchpatch.DiffInsert:
 		case diffmatchpatch.DiffInsert:
-			nextPosDst := posDst
-			for _, lineno := range edit.Text {
-				nextPosDst += len(lines[lineno])
-			}
+			nextPosDst := posDst + utf8.RuneCountInString(edit.Text)
 			if possibleDelInsBlock {
 			if possibleDelInsBlock {
 				possibleDelInsBlock = false
 				possibleDelInsBlock = false
-				localDmp := diffmatchpatch.New()
-				localSrc := src[prevPosSrc:posSrc]
-				localDst := dst[posDst:nextPosDst]
-				localDiffs := localDmp.DiffMainRunes([]rune(localSrc), []rune(localDst), false)
-				for _, localEdit := range localDiffs {
-					if localEdit.Type == diffmatchpatch.DiffEqual {
-						common += utf8.RuneCountInString(localEdit.Text)
+				if internal.Max(srcPositions[posSrc]-srcPositions[prevPosSrc],
+					dstPositions[nextPosDst]-dstPositions[posDst]) < RenameAnalysisByteDiffSizeThreshold {
+					localDmp := diffmatchpatch.New()
+					localSrc := src[srcPositions[prevPosSrc]:srcPositions[posSrc]]
+					localDst := dst[dstPositions[posDst]:dstPositions[nextPosDst]]
+					localDiffs := localDmp.DiffMainRunes(
+						strToLiteralRunes(localSrc), strToLiteralRunes(localDst), false)
+					for _, localEdit := range localDiffs {
+						if localEdit.Type == diffmatchpatch.DiffEqual {
+							common += utf8.RuneCountInString(localEdit.Text)
+						}
 					}
 					}
 				}
 				}
 			}
 			}
 			posDst = nextPosDst
 			posDst = nextPosDst
 		case diffmatchpatch.DiffEqual:
 		case diffmatchpatch.DiffEqual:
 			possibleDelInsBlock = false
 			possibleDelInsBlock = false
-			for _, lineno := range edit.Text {
-				common += utf8.RuneCountInString(lines[lineno])
-				step := len(lines[lineno])
-				posSrc += step
-				posDst += step
+			step := utf8.RuneCountInString(edit.Text)
+			// for i := range edit.Text does *not* work
+			// idk why, but `i` appears to be bigger than the number of runes
+			for i := 0; i < step; i++ {
+				common += srcPositions[posSrc+i+1] - srcPositions[posSrc+i]
 			}
 			}
+			posSrc += step
+			posDst += step
 		}
 		}
 		if possibleDelInsBlock {
 		if possibleDelInsBlock {
 			continue
 			continue
@@ -435,27 +448,52 @@ func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (b
 		// supposing that the rest of the lines are the same (they are not - too optimistic),
 		// supposing that the rest of the lines are the same (they are not - too optimistic),
 		// estimate the maximum similarity and exit the loop if it lower than our threshold
 		// estimate the maximum similarity and exit the loop if it lower than our threshold
 		var srcPendingSize, dstPendingSize int
 		var srcPendingSize, dstPendingSize int
-		if posSrc < len(src) {
-			srcPendingSize = utf8.RuneCountInString(src[posSrc:])
-		}
-		if posDst < len(dst) {
-			dstPendingSize = utf8.RuneCountInString(dst[posDst:])
-		}
+		srcPendingSize = len(src) - srcPositions[posSrc]
+		dstPendingSize = len(dst) - dstPositions[posDst]
 		maxCommon := common + internal.Min(srcPendingSize, dstPendingSize)
 		maxCommon := common + internal.Min(srcPendingSize, dstPendingSize)
 		similarity := (maxCommon * 100) / maxSize
 		similarity := (maxCommon * 100) / maxSize
 		if similarity < ra.SimilarityThreshold {
 		if similarity < ra.SimilarityThreshold {
+			cleanReturn = true
 			return false, nil
 			return false, nil
 		}
 		}
 		similarity = (common * 100) / maxSize
 		similarity = (common * 100) / maxSize
 		if similarity >= ra.SimilarityThreshold {
 		if similarity >= ra.SimilarityThreshold {
+			cleanReturn = true
 			return true, nil
 			return true, nil
 		}
 		}
 	}
 	}
 	// the very last "overly optimistic" estimate was actually precise, so since we are still here
 	// the very last "overly optimistic" estimate was actually precise, so since we are still here
 	// the blobs are similar
 	// the blobs are similar
+	cleanReturn = true
 	return true, nil
 	return true, nil
 }
 }
 
 
+func calcLinePositions(text string) []int {
+	if text == "" {
+		return []int{0}
+	}
+	lines := strings.Split(text, "\n")
+	positions := make([]int, len(lines)+1)
+	accum := 0
+	for i, l := range lines {
+		positions[i] = accum
+		accum += len(l) + 1 // +1 for \n
+	}
+	if len(lines) > 0 && lines[len(lines)-1] != "\n" {
+		accum--
+	}
+	positions[len(lines)] = accum
+	return positions
+}
+
+func strToLiteralRunes(s string) []rune {
+	lrunes := make([]rune, len(s))
+	for i, b := range []byte(s) {
+		lrunes[i] = rune(b)
+	}
+	return lrunes
+}
+
 type sortableChange struct {
 type sortableChange struct {
 	change *object.Change
 	change *object.Change
 	hash   plumbing.Hash
 	hash   plumbing.Hash

+ 23 - 5
internal/plumbing/renames_test.go

@@ -264,20 +264,25 @@ func TestBlobsAreCloseBinary(t *testing.T) {
 	assert.False(t, result)
 	assert.False(t, result)
 }
 }
 
 
-func TestBlobsAreCloseBug(t *testing.T) {
-	gzsource, err := os.Open(path.Join("..", "test_data", "rename_bug.xml.gz"))
+func loadData(t *testing.T, name string) []byte {
+	gzsource, err := os.Open(path.Join("..", "test_data", name))
 	defer gzsource.Close()
 	defer gzsource.Close()
 	if err != nil {
 	if err != nil {
-		t.Errorf("open ../test_data/rename_bug.xml.gz: %v", err)
+		t.Errorf("open ../test_data/%s: %v", name, err)
 	}
 	}
 	gzreader, err := gzip.NewReader(gzsource)
 	gzreader, err := gzip.NewReader(gzsource)
 	if err != nil {
 	if err != nil {
-		t.Errorf("gzip ../test_data/rename_bug.xml.gz: %v", err)
+		t.Errorf("gzip ../test_data/%s: %v", name, err)
 	}
 	}
 	data, err := ioutil.ReadAll(gzreader)
 	data, err := ioutil.ReadAll(gzreader)
 	if err != nil {
 	if err != nil {
-		t.Errorf("gzip ../test_data/rename_bug.xml.gz: %v", err)
+		t.Errorf("gzip ../test_data/%s: %v", name, err)
 	}
 	}
+	return data
+}
+
+func TestBlobsAreCloseBug1(t *testing.T) {
+	data := loadData(t, "rename_bug1.xml.gz")
 	blob1 := &CachedBlob{Data: data}
 	blob1 := &CachedBlob{Data: data}
 	blob2 := &CachedBlob{Data: data}
 	blob2 := &CachedBlob{Data: data}
 	blob1.Size = int64(len(data))
 	blob1.Size = int64(len(data))
@@ -287,3 +292,16 @@ func TestBlobsAreCloseBug(t *testing.T) {
 	assert.Nil(t, err)
 	assert.Nil(t, err)
 	assert.True(t, result)
 	assert.True(t, result)
 }
 }
+
+func TestBlobsAreCloseBug2(t *testing.T) {
+	data1 := loadData(t, "rename_bug2.base.gz")
+	data2 := loadData(t, "rename_bug2.head.gz")
+	blob1 := &CachedBlob{Data: data1}
+	blob2 := &CachedBlob{Data: data2}
+	blob1.Size = int64(len(data1))
+	blob2.Size = int64(len(data2))
+	ra := fixtureRenameAnalysis()
+	result, err := ra.blobsAreClose(blob1, blob2)
+	assert.Nil(t, err)
+	assert.False(t, result)
+}

internal/test_data/rename_bug.xml.gz → internal/test_data/rename_bug1.xml.gz


BIN
internal/test_data/rename_bug2.base.gz


BIN
internal/test_data/rename_bug2.head.gz