瀏覽代碼

Add explicit caching and fix renames detection

Vadim Markovtsev 6 年之前
父節點
當前提交
e62972bd8d

+ 3 - 9
contrib/_plugin_example/churn_analysis.go

@@ -126,7 +126,7 @@ func (churn *ChurnAnalysis) Consume(deps map[string]interface{}) (map[string]int
 	}
 	fileDiffs := deps[hercules.DependencyFileDiff].(map[string]hercules.FileDiffData)
 	treeDiffs := deps[hercules.DependencyTreeChanges].(object.Changes)
-	cache := deps[hercules.DependencyBlobCache].(map[plumbing.Hash]*object.Blob)
+	cache := deps[hercules.DependencyBlobCache].(map[plumbing.Hash]*hercules.CachedBlob)
 	day := deps[hercules.DependencyDay].(int)
 	author := deps[hercules.DependencyAuthor].(int)
 	for _, change := range treeDiffs {
@@ -138,15 +138,9 @@ func (churn *ChurnAnalysis) Consume(deps map[string]interface{}) (map[string]int
 		removed := 0
 		switch action {
 		case merkletrie.Insert:
-			added, err = hercules.CountLines(cache[change.To.TreeEntry.Hash])
-			if err != nil && err.Error() == "binary" {
-				err = nil
-			}
+			added, _ = cache[change.To.TreeEntry.Hash].CountLines()
 		case merkletrie.Delete:
-			removed, err = hercules.CountLines(cache[change.From.TreeEntry.Hash])
-			if err != nil && err.Error() == "binary" {
-				err = nil
-			}
+			removed, _ = cache[change.From.TreeEntry.Hash].CountLines()
 		case merkletrie.Modify:
 			diffs := fileDiffs[change.To.Name]
 			for _, edit := range diffs.Diffs {

+ 5 - 6
core.go

@@ -1,14 +1,14 @@
 package hercules
 
 import (
-	git "gopkg.in/src-d/go-git.v4"
+	"gopkg.in/src-d/go-git.v4"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
 	"gopkg.in/src-d/hercules.v4/internal/core"
 	"gopkg.in/src-d/hercules.v4/internal/plumbing"
 	"gopkg.in/src-d/hercules.v4/internal/plumbing/identity"
 	"gopkg.in/src-d/hercules.v4/internal/plumbing/uast"
-	"gopkg.in/src-d/hercules.v4/leaves"
 	"gopkg.in/src-d/hercules.v4/internal/yaml"
+	"gopkg.in/src-d/hercules.v4/leaves"
 )
 
 // ConfigurationOptionType represents the possible types of a ConfigurationOption's value.
@@ -146,10 +146,9 @@ const (
 // FileDiffData is the type of the dependency provided by plumbing.FileDiff.
 type FileDiffData = plumbing.FileDiffData
 
-// CountLines returns the number of lines in a *object.Blob.
-func CountLines(file *object.Blob) (int, error) {
-	return plumbing.CountLines(file)
-}
+// CachedBlob allows to explicitly cache the binary data associated with the Blob object.
+// Such structs are returned by DependencyBlobCache.
+type CachedBlob = plumbing.CachedBlob
 
 // SafeYamlString escapes the string so that it can be reliably used in YAML.
 func SafeYamlString(str string) string {

+ 1 - 1
internal/core/pipeline.go

@@ -16,9 +16,9 @@ import (
 	"gopkg.in/src-d/go-git.v4"
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
+	"gopkg.in/src-d/go-git.v4/plumbing/storer"
 	"gopkg.in/src-d/hercules.v4/internal/pb"
 	"gopkg.in/src-d/hercules.v4/internal/toposort"
-	"gopkg.in/src-d/go-git.v4/plumbing/storer"
 )
 
 // ConfigurationOptionType represents the possible types of a ConfigurationOption's value.

+ 2 - 2
internal/core/registry.go

@@ -42,7 +42,7 @@ func (registry *PipelineItemRegistry) Summon(providesOrName string) []PipelineIt
 		return []PipelineItem{}
 	}
 	ts := registry.provided[providesOrName]
-	items := []PipelineItem{}
+	var items []PipelineItem
 	for _, t := range ts {
 		items = append(items, reflect.New(t.Elem()).Interface().(PipelineItem))
 	}
@@ -209,7 +209,7 @@ func (registry *PipelineItemRegistry) AddFlags(flagSet *pflag.FlagSet) (
 			"Useful for --dump-dag.")
 		flags[ConfigPipelineDryRun] = iface
 	}
-	features := []string{}
+	var features []string
 	for f := range registry.featureFlags.Choices {
 		features = append(features, f)
 	}

+ 108 - 21
internal/plumbing/blob_cache.go

@@ -1,8 +1,13 @@
 package plumbing
 
 import (
+	"bytes"
+	"fmt"
+	"io"
+	"io/ioutil"
 	"log"
 
+	"github.com/pkg/errors"
 	"gopkg.in/src-d/go-git.v4"
 	"gopkg.in/src-d/go-git.v4/config"
 	"gopkg.in/src-d/go-git.v4/plumbing"
@@ -12,6 +17,55 @@ import (
 	"gopkg.in/src-d/hercules.v4/internal/core"
 )
 
+
+var ErrorBinary = errors.New("binary")
+
+// CachedBlob allows to explicitly cache the binary data associated with the Blob object.
+type CachedBlob struct {
+	object.Blob
+	Data []byte
+}
+
+// Reader returns a reader allow the access to the content of the blob
+func (b *CachedBlob) Reader() (io.ReadCloser, error) {
+	return ioutil.NopCloser(bytes.NewReader(b.Data)), nil
+}
+
+func (b *CachedBlob) Cache() error {
+	reader, err := b.Blob.Reader()
+	if err != nil {
+		return err
+	}
+	defer reader.Close()
+	buf := new(bytes.Buffer)
+	buf.Grow(int(b.Size))
+	size, err := buf.ReadFrom(reader)
+	if err != nil {
+		return err
+	}
+	if size != b.Size {
+		return fmt.Errorf("incomplete read of %s: %d while the declared size is %d",
+			b.Hash.String(), size, b.Size)
+	}
+	b.Data = buf.Bytes()
+	return nil
+}
+
+// CountLines returns the number of lines in the blob or (0, ErrorBinary) if it is binary.
+func (b *CachedBlob) CountLines() (int, error) {
+	if len(b.Data) == 0 {
+		return 0, nil
+	}
+	if bytes.IndexByte(b.Data, 0) >= 0 {
+		return 0, ErrorBinary
+	}
+	lines := bytes.Count(b.Data, []byte{'\n'})
+	if b.Data[len(b.Data)-1] != '\n' {
+		lines++
+	}
+	return lines, nil
+}
+
 // BlobCache loads the blobs which correspond to the changed files in a commit.
 // It is a PipelineItem.
 // It must provide the old and the new objects; "blobCache" rotates and allows to not load
@@ -24,7 +78,7 @@ type BlobCache struct {
 	FailOnMissingSubmodules bool
 
 	repository *git.Repository
-	cache      map[plumbing.Hash]*object.Blob
+	cache      map[plumbing.Hash]*CachedBlob
 }
 
 const (
@@ -80,7 +134,7 @@ func (blobCache *BlobCache) Configure(facts map[string]interface{}) {
 // calls. The repository which is going to be analysed is supplied as an argument.
 func (blobCache *BlobCache) Initialize(repository *git.Repository) {
 	blobCache.repository = repository
-	blobCache.cache = map[plumbing.Hash]*object.Blob{}
+	blobCache.cache = map[plumbing.Hash]*CachedBlob{}
 }
 
 // Consume runs this PipelineItem on the next commit data.
@@ -92,8 +146,8 @@ func (blobCache *BlobCache) Initialize(repository *git.Repository) {
 func (blobCache *BlobCache) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
 	commit := deps[core.DependencyCommit].(*object.Commit)
 	changes := deps[DependencyTreeChanges].(object.Changes)
-	cache := map[plumbing.Hash]*object.Blob{}
-	newCache := map[plumbing.Hash]*object.Blob{}
+	cache := map[plumbing.Hash]*CachedBlob{}
+	newCache := map[plumbing.Hash]*CachedBlob{}
 	for _, change := range changes {
 		action, err := change.Action()
 		if err != nil {
@@ -104,45 +158,77 @@ func (blobCache *BlobCache) Consume(deps map[string]interface{}) (map[string]int
 		var blob *object.Blob
 		switch action {
 		case merkletrie.Insert:
+			cache[change.To.TreeEntry.Hash] = &CachedBlob{}
+			newCache[change.To.TreeEntry.Hash] = &CachedBlob{}
 			blob, err = blobCache.getBlob(&change.To, commit.File)
 			if err != nil {
-				log.Printf("file to %s %s\n",
-					change.To.Name, change.To.TreeEntry.Hash)
+				log.Printf("file to %s %s: %v\n", change.To.Name, change.To.TreeEntry.Hash, err)
 			} else {
-				cache[change.To.TreeEntry.Hash] = blob
-				newCache[change.To.TreeEntry.Hash] = blob
+				cb := &CachedBlob{Blob: *blob}
+				err = cb.Cache()
+				if err == nil {
+					cache[change.To.TreeEntry.Hash] = cb
+					newCache[change.To.TreeEntry.Hash] = cb
+				} else {
+					log.Printf("file to %s %s: %v\n", change.To.Name, change.To.TreeEntry.Hash, err)
+				}
 			}
 		case merkletrie.Delete:
 			cache[change.From.TreeEntry.Hash], exists =
 				blobCache.cache[change.From.TreeEntry.Hash]
 			if !exists {
-				cache[change.From.TreeEntry.Hash], err =
-					blobCache.getBlob(&change.From, commit.File)
+				cache[change.From.TreeEntry.Hash] = &CachedBlob{}
+				blob, err = blobCache.getBlob(&change.From, commit.File)
 				if err != nil {
 					if err.Error() != plumbing.ErrObjectNotFound.Error() {
-						log.Printf("file from %s %s\n", change.From.Name,
-							change.From.TreeEntry.Hash)
+						log.Printf("file from %s %s: %v\n", change.From.Name,
+							change.From.TreeEntry.Hash, err)
 					} else {
-						cache[change.From.TreeEntry.Hash], err =
-							internal.CreateDummyBlob(change.From.TreeEntry.Hash)
+						blob, err = internal.CreateDummyBlob(change.From.TreeEntry.Hash)
+						cache[change.From.TreeEntry.Hash] = &CachedBlob{Blob: *blob}
+					}
+				} else {
+					cb := &CachedBlob{Blob: *blob}
+					err = cb.Cache()
+					if err == nil {
+						cache[change.From.TreeEntry.Hash] = cb
+					} else {
+						log.Printf("file from %s %s: %v\n", change.From.Name,
+							change.From.TreeEntry.Hash, err)
 					}
 				}
 			}
 		case merkletrie.Modify:
 			blob, err = blobCache.getBlob(&change.To, commit.File)
+			cache[change.To.TreeEntry.Hash] = &CachedBlob{}
+			newCache[change.To.TreeEntry.Hash] = &CachedBlob{}
 			if err != nil {
-				log.Printf("file to %s\n", change.To.Name)
+				log.Printf("file to %s: %v\n", change.To.Name, err)
 			} else {
-				cache[change.To.TreeEntry.Hash] = blob
-				newCache[change.To.TreeEntry.Hash] = blob
+				cb := &CachedBlob{Blob: *blob}
+				err = cb.Cache()
+				if err == nil {
+					cache[change.To.TreeEntry.Hash] = cb
+					newCache[change.To.TreeEntry.Hash] = cb
+				} else {
+					log.Printf("file to %s: %v\n", change.To.Name, err)
+				}
 			}
 			cache[change.From.TreeEntry.Hash], exists =
 				blobCache.cache[change.From.TreeEntry.Hash]
 			if !exists {
-				cache[change.From.TreeEntry.Hash], err =
-					blobCache.getBlob(&change.From, commit.File)
+				cache[change.From.TreeEntry.Hash] = &CachedBlob{}
+				blob, err = blobCache.getBlob(&change.From, commit.File)
 				if err != nil {
-					log.Printf("file from %s\n", change.From.Name)
+					log.Printf("file from %s: %v\n", change.From.Name, err)
+				} else {
+					cb := &CachedBlob{Blob: *blob}
+					err = cb.Cache()
+					if err == nil {
+						cache[change.From.TreeEntry.Hash] = cb
+					} else {
+						log.Printf("file from %s: %v\n", change.From.Name, err)
+					}
 				}
 			}
 		}
@@ -158,7 +244,7 @@ func (blobCache *BlobCache) Consume(deps map[string]interface{}) (map[string]int
 func (blobCache *BlobCache) Fork(n int) []core.PipelineItem {
 	caches := make([]core.PipelineItem, n)
 	for i := 0; i < n; i++ {
-		cache := map[plumbing.Hash]*object.Blob{}
+		cache := map[plumbing.Hash]*CachedBlob{}
 		for k, v := range blobCache.cache {
 			cache[k] = v
 		}
@@ -180,6 +266,7 @@ type FileGetter func(path string) (*object.File, error)
 func (blobCache *BlobCache) getBlob(entry *object.ChangeEntry, fileGetter FileGetter) (
 	*object.Blob, error) {
 	blob, err := blobCache.repository.BlobObject(entry.TreeEntry.Hash)
+
 	if err != nil {
 		if err.Error() != plumbing.ErrObjectNotFound.Error() {
 			log.Printf("getBlob(%s)\n", entry.TreeEntry.Hash.String())

+ 13 - 3
internal/plumbing/blob_cache_test.go

@@ -17,6 +17,16 @@ func fixtureBlobCache() *BlobCache {
 	return cache
 }
 
+func AddHash(t *testing.T, cache map[plumbing.Hash]*CachedBlob, hash string) {
+	objhash := plumbing.NewHash(hash)
+	blob, err := test.Repository.BlobObject(objhash)
+	assert.Nil(t, err)
+	cb := &CachedBlob{Blob: *blob}
+	err = cb.Cache()
+	assert.Nil(t, err)
+	cache[objhash] = cb
+}
+
 func TestBlobCacheConfigureInitialize(t *testing.T) {
 	cache := fixtureBlobCache()
 	assert.Equal(t, test.Repository, cache.repository)
@@ -85,7 +95,7 @@ func TestBlobCacheConsumeModification(t *testing.T) {
 	assert.Equal(t, len(result), 1)
 	cacheIface, exists := result[DependencyBlobCache]
 	assert.True(t, exists)
-	cache := cacheIface.(map[plumbing.Hash]*object.Blob)
+	cache := cacheIface.(map[plumbing.Hash]*CachedBlob)
 	assert.Equal(t, len(cache), 2)
 	blobFrom, exists := cache[plumbing.NewHash("1cacfc1bf0f048eb2f31973750983ae5d8de647a")]
 	assert.True(t, exists)
@@ -131,7 +141,7 @@ func TestBlobCacheConsumeInsertionDeletion(t *testing.T) {
 	assert.Equal(t, len(result), 1)
 	cacheIface, exists := result[DependencyBlobCache]
 	assert.True(t, exists)
-	cache := cacheIface.(map[plumbing.Hash]*object.Blob)
+	cache := cacheIface.(map[plumbing.Hash]*CachedBlob)
 	assert.Equal(t, len(cache), 2)
 	blobFrom, exists := cache[plumbing.NewHash("baa64828831d174f40140e4b3cfa77d1e917a2c1")]
 	assert.True(t, exists)
@@ -301,7 +311,7 @@ func TestBlobCacheDeleteInvalidBlob(t *testing.T) {
 	assert.Equal(t, len(result), 1)
 	cacheIface, exists := result[DependencyBlobCache]
 	assert.True(t, exists)
-	cache := cacheIface.(map[plumbing.Hash]*object.Blob)
+	cache := cacheIface.(map[plumbing.Hash]*CachedBlob)
 	assert.Equal(t, len(cache), 1)
 	blobFrom, exists := cache[plumbing.NewHash("ffffffffffffffffffffffffffffffffffffffff")]
 	assert.True(t, exists)

+ 2 - 79
internal/plumbing/diff.go

@@ -1,12 +1,6 @@
 package plumbing
 
 import (
-	"bufio"
-	"bytes"
-	"errors"
-	"io"
-	"unicode/utf8"
-
 	"github.com/sergi/go-diff/diffmatchpatch"
 	"gopkg.in/src-d/go-git.v4"
 	"gopkg.in/src-d/go-git.v4/plumbing"
@@ -90,7 +84,7 @@ func (diff *FileDiff) Initialize(repository *git.Repository) {}
 // in Provides(). If there was an error, nil is returned.
 func (diff *FileDiff) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
 	result := map[string]FileDiffData{}
-	cache := deps[DependencyBlobCache].(map[plumbing.Hash]*object.Blob)
+	cache := deps[DependencyBlobCache].(map[plumbing.Hash]*CachedBlob)
 	treeDiff := deps[DependencyTreeChanges].(object.Changes)
 	for _, change := range treeDiff {
 		action, err := change.Action()
@@ -103,14 +97,7 @@ func (diff *FileDiff) Consume(deps map[string]interface{}) (map[string]interface
 			blobTo := cache[change.To.TreeEntry.Hash]
 			// we are not validating UTF-8 here because for example
 			// git/git 4f7770c87ce3c302e1639a7737a6d2531fe4b160 fetch-pack.c is invalid UTF-8
-			strFrom, err := BlobToString(blobFrom)
-			if err != nil {
-				return nil, err
-			}
-			strTo, err := BlobToString(blobTo)
-			if err != nil {
-				return nil, err
-			}
+			strFrom, strTo := string(blobFrom.Data), string(blobTo.Data)
 			dmp := diffmatchpatch.New()
 			src, dst, _ := dmp.DiffLinesToRunes(strFrom, strTo)
 			diffs := dmp.DiffMainRunes(src, dst, false)
@@ -134,70 +121,6 @@ func (diff *FileDiff) Fork(n int) []core.PipelineItem {
 	return core.ForkSamePipelineItem(diff, n)
 }
 
-// CountLines returns the number of lines in a *object.Blob.
-func CountLines(file *object.Blob) (int, error) {
-	if file == nil {
-		return -1, errors.New("blob is nil: probably not cached")
-	}
-	reader, err := file.Reader()
-	if err != nil {
-		return -1, err
-	}
-	defer checkClose(reader)
-	var scanner *bufio.Scanner
-	buffer := make([]byte, bufio.MaxScanTokenSize)
-	counter := 0
-	utf8Errors := 0
-	for scanner == nil || scanner.Err() == bufio.ErrTooLong {
-		if scanner != nil {
-			chunk := scanner.Bytes()
-			if !utf8.Valid(chunk) {
-				utf8Errors++
-			}
-			if bytes.IndexByte(chunk, 0) >= 0 {
-				return -1, errors.New("binary")
-			}
-		}
-		scanner = bufio.NewScanner(reader)
-		scanner.Buffer(buffer, 0)
-		for scanner.Scan() {
-			chunk := scanner.Bytes()
-			if !utf8.Valid(chunk) {
-				utf8Errors++
-			}
-			if bytes.IndexByte(chunk, 0) >= 0 {
-				return -1, errors.New("binary")
-			}
-			counter++
-		}
-	}
-	if float32(utf8Errors) / float32(counter) >= 0.01 {
-		return -1, errors.New("binary")
-	}
-	return counter, nil
-}
-
-// BlobToString reads *object.Blob and returns its contents as a string.
-func BlobToString(file *object.Blob) (string, error) {
-	if file == nil {
-		return "", errors.New("blob is nil: probably not cached")
-	}
-	reader, err := file.Reader()
-	if err != nil {
-		return "", err
-	}
-	defer checkClose(reader)
-	buf := new(bytes.Buffer)
-	buf.ReadFrom(reader)
-	return buf.String(), nil
-}
-
-func checkClose(c io.Closer) {
-	if err := c.Close(); err != nil {
-		panic(err)
-	}
-}
-
 func init() {
 	core.Registry.Register(&FileDiff{})
 }

+ 39 - 42
internal/plumbing/diff_test.go

@@ -47,13 +47,10 @@ func TestFileDiffRegistration(t *testing.T) {
 func TestFileDiffConsume(t *testing.T) {
 	fd := fixtures.FileDiff()
 	deps := map[string]interface{}{}
-	cache := map[plumbing.Hash]*object.Blob{}
-	hash := plumbing.NewHash("291286b4ac41952cbd1389fda66420ec03c1a9fe")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("334cde09da4afcb74f8d2b3e6fd6cce61228b485")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("dc248ba2b22048cc730c571a748e8ffcf7085ab9")
-	cache[hash], _ = test.Repository.BlobObject(hash)
+	cache := map[plumbing.Hash]*items.CachedBlob{}
+	items.AddHash(t, cache, "291286b4ac41952cbd1389fda66420ec03c1a9fe")
+	items.AddHash(t, cache, "334cde09da4afcb74f8d2b3e6fd6cce61228b485")
+	items.AddHash(t, cache, "dc248ba2b22048cc730c571a748e8ffcf7085ab9")
 	deps[items.DependencyBlobCache] = cache
 	changes := make(object.Changes, 3)
 	treeFrom, _ := test.Repository.TreeObject(plumbing.NewHash(
@@ -124,13 +121,11 @@ func TestFileDiffConsume(t *testing.T) {
 func TestFileDiffConsumeInvalidBlob(t *testing.T) {
 	fd := fixtures.FileDiff()
 	deps := map[string]interface{}{}
-	cache := map[plumbing.Hash]*object.Blob{}
-	hash := plumbing.NewHash("291286b4ac41952cbd1389fda66420ec03c1a9fe")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("334cde09da4afcb74f8d2b3e6fd6cce61228b485")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("dc248ba2b22048cc730c571a748e8ffcf7085ab9")
-	cache[hash], _ = test.Repository.BlobObject(hash)
+	cache := map[plumbing.Hash]*items.CachedBlob{}
+	items.AddHash(t, cache, "291286b4ac41952cbd1389fda66420ec03c1a9fe")
+	items.AddHash(t, cache, "334cde09da4afcb74f8d2b3e6fd6cce61228b485")
+	items.AddHash(t, cache, "dc248ba2b22048cc730c571a748e8ffcf7085ab9")
+	cache[plumbing.NewHash("ffffffffffffffffffffffffffffffffffffffff")] = &items.CachedBlob{}
 	deps[items.DependencyBlobCache] = cache
 	changes := make(object.Changes, 1)
 	treeFrom, _ := test.Repository.TreeObject(plumbing.NewHash(
@@ -156,8 +151,8 @@ func TestFileDiffConsumeInvalidBlob(t *testing.T) {
 	}}
 	deps[items.DependencyTreeChanges] = changes
 	res, err := fd.Consume(deps)
-	assert.Nil(t, res)
-	assert.NotNil(t, err)
+	assert.Len(t, res[hercules.DependencyFileDiff].(map[string]items.FileDiffData), 1)
+	assert.Nil(t, err)
 	changes[0] = &object.Change{From: object.ChangeEntry{
 		Name: "analyser.go",
 		Tree: treeFrom,
@@ -176,38 +171,44 @@ func TestFileDiffConsumeInvalidBlob(t *testing.T) {
 		},
 	}}
 	res, err = fd.Consume(deps)
-	assert.Nil(t, res)
-	assert.NotNil(t, err)
+	assert.Len(t, res[hercules.DependencyFileDiff].(map[string]items.FileDiffData), 1)
+	assert.Nil(t, err)
 }
 
 func TestCountLines(t *testing.T) {
-	blob, _ := test.Repository.BlobObject(
+	blob, err := test.Repository.BlobObject(
 		plumbing.NewHash("291286b4ac41952cbd1389fda66420ec03c1a9fe"))
-	lines, err := items.CountLines(blob)
+	assert.Nil(t, err)
+	cb := &items.CachedBlob{Blob: *blob}
+	cb.Cache()
+	lines, err := cb.CountLines()
 	assert.Equal(t, lines, 12)
 	assert.Nil(t, err)
-	lines, err = items.CountLines(nil)
-	assert.Equal(t, lines, -1)
-	assert.NotNil(t, err)
-	blob, _ = internal.CreateDummyBlob(plumbing.NewHash("291286b4ac41952cbd1389fda66420ec03c1a9fe"), true)
-	lines, err = items.CountLines(blob)
-	assert.Equal(t, lines, -1)
-	assert.NotNil(t, err)
+	blob, err = internal.CreateDummyBlob(
+		plumbing.NewHash("291286b4ac41952cbd1389fda66420ec03c1a9fe"), true)
+	assert.Nil(t, err)
+	cb = &items.CachedBlob{Blob: *blob}
+	err = cb.Cache()
+	assert.Equal(t, err.Error(), "dummy failure")
 	// test_data/blob
 	blob, err = test.Repository.BlobObject(
 		plumbing.NewHash("c86626638e0bc8cf47ca49bb1525b40e9737ee64"))
 	assert.Nil(t, err)
-	lines, err = items.CountLines(blob)
-	assert.Equal(t, lines, -1)
+	cb = &items.CachedBlob{Blob: *blob}
+	cb.Cache()
+	lines, err = cb.CountLines()
+	assert.Equal(t, lines, 0)
 	assert.NotNil(t, err)
-	assert.EqualError(t, err, "binary")
+	assert.Equal(t, err.Error(), items.ErrorBinary.Error())
 }
 
 func TestBlobToString(t *testing.T) {
 	blob, _ := test.Repository.BlobObject(
 		plumbing.NewHash("291286b4ac41952cbd1389fda66420ec03c1a9fe"))
-	str, err := items.BlobToString(blob)
+	cb := &items.CachedBlob{Blob: *blob}
+	err := cb.Cache()
 	assert.Nil(t, err)
+	str := string(cb.Data)
 	assert.Equal(t, str, `language: go
 
 go:
@@ -221,23 +222,19 @@ script:
 notifications:
   email: false
 `)
-	str, err = items.BlobToString(nil)
-	assert.Equal(t, str, "")
-	assert.NotNil(t, err)
-	blob, _ = internal.CreateDummyBlob(plumbing.NewHash("291286b4ac41952cbd1389fda66420ec03c1a9fe"), true)
-	str, err = items.BlobToString(blob)
-	assert.Equal(t, str, "")
+	blob, _ = internal.CreateDummyBlob(
+		plumbing.NewHash("291286b4ac41952cbd1389fda66420ec03c1a9fe"), true)
+	cb = &items.CachedBlob{Blob: *blob}
+	err = cb.Cache()
 	assert.NotNil(t, err)
 }
 
 func TestFileDiffDarkMagic(t *testing.T) {
 	fd := fixtures.FileDiff()
 	deps := map[string]interface{}{}
-	cache := map[plumbing.Hash]*object.Blob{}
-	hash := plumbing.NewHash("448eb3f312849b0ca766063d06b09481c987b309")
-	cache[hash], _ = test.Repository.BlobObject(hash) // 1.java
-	hash = plumbing.NewHash("3312c92f3e8bdfbbdb30bccb6acd1b85bc338dfc")
-	cache[hash], _ = test.Repository.BlobObject(hash) // 2.java
+	cache := map[plumbing.Hash]*items.CachedBlob{}
+	items.AddHash(t, cache, "448eb3f312849b0ca766063d06b09481c987b309") // 1.java
+	items.AddHash(t, cache, "3312c92f3e8bdfbbdb30bccb6acd1b85bc338dfc") // 2.java
 	deps[items.DependencyBlobCache] = cache
 	changes := make(object.Changes, 1)
 	treeFrom, _ := test.Repository.TreeObject(plumbing.NewHash(

+ 48 - 19
internal/plumbing/renames.go

@@ -21,7 +21,7 @@ type RenameAnalysis struct {
 	core.NoopMerger
 	// SimilarityThreshold adjusts the heuristic to determine file renames.
 	// It has the same units as cgit's -X rename-threshold or -M. Better to
-	// set it to the default value of 90 (90%).
+	// set it to the default value of 50 (50%).
 	SimilarityThreshold int
 
 	repository *git.Repository
@@ -30,7 +30,8 @@ type RenameAnalysis struct {
 const (
 	// RenameAnalysisDefaultThreshold specifies the default percentage of common lines in a pair
 	// of files to consider them linked. The exact code of the decision is sizesAreClose().
-	RenameAnalysisDefaultThreshold = 90
+	// This defaults to CGit's 50%.
+	RenameAnalysisDefaultThreshold = 50
 
 	// ConfigRenameAnalysisSimilarityThreshold is the name of the configuration option
 	// (RenameAnalysis.Configure()) which sets the similarity threshold.
@@ -95,7 +96,7 @@ func (ra *RenameAnalysis) Initialize(repository *git.Repository) {
 // in Provides(). If there was an error, nil is returned.
 func (ra *RenameAnalysis) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
 	changes := deps[DependencyTreeChanges].(object.Changes)
-	cache := deps[DependencyBlobCache].(map[plumbing.Hash]*object.Blob)
+	cache := deps[DependencyBlobCache].(map[plumbing.Hash]*CachedBlob)
 
 	reducedChanges := make(object.Changes, 0, changes.Len())
 
@@ -210,30 +211,58 @@ func (ra *RenameAnalysis) Fork(n int) []core.PipelineItem {
 }
 
 func (ra *RenameAnalysis) sizesAreClose(size1 int64, size2 int64) bool {
-	return internal.Abs64(size1-size2)*100/internal.Max64(1, internal.Min64(size1, size2)) <=
+	return (internal.Abs64(size1-size2)*100)/internal.Max64(size1, size2) <=
 		int64(100-ra.SimilarityThreshold)
 }
 
 func (ra *RenameAnalysis) blobsAreClose(
-	blob1 *object.Blob, blob2 *object.Blob) (bool, error) {
-	strFrom, err := BlobToString(blob1)
-	if err != nil {
-		return false, err
-	}
-	strTo, err := BlobToString(blob2)
-	if err != nil {
-		return false, err
-	}
+	blob1 *CachedBlob, blob2 *CachedBlob) (bool, error) {
+	src, dst := string(blob1.Data), string(blob2.Data)
+
+	// compute the line-by-line diff, then the char-level diffs of the del-ins blocks
+	// yes, this algorithm is greedy and not exact
 	dmp := diffmatchpatch.New()
-	src, dst, _ := dmp.DiffLinesToRunes(strFrom, strTo)
-	diffs := dmp.DiffMainRunes(src, dst, false)
-	common := 0
+	srcLines, dstLines, lines := dmp.DiffLinesToRunes(src, dst)
+	diffs := dmp.DiffMainRunes(srcLines, dstLines, false)
+	var common, posSrc, prevPosSrc, posDst int
+	possibleDelInsBlock := false
 	for _, edit := range diffs {
-		if edit.Type == diffmatchpatch.DiffEqual {
-			common += utf8.RuneCountInString(edit.Text)
+		switch edit.Type {
+		case diffmatchpatch.DiffDelete:
+			possibleDelInsBlock = true
+			prevPosSrc = posSrc
+			for _, lineno := range edit.Text {
+				posSrc += len(lines[lineno])
+			}
+		case diffmatchpatch.DiffInsert:
+			nextPosDst := posDst
+			for _, lineno := range edit.Text {
+				nextPosDst += len(lines[lineno])
+			}
+			if possibleDelInsBlock {
+				possibleDelInsBlock = false
+				localDmp := diffmatchpatch.New()
+				localSrc := src[prevPosSrc:posSrc]
+				localDst := dst[posDst:nextPosDst]
+				localDiffs := localDmp.DiffMainRunes([]rune(localSrc), []rune(localDst), false)
+				for _, localEdit := range localDiffs {
+					if localEdit.Type == diffmatchpatch.DiffEqual {
+						common += utf8.RuneCountInString(localEdit.Text)
+					}
+				}
+			}
+			posDst = nextPosDst
+		case diffmatchpatch.DiffEqual:
+			possibleDelInsBlock = false
+			for _, lineno := range edit.Text {
+				common += utf8.RuneCountInString(lines[lineno])
+				step := len(lines[lineno])
+				posSrc += step
+				posDst += step
+			}
 		}
 	}
-	similarity := common * 100 / internal.Max(1, internal.Min(len(src), len(dst)))
+	similarity := (common*100)/internal.Max(utf8.RuneCountInString(src), utf8.RuneCountInString(dst))
 	return similarity >= ra.SimilarityThreshold, nil
 }
 

+ 7 - 11
internal/plumbing/renames_test.go

@@ -109,24 +109,20 @@ func TestRenameAnalysisConsume(t *testing.T) {
 		},
 	},
 	}
-	cache := map[plumbing.Hash]*object.Blob{}
-	hash := plumbing.NewHash("baa64828831d174f40140e4b3cfa77d1e917a2c1")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("29c9fafd6a2fae8cd20298c3f60115bc31a4c0f2")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("c29112dbd697ad9b401333b80c18a63951bc18d9")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("f7d918ec500e2f925ecde79b51cc007bac27de72")
-	cache[hash], _ = test.Repository.BlobObject(hash)
+	cache := map[plumbing.Hash]*CachedBlob{}
+	AddHash(t, cache, "baa64828831d174f40140e4b3cfa77d1e917a2c1")
+	AddHash(t, cache, "29c9fafd6a2fae8cd20298c3f60115bc31a4c0f2")
+	AddHash(t, cache, "c29112dbd697ad9b401333b80c18a63951bc18d9")
+	AddHash(t, cache, "f7d918ec500e2f925ecde79b51cc007bac27de72")
 	deps := map[string]interface{}{}
 	deps[DependencyBlobCache] = cache
 	deps[DependencyTreeChanges] = changes
-	ra.SimilarityThreshold = 33
+	ra.SimilarityThreshold = 37
 	res, err := ra.Consume(deps)
 	assert.Nil(t, err)
 	renamed := res[DependencyTreeChanges].(object.Changes)
 	assert.Equal(t, len(renamed), 2)
-	ra.SimilarityThreshold = 35
+	ra.SimilarityThreshold = 38
 	res, err = ra.Consume(deps)
 	assert.Nil(t, err)
 	renamed = res[DependencyTreeChanges].(object.Changes)

+ 19 - 36
internal/plumbing/uast/uast.go

@@ -1,12 +1,11 @@
 package uast
 
 import (
-	"bytes"
 	"context"
 	"errors"
 	"fmt"
 	"io"
-	goioutil "io/ioutil"
+	"io/ioutil"
 	"os"
 	"path"
 	"runtime"
@@ -22,7 +21,6 @@ import (
 	"gopkg.in/src-d/go-git.v4"
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
-	"gopkg.in/src-d/go-git.v4/utils/ioutil"
 	"gopkg.in/src-d/go-git.v4/utils/merkletrie"
 	"gopkg.in/src-d/hercules.v4/internal/core"
 	"gopkg.in/src-d/hercules.v4/internal/pb"
@@ -44,8 +42,6 @@ type Extractor struct {
 }
 
 const (
-	uastExtractionSkipped = -(1 << 31)
-
 	// ConfigUASTEndpoint is the name of the configuration option (Extractor.Configure())
 	// which sets the Babelfish server address.
 	ConfigUASTEndpoint = "ConfigUASTEndpoint"
@@ -67,7 +63,9 @@ const (
 type uastTask struct {
 	Lock   *sync.RWMutex
 	Dest   map[plumbing.Hash]*uast.Node
-	File   *object.File
+	Name   string
+	Hash   plumbing.Hash
+	Data   []byte
 	Errors *[]error
 }
 
@@ -200,27 +198,14 @@ func (exr *Extractor) Initialize(repository *git.Repository) {
 // This function returns the mapping with analysis results. The keys must be the same as
 // in Provides(). If there was an error, nil is returned.
 func (exr *Extractor) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
-	cache := deps[items.DependencyBlobCache].(map[plumbing.Hash]*object.Blob)
+	cache := deps[items.DependencyBlobCache].(map[plumbing.Hash]*items.CachedBlob)
 	treeDiffs := deps[items.DependencyTreeChanges].(object.Changes)
 	uasts := map[plumbing.Hash]*uast.Node{}
 	lock := sync.RWMutex{}
 	errs := make([]error, 0)
 	wg := sync.WaitGroup{}
 	submit := func(change *object.Change) {
-		{
-			reader, err := cache[change.To.TreeEntry.Hash].Reader()
-			if err != nil {
-				errs = append(errs, err)
-				return
-			}
-			defer ioutil.CheckClose(reader, &err)
-			buf := new(bytes.Buffer)
-			if _, err := buf.ReadFrom(reader); err != nil {
-				errs = append(errs, err)
-				return
-			}
-			exr.ProcessedFiles[change.To.Name]++
-		}
+		exr.ProcessedFiles[change.To.Name]++
 		wg.Add(1)
 		go func(task interface{}) {
 			exr.pool.Process(task)
@@ -228,7 +213,9 @@ func (exr *Extractor) Consume(deps map[string]interface{}) (map[string]interface
 		}(uastTask{
 			Lock:   &lock,
 			Dest:   uasts,
-			File:   &object.File{Name: change.To.Name, Blob: *cache[change.To.TreeEntry.Hash]},
+			Name:   change.To.Name,
+			Hash:   change.To.TreeEntry.Hash,
+			Data:   cache[change.To.TreeEntry.Hash].Data,
 			Errors: &errs,
 		})
 	}
@@ -267,14 +254,10 @@ func (exr *Extractor) Fork(n int) []core.PipelineItem {
 }
 
 func (exr *Extractor) extractUAST(
-	client *bblfsh.Client, file *object.File) (*uast.Node, error) {
+	client *bblfsh.Client, name string, data []byte) (*uast.Node, error) {
 	request := client.NewParseRequest()
-	contents, err := file.Contents()
-	if err != nil {
-		return nil, err
-	}
-	request.Content(contents)
-	request.Filename(file.Name)
+	request.Content(string(data))
+	request.Filename(name)
 	ctx, cancel := exr.Context()
 	if cancel != nil {
 		defer cancel()
@@ -297,16 +280,16 @@ func (exr *Extractor) extractUAST(
 
 func (exr *Extractor) extractTask(client *bblfsh.Client, data interface{}) interface{} {
 	task := data.(uastTask)
-	node, err := exr.extractUAST(client, task.File)
+	node, err := exr.extractUAST(client, task.Name, task.Data)
 	task.Lock.Lock()
 	defer task.Lock.Unlock()
 	if err != nil {
 		*task.Errors = append(*task.Errors,
-			fmt.Errorf("\nfile %s, blob %s: %v", task.File.Name, task.File.Hash.String(), err))
+			fmt.Errorf("\nfile %s, blob %s: %v", task.Name, task.Hash.String(), err))
 		return nil
 	}
 	if node != nil {
-		task.Dest[task.File.Hash] = node
+		task.Dest[task.Hash] = node
 	}
 	return nil
 }
@@ -550,21 +533,21 @@ func (saver *ChangesSaver) dumpFiles(result [][]Change) []*pb.UASTChange {
 			bs, _ := change.Before.Marshal()
 			record.UastBefore = path.Join(saver.OutputPath, fmt.Sprintf(
 				"%d_%d_before_%s.pb", i, j, change.Change.From.TreeEntry.Hash.String()))
-			goioutil.WriteFile(record.UastBefore, bs, 0666)
+			ioutil.WriteFile(record.UastBefore, bs, 0666)
 			blob, _ := saver.repository.BlobObject(change.Change.From.TreeEntry.Hash)
 			s, _ := (&object.File{Blob: *blob}).Contents()
 			record.SrcBefore = path.Join(saver.OutputPath, fmt.Sprintf(
 				"%d_%d_before_%s.src", i, j, change.Change.From.TreeEntry.Hash.String()))
-			goioutil.WriteFile(record.SrcBefore, []byte(s), 0666)
+			ioutil.WriteFile(record.SrcBefore, []byte(s), 0666)
 			bs, _ = change.After.Marshal()
 			record.UastAfter = path.Join(saver.OutputPath, fmt.Sprintf(
 				"%d_%d_after_%s.pb", i, j, change.Change.To.TreeEntry.Hash.String()))
-			goioutil.WriteFile(record.UastAfter, bs, 0666)
+			ioutil.WriteFile(record.UastAfter, bs, 0666)
 			blob, _ = saver.repository.BlobObject(change.Change.To.TreeEntry.Hash)
 			s, _ = (&object.File{Blob: *blob}).Contents()
 			record.SrcAfter = path.Join(saver.OutputPath, fmt.Sprintf(
 				"%d_%d_after_%s.src", i, j, change.Change.To.TreeEntry.Hash.String()))
-			goioutil.WriteFile(record.SrcAfter, []byte(s), 0666)
+			ioutil.WriteFile(record.SrcAfter, []byte(s), 0666)
 			fileNames = append(fileNames, record)
 		}
 	}

+ 12 - 2
internal/plumbing/uast/uast_test.go

@@ -28,6 +28,16 @@ func fixtureUASTExtractor() *Extractor {
 	return &exr
 }
 
+func AddHash(t *testing.T, cache map[plumbing.Hash]*items.CachedBlob, hash string) {
+	objhash := plumbing.NewHash(hash)
+	blob, err := test.Repository.BlobObject(objhash)
+	assert.Nil(t, err)
+	cb := &items.CachedBlob{Blob: *blob}
+	err = cb.Cache()
+	assert.Nil(t, err)
+	cache[objhash] = cb
+}
+
 func TestUASTExtractorMeta(t *testing.T) {
 	exr := fixtureUASTExtractor()
 	assert.Equal(t, exr.Name(), "UAST")
@@ -117,7 +127,7 @@ func TestUASTExtractorConsume(t *testing.T) {
 		},
 	},
 	}
-	cache := map[plumbing.Hash]*object.Blob{}
+	cache := map[plumbing.Hash]*items.CachedBlob{}
 	for _, hash := range []string{
 		"baa64828831d174f40140e4b3cfa77d1e917a2c1",
 		"5d78f57d732aed825764347ec6f3ab74d50d0619",
@@ -125,7 +135,7 @@ func TestUASTExtractorConsume(t *testing.T) {
 		"f7d918ec500e2f925ecde79b51cc007bac27de72",
 		"81f2b6d1fa5357f90e9dead150cd515720897545",
 	} {
-		cache[plumbing.NewHash(hash)], _ = test.Repository.BlobObject(plumbing.NewHash(hash))
+		AddHash(t, cache, hash)
 	}
 	deps := map[string]interface{}{}
 	deps[items.DependencyBlobCache] = cache

+ 36 - 17
leaves/burndown.go

@@ -276,11 +276,14 @@ func (analyser *BurndownAnalysis) Consume(deps map[string]interface{}) (map[stri
 		analyser.mergedFiles = map[string]bool{}
 		analyser.mergedAuthor = author
 	}
-	cache := deps[items.DependencyBlobCache].(map[plumbing.Hash]*object.Blob)
+	cache := deps[items.DependencyBlobCache].(map[plumbing.Hash]*items.CachedBlob)
 	treeDiffs := deps[items.DependencyTreeChanges].(object.Changes)
 	fileDiffs := deps[items.DependencyFileDiff].(map[string]items.FileDiffData)
 	for _, change := range treeDiffs {
 		action, _ := change.Action()
+		if deps["commit"].(*object.Commit).Hash.String() == "a6667d96c5e4aca92612295d549541146dd6e74a" {
+			fmt.Println("a6667d96c5e4aca92612295d549541146dd6e74a", action, change)
+		}
 		var err error
 		switch action {
 		case merkletrie.Insert:
@@ -1005,18 +1008,17 @@ func (analyser *BurndownAnalysis) newFile(
 }
 
 func (analyser *BurndownAnalysis) handleInsertion(
-	change *object.Change, author int, cache map[plumbing.Hash]*object.Blob) error {
+	change *object.Change, author int, cache map[plumbing.Hash]*items.CachedBlob) error {
 	blob := cache[change.To.TreeEntry.Hash]
-	lines, err := items.CountLines(blob)
+	lines, err := blob.CountLines()
 	if err != nil {
-		if err.Error() == "binary" {
-			return nil
-		}
-		return err
+		// binary
+		return nil
 	}
 	name := change.To.Name
 	file, exists := analyser.files[name]
 	if exists {
+		println("\n", analyser, "error")
 		return fmt.Errorf("file %s already exists", name)
 	}
 	var hash plumbing.Hash
@@ -1032,18 +1034,18 @@ func (analyser *BurndownAnalysis) handleInsertion(
 }
 
 func (analyser *BurndownAnalysis) handleDeletion(
-	change *object.Change, author int, cache map[plumbing.Hash]*object.Blob) error {
+	change *object.Change, author int, cache map[plumbing.Hash]*items.CachedBlob) error {
 
+	name := change.From.Name
+	file, exists := analyser.files[name]
 	blob := cache[change.From.TreeEntry.Hash]
-	lines, err := items.CountLines(blob)
-	if err != nil {
-		if err.Error() == "binary" {
-			return nil
-		}
-		return err
+	lines, err := blob.CountLines()
+	if exists && err != nil {
+		return fmt.Errorf("file %s unexpectedly became binary", name)
+	}
+	if !exists {
+		return nil
 	}
-	name := change.From.Name
-	file := analyser.files[name]
 	file.Update(analyser.packPersonWithDay(author, analyser.day), 0, 0, lines)
 	delete(analyser.files, name)
 	delete(analyser.fileHistories, name)
@@ -1055,7 +1057,7 @@ func (analyser *BurndownAnalysis) handleDeletion(
 }
 
 func (analyser *BurndownAnalysis) handleModification(
-	change *object.Change, author int, cache map[plumbing.Hash]*object.Blob,
+	change *object.Change, author int, cache map[plumbing.Hash]*items.CachedBlob,
 	diffs map[string]items.FileDiffData) error {
 
 	if analyser.day == burndown.TreeMergeMark {
@@ -1075,6 +1077,23 @@ func (analyser *BurndownAnalysis) handleModification(
 		}
 	}
 
+	// Check for binary changes
+	blobFrom := cache[change.From.TreeEntry.Hash]
+	_, errFrom := blobFrom.CountLines()
+	blobTo := cache[change.From.TreeEntry.Hash]
+	_, errTo := blobTo.CountLines()
+	if errFrom != errTo {
+		if errFrom != nil {
+			// the file is no longer binary
+			return analyser.handleInsertion(change, author, cache)
+		}
+		// the file became binary
+		return analyser.handleDeletion(change, author, cache)
+	} else if errFrom != nil {
+		// what are we doing here?!
+		return nil
+	}
+
 	thisDiffs := diffs[change.To.Name]
 	if file.Len() != thisDiffs.OldLinesOfCode {
 		log.Printf("====TREE====\n%s", file.Dump())

+ 32 - 40
leaves/burndown_test.go

@@ -20,6 +20,16 @@ import (
 	"gopkg.in/src-d/hercules.v4/internal/test"
 )
 
+func AddHash(t *testing.T, cache map[plumbing.Hash]*items.CachedBlob, hash string) {
+	objhash := plumbing.NewHash(hash)
+	blob, err := test.Repository.BlobObject(objhash)
+	assert.Nil(t, err)
+	cb := &items.CachedBlob{Blob: *blob}
+	err = cb.Cache()
+	assert.Nil(t, err)
+	cache[objhash] = cb
+}
+
 func TestBurndownMeta(t *testing.T) {
 	burndown := BurndownAnalysis{}
 	assert.Equal(t, burndown.Name(), "Burndown")
@@ -121,15 +131,11 @@ func TestBurndownConsumeFinalize(t *testing.T) {
 	// stage 1
 	deps[identity.DependencyAuthor] = 0
 	deps[items.DependencyDay] = 0
-	cache := map[plumbing.Hash]*object.Blob{}
-	hash := plumbing.NewHash("291286b4ac41952cbd1389fda66420ec03c1a9fe")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("c29112dbd697ad9b401333b80c18a63951bc18d9")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("baa64828831d174f40140e4b3cfa77d1e917a2c1")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("dc248ba2b22048cc730c571a748e8ffcf7085ab9")
-	cache[hash], _ = test.Repository.BlobObject(hash)
+	cache := map[plumbing.Hash]*items.CachedBlob{}
+	AddHash(t, cache, "291286b4ac41952cbd1389fda66420ec03c1a9fe")
+	AddHash(t, cache, "c29112dbd697ad9b401333b80c18a63951bc18d9")
+	AddHash(t, cache, "baa64828831d174f40140e4b3cfa77d1e917a2c1")
+	AddHash(t, cache, "dc248ba2b22048cc730c571a748e8ffcf7085ab9")
 	deps[items.DependencyBlobCache] = cache
 	changes := make(object.Changes, 3)
 	treeFrom, _ := test.Repository.TreeObject(plumbing.NewHash(
@@ -220,17 +226,12 @@ func TestBurndownConsumeFinalize(t *testing.T) {
 	// 2b1ed978194a94edeabbca6de7ff3b5771d4d665
 	deps[core.DependencyIsMerge] = false
 	deps[items.DependencyDay] = 30
-	cache = map[plumbing.Hash]*object.Blob{}
-	hash = plumbing.NewHash("291286b4ac41952cbd1389fda66420ec03c1a9fe")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("baa64828831d174f40140e4b3cfa77d1e917a2c1")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("29c9fafd6a2fae8cd20298c3f60115bc31a4c0f2")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("c29112dbd697ad9b401333b80c18a63951bc18d9")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("f7d918ec500e2f925ecde79b51cc007bac27de72")
-	cache[hash], _ = test.Repository.BlobObject(hash)
+	cache = map[plumbing.Hash]*items.CachedBlob{}
+	AddHash(t, cache, "291286b4ac41952cbd1389fda66420ec03c1a9fe")
+	AddHash(t, cache, "baa64828831d174f40140e4b3cfa77d1e917a2c1")
+	AddHash(t, cache, "29c9fafd6a2fae8cd20298c3f60115bc31a4c0f2")
+	AddHash(t, cache, "c29112dbd697ad9b401333b80c18a63951bc18d9")
+	AddHash(t, cache, "f7d918ec500e2f925ecde79b51cc007bac27de72")
 	deps[items.DependencyBlobCache] = cache
 	changes = make(object.Changes, 3)
 	treeFrom, _ = test.Repository.TreeObject(plumbing.NewHash(
@@ -353,15 +354,11 @@ func TestBurndownSerialize(t *testing.T) {
 	// stage 1
 	deps[identity.DependencyAuthor] = 0
 	deps[items.DependencyDay] = 0
-	cache := map[plumbing.Hash]*object.Blob{}
-	hash := plumbing.NewHash("291286b4ac41952cbd1389fda66420ec03c1a9fe")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("c29112dbd697ad9b401333b80c18a63951bc18d9")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("baa64828831d174f40140e4b3cfa77d1e917a2c1")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("dc248ba2b22048cc730c571a748e8ffcf7085ab9")
-	cache[hash], _ = test.Repository.BlobObject(hash)
+	cache := map[plumbing.Hash]*items.CachedBlob{}
+	AddHash(t, cache, "291286b4ac41952cbd1389fda66420ec03c1a9fe")
+	AddHash(t, cache, "c29112dbd697ad9b401333b80c18a63951bc18d9")
+	AddHash(t, cache, "baa64828831d174f40140e4b3cfa77d1e917a2c1")
+	AddHash(t, cache, "dc248ba2b22048cc730c571a748e8ffcf7085ab9")
 	deps[items.DependencyBlobCache] = cache
 	changes := make(object.Changes, 3)
 	treeFrom, _ := test.Repository.TreeObject(plumbing.NewHash(
@@ -418,17 +415,12 @@ func TestBurndownSerialize(t *testing.T) {
 	// 2b1ed978194a94edeabbca6de7ff3b5771d4d665
 	deps[identity.DependencyAuthor] = 1
 	deps[items.DependencyDay] = 30
-	cache = map[plumbing.Hash]*object.Blob{}
-	hash = plumbing.NewHash("291286b4ac41952cbd1389fda66420ec03c1a9fe")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("baa64828831d174f40140e4b3cfa77d1e917a2c1")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("29c9fafd6a2fae8cd20298c3f60115bc31a4c0f2")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("c29112dbd697ad9b401333b80c18a63951bc18d9")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("f7d918ec500e2f925ecde79b51cc007bac27de72")
-	cache[hash], _ = test.Repository.BlobObject(hash)
+	cache = map[plumbing.Hash]*items.CachedBlob{}
+	AddHash(t, cache, "291286b4ac41952cbd1389fda66420ec03c1a9fe")
+	AddHash(t, cache, "baa64828831d174f40140e4b3cfa77d1e917a2c1")
+	AddHash(t, cache, "29c9fafd6a2fae8cd20298c3f60115bc31a4c0f2")
+	AddHash(t, cache, "c29112dbd697ad9b401333b80c18a63951bc18d9")
+	AddHash(t, cache, "f7d918ec500e2f925ecde79b51cc007bac27de72")
 	deps[items.DependencyBlobCache] = cache
 	changes = make(object.Changes, 3)
 	treeFrom, _ = test.Repository.TreeObject(plumbing.NewHash(