Ver código fonte

Merge pull request #114 from vmarkovtsev/master

v5
Vadim Markovtsev 6 anos atrás
pai
commit
28fe0f7c15
45 arquivos alterados com 469 adições e 420 exclusões
  1. 5 5
      .appveyor.yml
  2. 6 6
      .travis.yml
  3. 2 2
      Makefile
  4. 2 2
      cmd/hercules/combine.go
  5. 1 1
      cmd/hercules/plugin.template
  6. 2 2
      cmd/hercules/root.go
  7. 4 10
      contrib/_plugin_example/churn_analysis.go
  8. 10 11
      core.go
  9. 2 2
      internal/burndown/file.go
  10. 2 2
      internal/burndown/file_test.go
  11. 1 1
      internal/core/forks.go
  12. 2 2
      internal/core/pipeline.go
  13. 2 2
      internal/core/pipeline_test.go
  14. 2 2
      internal/core/registry.go
  15. 1 1
      internal/core/registry_test.go
  16. 4 4
      internal/global_test.go
  17. 118 23
      internal/plumbing/blob_cache.go
  18. 16 6
      internal/plumbing/blob_cache_test.go
  19. 1 1
      internal/plumbing/day.go
  20. 2 2
      internal/plumbing/day_test.go
  21. 3 80
      internal/plumbing/diff.go
  22. 45 47
      internal/plumbing/diff_test.go
  23. 1 1
      internal/plumbing/identity/identity.go
  24. 2 2
      internal/plumbing/identity/identity_test.go
  25. 52 22
      internal/plumbing/renames.go
  26. 9 13
      internal/plumbing/renames_test.go
  27. 1 1
      internal/plumbing/tree_diff.go
  28. 2 2
      internal/plumbing/tree_diff_test.go
  29. 2 2
      internal/plumbing/uast/changes_xpather_test.go
  30. 2 2
      internal/plumbing/uast/diff_refiner.go
  31. 3 3
      internal/plumbing/uast/diff_refiner_test.go
  32. 1 1
      internal/plumbing/uast/test/utils.go
  33. 22 39
      internal/plumbing/uast/uast.go
  34. 18 9
      internal/plumbing/uast/uast_test.go
  35. 2 2
      internal/test/fixtures/fixtures.go
  36. 45 25
      leaves/burndown.go
  37. 38 46
      leaves/burndown_test.go
  38. 4 4
      leaves/comment_sentiment.go
  39. 6 6
      leaves/comment_sentiment_test.go
  40. 5 5
      leaves/couples.go
  41. 5 5
      leaves/couples_test.go
  42. 3 3
      leaves/file_history.go
  43. 4 4
      leaves/file_history_test.go
  44. 4 4
      leaves/shotness.go
  45. 5 5
      leaves/shotness_test.go

+ 5 - 5
.appveyor.yml

@@ -2,7 +2,7 @@ version: "{build}"
 platform: x64
 image: Visual Studio 2017
 
-clone_folder: c:\gopath\src\gopkg.in\src-d\hercules.v4
+clone_folder: c:\gopath\src\gopkg.in\src-d\hercules.v5
 
 environment:
   GOPATH: c:\gopath
@@ -17,14 +17,14 @@ build_script:
   - set PATH=C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;C:\msys64\usr\bin;C:\msys64\mingw64\bin;%PATH%
   - set PATH=%PATH:C:\Program Files\Git\usr\bin;=%
   - git clone --depth 1 --single-branch --branch %GOGIT_TAG% https://github.com/src-d/go-git %GOPATH%/src/gopkg.in/src-d/go-git.v4
-  - cd %GOPATH%\src\gopkg.in\src-d\hercules.v4
+  - cd %GOPATH%\src\gopkg.in\src-d\hercules.v5
   - set DISABLE_TENSORFLOW=1
   - make
-  - 7z a c:\gopath\src\gopkg.in\src-d\hercules.v4\hercules.win64.zip %GOPATH%\bin\hercules.exe
+  - 7z a c:\gopath\src\gopkg.in\src-d\hercules.v5\hercules.win64.zip %GOPATH%\bin\hercules.exe
 
 test_script:
-  - go get -v -t -d gopkg.in/src-d/hercules.v4/...
-  - go test -v -tags disable_babelfish gopkg.in/src-d/hercules.v4/...
+  - go get -v -t -d gopkg.in/src-d/hercules.v5/...
+  - go test -v -tags disable_babelfish gopkg.in/src-d/hercules.v5/...
 
 artifacts:
   - name: hercules.win64.zip

+ 6 - 6
.travis.yml

@@ -13,7 +13,7 @@ addons:
     packages:
     - libxml2-dev
 
-go_import_path: gopkg.in/src-d/hercules.v4
+go_import_path: gopkg.in/src-d/hercules.v5
 go:
   - 1.9.x
   - 1.10.x
@@ -25,7 +25,7 @@ cache:
     - $HOME/gopath/src
 before_cache:
   - cd $HOME/gopath
-  - rm -rf $HOME/gopath/src/gopkg.in/src-d/hercules.v4
+  - rm -rf $HOME/gopath/src/gopkg.in/src-d/hercules.v5
 
 matrix:
   fast_finish: true
@@ -65,8 +65,8 @@ script:
   - go vet -tags tensorflow ./...
   - golint -set_exit_status ./...
   - flake8
-  - if [[ $TRAVIS_GO_VERSION != 1.9.* ]]; then go test -coverpkg=all -v -cpu=1,2 -coverprofile=coverage.txt -covermode=count gopkg.in/src-d/hercules.v4/... && sed -i '/cmd\/hercules\|core.go/d' coverage.txt; fi
-  - if [[ $TRAVIS_GO_VERSION = 1.9.* ]]; then go test -v -cpu=1,2 gopkg.in/src-d/hercules.v4/...; fi
+  - if [[ $TRAVIS_GO_VERSION != 1.9.* ]]; then go test -coverpkg=all -v -cpu=1,2 -coverprofile=coverage.txt -covermode=count gopkg.in/src-d/hercules.v5/... && sed -i '/cmd\/hercules\|core.go/d' coverage.txt; fi
+  - if [[ $TRAVIS_GO_VERSION = 1.9.* ]]; then go test -v -cpu=1,2 gopkg.in/src-d/hercules.v5/...; fi
   - $GOPATH/bin/hercules version
   - $GOPATH/bin/hercules --burndown --couples --quiet --pb https://github.com/src-d/hercules > 1.pb
   - cp 1.pb 2.pb
@@ -85,7 +85,7 @@ jobs:
       os: osx
       osx_image: xcode9.3
       go: 1.11.x
-      go_import_path: gopkg.in/src-d/hercules.v4
+      go_import_path: gopkg.in/src-d/hercules.v5
       before_install:
         - wget -O protoc.zip https://github.com/google/protobuf/releases/download/v$PROTOC_VERSION/protoc-$PROTOC_VERSION-osx-x86_64.zip
         - unzip -d ~/.local protoc.zip && rm protoc.zip
@@ -105,7 +105,7 @@ jobs:
     - stage: deploy
       os: linux
       go: 1.11.x
-      go_import_path: gopkg.in/src-d/hercules.v4
+      go_import_path: gopkg.in/src-d/hercules.v5
       before_install:
         - wget -O protoc.zip https://github.com/google/protobuf/releases/download/v$PROTOC_VERSION/protoc-$PROTOC_VERSION-linux-x86_64.zip
         - unzip -d ~/.local protoc.zip && rm protoc.zip

+ 2 - 2
Makefile

@@ -12,7 +12,7 @@ endif
 all: ${GOPATH}/bin/hercules${EXE}
 
 test: all
-	go test gopkg.in/src-d/hercules.v4
+	go test gopkg.in/src-d/hercules.v5
 
 ${GOPATH}/bin/protoc-gen-gogo${EXE}:
 	go get -v github.com/gogo/protobuf/protoc-gen-gogo
@@ -40,4 +40,4 @@ ${GOPATH}/pkg/$(PKG)/gopkg.in/bblfsh/client-go.v2: ${GOPATH}/src/gopkg.in/bblfsh
 	make dependencies
 
 ${GOPATH}/bin/hercules${EXE}: *.go */*.go */*/*.go */*/*/*.go ${GOPATH}/pkg/$(PKG)/gopkg.in/bblfsh/client-go.v2 internal/pb/pb.pb.go internal/pb/pb_pb2.py cmd/hercules/plugin_template_source.go
-	go get -tags "$(TAGS)" -ldflags "-X gopkg.in/src-d/hercules.v4.BinaryGitHash=$(shell git rev-parse HEAD)" gopkg.in/src-d/hercules.v4/cmd/hercules
+	go get -tags "$(TAGS)" -ldflags "-X gopkg.in/src-d/hercules.v5.BinaryGitHash=$(shell git rev-parse HEAD)" gopkg.in/src-d/hercules.v5/cmd/hercules

+ 2 - 2
cmd/hercules/combine.go

@@ -12,8 +12,8 @@ import (
 
 	"github.com/gogo/protobuf/proto"
 	"github.com/spf13/cobra"
-	"gopkg.in/src-d/hercules.v4"
-	"gopkg.in/src-d/hercules.v4/internal/pb"
+	"gopkg.in/src-d/hercules.v5"
+	"gopkg.in/src-d/hercules.v5/internal/pb"
 )
 
 // combineCmd represents the combine command

+ 1 - 1
cmd/hercules/plugin.template

@@ -24,7 +24,7 @@ import (
 
   "github.com/gogo/protobuf/proto"
   "gopkg.in/src-d/go-git.v4"
-  "gopkg.in/src-d/hercules.v4"
+  "gopkg.in/src-d/hercules.v5"
 )
 
 // {{.name}} contains the intermediate state which is mutated by Consume(). It should implement

+ 2 - 2
cmd/hercules/root.go

@@ -29,8 +29,8 @@ import (
 	"gopkg.in/src-d/go-git.v4/storage"
 	"gopkg.in/src-d/go-git.v4/storage/filesystem"
 	"gopkg.in/src-d/go-git.v4/storage/memory"
-	"gopkg.in/src-d/hercules.v4"
-	"gopkg.in/src-d/hercules.v4/internal/pb"
+	"gopkg.in/src-d/hercules.v5"
+	"gopkg.in/src-d/hercules.v5/internal/pb"
 )
 
 // oneLineWriter splits the output data by lines and outputs one on top of another using '\r'.

+ 4 - 10
contrib/_plugin_example/churn_analysis.go

@@ -13,7 +13,7 @@ import (
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
 	"gopkg.in/src-d/go-git.v4/utils/merkletrie"
-	"gopkg.in/src-d/hercules.v4"
+	"gopkg.in/src-d/hercules.v5"
 )
 
 // ChurnAnalysis contains the intermediate state which is mutated by Consume(). It should implement
@@ -126,7 +126,7 @@ func (churn *ChurnAnalysis) Consume(deps map[string]interface{}) (map[string]int
 	}
 	fileDiffs := deps[hercules.DependencyFileDiff].(map[string]hercules.FileDiffData)
 	treeDiffs := deps[hercules.DependencyTreeChanges].(object.Changes)
-	cache := deps[hercules.DependencyBlobCache].(map[plumbing.Hash]*object.Blob)
+	cache := deps[hercules.DependencyBlobCache].(map[plumbing.Hash]*hercules.CachedBlob)
 	day := deps[hercules.DependencyDay].(int)
 	author := deps[hercules.DependencyAuthor].(int)
 	for _, change := range treeDiffs {
@@ -138,15 +138,9 @@ func (churn *ChurnAnalysis) Consume(deps map[string]interface{}) (map[string]int
 		removed := 0
 		switch action {
 		case merkletrie.Insert:
-			added, err = hercules.CountLines(cache[change.To.TreeEntry.Hash])
-			if err != nil && err.Error() == "binary" {
-				err = nil
-			}
+			added, _ = cache[change.To.TreeEntry.Hash].CountLines()
 		case merkletrie.Delete:
-			removed, err = hercules.CountLines(cache[change.From.TreeEntry.Hash])
-			if err != nil && err.Error() == "binary" {
-				err = nil
-			}
+			removed, _ = cache[change.From.TreeEntry.Hash].CountLines()
 		case merkletrie.Modify:
 			diffs := fileDiffs[change.To.Name]
 			for _, edit := range diffs.Diffs {

+ 10 - 11
core.go

@@ -1,14 +1,14 @@
 package hercules
 
 import (
-	git "gopkg.in/src-d/go-git.v4"
+	"gopkg.in/src-d/go-git.v4"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
-	"gopkg.in/src-d/hercules.v4/internal/core"
-	"gopkg.in/src-d/hercules.v4/internal/plumbing"
-	"gopkg.in/src-d/hercules.v4/internal/plumbing/identity"
-	"gopkg.in/src-d/hercules.v4/internal/plumbing/uast"
-	"gopkg.in/src-d/hercules.v4/leaves"
-	"gopkg.in/src-d/hercules.v4/internal/yaml"
+	"gopkg.in/src-d/hercules.v5/internal/core"
+	"gopkg.in/src-d/hercules.v5/internal/plumbing"
+	"gopkg.in/src-d/hercules.v5/internal/plumbing/identity"
+	"gopkg.in/src-d/hercules.v5/internal/plumbing/uast"
+	"gopkg.in/src-d/hercules.v5/internal/yaml"
+	"gopkg.in/src-d/hercules.v5/leaves"
 )
 
 // ConfigurationOptionType represents the possible types of a ConfigurationOption's value.
@@ -146,10 +146,9 @@ const (
 // FileDiffData is the type of the dependency provided by plumbing.FileDiff.
 type FileDiffData = plumbing.FileDiffData
 
-// CountLines returns the number of lines in a *object.Blob.
-func CountLines(file *object.Blob) (int, error) {
-	return plumbing.CountLines(file)
-}
+// CachedBlob allows to explicitly cache the binary data associated with the Blob object.
+// Such structs are returned by DependencyBlobCache.
+type CachedBlob = plumbing.CachedBlob
 
 // SafeYamlString escapes the string so that it can be reliably used in YAML.
 func SafeYamlString(str string) string {

+ 2 - 2
internal/burndown/file.go

@@ -4,8 +4,8 @@ import (
 	"fmt"
 	"log"
 
-	"gopkg.in/src-d/hercules.v4/internal"
-	"gopkg.in/src-d/hercules.v4/internal/rbtree"
+	"gopkg.in/src-d/hercules.v5/internal"
+	"gopkg.in/src-d/hercules.v5/internal/rbtree"
 )
 
 // Updater is the function which is called back on File.Update().

+ 2 - 2
internal/burndown/file_test.go

@@ -1,11 +1,11 @@
 package burndown
 
 import (
+	"fmt"
 	"testing"
 
 	"github.com/stretchr/testify/assert"
-	"gopkg.in/src-d/hercules.v4/internal/rbtree"
-	"fmt"
+	"gopkg.in/src-d/hercules.v5/internal/rbtree"
 )
 
 func updateStatusFile(status map[int]int64, _, previousTime, delta int) {

+ 1 - 1
internal/core/forks.go

@@ -7,7 +7,7 @@ import (
 
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
 	"gopkg.in/src-d/go-git.v4/plumbing"
-	"gopkg.in/src-d/hercules.v4/internal/toposort"
+	"gopkg.in/src-d/hercules.v5/internal/toposort"
 )
 
 // OneShotMergeProcessor provides the convenience method to consume merges only once.

+ 2 - 2
internal/core/pipeline.go

@@ -16,9 +16,9 @@ import (
 	"gopkg.in/src-d/go-git.v4"
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
-	"gopkg.in/src-d/hercules.v4/internal/pb"
-	"gopkg.in/src-d/hercules.v4/internal/toposort"
 	"gopkg.in/src-d/go-git.v4/plumbing/storer"
+	"gopkg.in/src-d/hercules.v5/internal/pb"
+	"gopkg.in/src-d/hercules.v5/internal/toposort"
 )
 
 // ConfigurationOptionType represents the possible types of a ConfigurationOption's value.

+ 2 - 2
internal/core/pipeline_test.go

@@ -13,8 +13,8 @@ import (
 	"gopkg.in/src-d/go-git.v4"
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
-	"gopkg.in/src-d/hercules.v4/internal/pb"
-	"gopkg.in/src-d/hercules.v4/internal/test"
+	"gopkg.in/src-d/hercules.v5/internal/pb"
+	"gopkg.in/src-d/hercules.v5/internal/test"
 )
 
 type testPipelineItem struct {

+ 2 - 2
internal/core/registry.go

@@ -42,7 +42,7 @@ func (registry *PipelineItemRegistry) Summon(providesOrName string) []PipelineIt
 		return []PipelineItem{}
 	}
 	ts := registry.provided[providesOrName]
-	items := []PipelineItem{}
+	var items []PipelineItem
 	for _, t := range ts {
 		items = append(items, reflect.New(t.Elem()).Interface().(PipelineItem))
 	}
@@ -209,7 +209,7 @@ func (registry *PipelineItemRegistry) AddFlags(flagSet *pflag.FlagSet) (
 			"Useful for --dump-dag.")
 		flags[ConfigPipelineDryRun] = iface
 	}
-	features := []string{}
+	var features []string
 	for f := range registry.featureFlags.Choices {
 		features = append(features, f)
 	}

+ 1 - 1
internal/core/registry_test.go

@@ -7,7 +7,7 @@ import (
 	"github.com/spf13/cobra"
 	"github.com/stretchr/testify/assert"
 	"gopkg.in/src-d/go-git.v4"
-	"gopkg.in/src-d/hercules.v4/internal/test"
+	"gopkg.in/src-d/hercules.v5/internal/test"
 )
 
 func getRegistry() *PipelineItemRegistry {

+ 4 - 4
internal/global_test.go

@@ -7,10 +7,10 @@ import (
 	"testing"
 
 	"github.com/stretchr/testify/assert"
-	"gopkg.in/src-d/hercules.v4/internal/core"
-	uast_items "gopkg.in/src-d/hercules.v4/internal/plumbing/uast"
-	"gopkg.in/src-d/hercules.v4/internal/test"
-	"gopkg.in/src-d/hercules.v4/leaves"
+	"gopkg.in/src-d/hercules.v5/internal/core"
+	uast_items "gopkg.in/src-d/hercules.v5/internal/plumbing/uast"
+	"gopkg.in/src-d/hercules.v5/internal/test"
+	"gopkg.in/src-d/hercules.v5/leaves"
 )
 
 func TestPipelineSerialize(t *testing.T) {

+ 118 - 23
internal/plumbing/blob_cache.go

@@ -1,17 +1,79 @@
 package plumbing
 
 import (
+	"bytes"
+	"fmt"
+	"io"
+	"io/ioutil"
 	"log"
 
+	"github.com/pkg/errors"
 	"gopkg.in/src-d/go-git.v4"
 	"gopkg.in/src-d/go-git.v4/config"
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
 	"gopkg.in/src-d/go-git.v4/utils/merkletrie"
-	"gopkg.in/src-d/hercules.v4/internal"
-	"gopkg.in/src-d/hercules.v4/internal/core"
+	"gopkg.in/src-d/hercules.v5/internal"
+	"gopkg.in/src-d/hercules.v5/internal/core"
 )
 
+// ErrorBinary is raised in CachedBlob.CountLines() if the file is binary.
+var ErrorBinary = errors.New("binary")
+
+// CachedBlob allows to explicitly cache the binary data associated with the Blob object.
+type CachedBlob struct {
+	object.Blob
+	// Data is the read contents of the blob object.
+	Data []byte
+}
+
+// Reader returns a reader allow the access to the content of the blob
+func (b *CachedBlob) Reader() (io.ReadCloser, error) {
+	return ioutil.NopCloser(bytes.NewReader(b.Data)), nil
+}
+
+// Cache reads the underlying blob object and sets CachedBlob.Data.
+func (b *CachedBlob) Cache() error {
+	reader, err := b.Blob.Reader()
+	if err != nil {
+		return err
+	}
+	defer reader.Close()
+	buf := new(bytes.Buffer)
+	buf.Grow(int(b.Size))
+	size, err := buf.ReadFrom(reader)
+	if err != nil {
+		return err
+	}
+	if size != b.Size {
+		return fmt.Errorf("incomplete read of %s: %d while the declared size is %d",
+			b.Hash.String(), size, b.Size)
+	}
+	b.Data = buf.Bytes()
+	return nil
+}
+
+// CountLines returns the number of lines in the blob or (0, ErrorBinary) if it is binary.
+func (b *CachedBlob) CountLines() (int, error) {
+	if len(b.Data) == 0 {
+		return 0, nil
+	}
+	// 8000 was taken from go-git's utils/binary.IsBinary()
+	sniffLen := 8000
+	sniff := b.Data
+	if len(sniff) > sniffLen {
+		sniff = sniff[:sniffLen]
+	}
+	if bytes.IndexByte(sniff, 0) >= 0 {
+		return 0, ErrorBinary
+	}
+	lines := bytes.Count(b.Data, []byte{'\n'})
+	if b.Data[len(b.Data)-1] != '\n' {
+		lines++
+	}
+	return lines, nil
+}
+
 // BlobCache loads the blobs which correspond to the changed files in a commit.
 // It is a PipelineItem.
 // It must provide the old and the new objects; "blobCache" rotates and allows to not load
@@ -24,7 +86,7 @@ type BlobCache struct {
 	FailOnMissingSubmodules bool
 
 	repository *git.Repository
-	cache      map[plumbing.Hash]*object.Blob
+	cache      map[plumbing.Hash]*CachedBlob
 }
 
 const (
@@ -80,7 +142,7 @@ func (blobCache *BlobCache) Configure(facts map[string]interface{}) {
 // calls. The repository which is going to be analysed is supplied as an argument.
 func (blobCache *BlobCache) Initialize(repository *git.Repository) {
 	blobCache.repository = repository
-	blobCache.cache = map[plumbing.Hash]*object.Blob{}
+	blobCache.cache = map[plumbing.Hash]*CachedBlob{}
 }
 
 // Consume runs this PipelineItem on the next commit data.
@@ -92,8 +154,8 @@ func (blobCache *BlobCache) Initialize(repository *git.Repository) {
 func (blobCache *BlobCache) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
 	commit := deps[core.DependencyCommit].(*object.Commit)
 	changes := deps[DependencyTreeChanges].(object.Changes)
-	cache := map[plumbing.Hash]*object.Blob{}
-	newCache := map[plumbing.Hash]*object.Blob{}
+	cache := map[plumbing.Hash]*CachedBlob{}
+	newCache := map[plumbing.Hash]*CachedBlob{}
 	for _, change := range changes {
 		action, err := change.Action()
 		if err != nil {
@@ -104,45 +166,77 @@ func (blobCache *BlobCache) Consume(deps map[string]interface{}) (map[string]int
 		var blob *object.Blob
 		switch action {
 		case merkletrie.Insert:
+			cache[change.To.TreeEntry.Hash] = &CachedBlob{}
+			newCache[change.To.TreeEntry.Hash] = &CachedBlob{}
 			blob, err = blobCache.getBlob(&change.To, commit.File)
 			if err != nil {
-				log.Printf("file to %s %s\n",
-					change.To.Name, change.To.TreeEntry.Hash)
+				log.Printf("file to %s %s: %v\n", change.To.Name, change.To.TreeEntry.Hash, err)
 			} else {
-				cache[change.To.TreeEntry.Hash] = blob
-				newCache[change.To.TreeEntry.Hash] = blob
+				cb := &CachedBlob{Blob: *blob}
+				err = cb.Cache()
+				if err == nil {
+					cache[change.To.TreeEntry.Hash] = cb
+					newCache[change.To.TreeEntry.Hash] = cb
+				} else {
+					log.Printf("file to %s %s: %v\n", change.To.Name, change.To.TreeEntry.Hash, err)
+				}
 			}
 		case merkletrie.Delete:
 			cache[change.From.TreeEntry.Hash], exists =
 				blobCache.cache[change.From.TreeEntry.Hash]
 			if !exists {
-				cache[change.From.TreeEntry.Hash], err =
-					blobCache.getBlob(&change.From, commit.File)
+				cache[change.From.TreeEntry.Hash] = &CachedBlob{}
+				blob, err = blobCache.getBlob(&change.From, commit.File)
 				if err != nil {
 					if err.Error() != plumbing.ErrObjectNotFound.Error() {
-						log.Printf("file from %s %s\n", change.From.Name,
-							change.From.TreeEntry.Hash)
+						log.Printf("file from %s %s: %v\n", change.From.Name,
+							change.From.TreeEntry.Hash, err)
 					} else {
-						cache[change.From.TreeEntry.Hash], err =
-							internal.CreateDummyBlob(change.From.TreeEntry.Hash)
+						blob, err = internal.CreateDummyBlob(change.From.TreeEntry.Hash)
+						cache[change.From.TreeEntry.Hash] = &CachedBlob{Blob: *blob}
+					}
+				} else {
+					cb := &CachedBlob{Blob: *blob}
+					err = cb.Cache()
+					if err == nil {
+						cache[change.From.TreeEntry.Hash] = cb
+					} else {
+						log.Printf("file from %s %s: %v\n", change.From.Name,
+							change.From.TreeEntry.Hash, err)
 					}
 				}
 			}
 		case merkletrie.Modify:
 			blob, err = blobCache.getBlob(&change.To, commit.File)
+			cache[change.To.TreeEntry.Hash] = &CachedBlob{}
+			newCache[change.To.TreeEntry.Hash] = &CachedBlob{}
 			if err != nil {
-				log.Printf("file to %s\n", change.To.Name)
+				log.Printf("file to %s: %v\n", change.To.Name, err)
 			} else {
-				cache[change.To.TreeEntry.Hash] = blob
-				newCache[change.To.TreeEntry.Hash] = blob
+				cb := &CachedBlob{Blob: *blob}
+				err = cb.Cache()
+				if err == nil {
+					cache[change.To.TreeEntry.Hash] = cb
+					newCache[change.To.TreeEntry.Hash] = cb
+				} else {
+					log.Printf("file to %s: %v\n", change.To.Name, err)
+				}
 			}
 			cache[change.From.TreeEntry.Hash], exists =
 				blobCache.cache[change.From.TreeEntry.Hash]
 			if !exists {
-				cache[change.From.TreeEntry.Hash], err =
-					blobCache.getBlob(&change.From, commit.File)
+				cache[change.From.TreeEntry.Hash] = &CachedBlob{}
+				blob, err = blobCache.getBlob(&change.From, commit.File)
 				if err != nil {
-					log.Printf("file from %s\n", change.From.Name)
+					log.Printf("file from %s: %v\n", change.From.Name, err)
+				} else {
+					cb := &CachedBlob{Blob: *blob}
+					err = cb.Cache()
+					if err == nil {
+						cache[change.From.TreeEntry.Hash] = cb
+					} else {
+						log.Printf("file from %s: %v\n", change.From.Name, err)
+					}
 				}
 			}
 		}
@@ -158,7 +252,7 @@ func (blobCache *BlobCache) Consume(deps map[string]interface{}) (map[string]int
 func (blobCache *BlobCache) Fork(n int) []core.PipelineItem {
 	caches := make([]core.PipelineItem, n)
 	for i := 0; i < n; i++ {
-		cache := map[plumbing.Hash]*object.Blob{}
+		cache := map[plumbing.Hash]*CachedBlob{}
 		for k, v := range blobCache.cache {
 			cache[k] = v
 		}
@@ -180,6 +274,7 @@ type FileGetter func(path string) (*object.File, error)
 func (blobCache *BlobCache) getBlob(entry *object.ChangeEntry, fileGetter FileGetter) (
 	*object.Blob, error) {
 	blob, err := blobCache.repository.BlobObject(entry.TreeEntry.Hash)
+
 	if err != nil {
 		if err.Error() != plumbing.ErrObjectNotFound.Error() {
 			log.Printf("getBlob(%s)\n", entry.TreeEntry.Hash.String())

+ 16 - 6
internal/plumbing/blob_cache_test.go

@@ -6,9 +6,9 @@ import (
 	"github.com/stretchr/testify/assert"
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
-	"gopkg.in/src-d/hercules.v4/internal"
-	"gopkg.in/src-d/hercules.v4/internal/core"
-	"gopkg.in/src-d/hercules.v4/internal/test"
+	"gopkg.in/src-d/hercules.v5/internal"
+	"gopkg.in/src-d/hercules.v5/internal/core"
+	"gopkg.in/src-d/hercules.v5/internal/test"
 )
 
 func fixtureBlobCache() *BlobCache {
@@ -17,6 +17,16 @@ func fixtureBlobCache() *BlobCache {
 	return cache
 }
 
+func AddHash(t *testing.T, cache map[plumbing.Hash]*CachedBlob, hash string) {
+	objhash := plumbing.NewHash(hash)
+	blob, err := test.Repository.BlobObject(objhash)
+	assert.Nil(t, err)
+	cb := &CachedBlob{Blob: *blob}
+	err = cb.Cache()
+	assert.Nil(t, err)
+	cache[objhash] = cb
+}
+
 func TestBlobCacheConfigureInitialize(t *testing.T) {
 	cache := fixtureBlobCache()
 	assert.Equal(t, test.Repository, cache.repository)
@@ -85,7 +95,7 @@ func TestBlobCacheConsumeModification(t *testing.T) {
 	assert.Equal(t, len(result), 1)
 	cacheIface, exists := result[DependencyBlobCache]
 	assert.True(t, exists)
-	cache := cacheIface.(map[plumbing.Hash]*object.Blob)
+	cache := cacheIface.(map[plumbing.Hash]*CachedBlob)
 	assert.Equal(t, len(cache), 2)
 	blobFrom, exists := cache[plumbing.NewHash("1cacfc1bf0f048eb2f31973750983ae5d8de647a")]
 	assert.True(t, exists)
@@ -131,7 +141,7 @@ func TestBlobCacheConsumeInsertionDeletion(t *testing.T) {
 	assert.Equal(t, len(result), 1)
 	cacheIface, exists := result[DependencyBlobCache]
 	assert.True(t, exists)
-	cache := cacheIface.(map[plumbing.Hash]*object.Blob)
+	cache := cacheIface.(map[plumbing.Hash]*CachedBlob)
 	assert.Equal(t, len(cache), 2)
 	blobFrom, exists := cache[plumbing.NewHash("baa64828831d174f40140e4b3cfa77d1e917a2c1")]
 	assert.True(t, exists)
@@ -301,7 +311,7 @@ func TestBlobCacheDeleteInvalidBlob(t *testing.T) {
 	assert.Equal(t, len(result), 1)
 	cacheIface, exists := result[DependencyBlobCache]
 	assert.True(t, exists)
-	cache := cacheIface.(map[plumbing.Hash]*object.Blob)
+	cache := cacheIface.(map[plumbing.Hash]*CachedBlob)
 	assert.Equal(t, len(cache), 1)
 	blobFrom, exists := cache[plumbing.NewHash("ffffffffffffffffffffffffffffffffffffffff")]
 	assert.True(t, exists)

+ 1 - 1
internal/plumbing/day.go

@@ -6,7 +6,7 @@ import (
 	"gopkg.in/src-d/go-git.v4"
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
-	"gopkg.in/src-d/hercules.v4/internal/core"
+	"gopkg.in/src-d/hercules.v5/internal/core"
 )
 
 // DaysSinceStart provides the relative date information for every commit.

+ 2 - 2
internal/plumbing/day_test.go

@@ -5,8 +5,8 @@ import (
 
 	"github.com/stretchr/testify/assert"
 	"gopkg.in/src-d/go-git.v4/plumbing"
-	"gopkg.in/src-d/hercules.v4/internal/core"
-	"gopkg.in/src-d/hercules.v4/internal/test"
+	"gopkg.in/src-d/hercules.v5/internal/core"
+	"gopkg.in/src-d/hercules.v5/internal/test"
 )
 
 func fixtureDaysSinceStart() *DaysSinceStart {

+ 3 - 80
internal/plumbing/diff.go

@@ -1,18 +1,12 @@
 package plumbing
 
 import (
-	"bufio"
-	"bytes"
-	"errors"
-	"io"
-	"unicode/utf8"
-
 	"github.com/sergi/go-diff/diffmatchpatch"
 	"gopkg.in/src-d/go-git.v4"
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
 	"gopkg.in/src-d/go-git.v4/utils/merkletrie"
-	"gopkg.in/src-d/hercules.v4/internal/core"
+	"gopkg.in/src-d/hercules.v5/internal/core"
 )
 
 // FileDiff calculates the difference of files which were modified.
@@ -90,7 +84,7 @@ func (diff *FileDiff) Initialize(repository *git.Repository) {}
 // in Provides(). If there was an error, nil is returned.
 func (diff *FileDiff) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
 	result := map[string]FileDiffData{}
-	cache := deps[DependencyBlobCache].(map[plumbing.Hash]*object.Blob)
+	cache := deps[DependencyBlobCache].(map[plumbing.Hash]*CachedBlob)
 	treeDiff := deps[DependencyTreeChanges].(object.Changes)
 	for _, change := range treeDiff {
 		action, err := change.Action()
@@ -103,14 +97,7 @@ func (diff *FileDiff) Consume(deps map[string]interface{}) (map[string]interface
 			blobTo := cache[change.To.TreeEntry.Hash]
 			// we are not validating UTF-8 here because for example
 			// git/git 4f7770c87ce3c302e1639a7737a6d2531fe4b160 fetch-pack.c is invalid UTF-8
-			strFrom, err := BlobToString(blobFrom)
-			if err != nil {
-				return nil, err
-			}
-			strTo, err := BlobToString(blobTo)
-			if err != nil {
-				return nil, err
-			}
+			strFrom, strTo := string(blobFrom.Data), string(blobTo.Data)
 			dmp := diffmatchpatch.New()
 			src, dst, _ := dmp.DiffLinesToRunes(strFrom, strTo)
 			diffs := dmp.DiffMainRunes(src, dst, false)
@@ -134,70 +121,6 @@ func (diff *FileDiff) Fork(n int) []core.PipelineItem {
 	return core.ForkSamePipelineItem(diff, n)
 }
 
-// CountLines returns the number of lines in a *object.Blob.
-func CountLines(file *object.Blob) (int, error) {
-	if file == nil {
-		return -1, errors.New("blob is nil: probably not cached")
-	}
-	reader, err := file.Reader()
-	if err != nil {
-		return -1, err
-	}
-	defer checkClose(reader)
-	var scanner *bufio.Scanner
-	buffer := make([]byte, bufio.MaxScanTokenSize)
-	counter := 0
-	utf8Errors := 0
-	for scanner == nil || scanner.Err() == bufio.ErrTooLong {
-		if scanner != nil {
-			chunk := scanner.Bytes()
-			if !utf8.Valid(chunk) {
-				utf8Errors++
-			}
-			if bytes.IndexByte(chunk, 0) >= 0 {
-				return -1, errors.New("binary")
-			}
-		}
-		scanner = bufio.NewScanner(reader)
-		scanner.Buffer(buffer, 0)
-		for scanner.Scan() {
-			chunk := scanner.Bytes()
-			if !utf8.Valid(chunk) {
-				utf8Errors++
-			}
-			if bytes.IndexByte(chunk, 0) >= 0 {
-				return -1, errors.New("binary")
-			}
-			counter++
-		}
-	}
-	if float32(utf8Errors) / float32(counter) >= 0.01 {
-		return -1, errors.New("binary")
-	}
-	return counter, nil
-}
-
-// BlobToString reads *object.Blob and returns its contents as a string.
-func BlobToString(file *object.Blob) (string, error) {
-	if file == nil {
-		return "", errors.New("blob is nil: probably not cached")
-	}
-	reader, err := file.Reader()
-	if err != nil {
-		return "", err
-	}
-	defer checkClose(reader)
-	buf := new(bytes.Buffer)
-	buf.ReadFrom(reader)
-	return buf.String(), nil
-}
-
-func checkClose(c io.Closer) {
-	if err := c.Close(); err != nil {
-		panic(err)
-	}
-}
-
 func init() {
 	core.Registry.Register(&FileDiff{})
 }

+ 45 - 47
internal/plumbing/diff_test.go

@@ -8,11 +8,12 @@ import (
 	"github.com/stretchr/testify/assert"
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
-	"gopkg.in/src-d/hercules.v4/internal"
-	"gopkg.in/src-d/hercules.v4/internal/core"
-	items "gopkg.in/src-d/hercules.v4/internal/plumbing"
-	"gopkg.in/src-d/hercules.v4/internal/test"
-	"gopkg.in/src-d/hercules.v4/internal/test/fixtures"
+	"gopkg.in/src-d/hercules.v5"
+	"gopkg.in/src-d/hercules.v5/internal"
+	"gopkg.in/src-d/hercules.v5/internal/core"
+	items "gopkg.in/src-d/hercules.v5/internal/plumbing"
+	"gopkg.in/src-d/hercules.v5/internal/test"
+	"gopkg.in/src-d/hercules.v5/internal/test/fixtures"
 )
 
 func TestFileDiffMeta(t *testing.T) {
@@ -47,13 +48,10 @@ func TestFileDiffRegistration(t *testing.T) {
 func TestFileDiffConsume(t *testing.T) {
 	fd := fixtures.FileDiff()
 	deps := map[string]interface{}{}
-	cache := map[plumbing.Hash]*object.Blob{}
-	hash := plumbing.NewHash("291286b4ac41952cbd1389fda66420ec03c1a9fe")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("334cde09da4afcb74f8d2b3e6fd6cce61228b485")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("dc248ba2b22048cc730c571a748e8ffcf7085ab9")
-	cache[hash], _ = test.Repository.BlobObject(hash)
+	cache := map[plumbing.Hash]*items.CachedBlob{}
+	items.AddHash(t, cache, "291286b4ac41952cbd1389fda66420ec03c1a9fe")
+	items.AddHash(t, cache, "334cde09da4afcb74f8d2b3e6fd6cce61228b485")
+	items.AddHash(t, cache, "dc248ba2b22048cc730c571a748e8ffcf7085ab9")
 	deps[items.DependencyBlobCache] = cache
 	changes := make(object.Changes, 3)
 	treeFrom, _ := test.Repository.TreeObject(plumbing.NewHash(
@@ -124,13 +122,11 @@ func TestFileDiffConsume(t *testing.T) {
 func TestFileDiffConsumeInvalidBlob(t *testing.T) {
 	fd := fixtures.FileDiff()
 	deps := map[string]interface{}{}
-	cache := map[plumbing.Hash]*object.Blob{}
-	hash := plumbing.NewHash("291286b4ac41952cbd1389fda66420ec03c1a9fe")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("334cde09da4afcb74f8d2b3e6fd6cce61228b485")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("dc248ba2b22048cc730c571a748e8ffcf7085ab9")
-	cache[hash], _ = test.Repository.BlobObject(hash)
+	cache := map[plumbing.Hash]*items.CachedBlob{}
+	items.AddHash(t, cache, "291286b4ac41952cbd1389fda66420ec03c1a9fe")
+	items.AddHash(t, cache, "334cde09da4afcb74f8d2b3e6fd6cce61228b485")
+	items.AddHash(t, cache, "dc248ba2b22048cc730c571a748e8ffcf7085ab9")
+	cache[plumbing.NewHash("ffffffffffffffffffffffffffffffffffffffff")] = &items.CachedBlob{}
 	deps[items.DependencyBlobCache] = cache
 	changes := make(object.Changes, 1)
 	treeFrom, _ := test.Repository.TreeObject(plumbing.NewHash(
@@ -156,8 +152,8 @@ func TestFileDiffConsumeInvalidBlob(t *testing.T) {
 	}}
 	deps[items.DependencyTreeChanges] = changes
 	res, err := fd.Consume(deps)
-	assert.Nil(t, res)
-	assert.NotNil(t, err)
+	assert.Len(t, res[hercules.DependencyFileDiff].(map[string]items.FileDiffData), 1)
+	assert.Nil(t, err)
 	changes[0] = &object.Change{From: object.ChangeEntry{
 		Name: "analyser.go",
 		Tree: treeFrom,
@@ -176,38 +172,44 @@ func TestFileDiffConsumeInvalidBlob(t *testing.T) {
 		},
 	}}
 	res, err = fd.Consume(deps)
-	assert.Nil(t, res)
-	assert.NotNil(t, err)
+	assert.Len(t, res[hercules.DependencyFileDiff].(map[string]items.FileDiffData), 1)
+	assert.Nil(t, err)
 }
 
 func TestCountLines(t *testing.T) {
-	blob, _ := test.Repository.BlobObject(
+	blob, err := test.Repository.BlobObject(
 		plumbing.NewHash("291286b4ac41952cbd1389fda66420ec03c1a9fe"))
-	lines, err := items.CountLines(blob)
+	assert.Nil(t, err)
+	cb := &items.CachedBlob{Blob: *blob}
+	cb.Cache()
+	lines, err := cb.CountLines()
 	assert.Equal(t, lines, 12)
 	assert.Nil(t, err)
-	lines, err = items.CountLines(nil)
-	assert.Equal(t, lines, -1)
-	assert.NotNil(t, err)
-	blob, _ = internal.CreateDummyBlob(plumbing.NewHash("291286b4ac41952cbd1389fda66420ec03c1a9fe"), true)
-	lines, err = items.CountLines(blob)
-	assert.Equal(t, lines, -1)
-	assert.NotNil(t, err)
+	blob, err = internal.CreateDummyBlob(
+		plumbing.NewHash("291286b4ac41952cbd1389fda66420ec03c1a9fe"), true)
+	assert.Nil(t, err)
+	cb = &items.CachedBlob{Blob: *blob}
+	err = cb.Cache()
+	assert.Equal(t, err.Error(), "dummy failure")
 	// test_data/blob
 	blob, err = test.Repository.BlobObject(
 		plumbing.NewHash("c86626638e0bc8cf47ca49bb1525b40e9737ee64"))
 	assert.Nil(t, err)
-	lines, err = items.CountLines(blob)
-	assert.Equal(t, lines, -1)
+	cb = &items.CachedBlob{Blob: *blob}
+	cb.Cache()
+	lines, err = cb.CountLines()
+	assert.Equal(t, lines, 0)
 	assert.NotNil(t, err)
-	assert.EqualError(t, err, "binary")
+	assert.Equal(t, err.Error(), items.ErrorBinary.Error())
 }
 
 func TestBlobToString(t *testing.T) {
 	blob, _ := test.Repository.BlobObject(
 		plumbing.NewHash("291286b4ac41952cbd1389fda66420ec03c1a9fe"))
-	str, err := items.BlobToString(blob)
+	cb := &items.CachedBlob{Blob: *blob}
+	err := cb.Cache()
 	assert.Nil(t, err)
+	str := string(cb.Data)
 	assert.Equal(t, str, `language: go
 
 go:
@@ -221,23 +223,19 @@ script:
 notifications:
   email: false
 `)
-	str, err = items.BlobToString(nil)
-	assert.Equal(t, str, "")
-	assert.NotNil(t, err)
-	blob, _ = internal.CreateDummyBlob(plumbing.NewHash("291286b4ac41952cbd1389fda66420ec03c1a9fe"), true)
-	str, err = items.BlobToString(blob)
-	assert.Equal(t, str, "")
+	blob, _ = internal.CreateDummyBlob(
+		plumbing.NewHash("291286b4ac41952cbd1389fda66420ec03c1a9fe"), true)
+	cb = &items.CachedBlob{Blob: *blob}
+	err = cb.Cache()
 	assert.NotNil(t, err)
 }
 
 func TestFileDiffDarkMagic(t *testing.T) {
 	fd := fixtures.FileDiff()
 	deps := map[string]interface{}{}
-	cache := map[plumbing.Hash]*object.Blob{}
-	hash := plumbing.NewHash("448eb3f312849b0ca766063d06b09481c987b309")
-	cache[hash], _ = test.Repository.BlobObject(hash) // 1.java
-	hash = plumbing.NewHash("3312c92f3e8bdfbbdb30bccb6acd1b85bc338dfc")
-	cache[hash], _ = test.Repository.BlobObject(hash) // 2.java
+	cache := map[plumbing.Hash]*items.CachedBlob{}
+	items.AddHash(t, cache, "448eb3f312849b0ca766063d06b09481c987b309") // 1.java
+	items.AddHash(t, cache, "3312c92f3e8bdfbbdb30bccb6acd1b85bc338dfc") // 2.java
 	deps[items.DependencyBlobCache] = cache
 	changes := make(object.Changes, 1)
 	treeFrom, _ := test.Repository.TreeObject(plumbing.NewHash(

+ 1 - 1
internal/plumbing/identity/identity.go

@@ -8,7 +8,7 @@ import (
 
 	"gopkg.in/src-d/go-git.v4"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
-	"gopkg.in/src-d/hercules.v4/internal/core"
+	"gopkg.in/src-d/hercules.v5/internal/core"
 	)
 
 // Detector determines the author of a commit. Same person can commit under different

+ 2 - 2
internal/plumbing/identity/identity_test.go

@@ -14,8 +14,8 @@ import (
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
 	"gopkg.in/src-d/go-git.v4/plumbing/storer"
-	"gopkg.in/src-d/hercules.v4/internal/core"
-	"gopkg.in/src-d/hercules.v4/internal/test"
+	"gopkg.in/src-d/hercules.v5/internal/core"
+	"gopkg.in/src-d/hercules.v5/internal/test"
 )
 
 func fixtureIdentityDetector() *Detector {

+ 52 - 22
internal/plumbing/renames.go

@@ -10,8 +10,8 @@ import (
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
 	"gopkg.in/src-d/go-git.v4/utils/merkletrie"
-	"gopkg.in/src-d/hercules.v4/internal"
-	"gopkg.in/src-d/hercules.v4/internal/core"
+	"gopkg.in/src-d/hercules.v5/internal"
+	"gopkg.in/src-d/hercules.v5/internal/core"
 )
 
 // RenameAnalysis improves TreeDiff's results by searching for changed blobs under different
@@ -21,7 +21,7 @@ type RenameAnalysis struct {
 	core.NoopMerger
 	// SimilarityThreshold adjusts the heuristic to determine file renames.
 	// It has the same units as cgit's -X rename-threshold or -M. Better to
-	// set it to the default value of 90 (90%).
+	// set it to the default value of 50 (50%).
 	SimilarityThreshold int
 
 	repository *git.Repository
@@ -30,7 +30,8 @@ type RenameAnalysis struct {
 const (
 	// RenameAnalysisDefaultThreshold specifies the default percentage of common lines in a pair
 	// of files to consider them linked. The exact code of the decision is sizesAreClose().
-	RenameAnalysisDefaultThreshold = 90
+	// This defaults to CGit's 50%.
+	RenameAnalysisDefaultThreshold = 50
 
 	// ConfigRenameAnalysisSimilarityThreshold is the name of the configuration option
 	// (RenameAnalysis.Configure()) which sets the similarity threshold.
@@ -95,7 +96,7 @@ func (ra *RenameAnalysis) Initialize(repository *git.Repository) {
 // in Provides(). If there was an error, nil is returned.
 func (ra *RenameAnalysis) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
 	changes := deps[DependencyTreeChanges].(object.Changes)
-	cache := deps[DependencyBlobCache].(map[plumbing.Hash]*object.Blob)
+	cache := deps[DependencyBlobCache].(map[plumbing.Hash]*CachedBlob)
 
 	reducedChanges := make(object.Changes, 0, changes.Len())
 
@@ -210,30 +211,59 @@ func (ra *RenameAnalysis) Fork(n int) []core.PipelineItem {
 }
 
 func (ra *RenameAnalysis) sizesAreClose(size1 int64, size2 int64) bool {
-	return internal.Abs64(size1-size2)*100/internal.Max64(1, internal.Min64(size1, size2)) <=
-		int64(100-ra.SimilarityThreshold)
+	size := internal.Max64(1, internal.Max64(size1, size2))
+	return (internal.Abs64(size1-size2)*100)/size <= int64(100-ra.SimilarityThreshold)
 }
 
 func (ra *RenameAnalysis) blobsAreClose(
-	blob1 *object.Blob, blob2 *object.Blob) (bool, error) {
-	strFrom, err := BlobToString(blob1)
-	if err != nil {
-		return false, err
-	}
-	strTo, err := BlobToString(blob2)
-	if err != nil {
-		return false, err
-	}
+	blob1 *CachedBlob, blob2 *CachedBlob) (bool, error) {
+	src, dst := string(blob1.Data), string(blob2.Data)
+
+	// compute the line-by-line diff, then the char-level diffs of the del-ins blocks
+	// yes, this algorithm is greedy and not exact
 	dmp := diffmatchpatch.New()
-	src, dst, _ := dmp.DiffLinesToRunes(strFrom, strTo)
-	diffs := dmp.DiffMainRunes(src, dst, false)
-	common := 0
+	srcLines, dstLines, lines := dmp.DiffLinesToRunes(src, dst)
+	diffs := dmp.DiffMainRunes(srcLines, dstLines, false)
+	var common, posSrc, prevPosSrc, posDst int
+	possibleDelInsBlock := false
 	for _, edit := range diffs {
-		if edit.Type == diffmatchpatch.DiffEqual {
-			common += utf8.RuneCountInString(edit.Text)
+		switch edit.Type {
+		case diffmatchpatch.DiffDelete:
+			possibleDelInsBlock = true
+			prevPosSrc = posSrc
+			for _, lineno := range edit.Text {
+				posSrc += len(lines[lineno])
+			}
+		case diffmatchpatch.DiffInsert:
+			nextPosDst := posDst
+			for _, lineno := range edit.Text {
+				nextPosDst += len(lines[lineno])
+			}
+			if possibleDelInsBlock {
+				possibleDelInsBlock = false
+				localDmp := diffmatchpatch.New()
+				localSrc := src[prevPosSrc:posSrc]
+				localDst := dst[posDst:nextPosDst]
+				localDiffs := localDmp.DiffMainRunes([]rune(localSrc), []rune(localDst), false)
+				for _, localEdit := range localDiffs {
+					if localEdit.Type == diffmatchpatch.DiffEqual {
+						common += utf8.RuneCountInString(localEdit.Text)
+					}
+				}
+			}
+			posDst = nextPosDst
+		case diffmatchpatch.DiffEqual:
+			possibleDelInsBlock = false
+			for _, lineno := range edit.Text {
+				common += utf8.RuneCountInString(lines[lineno])
+				step := len(lines[lineno])
+				posSrc += step
+				posDst += step
+			}
 		}
 	}
-	similarity := common * 100 / internal.Max(1, internal.Min(len(src), len(dst)))
+	size := internal.Max(1, internal.Max(utf8.RuneCountInString(src), utf8.RuneCountInString(dst)))
+	similarity := (common*100) / size
 	return similarity >= ra.SimilarityThreshold, nil
 }
 

+ 9 - 13
internal/plumbing/renames_test.go

@@ -6,8 +6,8 @@ import (
 	"github.com/stretchr/testify/assert"
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
-	"gopkg.in/src-d/hercules.v4/internal/core"
-	"gopkg.in/src-d/hercules.v4/internal/test"
+	"gopkg.in/src-d/hercules.v5/internal/core"
+	"gopkg.in/src-d/hercules.v5/internal/test"
 )
 
 func fixtureRenameAnalysis() *RenameAnalysis {
@@ -109,24 +109,20 @@ func TestRenameAnalysisConsume(t *testing.T) {
 		},
 	},
 	}
-	cache := map[plumbing.Hash]*object.Blob{}
-	hash := plumbing.NewHash("baa64828831d174f40140e4b3cfa77d1e917a2c1")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("29c9fafd6a2fae8cd20298c3f60115bc31a4c0f2")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("c29112dbd697ad9b401333b80c18a63951bc18d9")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("f7d918ec500e2f925ecde79b51cc007bac27de72")
-	cache[hash], _ = test.Repository.BlobObject(hash)
+	cache := map[plumbing.Hash]*CachedBlob{}
+	AddHash(t, cache, "baa64828831d174f40140e4b3cfa77d1e917a2c1")
+	AddHash(t, cache, "29c9fafd6a2fae8cd20298c3f60115bc31a4c0f2")
+	AddHash(t, cache, "c29112dbd697ad9b401333b80c18a63951bc18d9")
+	AddHash(t, cache, "f7d918ec500e2f925ecde79b51cc007bac27de72")
 	deps := map[string]interface{}{}
 	deps[DependencyBlobCache] = cache
 	deps[DependencyTreeChanges] = changes
-	ra.SimilarityThreshold = 33
+	ra.SimilarityThreshold = 37
 	res, err := ra.Consume(deps)
 	assert.Nil(t, err)
 	renamed := res[DependencyTreeChanges].(object.Changes)
 	assert.Equal(t, len(renamed), 2)
-	ra.SimilarityThreshold = 35
+	ra.SimilarityThreshold = 38
 	res, err = ra.Consume(deps)
 	assert.Nil(t, err)
 	renamed = res[DependencyTreeChanges].(object.Changes)

+ 1 - 1
internal/plumbing/tree_diff.go

@@ -9,7 +9,7 @@ import (
 
 	"gopkg.in/src-d/go-git.v4"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
-	"gopkg.in/src-d/hercules.v4/internal/core"
+	"gopkg.in/src-d/hercules.v5/internal/core"
 	"gopkg.in/src-d/go-git.v4/plumbing"
 )
 

+ 2 - 2
internal/plumbing/tree_diff_test.go

@@ -7,8 +7,8 @@ import (
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
 	"gopkg.in/src-d/go-git.v4/utils/merkletrie"
-	"gopkg.in/src-d/hercules.v4/internal/core"
-	"gopkg.in/src-d/hercules.v4/internal/test"
+	"gopkg.in/src-d/hercules.v5/internal/core"
+	"gopkg.in/src-d/hercules.v5/internal/test"
 )
 
 func fixtureTreeDiff() *TreeDiff {

+ 2 - 2
internal/plumbing/uast/changes_xpather_test.go

@@ -8,8 +8,8 @@ import (
 
 	"github.com/stretchr/testify/assert"
 	"gopkg.in/bblfsh/client-go.v2"
-	uast_test "gopkg.in/src-d/hercules.v4/internal/plumbing/uast/test"
-	"gopkg.in/src-d/hercules.v4/internal/test"
+	uast_test "gopkg.in/src-d/hercules.v5/internal/plumbing/uast/test"
+	"gopkg.in/src-d/hercules.v5/internal/test"
 )
 
 func TestChangesXPatherExtractChanged(t *testing.T) {

+ 2 - 2
internal/plumbing/uast/diff_refiner.go

@@ -6,8 +6,8 @@ import (
 	"github.com/sergi/go-diff/diffmatchpatch"
 	"gopkg.in/bblfsh/sdk.v1/uast"
 	"gopkg.in/src-d/go-git.v4"
-	"gopkg.in/src-d/hercules.v4/internal/core"
-	"gopkg.in/src-d/hercules.v4/internal/plumbing"
+	"gopkg.in/src-d/hercules.v5/internal/core"
+	"gopkg.in/src-d/hercules.v5/internal/plumbing"
 )
 
 // FileDiffRefiner uses UASTs to improve the human interpretability of diffs.

+ 3 - 3
internal/plumbing/uast/diff_refiner_test.go

@@ -11,9 +11,9 @@ import (
 	"github.com/stretchr/testify/assert"
 	"gopkg.in/bblfsh/sdk.v1/uast"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
-	"gopkg.in/src-d/hercules.v4/internal/core"
-	"gopkg.in/src-d/hercules.v4/internal/plumbing"
-	"gopkg.in/src-d/hercules.v4/internal/test"
+	"gopkg.in/src-d/hercules.v5/internal/core"
+	"gopkg.in/src-d/hercules.v5/internal/plumbing"
+	"gopkg.in/src-d/hercules.v5/internal/test"
 )
 
 func fixtureFileDiffRefiner() *FileDiffRefiner {

+ 1 - 1
internal/plumbing/uast/test/utils.go

@@ -7,7 +7,7 @@ import (
 	bblfsh "gopkg.in/bblfsh/client-go.v2"
 	"gopkg.in/bblfsh/sdk.v1/uast"
 	"gopkg.in/src-d/go-git.v4/plumbing"
-	core_test "gopkg.in/src-d/hercules.v4/internal/test"
+	core_test "gopkg.in/src-d/hercules.v5/internal/test"
 )
 
 // ParseBlobFromTestRepo extracts the UAST from the file by it's hash and name.

+ 22 - 39
internal/plumbing/uast/uast.go

@@ -1,12 +1,11 @@
 package uast
 
 import (
-	"bytes"
 	"context"
 	"errors"
 	"fmt"
 	"io"
-	goioutil "io/ioutil"
+	"io/ioutil"
 	"os"
 	"path"
 	"runtime"
@@ -22,11 +21,10 @@ import (
 	"gopkg.in/src-d/go-git.v4"
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
-	"gopkg.in/src-d/go-git.v4/utils/ioutil"
 	"gopkg.in/src-d/go-git.v4/utils/merkletrie"
-	"gopkg.in/src-d/hercules.v4/internal/core"
-	"gopkg.in/src-d/hercules.v4/internal/pb"
-	items "gopkg.in/src-d/hercules.v4/internal/plumbing"
+	"gopkg.in/src-d/hercules.v5/internal/core"
+	"gopkg.in/src-d/hercules.v5/internal/pb"
+	items "gopkg.in/src-d/hercules.v5/internal/plumbing"
 )
 
 // Extractor retrieves UASTs from Babelfish server which correspond to changed files in a commit.
@@ -44,8 +42,6 @@ type Extractor struct {
 }
 
 const (
-	uastExtractionSkipped = -(1 << 31)
-
 	// ConfigUASTEndpoint is the name of the configuration option (Extractor.Configure())
 	// which sets the Babelfish server address.
 	ConfigUASTEndpoint = "ConfigUASTEndpoint"
@@ -67,7 +63,9 @@ const (
 type uastTask struct {
 	Lock   *sync.RWMutex
 	Dest   map[plumbing.Hash]*uast.Node
-	File   *object.File
+	Name   string
+	Hash   plumbing.Hash
+	Data   []byte
 	Errors *[]error
 }
 
@@ -200,27 +198,14 @@ func (exr *Extractor) Initialize(repository *git.Repository) {
 // This function returns the mapping with analysis results. The keys must be the same as
 // in Provides(). If there was an error, nil is returned.
 func (exr *Extractor) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
-	cache := deps[items.DependencyBlobCache].(map[plumbing.Hash]*object.Blob)
+	cache := deps[items.DependencyBlobCache].(map[plumbing.Hash]*items.CachedBlob)
 	treeDiffs := deps[items.DependencyTreeChanges].(object.Changes)
 	uasts := map[plumbing.Hash]*uast.Node{}
 	lock := sync.RWMutex{}
 	errs := make([]error, 0)
 	wg := sync.WaitGroup{}
 	submit := func(change *object.Change) {
-		{
-			reader, err := cache[change.To.TreeEntry.Hash].Reader()
-			if err != nil {
-				errs = append(errs, err)
-				return
-			}
-			defer ioutil.CheckClose(reader, &err)
-			buf := new(bytes.Buffer)
-			if _, err := buf.ReadFrom(reader); err != nil {
-				errs = append(errs, err)
-				return
-			}
-			exr.ProcessedFiles[change.To.Name]++
-		}
+		exr.ProcessedFiles[change.To.Name]++
 		wg.Add(1)
 		go func(task interface{}) {
 			exr.pool.Process(task)
@@ -228,7 +213,9 @@ func (exr *Extractor) Consume(deps map[string]interface{}) (map[string]interface
 		}(uastTask{
 			Lock:   &lock,
 			Dest:   uasts,
-			File:   &object.File{Name: change.To.Name, Blob: *cache[change.To.TreeEntry.Hash]},
+			Name:   change.To.Name,
+			Hash:   change.To.TreeEntry.Hash,
+			Data:   cache[change.To.TreeEntry.Hash].Data,
 			Errors: &errs,
 		})
 	}
@@ -267,14 +254,10 @@ func (exr *Extractor) Fork(n int) []core.PipelineItem {
 }
 
 func (exr *Extractor) extractUAST(
-	client *bblfsh.Client, file *object.File) (*uast.Node, error) {
+	client *bblfsh.Client, name string, data []byte) (*uast.Node, error) {
 	request := client.NewParseRequest()
-	contents, err := file.Contents()
-	if err != nil {
-		return nil, err
-	}
-	request.Content(contents)
-	request.Filename(file.Name)
+	request.Content(string(data))
+	request.Filename(name)
 	ctx, cancel := exr.Context()
 	if cancel != nil {
 		defer cancel()
@@ -297,16 +280,16 @@ func (exr *Extractor) extractUAST(
 
 func (exr *Extractor) extractTask(client *bblfsh.Client, data interface{}) interface{} {
 	task := data.(uastTask)
-	node, err := exr.extractUAST(client, task.File)
+	node, err := exr.extractUAST(client, task.Name, task.Data)
 	task.Lock.Lock()
 	defer task.Lock.Unlock()
 	if err != nil {
 		*task.Errors = append(*task.Errors,
-			fmt.Errorf("\nfile %s, blob %s: %v", task.File.Name, task.File.Hash.String(), err))
+			fmt.Errorf("\nfile %s, blob %s: %v", task.Name, task.Hash.String(), err))
 		return nil
 	}
 	if node != nil {
-		task.Dest[task.File.Hash] = node
+		task.Dest[task.Hash] = node
 	}
 	return nil
 }
@@ -550,21 +533,21 @@ func (saver *ChangesSaver) dumpFiles(result [][]Change) []*pb.UASTChange {
 			bs, _ := change.Before.Marshal()
 			record.UastBefore = path.Join(saver.OutputPath, fmt.Sprintf(
 				"%d_%d_before_%s.pb", i, j, change.Change.From.TreeEntry.Hash.String()))
-			goioutil.WriteFile(record.UastBefore, bs, 0666)
+			ioutil.WriteFile(record.UastBefore, bs, 0666)
 			blob, _ := saver.repository.BlobObject(change.Change.From.TreeEntry.Hash)
 			s, _ := (&object.File{Blob: *blob}).Contents()
 			record.SrcBefore = path.Join(saver.OutputPath, fmt.Sprintf(
 				"%d_%d_before_%s.src", i, j, change.Change.From.TreeEntry.Hash.String()))
-			goioutil.WriteFile(record.SrcBefore, []byte(s), 0666)
+			ioutil.WriteFile(record.SrcBefore, []byte(s), 0666)
 			bs, _ = change.After.Marshal()
 			record.UastAfter = path.Join(saver.OutputPath, fmt.Sprintf(
 				"%d_%d_after_%s.pb", i, j, change.Change.To.TreeEntry.Hash.String()))
-			goioutil.WriteFile(record.UastAfter, bs, 0666)
+			ioutil.WriteFile(record.UastAfter, bs, 0666)
 			blob, _ = saver.repository.BlobObject(change.Change.To.TreeEntry.Hash)
 			s, _ = (&object.File{Blob: *blob}).Contents()
 			record.SrcAfter = path.Join(saver.OutputPath, fmt.Sprintf(
 				"%d_%d_after_%s.src", i, j, change.Change.To.TreeEntry.Hash.String()))
-			goioutil.WriteFile(record.SrcAfter, []byte(s), 0666)
+			ioutil.WriteFile(record.SrcAfter, []byte(s), 0666)
 			fileNames = append(fileNames, record)
 		}
 	}

+ 18 - 9
internal/plumbing/uast/uast_test.go

@@ -4,22 +4,21 @@ package uast
 
 import (
 	"bytes"
+	"fmt"
 	"io/ioutil"
 	"os"
-	"testing"
-
-	"fmt"
 	"path"
+	"testing"
 
 	"github.com/gogo/protobuf/proto"
 	"github.com/stretchr/testify/assert"
 	"gopkg.in/bblfsh/sdk.v1/uast"
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
-	"gopkg.in/src-d/hercules.v4/internal/core"
-	"gopkg.in/src-d/hercules.v4/internal/pb"
-	items "gopkg.in/src-d/hercules.v4/internal/plumbing"
-	"gopkg.in/src-d/hercules.v4/internal/test"
+	"gopkg.in/src-d/hercules.v5/internal/core"
+	"gopkg.in/src-d/hercules.v5/internal/pb"
+	items "gopkg.in/src-d/hercules.v5/internal/plumbing"
+	"gopkg.in/src-d/hercules.v5/internal/test"
 )
 
 func fixtureUASTExtractor() *Extractor {
@@ -28,6 +27,16 @@ func fixtureUASTExtractor() *Extractor {
 	return &exr
 }
 
+func AddHash(t *testing.T, cache map[plumbing.Hash]*items.CachedBlob, hash string) {
+	objhash := plumbing.NewHash(hash)
+	blob, err := test.Repository.BlobObject(objhash)
+	assert.Nil(t, err)
+	cb := &items.CachedBlob{Blob: *blob}
+	err = cb.Cache()
+	assert.Nil(t, err)
+	cache[objhash] = cb
+}
+
 func TestUASTExtractorMeta(t *testing.T) {
 	exr := fixtureUASTExtractor()
 	assert.Equal(t, exr.Name(), "UAST")
@@ -117,7 +126,7 @@ func TestUASTExtractorConsume(t *testing.T) {
 		},
 	},
 	}
-	cache := map[plumbing.Hash]*object.Blob{}
+	cache := map[plumbing.Hash]*items.CachedBlob{}
 	for _, hash := range []string{
 		"baa64828831d174f40140e4b3cfa77d1e917a2c1",
 		"5d78f57d732aed825764347ec6f3ab74d50d0619",
@@ -125,7 +134,7 @@ func TestUASTExtractorConsume(t *testing.T) {
 		"f7d918ec500e2f925ecde79b51cc007bac27de72",
 		"81f2b6d1fa5357f90e9dead150cd515720897545",
 	} {
-		cache[plumbing.NewHash(hash)], _ = test.Repository.BlobObject(plumbing.NewHash(hash))
+		AddHash(t, cache, hash)
 	}
 	deps := map[string]interface{}{}
 	deps[items.DependencyBlobCache] = cache

+ 2 - 2
internal/test/fixtures/fixtures.go

@@ -1,8 +1,8 @@
 package fixtures
 
 import (
-	"gopkg.in/src-d/hercules.v4/internal/plumbing"
-	"gopkg.in/src-d/hercules.v4/internal/test"
+	"gopkg.in/src-d/hercules.v5/internal/plumbing"
+	"gopkg.in/src-d/hercules.v5/internal/test"
 )
 
 // FileDiff initializes a new plumbing.FileDiff item for testing.

+ 45 - 25
leaves/burndown.go

@@ -15,12 +15,12 @@ import (
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
 	"gopkg.in/src-d/go-git.v4/utils/merkletrie"
-	"gopkg.in/src-d/hercules.v4/internal/burndown"
-	"gopkg.in/src-d/hercules.v4/internal/core"
-	"gopkg.in/src-d/hercules.v4/internal/pb"
-	items "gopkg.in/src-d/hercules.v4/internal/plumbing"
-	"gopkg.in/src-d/hercules.v4/internal/plumbing/identity"
-	"gopkg.in/src-d/hercules.v4/internal/yaml"
+	"gopkg.in/src-d/hercules.v5/internal/burndown"
+	"gopkg.in/src-d/hercules.v5/internal/core"
+	"gopkg.in/src-d/hercules.v5/internal/pb"
+	items "gopkg.in/src-d/hercules.v5/internal/plumbing"
+	"gopkg.in/src-d/hercules.v5/internal/plumbing/identity"
+	"gopkg.in/src-d/hercules.v5/internal/yaml"
 )
 
 // BurndownAnalysis allows to gather the line burndown statistics for a Git repository.
@@ -276,11 +276,14 @@ func (analyser *BurndownAnalysis) Consume(deps map[string]interface{}) (map[stri
 		analyser.mergedFiles = map[string]bool{}
 		analyser.mergedAuthor = author
 	}
-	cache := deps[items.DependencyBlobCache].(map[plumbing.Hash]*object.Blob)
+	cache := deps[items.DependencyBlobCache].(map[plumbing.Hash]*items.CachedBlob)
 	treeDiffs := deps[items.DependencyTreeChanges].(object.Changes)
 	fileDiffs := deps[items.DependencyFileDiff].(map[string]items.FileDiffData)
 	for _, change := range treeDiffs {
 		action, _ := change.Action()
+		if deps["commit"].(*object.Commit).Hash.String() == "a6667d96c5e4aca92612295d549541146dd6e74a" {
+			fmt.Println("a6667d96c5e4aca92612295d549541146dd6e74a", action, change)
+		}
 		var err error
 		switch action {
 		case merkletrie.Insert:
@@ -1005,18 +1008,17 @@ func (analyser *BurndownAnalysis) newFile(
 }
 
 func (analyser *BurndownAnalysis) handleInsertion(
-	change *object.Change, author int, cache map[plumbing.Hash]*object.Blob) error {
+	change *object.Change, author int, cache map[plumbing.Hash]*items.CachedBlob) error {
 	blob := cache[change.To.TreeEntry.Hash]
-	lines, err := items.CountLines(blob)
+	lines, err := blob.CountLines()
 	if err != nil {
-		if err.Error() == "binary" {
-			return nil
-		}
-		return err
+		// binary
+		return nil
 	}
 	name := change.To.Name
 	file, exists := analyser.files[name]
 	if exists {
+		println("\n", analyser, "error")
 		return fmt.Errorf("file %s already exists", name)
 	}
 	var hash plumbing.Hash
@@ -1032,18 +1034,18 @@ func (analyser *BurndownAnalysis) handleInsertion(
 }
 
 func (analyser *BurndownAnalysis) handleDeletion(
-	change *object.Change, author int, cache map[plumbing.Hash]*object.Blob) error {
+	change *object.Change, author int, cache map[plumbing.Hash]*items.CachedBlob) error {
 
+	name := change.From.Name
+	file, exists := analyser.files[name]
 	blob := cache[change.From.TreeEntry.Hash]
-	lines, err := items.CountLines(blob)
-	if err != nil {
-		if err.Error() == "binary" {
-			return nil
-		}
-		return err
+	lines, err := blob.CountLines()
+	if exists && err != nil {
+		return fmt.Errorf("file %s unexpectedly became binary", name)
+	}
+	if !exists {
+		return nil
 	}
-	name := change.From.Name
-	file := analyser.files[name]
 	file.Update(analyser.packPersonWithDay(author, analyser.day), 0, 0, lines)
 	delete(analyser.files, name)
 	delete(analyser.fileHistories, name)
@@ -1055,7 +1057,7 @@ func (analyser *BurndownAnalysis) handleDeletion(
 }
 
 func (analyser *BurndownAnalysis) handleModification(
-	change *object.Change, author int, cache map[plumbing.Hash]*object.Blob,
+	change *object.Change, author int, cache map[plumbing.Hash]*items.CachedBlob,
 	diffs map[string]items.FileDiffData) error {
 
 	if analyser.day == burndown.TreeMergeMark {
@@ -1075,6 +1077,23 @@ func (analyser *BurndownAnalysis) handleModification(
 		}
 	}
 
+	// Check for binary changes
+	blobFrom := cache[change.From.TreeEntry.Hash]
+	_, errFrom := blobFrom.CountLines()
+	blobTo := cache[change.To.TreeEntry.Hash]
+	_, errTo := blobTo.CountLines()
+	if errFrom != errTo {
+		if errFrom != nil {
+			// the file is no longer binary
+			return analyser.handleInsertion(change, author, cache)
+		}
+		// the file became binary
+		return analyser.handleDeletion(change, author, cache)
+	} else if errFrom != nil {
+		// what are we doing here?!
+		return nil
+	}
+
 	thisDiffs := diffs[change.To.Name]
 	if file.Len() != thisDiffs.OldLinesOfCode {
 		log.Printf("====TREE====\n%s", file.Dump())
@@ -1155,8 +1174,9 @@ func (analyser *BurndownAnalysis) handleModification(
 		pending.Text = ""
 	}
 	if file.Len() != thisDiffs.NewLinesOfCode {
-		return fmt.Errorf("%s: internal integrity error dst %d != %d",
-			change.To.Name, thisDiffs.NewLinesOfCode, file.Len())
+		return fmt.Errorf("%s: internal integrity error dst %d != %d %s -> %s",
+			change.To.Name, thisDiffs.NewLinesOfCode, file.Len(),
+			change.From.TreeEntry.Hash.String(), change.To.TreeEntry.Hash.String())
 	}
 	return nil
 }

+ 38 - 46
leaves/burndown_test.go

@@ -7,19 +7,29 @@ import (
 	"path"
 	"testing"
 
-	"gopkg.in/src-d/hercules.v4/internal/core"
-	"gopkg.in/src-d/hercules.v4/internal/test/fixtures"
+	"gopkg.in/src-d/hercules.v5/internal/core"
+	"gopkg.in/src-d/hercules.v5/internal/test/fixtures"
 
 	"github.com/gogo/protobuf/proto"
 	"github.com/stretchr/testify/assert"
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
-	"gopkg.in/src-d/hercules.v4/internal/pb"
-	items "gopkg.in/src-d/hercules.v4/internal/plumbing"
-	"gopkg.in/src-d/hercules.v4/internal/plumbing/identity"
-	"gopkg.in/src-d/hercules.v4/internal/test"
+	"gopkg.in/src-d/hercules.v5/internal/pb"
+	items "gopkg.in/src-d/hercules.v5/internal/plumbing"
+	"gopkg.in/src-d/hercules.v5/internal/plumbing/identity"
+	"gopkg.in/src-d/hercules.v5/internal/test"
 )
 
+func AddHash(t *testing.T, cache map[plumbing.Hash]*items.CachedBlob, hash string) {
+	objhash := plumbing.NewHash(hash)
+	blob, err := test.Repository.BlobObject(objhash)
+	assert.Nil(t, err)
+	cb := &items.CachedBlob{Blob: *blob}
+	err = cb.Cache()
+	assert.Nil(t, err)
+	cache[objhash] = cb
+}
+
 func TestBurndownMeta(t *testing.T) {
 	burndown := BurndownAnalysis{}
 	assert.Equal(t, burndown.Name(), "Burndown")
@@ -121,15 +131,11 @@ func TestBurndownConsumeFinalize(t *testing.T) {
 	// stage 1
 	deps[identity.DependencyAuthor] = 0
 	deps[items.DependencyDay] = 0
-	cache := map[plumbing.Hash]*object.Blob{}
-	hash := plumbing.NewHash("291286b4ac41952cbd1389fda66420ec03c1a9fe")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("c29112dbd697ad9b401333b80c18a63951bc18d9")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("baa64828831d174f40140e4b3cfa77d1e917a2c1")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("dc248ba2b22048cc730c571a748e8ffcf7085ab9")
-	cache[hash], _ = test.Repository.BlobObject(hash)
+	cache := map[plumbing.Hash]*items.CachedBlob{}
+	AddHash(t, cache, "291286b4ac41952cbd1389fda66420ec03c1a9fe")
+	AddHash(t, cache, "c29112dbd697ad9b401333b80c18a63951bc18d9")
+	AddHash(t, cache, "baa64828831d174f40140e4b3cfa77d1e917a2c1")
+	AddHash(t, cache, "dc248ba2b22048cc730c571a748e8ffcf7085ab9")
 	deps[items.DependencyBlobCache] = cache
 	changes := make(object.Changes, 3)
 	treeFrom, _ := test.Repository.TreeObject(plumbing.NewHash(
@@ -220,17 +226,12 @@ func TestBurndownConsumeFinalize(t *testing.T) {
 	// 2b1ed978194a94edeabbca6de7ff3b5771d4d665
 	deps[core.DependencyIsMerge] = false
 	deps[items.DependencyDay] = 30
-	cache = map[plumbing.Hash]*object.Blob{}
-	hash = plumbing.NewHash("291286b4ac41952cbd1389fda66420ec03c1a9fe")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("baa64828831d174f40140e4b3cfa77d1e917a2c1")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("29c9fafd6a2fae8cd20298c3f60115bc31a4c0f2")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("c29112dbd697ad9b401333b80c18a63951bc18d9")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("f7d918ec500e2f925ecde79b51cc007bac27de72")
-	cache[hash], _ = test.Repository.BlobObject(hash)
+	cache = map[plumbing.Hash]*items.CachedBlob{}
+	AddHash(t, cache, "291286b4ac41952cbd1389fda66420ec03c1a9fe")
+	AddHash(t, cache, "baa64828831d174f40140e4b3cfa77d1e917a2c1")
+	AddHash(t, cache, "29c9fafd6a2fae8cd20298c3f60115bc31a4c0f2")
+	AddHash(t, cache, "c29112dbd697ad9b401333b80c18a63951bc18d9")
+	AddHash(t, cache, "f7d918ec500e2f925ecde79b51cc007bac27de72")
 	deps[items.DependencyBlobCache] = cache
 	changes = make(object.Changes, 3)
 	treeFrom, _ = test.Repository.TreeObject(plumbing.NewHash(
@@ -353,15 +354,11 @@ func TestBurndownSerialize(t *testing.T) {
 	// stage 1
 	deps[identity.DependencyAuthor] = 0
 	deps[items.DependencyDay] = 0
-	cache := map[plumbing.Hash]*object.Blob{}
-	hash := plumbing.NewHash("291286b4ac41952cbd1389fda66420ec03c1a9fe")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("c29112dbd697ad9b401333b80c18a63951bc18d9")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("baa64828831d174f40140e4b3cfa77d1e917a2c1")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("dc248ba2b22048cc730c571a748e8ffcf7085ab9")
-	cache[hash], _ = test.Repository.BlobObject(hash)
+	cache := map[plumbing.Hash]*items.CachedBlob{}
+	AddHash(t, cache, "291286b4ac41952cbd1389fda66420ec03c1a9fe")
+	AddHash(t, cache, "c29112dbd697ad9b401333b80c18a63951bc18d9")
+	AddHash(t, cache, "baa64828831d174f40140e4b3cfa77d1e917a2c1")
+	AddHash(t, cache, "dc248ba2b22048cc730c571a748e8ffcf7085ab9")
 	deps[items.DependencyBlobCache] = cache
 	changes := make(object.Changes, 3)
 	treeFrom, _ := test.Repository.TreeObject(plumbing.NewHash(
@@ -418,17 +415,12 @@ func TestBurndownSerialize(t *testing.T) {
 	// 2b1ed978194a94edeabbca6de7ff3b5771d4d665
 	deps[identity.DependencyAuthor] = 1
 	deps[items.DependencyDay] = 30
-	cache = map[plumbing.Hash]*object.Blob{}
-	hash = plumbing.NewHash("291286b4ac41952cbd1389fda66420ec03c1a9fe")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("baa64828831d174f40140e4b3cfa77d1e917a2c1")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("29c9fafd6a2fae8cd20298c3f60115bc31a4c0f2")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("c29112dbd697ad9b401333b80c18a63951bc18d9")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("f7d918ec500e2f925ecde79b51cc007bac27de72")
-	cache[hash], _ = test.Repository.BlobObject(hash)
+	cache = map[plumbing.Hash]*items.CachedBlob{}
+	AddHash(t, cache, "291286b4ac41952cbd1389fda66420ec03c1a9fe")
+	AddHash(t, cache, "baa64828831d174f40140e4b3cfa77d1e917a2c1")
+	AddHash(t, cache, "29c9fafd6a2fae8cd20298c3f60115bc31a4c0f2")
+	AddHash(t, cache, "c29112dbd697ad9b401333b80c18a63951bc18d9")
+	AddHash(t, cache, "f7d918ec500e2f925ecde79b51cc007bac27de72")
 	deps[items.DependencyBlobCache] = cache
 	changes = make(object.Changes, 3)
 	treeFrom, _ = test.Repository.TreeObject(plumbing.NewHash(

+ 4 - 4
leaves/comment_sentiment.go

@@ -16,10 +16,10 @@ import (
 	progress "gopkg.in/cheggaaa/pb.v1"
 	"gopkg.in/src-d/go-git.v4"
 	"gopkg.in/src-d/go-git.v4/plumbing"
-	"gopkg.in/src-d/hercules.v4/internal/core"
-	"gopkg.in/src-d/hercules.v4/internal/pb"
-	items "gopkg.in/src-d/hercules.v4/internal/plumbing"
-	uast_items "gopkg.in/src-d/hercules.v4/internal/plumbing/uast"
+	"gopkg.in/src-d/hercules.v5/internal/core"
+	"gopkg.in/src-d/hercules.v5/internal/pb"
+	items "gopkg.in/src-d/hercules.v5/internal/plumbing"
+	uast_items "gopkg.in/src-d/hercules.v5/internal/plumbing/uast"
 	"gopkg.in/vmarkovtsev/BiDiSentiment.v1"
 )
 

+ 6 - 6
leaves/comment_sentiment_test.go

@@ -13,12 +13,12 @@ import (
 	"gopkg.in/bblfsh/client-go.v2"
 	"gopkg.in/bblfsh/client-go.v2/tools"
 	"gopkg.in/src-d/go-git.v4/plumbing"
-	"gopkg.in/src-d/hercules.v4/internal/core"
-	"gopkg.in/src-d/hercules.v4/internal/pb"
-	items "gopkg.in/src-d/hercules.v4/internal/plumbing"
-	uast_items "gopkg.in/src-d/hercules.v4/internal/plumbing/uast"
-	uast_test "gopkg.in/src-d/hercules.v4/internal/plumbing/uast/test"
-	"gopkg.in/src-d/hercules.v4/internal/test"
+	"gopkg.in/src-d/hercules.v5/internal/core"
+	"gopkg.in/src-d/hercules.v5/internal/pb"
+	items "gopkg.in/src-d/hercules.v5/internal/plumbing"
+	uast_items "gopkg.in/src-d/hercules.v5/internal/plumbing/uast"
+	uast_test "gopkg.in/src-d/hercules.v5/internal/plumbing/uast/test"
+	"gopkg.in/src-d/hercules.v5/internal/test"
 )
 
 func fixtureCommentSentiment() *CommentSentimentAnalysis {

+ 5 - 5
leaves/couples.go

@@ -9,11 +9,11 @@ import (
 	"gopkg.in/src-d/go-git.v4"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
 	"gopkg.in/src-d/go-git.v4/utils/merkletrie"
-	"gopkg.in/src-d/hercules.v4/internal/core"
-	"gopkg.in/src-d/hercules.v4/internal/pb"
-	items "gopkg.in/src-d/hercules.v4/internal/plumbing"
-	"gopkg.in/src-d/hercules.v4/internal/plumbing/identity"
-	"gopkg.in/src-d/hercules.v4/internal/yaml"
+	"gopkg.in/src-d/hercules.v5/internal/core"
+	"gopkg.in/src-d/hercules.v5/internal/pb"
+	items "gopkg.in/src-d/hercules.v5/internal/plumbing"
+	"gopkg.in/src-d/hercules.v5/internal/plumbing/identity"
+	"gopkg.in/src-d/hercules.v5/internal/yaml"
 )
 
 // CouplesAnalysis calculates the number of common commits for files and authors.

+ 5 - 5
leaves/couples_test.go

@@ -11,11 +11,11 @@ import (
 	"github.com/stretchr/testify/assert"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
 	gitplumbing "gopkg.in/src-d/go-git.v4/plumbing"
-	"gopkg.in/src-d/hercules.v4/internal/core"
-	"gopkg.in/src-d/hercules.v4/internal/pb"
-	"gopkg.in/src-d/hercules.v4/internal/plumbing"
-	"gopkg.in/src-d/hercules.v4/internal/plumbing/identity"
-	"gopkg.in/src-d/hercules.v4/internal/test"
+	"gopkg.in/src-d/hercules.v5/internal/core"
+	"gopkg.in/src-d/hercules.v5/internal/pb"
+	"gopkg.in/src-d/hercules.v5/internal/plumbing"
+	"gopkg.in/src-d/hercules.v5/internal/plumbing/identity"
+	"gopkg.in/src-d/hercules.v5/internal/test"
 	)
 
 func fixtureCouples() *CouplesAnalysis {

+ 3 - 3
leaves/file_history.go

@@ -11,9 +11,9 @@ import (
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
 	"gopkg.in/src-d/go-git.v4/utils/merkletrie"
-	"gopkg.in/src-d/hercules.v4/internal/core"
-	"gopkg.in/src-d/hercules.v4/internal/pb"
-	items "gopkg.in/src-d/hercules.v4/internal/plumbing"
+	"gopkg.in/src-d/hercules.v5/internal/core"
+	"gopkg.in/src-d/hercules.v5/internal/pb"
+	items "gopkg.in/src-d/hercules.v5/internal/plumbing"
 )
 
 // FileHistory contains the intermediate state which is mutated by Consume(). It should implement

+ 4 - 4
leaves/file_history_test.go

@@ -8,10 +8,10 @@ import (
 	"github.com/stretchr/testify/assert"
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
-	"gopkg.in/src-d/hercules.v4/internal/core"
-	"gopkg.in/src-d/hercules.v4/internal/pb"
-	items "gopkg.in/src-d/hercules.v4/internal/plumbing"
-	"gopkg.in/src-d/hercules.v4/internal/test"
+	"gopkg.in/src-d/hercules.v5/internal/core"
+	"gopkg.in/src-d/hercules.v5/internal/pb"
+	items "gopkg.in/src-d/hercules.v5/internal/plumbing"
+	"gopkg.in/src-d/hercules.v5/internal/test"
 )
 
 func fixtureFileHistory() *FileHistory {

+ 4 - 4
leaves/shotness.go

@@ -13,10 +13,10 @@ import (
 	"gopkg.in/bblfsh/sdk.v1/uast"
 	"gopkg.in/src-d/go-git.v4"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
-	"gopkg.in/src-d/hercules.v4/internal/core"
-	"gopkg.in/src-d/hercules.v4/internal/pb"
-	items "gopkg.in/src-d/hercules.v4/internal/plumbing"
-	uast_items "gopkg.in/src-d/hercules.v4/internal/plumbing/uast"
+	"gopkg.in/src-d/hercules.v5/internal/core"
+	"gopkg.in/src-d/hercules.v5/internal/pb"
+	items "gopkg.in/src-d/hercules.v5/internal/plumbing"
+	uast_items "gopkg.in/src-d/hercules.v5/internal/plumbing/uast"
 )
 
 // ShotnessAnalysis contains the intermediate state which is mutated by Consume(). It should implement

+ 5 - 5
leaves/shotness_test.go

@@ -12,11 +12,11 @@ import (
 	"github.com/stretchr/testify/assert"
 	"gopkg.in/bblfsh/sdk.v1/uast"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
-	"gopkg.in/src-d/hercules.v4/internal/core"
-	"gopkg.in/src-d/hercules.v4/internal/pb"
-	items "gopkg.in/src-d/hercules.v4/internal/plumbing"
-	uast_items "gopkg.in/src-d/hercules.v4/internal/plumbing/uast"
-	"gopkg.in/src-d/hercules.v4/internal/test"
+	"gopkg.in/src-d/hercules.v5/internal/core"
+	"gopkg.in/src-d/hercules.v5/internal/pb"
+	items "gopkg.in/src-d/hercules.v5/internal/plumbing"
+	uast_items "gopkg.in/src-d/hercules.v5/internal/plumbing/uast"
+	"gopkg.in/src-d/hercules.v5/internal/test"
 )
 
 func fixtureShotness() *ShotnessAnalysis {