浏览代码

Refactor the engine to enable many analyses

Vadim Markovtsev 7 年之前
父节点
当前提交
2b1ed97819
共有 12 个文件被更改,包括 1611 次插入1045 次删除
  1. 0 926
      analyser.go
  2. 114 0
      blob_cache.go
  3. 543 0
      burndown.go
  4. 36 119
      cmd/hercules/main.go
  5. 51 0
      day.go
  6. 57 0
      dummies.go
  7. 112 0
      identity.go
  8. 202 0
      pipeline.go
  9. 238 0
      renames.go
  10. 103 0
      toposort/toposort.go
  11. 83 0
      toposort/toposort_test.go
  12. 72 0
      tree_diff.go

+ 0 - 926
analyser.go

@@ -1,926 +0,0 @@
-package hercules
-
-import (
-	"bufio"
-	"bytes"
-	"errors"
-	"fmt"
-	"io"
-	"os"
-	"sort"
-	"strings"
-	"time"
-	"unicode/utf8"
-
-	"github.com/sergi/go-diff/diffmatchpatch"
-	"gopkg.in/src-d/go-git.v4"
-	"gopkg.in/src-d/go-git.v4/config"
-	"gopkg.in/src-d/go-git.v4/plumbing"
-	"gopkg.in/src-d/go-git.v4/plumbing/object"
-	"gopkg.in/src-d/go-git.v4/utils/merkletrie"
-)
-
-// Analyser allows to gather the line burndown statistics for a Git repository.
-type Analyser struct {
-	// Repository points to the analysed Git repository struct from go-git.
-	Repository *git.Repository
-	// Granularity sets the size of each band - the number of days it spans.
-	// Smaller values provide better resolution but require more work and eat more
-	// memory. 30 days is usually enough.
-	Granularity int
-	// Sampling sets how detailed is the statistic - the size of the interval in
-	// days between consecutive measurements. It is usually a good idea to set it
-	// <= Granularity. Try 15 or 30.
-	Sampling int
-	// SimilarityThreshold adjusts the heuristic to determine file renames.
-	// It has the same units as cgit's -X rename-threshold or -M. Better to
-	// set it to the default value of 90 (90%).
-	SimilarityThreshold int
-	// The number of developers for which to collect the burndown stats. 0 disables it.
-	PeopleNumber int
-	// Maps email || name  -> developer id.
-	PeopleDict map[string]int
-	// Debug activates the debugging mode. Analyse() runs slower in this mode
-	// but it accurately checks all the intermediate states for invariant
-	// violations.
-	Debug bool
-	// OnProgress is the callback which is invoked in Analyse() to output it's
-	// progress. The first argument is the number of processed commits and the
-	// second is the total number of commits.
-	OnProgress func(int, int)
-}
-
-type ProtoMatrix map[int]map[int]int64
-
-func checkClose(c io.Closer) {
-	if err := c.Close(); err != nil {
-		panic(err)
-	}
-}
-
-func loc(file *object.Blob) (int, error) {
-	reader, err := file.Reader()
-	if err != nil {
-		panic(err)
-	}
-	defer checkClose(reader)
-	var scanner *bufio.Scanner
-	buffer := make([]byte, bufio.MaxScanTokenSize)
-	counter := 0
-	for scanner == nil || scanner.Err() == bufio.ErrTooLong {
-		if scanner != nil && !utf8.Valid(scanner.Bytes()) {
-			return -1, errors.New("binary")
-		}
-		scanner = bufio.NewScanner(reader)
-		scanner.Buffer(buffer, 0)
-		for scanner.Scan() {
-			if !utf8.Valid(scanner.Bytes()) {
-				return -1, errors.New("binary")
-			}
-			counter++
-		}
-	}
-	return counter, nil
-}
-
-func str(file *object.Blob) string {
-	reader, err := file.Reader()
-	if err != nil {
-		panic(err)
-	}
-	defer checkClose(reader)
-	buf := new(bytes.Buffer)
-	buf.ReadFrom(reader)
-	return buf.String()
-}
-
-type dummyIO struct {
-}
-
-func (dummyIO) Read(p []byte) (int, error) {
-	return 0, io.EOF
-}
-
-func (dummyIO) Write(p []byte) (int, error) {
-	return len(p), nil
-}
-
-func (dummyIO) Close() error {
-	return nil
-}
-
-type dummyEncodedObject struct {
-	FakeHash plumbing.Hash
-}
-
-func (obj dummyEncodedObject) Hash() plumbing.Hash {
-	return obj.FakeHash
-}
-
-func (obj dummyEncodedObject) Type() plumbing.ObjectType {
-	return plumbing.BlobObject
-}
-
-func (obj dummyEncodedObject) SetType(plumbing.ObjectType) {
-}
-
-func (obj dummyEncodedObject) Size() int64 {
-	return 0
-}
-
-func (obj dummyEncodedObject) SetSize(int64) {
-}
-
-func (obj dummyEncodedObject) Reader() (io.ReadCloser, error) {
-	return dummyIO{}, nil
-}
-
-func (obj dummyEncodedObject) Writer() (io.WriteCloser, error) {
-	return dummyIO{}, nil
-}
-
-func createDummyBlob(hash *plumbing.Hash) (*object.Blob, error) {
-	return object.DecodeBlob(dummyEncodedObject{*hash})
-}
-
-const MISSING_AUTHOR = (1 << 18) - 1
-const SELF_AUTHOR = (1 << 18) - 2
-
-func (analyser *Analyser) packPersonWithDay(person int, day int) int {
-	if analyser.PeopleNumber == 0 {
-		return day
-	}
-	result := day
-	result |= person << 14
-	// This effectively means max 16384 days (>44 years) and (131072 - 2) devs
-	return result
-}
-
-func (analyser *Analyser) unpackPersonWithDay(value int) (int, int) {
-	if analyser.PeopleNumber == 0 {
-		return MISSING_AUTHOR, value
-	}
-	return value >> 14, value & 0x3FFF
-}
-
-func (analyser *Analyser) updateStatus(
-	status interface{}, _ int, previous_time_ int, delta int) {
-
-	_, previous_time := analyser.unpackPersonWithDay(previous_time_)
-	status.(map[int]int64)[previous_time] += int64(delta)
-}
-
-func (analyser *Analyser) updatePeople(people interface{}, _ int, previous_time_ int, delta int) {
-	old_author, previous_time := analyser.unpackPersonWithDay(previous_time_)
-	if old_author == MISSING_AUTHOR {
-		return
-	}
-	casted := people.([]map[int]int64)
-	stats := casted[old_author]
-	if stats == nil {
-		stats = map[int]int64{}
-		casted[old_author] = stats
-	}
-	stats[previous_time] += int64(delta)
-}
-
-func (analyser *Analyser) updateMatrix(
-	matrix_ interface{}, current_time int, previous_time int, delta int) {
-
-	matrix := matrix_.([]map[int]int64)
-	new_author, _ := analyser.unpackPersonWithDay(current_time)
-	old_author, _ := analyser.unpackPersonWithDay(previous_time)
-	if old_author == MISSING_AUTHOR {
-		return
-	}
-	if new_author == old_author && delta > 0 {
-		new_author = SELF_AUTHOR
-	}
-	row := matrix[old_author]
-	if row == nil {
-		row = map[int]int64{}
-		matrix[old_author] = row
-	}
-	cell, exists := row[new_author]
-	if !exists {
-		row[new_author] = 0
-		cell = 0
-	}
-	row[new_author] = cell + int64(delta)
-}
-
-func (analyser *Analyser) newFile(
-	author int, day int, size int, global map[int]int64, people []map[int]int64,
-	matrix []map[int]int64) *File {
-	if analyser.PeopleNumber == 0 {
-		return NewFile(day, size, NewStatus(global, analyser.updateStatus),
-			NewStatus(make(map[int]int64), analyser.updateStatus))
-	}
-	return NewFile(analyser.packPersonWithDay(author, day), size,
-		NewStatus(global, analyser.updateStatus),
-		NewStatus(make(map[int]int64), analyser.updateStatus),
-		NewStatus(people, analyser.updatePeople),
-		NewStatus(matrix, analyser.updateMatrix))
-}
-
-func (analyser *Analyser) getAuthorId(signature object.Signature) int {
-	id, exists := analyser.PeopleDict[strings.ToLower(signature.Email)]
-	if !exists {
-		id, exists = analyser.PeopleDict[strings.ToLower(signature.Name)]
-		if !exists {
-			id = MISSING_AUTHOR
-		}
-	}
-	return id
-}
-
-func (analyser *Analyser) handleInsertion(
-	change *object.Change, author int, day int, global_status map[int]int64,
-	files map[string]*File, people []map[int]int64, matrix []map[int]int64,
-	cache *map[plumbing.Hash]*object.Blob) {
-
-	blob := (*cache)[change.To.TreeEntry.Hash]
-	lines, err := loc(blob)
-	if err != nil {
-		return
-	}
-	name := change.To.Name
-	file, exists := files[name]
-	if exists {
-		panic(fmt.Sprintf("file %s already exists", name))
-	}
-	file = analyser.newFile(author, day, lines, global_status, people, matrix)
-	files[name] = file
-}
-
-func (analyser *Analyser) handleDeletion(
-	change *object.Change, author int, day int, status map[int]int64, files map[string]*File,
-	cache *map[plumbing.Hash]*object.Blob) {
-	blob := (*cache)[change.From.TreeEntry.Hash]
-	lines, err := loc(blob)
-	if err != nil {
-		return
-	}
-	name := change.From.Name
-	file := files[name]
-	file.Update(analyser.packPersonWithDay(author, day), 0, 0, lines)
-	delete(files, name)
-}
-
-func (analyser *Analyser) handleModification(
-	change *object.Change, author int, day int, status map[int]int64, files map[string]*File,
-	people []map[int]int64, matrix []map[int]int64,
-	cache *map[plumbing.Hash]*object.Blob) {
-
-	blob_from := (*cache)[change.From.TreeEntry.Hash]
-	blob_to := (*cache)[change.To.TreeEntry.Hash]
-	// we are not validating UTF-8 here because for example
-	// git/git 4f7770c87ce3c302e1639a7737a6d2531fe4b160 fetch-pack.c is invalid UTF-8
-	str_from := str(blob_from)
-	str_to := str(blob_to)
-	file, exists := files[change.From.Name]
-	if !exists {
-		analyser.handleInsertion(change, author, day, status, files, people, matrix, cache)
-		return
-	}
-	// possible rename
-	if change.To.Name != change.From.Name {
-		analyser.handleRename(change.From.Name, change.To.Name, files)
-	}
-	dmp := diffmatchpatch.New()
-	src, dst, _ := dmp.DiffLinesToRunes(str_from, str_to)
-	if file.Len() != len(src) {
-		fmt.Fprintf(os.Stderr, "====TREE====\n%s", file.Dump())
-		panic(fmt.Sprintf("%s: internal integrity error src %d != %d %s -> %s",
-			change.To.Name, len(src), file.Len(),
-			change.From.TreeEntry.Hash.String(), change.To.TreeEntry.Hash.String()))
-	}
-	diffs := dmp.DiffMainRunes(src, dst, false)
-	// we do not call RunesToDiffLines so the number of lines equals
-	// to the rune count
-	position := 0
-	pending := diffmatchpatch.Diff{Text: ""}
-
-	apply := func(edit diffmatchpatch.Diff) {
-		length := utf8.RuneCountInString(edit.Text)
-		if edit.Type == diffmatchpatch.DiffInsert {
-			file.Update(analyser.packPersonWithDay(author, day), position, length, 0)
-			position += length
-		} else {
-			file.Update(analyser.packPersonWithDay(author, day), position, 0, length)
-		}
-		if analyser.Debug {
-			file.Validate()
-		}
-	}
-
-	for _, edit := range diffs {
-		dump_before := ""
-		if analyser.Debug {
-			dump_before = file.Dump()
-		}
-		length := utf8.RuneCountInString(edit.Text)
-		func() {
-			defer func() {
-				r := recover()
-				if r != nil {
-					fmt.Fprintf(os.Stderr, "%s: internal diff error\n", change.To.Name)
-					fmt.Fprintf(os.Stderr, "Update(%d, %d, %d (0), %d (0))\n", day, position,
-						length, utf8.RuneCountInString(pending.Text))
-					if dump_before != "" {
-						fmt.Fprintf(os.Stderr, "====TREE BEFORE====\n%s====END====\n", dump_before)
-					}
-					fmt.Fprintf(os.Stderr, "====TREE AFTER====\n%s====END====\n", file.Dump())
-					panic(r)
-				}
-			}()
-			switch edit.Type {
-			case diffmatchpatch.DiffEqual:
-				if pending.Text != "" {
-					apply(pending)
-					pending.Text = ""
-				}
-				position += length
-			case diffmatchpatch.DiffInsert:
-				if pending.Text != "" {
-					if pending.Type == diffmatchpatch.DiffInsert {
-						panic("DiffInsert may not appear after DiffInsert")
-					}
-					file.Update(analyser.packPersonWithDay(author, day), position, length,
-						utf8.RuneCountInString(pending.Text))
-					if analyser.Debug {
-						file.Validate()
-					}
-					position += length
-					pending.Text = ""
-				} else {
-					pending = edit
-				}
-			case diffmatchpatch.DiffDelete:
-				if pending.Text != "" {
-					panic("DiffDelete may not appear after DiffInsert/DiffDelete")
-				}
-				pending = edit
-			default:
-				panic(fmt.Sprintf("diff operation is not supported: %d", edit.Type))
-			}
-		}()
-	}
-	if pending.Text != "" {
-		apply(pending)
-		pending.Text = ""
-	}
-	if file.Len() != len(dst) {
-		panic(fmt.Sprintf("%s: internal integrity error dst %d != %d",
-			change.To.Name, len(dst), file.Len()))
-	}
-}
-
-func (analyser *Analyser) handleRename(from, to string, files map[string]*File) {
-	file, exists := files[from]
-	if !exists {
-		panic(fmt.Sprintf("file %s does not exist", from))
-	}
-	files[to] = file
-	delete(files, from)
-}
-
-// Commits returns the critical path in the repository's history. It starts
-// from HEAD and traces commits backwards till the root. When it encounters
-// a merge (more than one parent), it always chooses the first parent.
-func (analyser *Analyser) Commits() []*object.Commit {
-	result := []*object.Commit{}
-	repository := analyser.Repository
-	head, err := repository.Head()
-	if err != nil {
-		panic(err)
-	}
-	commit, err := repository.CommitObject(head.Hash())
-	if err != nil {
-		panic(err)
-	}
-	result = append(result, commit)
-	for ; err != io.EOF; commit, err = commit.Parents().Next() {
-		if err != nil {
-			panic(err)
-		}
-		result = append(result, commit)
-	}
-	// reverse the order
-	for i, j := 0, len(result)-1; i < j; i, j = i+1, j-1 {
-		result[i], result[j] = result[j], result[i]
-	}
-	return result
-}
-
-func (analyser *Analyser) groupStatus(
-	status map[int]int64,
-	files map[string]*File,
-	people []map[int]int64,
-	day int) ([]int64, map[string][]int64, [][]int64) {
-	granularity := analyser.Granularity
-	if granularity == 0 {
-		granularity = 1
-	}
-	day++
-	adjust := 0
-	if day%granularity != 0 {
-		adjust = 1
-	}
-	global := make([]int64, day/granularity+adjust)
-	var group int64
-	for i := 0; i < day; i++ {
-		group += status[i]
-		if (i % granularity) == (granularity - 1) {
-			global[i/granularity] = group
-			group = 0
-		}
-	}
-	if day%granularity != 0 {
-		global[len(global)-1] = group
-	}
-	locals := make(map[string][]int64)
-	for key, file := range files {
-		status := make([]int64, day/granularity+adjust)
-		var group int64
-		for i := 0; i < day; i++ {
-			group += file.Status(1).(map[int]int64)[i]
-			if (i % granularity) == (granularity - 1) {
-				status[i/granularity] = group
-				group = 0
-			}
-		}
-		if day%granularity != 0 {
-			status[len(status)-1] = group
-		}
-		locals[key] = status
-	}
-	peoples := make([][]int64, len(people))
-	for key, person := range people {
-		status := make([]int64, day/granularity+adjust)
-		var group int64
-		for i := 0; i < day; i++ {
-			group += person[i]
-			if (i % granularity) == (granularity - 1) {
-				status[i/granularity] = group
-				group = 0
-			}
-		}
-		if day%granularity != 0 {
-			status[len(status)-1] = group
-		}
-		peoples[key] = status
-	}
-	return global, locals, peoples
-}
-
-func (analyser *Analyser) updateHistories(
-	global_history [][]int64, global_status []int64,
-	file_histories map[string][][]int64, file_statuses map[string][]int64,
-	people_histories [][][]int64, people_statuses [][]int64,
-	delta int) [][]int64 {
-	for i := 0; i < delta; i++ {
-		global_history = append(global_history, global_status)
-	}
-	to_delete := make([]string, 0)
-	for key, fh := range file_histories {
-		ls, exists := file_statuses[key]
-		if !exists {
-			to_delete = append(to_delete, key)
-		} else {
-			for i := 0; i < delta; i++ {
-				fh = append(fh, ls)
-			}
-			file_histories[key] = fh
-		}
-	}
-	for _, key := range to_delete {
-		delete(file_histories, key)
-	}
-	for key, ls := range file_statuses {
-		fh, exists := file_histories[key]
-		if exists {
-			continue
-		}
-		for i := 0; i < delta; i++ {
-			fh = append(fh, ls)
-		}
-		file_histories[key] = fh
-	}
-
-	for key, ph := range people_histories {
-		ls := people_statuses[key]
-		for i := 0; i < delta; i++ {
-			ph = append(ph, ls)
-		}
-		people_histories[key] = ph
-	}
-	return global_history
-}
-
-type sortableChange struct {
-	change *object.Change
-	hash   plumbing.Hash
-}
-
-type sortableChanges []sortableChange
-
-func (change *sortableChange) Less(other *sortableChange) bool {
-	for x := 0; x < 20; x++ {
-		if change.hash[x] < other.hash[x] {
-			return true
-		}
-	}
-	return false
-}
-
-func (slice sortableChanges) Len() int {
-	return len(slice)
-}
-
-func (slice sortableChanges) Less(i, j int) bool {
-	return slice[i].Less(&slice[j])
-}
-
-func (slice sortableChanges) Swap(i, j int) {
-	slice[i], slice[j] = slice[j], slice[i]
-}
-
-type sortableBlob struct {
-	change *object.Change
-	size   int64
-}
-
-type sortableBlobs []sortableBlob
-
-func (change *sortableBlob) Less(other *sortableBlob) bool {
-	return change.size < other.size
-}
-
-func (slice sortableBlobs) Len() int {
-	return len(slice)
-}
-
-func (slice sortableBlobs) Less(i, j int) bool {
-	return slice[i].Less(&slice[j])
-}
-
-func (slice sortableBlobs) Swap(i, j int) {
-	slice[i], slice[j] = slice[j], slice[i]
-}
-
-func (analyser *Analyser) sizesAreClose(size1 int64, size2 int64) bool {
-	return abs64(size1-size2)*100/max64(1, min64(size1, size2)) <=
-		int64(100-analyser.SimilarityThreshold)
-}
-
-func (analyser *Analyser) blobsAreClose(
-	blob1 *object.Blob, blob2 *object.Blob) bool {
-	str_from := str(blob1)
-	str_to := str(blob2)
-	dmp := diffmatchpatch.New()
-	src, dst, _ := dmp.DiffLinesToRunes(str_from, str_to)
-	diffs := dmp.DiffMainRunes(src, dst, false)
-	common := 0
-	for _, edit := range diffs {
-		if edit.Type == diffmatchpatch.DiffEqual {
-			common += utf8.RuneCountInString(edit.Text)
-		}
-	}
-	return common*100/max(1, min(len(src), len(dst))) >=
-		analyser.SimilarityThreshold
-}
-
-func (analyser *Analyser) getBlob(entry *object.ChangeEntry, commit *object.Commit) (
-	*object.Blob, error) {
-	blob, err := analyser.Repository.BlobObject(entry.TreeEntry.Hash)
-	if err != nil {
-		if err.Error() != plumbing.ErrObjectNotFound.Error() {
-			fmt.Fprintf(os.Stderr, "getBlob(%s)\n", entry.TreeEntry.Hash.String())
-			return nil, err
-		}
-		file, err_modules := commit.File(".gitmodules")
-		if err_modules != nil {
-			return nil, err
-		}
-		contents, err_modules := file.Contents()
-		if err_modules != nil {
-			return nil, err
-		}
-		modules := config.NewModules()
-		err_modules = modules.Unmarshal([]byte(contents))
-		if err_modules != nil {
-			return nil, err
-		}
-		_, exists := modules.Submodules[entry.Name]
-		if exists {
-			// we found that this is a submodule
-			return createDummyBlob(&entry.TreeEntry.Hash)
-		}
-		return nil, err
-	}
-	return blob, nil
-}
-
-func (analyser *Analyser) cacheBlobs(changes *object.Changes, commit *object.Commit) (
-	*map[plumbing.Hash]*object.Blob, error) {
-	cache := make(map[plumbing.Hash]*object.Blob)
-	for _, change := range *changes {
-		action, err := change.Action()
-		if err != nil {
-			return nil, err
-		}
-		switch action {
-		case merkletrie.Insert:
-			cache[change.To.TreeEntry.Hash], err = analyser.getBlob(&change.To, commit)
-			if err != nil {
-				fmt.Fprintf(os.Stderr, "file to %s\n", change.To.Name)
-			}
-		case merkletrie.Delete:
-			cache[change.From.TreeEntry.Hash], err = analyser.getBlob(&change.From, commit)
-			if err != nil {
-				if err.Error() != plumbing.ErrObjectNotFound.Error() {
-					fmt.Fprintf(os.Stderr, "file from %s\n", change.From.Name)
-				} else {
-					cache[change.From.TreeEntry.Hash], err = createDummyBlob(
-						&change.From.TreeEntry.Hash)
-				}
-			}
-		case merkletrie.Modify:
-			cache[change.To.TreeEntry.Hash], err = analyser.getBlob(&change.To, commit)
-			if err != nil {
-				fmt.Fprintf(os.Stderr, "file to %s\n", change.To.Name)
-			}
-			cache[change.From.TreeEntry.Hash], err = analyser.getBlob(&change.From, commit)
-			if err != nil {
-				fmt.Fprintf(os.Stderr, "file from %s\n", change.From.Name)
-			}
-		default:
-			panic(fmt.Sprintf("unsupported action: %d", change.Action))
-		}
-		if err != nil {
-			return nil, err
-		}
-	}
-	return &cache, nil
-}
-
-func (analyser *Analyser) detectRenames(
-	changes *object.Changes, cache *map[plumbing.Hash]*object.Blob) object.Changes {
-	reduced_changes := make(object.Changes, 0, changes.Len())
-
-	// Stage 1 - find renames by matching the hashes
-	// n log(n)
-	// We sort additions and deletions by hash and then do the single scan along
-	// both slices.
-	deleted := make(sortableChanges, 0, changes.Len())
-	added := make(sortableChanges, 0, changes.Len())
-	for _, change := range *changes {
-		action, err := change.Action()
-		if err != nil {
-			panic(err)
-		}
-		switch action {
-		case merkletrie.Insert:
-			added = append(added, sortableChange{change, change.To.TreeEntry.Hash})
-		case merkletrie.Delete:
-			deleted = append(deleted, sortableChange{change, change.From.TreeEntry.Hash})
-		case merkletrie.Modify:
-			reduced_changes = append(reduced_changes, change)
-		default:
-			panic(fmt.Sprintf("unsupported action: %d", change.Action))
-		}
-	}
-	sort.Sort(deleted)
-	sort.Sort(added)
-	a := 0
-	d := 0
-	still_deleted := make(object.Changes, 0, deleted.Len())
-	still_added := make(object.Changes, 0, added.Len())
-	for a < added.Len() && d < deleted.Len() {
-		if added[a].hash == deleted[d].hash {
-			reduced_changes = append(
-				reduced_changes,
-				&object.Change{From: deleted[d].change.From, To: added[a].change.To})
-			a++
-			d++
-		} else if added[a].Less(&deleted[d]) {
-			still_added = append(still_added, added[a].change)
-			a++
-		} else {
-			still_deleted = append(still_deleted, deleted[d].change)
-			d++
-		}
-	}
-	for ; a < added.Len(); a++ {
-		still_added = append(still_added, added[a].change)
-	}
-	for ; d < deleted.Len(); d++ {
-		still_deleted = append(still_deleted, deleted[d].change)
-	}
-
-	// Stage 2 - apply the similarity threshold
-	// n^2 but actually linear
-	// We sort the blobs by size and do the single linear scan.
-	added_blobs := make(sortableBlobs, 0, still_added.Len())
-	deleted_blobs := make(sortableBlobs, 0, still_deleted.Len())
-	for _, change := range still_added {
-		blob := (*cache)[change.To.TreeEntry.Hash]
-		added_blobs = append(
-			added_blobs, sortableBlob{change: change, size: blob.Size})
-	}
-	for _, change := range still_deleted {
-		blob := (*cache)[change.From.TreeEntry.Hash]
-		deleted_blobs = append(
-			deleted_blobs, sortableBlob{change: change, size: blob.Size})
-	}
-	sort.Sort(added_blobs)
-	sort.Sort(deleted_blobs)
-	d_start := 0
-	for a = 0; a < added_blobs.Len(); a++ {
-		my_blob := (*cache)[added_blobs[a].change.To.TreeEntry.Hash]
-		my_size := added_blobs[a].size
-		for d = d_start; d < deleted_blobs.Len() && !analyser.sizesAreClose(my_size, deleted_blobs[d].size); d++ {
-		}
-		d_start = d
-		found_match := false
-		for d = d_start; d < deleted_blobs.Len() && analyser.sizesAreClose(my_size, deleted_blobs[d].size); d++ {
-			if analyser.blobsAreClose(
-				my_blob, (*cache)[deleted_blobs[d].change.From.TreeEntry.Hash]) {
-				found_match = true
-				reduced_changes = append(
-					reduced_changes,
-					&object.Change{From: deleted_blobs[d].change.From,
-						To: added_blobs[a].change.To})
-				break
-			}
-		}
-		if found_match {
-			added_blobs = append(added_blobs[:a], added_blobs[a+1:]...)
-			a--
-			deleted_blobs = append(deleted_blobs[:d], deleted_blobs[d+1:]...)
-		}
-	}
-
-	// Stage 3 - we give up, everything left are independent additions and deletions
-	for _, blob := range added_blobs {
-		reduced_changes = append(reduced_changes, blob.change)
-	}
-	for _, blob := range deleted_blobs {
-		reduced_changes = append(reduced_changes, blob.change)
-	}
-	return reduced_changes
-}
-
-// Analyse calculates the line burndown statistics for the bound repository.
-//
-// commits is a slice with the sequential commit history. It shall start from
-// the root (ascending order).
-//
-// Returns the list of snapshots of the cumulative line edit times and the
-// similar lists for every file which is alive in HEAD.
-// The number of snapshots (the first dimension >[]<[]int64) depends on
-// Analyser.Sampling (the more Sampling, the less the value); the length of
-// each snapshot depends on Analyser.Granularity (the more Granularity,
-// the less the value).
-func (analyser *Analyser) Analyse(commits []*object.Commit) (
-	[][]int64, map[string][][]int64, [][][]int64, [][]int64) {
-	sampling := analyser.Sampling
-	if sampling == 0 {
-		sampling = 1
-	}
-	onProgress := analyser.OnProgress
-	if onProgress == nil {
-		onProgress = func(int, int) {}
-	}
-	if analyser.SimilarityThreshold < 0 || analyser.SimilarityThreshold > 100 {
-		panic("hercules.Analyser: an invalid SimilarityThreshold was specified")
-	}
-
-	// current daily alive number of lines; key is the number of days from the
-	// beginning of the history
-	global_status := map[int]int64{}
-	// weekly snapshots of status
-	global_history := [][]int64{}
-	// weekly snapshots of each file's status
-	file_histories := map[string][][]int64{}
-	// weekly snapshots of each person's status
-	people_histories := make([][][]int64, analyser.PeopleNumber)
-	// mapping <file path> -> hercules.File
-	files := map[string]*File{}
-	// Mutual deletions and self insertions
-	matrix := make([]map[int]int64, analyser.PeopleNumber)
-	// People's individual time stats
-	people := make([]map[int]int64, analyser.PeopleNumber)
-
-	var day0 time.Time // will be initialized in the first iteration
-	var prev_tree *object.Tree = nil
-	var day, prev_day int
-
-	for index, commit := range commits {
-		onProgress(index, len(commits))
-		tree, err := commit.Tree()
-		if err != nil {
-			panic(err)
-		}
-		author := analyser.getAuthorId(commit.Author)
-		if index == 0 {
-			// first iteration - initialize the file objects from the tree
-			day0 = commit.Author.When
-			func() {
-				file_iter := tree.Files()
-				defer file_iter.Close()
-				for {
-					file, err := file_iter.Next()
-					if err != nil {
-						if err == io.EOF {
-							break
-						}
-						panic(err)
-					}
-					lines, err := loc(&file.Blob)
-					if err == nil {
-						files[file.Name] = analyser.newFile(author, 0, lines, global_status, people, matrix)
-					}
-				}
-			}()
-		} else {
-			day = int(commit.Author.When.Sub(day0).Hours() / 24)
-			if day < prev_day {
-				// rebase makes miracles
-				day = prev_day
-			}
-			delta := (day / sampling) - (prev_day / sampling)
-			if delta > 0 {
-				prev_day = day
-				gs, fss, pss := analyser.groupStatus(global_status, files, people, day)
-				global_history = analyser.updateHistories(
-					global_history, gs, file_histories, fss, people_histories, pss, delta)
-			}
-			tree_diff, err := object.DiffTree(prev_tree, tree)
-			if err != nil {
-				fmt.Fprintf(os.Stderr, "commit #%d %s\n", index, commit.Hash.String())
-				panic(err)
-			}
-			cache, err := analyser.cacheBlobs(&tree_diff, commit)
-			if err != nil {
-				fmt.Fprintf(os.Stderr, "commit #%d %s\n", index, commit.Hash.String())
-				panic(err)
-			}
-			tree_diff = analyser.detectRenames(&tree_diff, cache)
-			for _, change := range tree_diff {
-				action, err := change.Action()
-				if err != nil {
-					fmt.Fprintf(os.Stderr, "commit #%d %s\n", index, commit.Hash.String())
-					panic(err)
-				}
-				switch action {
-				case merkletrie.Insert:
-					analyser.handleInsertion(change, author, day, global_status, files, people, matrix, cache)
-				case merkletrie.Delete:
-					analyser.handleDeletion(change, author, day, global_status, files, cache)
-				case merkletrie.Modify:
-					func() {
-						defer func() {
-							r := recover()
-							if r != nil {
-								fmt.Fprintf(os.Stderr, "#%d - %s: modification error\n",
-									index, commit.Hash.String())
-								panic(r)
-							}
-						}()
-						analyser.handleModification(change, author, day, global_status, files, people, matrix, cache)
-					}()
-				}
-			}
-		}
-		prev_tree = tree
-	}
-	gs, fss, pss := analyser.groupStatus(global_status, files, people, day)
-	global_history = analyser.updateHistories(
-		global_history, gs, file_histories, fss, people_histories, pss, 1)
-	for key, statuses := range file_histories {
-		if len(statuses) == len(global_history) {
-			continue
-		}
-		padding := make([][]int64, len(global_history)-len(statuses))
-		for i := range padding {
-			padding[i] = make([]int64, len(global_status))
-		}
-		file_histories[key] = append(padding, statuses...)
-	}
-	people_matrix := make([][]int64, analyser.PeopleNumber)
-	for i, row := range matrix {
-		mrow := make([]int64, analyser.PeopleNumber+2)
-		people_matrix[i] = mrow
-		for key, val := range row {
-			if key == MISSING_AUTHOR {
-				key = -1
-			} else if key == SELF_AUTHOR {
-				key = -2
-			}
-			mrow[key+2] = val
-		}
-	}
-	return global_history, file_histories, people_histories, people_matrix
-}

+ 114 - 0
blob_cache.go

@@ -0,0 +1,114 @@
+package hercules
+
+import (
+	"fmt"
+	"os"
+
+	"gopkg.in/src-d/go-git.v4"
+	"gopkg.in/src-d/go-git.v4/config"
+	"gopkg.in/src-d/go-git.v4/plumbing"
+	"gopkg.in/src-d/go-git.v4/plumbing/object"
+	"gopkg.in/src-d/go-git.v4/utils/merkletrie"
+)
+
+type BlobCache struct {
+	repository *git.Repository
+}
+
+func (cache *BlobCache) Name() string {
+	return "BlobCache"
+}
+
+func (cache *BlobCache) Provides() []string {
+	arr := [...]string{"blob_cache"}
+	return arr[:]
+}
+
+func (cache *BlobCache) Requires() []string {
+	arr := [...]string{"changes"}
+	return arr[:]
+}
+
+func (cache *BlobCache) Initialize(repository *git.Repository) {
+	cache.repository = repository
+}
+
+func (self *BlobCache) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
+	commit := deps["commit"].(*object.Commit)
+	changes := deps["changes"].(object.Changes)
+	cache := make(map[plumbing.Hash]*object.Blob)
+	for _, change := range changes {
+		action, err := change.Action()
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "no action in %s\n", change.To.TreeEntry.Hash)
+			return nil, err
+		}
+		switch action {
+		case merkletrie.Insert:
+			cache[change.To.TreeEntry.Hash], err = self.getBlob(&change.To, commit)
+			if err != nil {
+				fmt.Fprintf(os.Stderr, "file to %s %s\n", change.To.Name, change.To.TreeEntry.Hash)
+			}
+		case merkletrie.Delete:
+			cache[change.From.TreeEntry.Hash], err = self.getBlob(&change.From, commit)
+			if err != nil {
+				if err.Error() != plumbing.ErrObjectNotFound.Error() {
+					fmt.Fprintf(os.Stderr, "file from %s %s\n", change.From.Name, change.From.TreeEntry.Hash)
+				} else {
+					cache[change.From.TreeEntry.Hash], err = createDummyBlob(
+						&change.From.TreeEntry.Hash)
+				}
+			}
+		case merkletrie.Modify:
+			cache[change.To.TreeEntry.Hash], err = self.getBlob(&change.To, commit)
+			if err != nil {
+				fmt.Fprintf(os.Stderr, "file to %s\n", change.To.Name)
+			}
+			cache[change.From.TreeEntry.Hash], err = self.getBlob(&change.From, commit)
+			if err != nil {
+				fmt.Fprintf(os.Stderr, "file from %s\n", change.From.Name)
+			}
+		default:
+			panic(fmt.Sprintf("unsupported action: %d", change.Action))
+		}
+		if err != nil {
+			return nil, err
+		}
+	}
+	return map[string]interface{}{"blob_cache": cache}, nil
+}
+
+func (cache *BlobCache) Finalize() interface{} {
+	return nil
+}
+
+func (cache *BlobCache) getBlob(entry *object.ChangeEntry, commit *object.Commit) (
+	*object.Blob, error) {
+	blob, err := cache.repository.BlobObject(entry.TreeEntry.Hash)
+	if err != nil {
+		if err.Error() != plumbing.ErrObjectNotFound.Error() {
+			fmt.Fprintf(os.Stderr, "getBlob(%s)\n", entry.TreeEntry.Hash.String())
+			return nil, err
+		}
+		file, err_modules := commit.File(".gitmodules")
+		if err_modules != nil {
+			return nil, err
+		}
+		contents, err_modules := file.Contents()
+		if err_modules != nil {
+			return nil, err
+		}
+		modules := config.NewModules()
+		err_modules = modules.Unmarshal([]byte(contents))
+		if err_modules != nil {
+			return nil, err
+		}
+		_, exists := modules.Submodules[entry.Name]
+		if exists {
+			// we found that this is a submodule
+			return createDummyBlob(&entry.TreeEntry.Hash)
+		}
+		return nil, err
+	}
+	return blob, nil
+}

+ 543 - 0
burndown.go

@@ -0,0 +1,543 @@
+package hercules
+
+import (
+	"bufio"
+	"bytes"
+	"errors"
+	"fmt"
+	"io"
+	"os"
+	"unicode/utf8"
+
+	"github.com/sergi/go-diff/diffmatchpatch"
+	"gopkg.in/src-d/go-git.v4"
+	"gopkg.in/src-d/go-git.v4/plumbing"
+	"gopkg.in/src-d/go-git.v4/plumbing/object"
+	"gopkg.in/src-d/go-git.v4/utils/merkletrie"
+)
+
+// BurndownAnalyser allows to gather the line burndown statistics for a Git repository.
+type BurndownAnalysis struct {
+	// Granularity sets the size of each band - the number of days it spans.
+	// Smaller values provide better resolution but require more work and eat more
+	// memory. 30 days is usually enough.
+	Granularity int
+	// Sampling sets how detailed is the statistic - the size of the interval in
+	// days between consecutive measurements. It is usually a good idea to set it
+	// <= Granularity. Try 15 or 30.
+	Sampling int
+
+	// The number of developers for which to collect the burndown stats. 0 disables it.
+	PeopleNumber int
+
+	// Debug activates the debugging mode. Analyse() runs slower in this mode
+	// but it accurately checks all the intermediate states for invariant
+	// violations.
+	Debug bool
+
+	// Repository points to the analysed Git repository struct from go-git.
+	repository *git.Repository
+	// globalStatus is the current daily alive number of lines; key is the number
+	// of days from the beginning of the history.
+	globalStatus map[int]int64
+	// globalHistory is the weekly snapshots of globalStatus.
+	globalHistory [][]int64
+	// fileHistories is the weekly snapshots of each file's status.
+	fileHistories map[string][][]int64
+	// peopleHistories is the weekly snapshots of each person's status.
+	peopleHistories [][][]int64
+	// fiales is the mapping <file path> -> hercules.File.
+	files map[string]*File
+	// matrix is the mutual deletions and self insertions.
+	matrix []map[int]int64
+	// people is the people's individual time stats.
+	people []map[int]int64
+	// day is the most recent day index processed.
+	day int
+	// previousDay is the day from the previous sample period -
+	// different from DaysSinceStart.previousDay.
+	previousDay int
+}
+
+type BurndownResult struct {
+	GlobalHistory   [][]int64
+	FileHistories   map[string][][]int64
+	PeopleHistories [][][]int64
+	PeopleMatrix    [][]int64
+}
+
+func (analyser *BurndownAnalysis) Name() string {
+	return "BurndownAnalysis"
+}
+
+func (analyser *BurndownAnalysis) Provides() []string {
+	return []string{}
+}
+
+func (analyser *BurndownAnalysis) Requires() []string {
+	arr := [...]string{"renamed_changes", "blob_cache", "day", "author"}
+	return arr[:]
+}
+
+func (analyser *BurndownAnalysis) Initialize(repository *git.Repository) {
+	analyser.repository = repository
+	analyser.globalStatus = map[int]int64{}
+	analyser.globalHistory = [][]int64{}
+	analyser.fileHistories = map[string][][]int64{}
+	analyser.peopleHistories = make([][][]int64, analyser.PeopleNumber)
+	analyser.files = map[string]*File{}
+	analyser.matrix = make([]map[int]int64, analyser.PeopleNumber)
+	analyser.people = make([]map[int]int64, analyser.PeopleNumber)
+	analyser.day = 0
+	analyser.previousDay = 0
+}
+
+func (analyser *BurndownAnalysis) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
+	sampling := analyser.Sampling
+	if sampling == 0 {
+		sampling = 1
+	}
+	author := deps["author"].(int)
+	analyser.day = deps["day"].(int)
+	delta := (analyser.day / sampling) - (analyser.previousDay / sampling)
+	if delta > 0 {
+		analyser.previousDay = analyser.day
+		gs, fss, pss := analyser.groupStatus()
+		analyser.updateHistories(gs, fss, pss, delta)
+	}
+	cache := deps["blob_cache"].(map[plumbing.Hash]*object.Blob)
+	tree_diff := deps["renamed_changes"].(object.Changes)
+	for _, change := range tree_diff {
+		action, err := change.Action()
+		if err != nil {
+			return nil, err
+		}
+		switch action {
+		case merkletrie.Insert:
+			err = analyser.handleInsertion(change, author, cache)
+		case merkletrie.Delete:
+			err = analyser.handleDeletion(change, author, cache)
+		case merkletrie.Modify:
+			err = analyser.handleModification(change, author, cache)
+		}
+		if err != nil {
+			return nil, err
+		}
+	}
+	return nil, nil
+}
+
+// Finalize() returns the list of snapshots of the cumulative line edit times
+// and the similar lists for every file which is alive in HEAD.
+// The number of snapshots (the first dimension >[]<[]int64) depends on
+// Analyser.Sampling (the more Sampling, the less the value); the length of
+// each snapshot depends on Analyser.Granularity (the more Granularity,
+// the less the value).
+func (analyser *BurndownAnalysis) Finalize() interface{} {
+	gs, fss, pss := analyser.groupStatus()
+	analyser.updateHistories(gs, fss, pss, 1)
+	for key, statuses := range analyser.fileHistories {
+		if len(statuses) == len(analyser.globalHistory) {
+			continue
+		}
+		padding := make([][]int64, len(analyser.globalHistory)-len(statuses))
+		for i := range padding {
+			padding[i] = make([]int64, len(analyser.globalStatus))
+		}
+		analyser.fileHistories[key] = append(padding, statuses...)
+	}
+	peopleMatrix := make([][]int64, analyser.PeopleNumber)
+	for i, row := range analyser.matrix {
+		mrow := make([]int64, analyser.PeopleNumber+2)
+		peopleMatrix[i] = mrow
+		for key, val := range row {
+			if key == MISSING_AUTHOR {
+				key = -1
+			} else if key == SELF_AUTHOR {
+				key = -2
+			}
+			mrow[key+2] = val
+		}
+	}
+	return BurndownResult{
+		GlobalHistory:   analyser.globalHistory,
+		FileHistories:   analyser.fileHistories,
+		PeopleHistories: analyser.peopleHistories,
+		PeopleMatrix:    peopleMatrix}
+}
+
+func checkClose(c io.Closer) {
+	if err := c.Close(); err != nil {
+		panic(err)
+	}
+}
+
+func countLines(file *object.Blob) (int, error) {
+	reader, err := file.Reader()
+	if err != nil {
+		return 0, err
+	}
+	defer checkClose(reader)
+	var scanner *bufio.Scanner
+	buffer := make([]byte, bufio.MaxScanTokenSize)
+	counter := 0
+	for scanner == nil || scanner.Err() == bufio.ErrTooLong {
+		if scanner != nil && !utf8.Valid(scanner.Bytes()) {
+			return -1, errors.New("binary")
+		}
+		scanner = bufio.NewScanner(reader)
+		scanner.Buffer(buffer, 0)
+		for scanner.Scan() {
+			if !utf8.Valid(scanner.Bytes()) {
+				return -1, errors.New("binary")
+			}
+			counter++
+		}
+	}
+	return counter, nil
+}
+
+func blobToString(file *object.Blob) (string, error) {
+	reader, err := file.Reader()
+	if err != nil {
+		return "", err
+	}
+	defer checkClose(reader)
+	buf := new(bytes.Buffer)
+	buf.ReadFrom(reader)
+	return buf.String(), nil
+}
+
+func (analyser *BurndownAnalysis) packPersonWithDay(person int, day int) int {
+	if analyser.PeopleNumber == 0 {
+		return day
+	}
+	result := day
+	result |= person << 14
+	// This effectively means max 16384 days (>44 years) and (131072 - 2) devs
+	return result
+}
+
+func (analyser *BurndownAnalysis) unpackPersonWithDay(value int) (int, int) {
+	if analyser.PeopleNumber == 0 {
+		return MISSING_AUTHOR, value
+	}
+	return value >> 14, value & 0x3FFF
+}
+
+func (analyser *BurndownAnalysis) updateStatus(
+	status interface{}, _ int, previous_time_ int, delta int) {
+
+	_, previous_time := analyser.unpackPersonWithDay(previous_time_)
+	status.(map[int]int64)[previous_time] += int64(delta)
+}
+
+func (analyser *BurndownAnalysis) updatePeople(people interface{}, _ int, previous_time_ int, delta int) {
+	old_author, previous_time := analyser.unpackPersonWithDay(previous_time_)
+	if old_author == MISSING_AUTHOR {
+		return
+	}
+	casted := people.([]map[int]int64)
+	stats := casted[old_author]
+	if stats == nil {
+		stats = map[int]int64{}
+		casted[old_author] = stats
+	}
+	stats[previous_time] += int64(delta)
+}
+
+func (analyser *BurndownAnalysis) updateMatrix(
+	matrix_ interface{}, current_time int, previous_time int, delta int) {
+
+	matrix := matrix_.([]map[int]int64)
+	new_author, _ := analyser.unpackPersonWithDay(current_time)
+	old_author, _ := analyser.unpackPersonWithDay(previous_time)
+	if old_author == MISSING_AUTHOR {
+		return
+	}
+	if new_author == old_author && delta > 0 {
+		new_author = SELF_AUTHOR
+	}
+	row := matrix[old_author]
+	if row == nil {
+		row = map[int]int64{}
+		matrix[old_author] = row
+	}
+	cell, exists := row[new_author]
+	if !exists {
+		row[new_author] = 0
+		cell = 0
+	}
+	row[new_author] = cell + int64(delta)
+}
+
+func (analyser *BurndownAnalysis) newFile(
+	author int, day int, size int, global map[int]int64, people []map[int]int64,
+	matrix []map[int]int64) *File {
+	if analyser.PeopleNumber == 0 {
+		return NewFile(day, size, NewStatus(global, analyser.updateStatus),
+			NewStatus(make(map[int]int64), analyser.updateStatus))
+	}
+	return NewFile(analyser.packPersonWithDay(author, day), size,
+		NewStatus(global, analyser.updateStatus),
+		NewStatus(make(map[int]int64), analyser.updateStatus),
+		NewStatus(people, analyser.updatePeople),
+		NewStatus(matrix, analyser.updateMatrix))
+}
+
+func (analyser *BurndownAnalysis) handleInsertion(
+	change *object.Change, author int, cache map[plumbing.Hash]*object.Blob) error {
+	blob := cache[change.To.TreeEntry.Hash]
+	lines, err := countLines(blob)
+	if err != nil {
+		if err.Error() == "binary" {
+			return nil
+		}
+		return err
+	}
+	name := change.To.Name
+	file, exists := analyser.files[name]
+	if exists {
+		return errors.New(fmt.Sprintf("file %s already exists", name))
+	}
+	file = analyser.newFile(
+		author, analyser.day, lines, analyser.globalStatus, analyser.people, analyser.matrix)
+	analyser.files[name] = file
+	return nil
+}
+
+func (analyser *BurndownAnalysis) handleDeletion(
+	change *object.Change, author int, cache map[plumbing.Hash]*object.Blob) error {
+
+	blob := cache[change.From.TreeEntry.Hash]
+	lines, err := countLines(blob)
+	if err != nil {
+		return err
+	}
+	name := change.From.Name
+	file := analyser.files[name]
+	file.Update(analyser.packPersonWithDay(author, analyser.day), 0, 0, lines)
+	delete(analyser.files, name)
+	return nil
+}
+
+func (analyser *BurndownAnalysis) handleModification(
+	change *object.Change, author int, cache map[plumbing.Hash]*object.Blob) error {
+
+	blob_from := cache[change.From.TreeEntry.Hash]
+	blob_to := cache[change.To.TreeEntry.Hash]
+	// we are not validating UTF-8 here because for example
+	// git/git 4f7770c87ce3c302e1639a7737a6d2531fe4b160 fetch-pack.c is invalid UTF-8
+	str_from, err := blobToString(blob_from)
+	if err != nil {
+		return err
+	}
+	str_to, err := blobToString(blob_to)
+	if err != nil {
+		return err
+	}
+	file, exists := analyser.files[change.From.Name]
+	if !exists {
+		return analyser.handleInsertion(change, author, cache)
+	}
+	// possible rename
+	if change.To.Name != change.From.Name {
+		err = analyser.handleRename(change.From.Name, change.To.Name)
+		if err != nil {
+			return err
+		}
+	}
+	dmp := diffmatchpatch.New()
+	src, dst, _ := dmp.DiffLinesToRunes(str_from, str_to)
+	if file.Len() != len(src) {
+		fmt.Fprintf(os.Stderr, "====TREE====\n%s", file.Dump())
+		return errors.New(fmt.Sprintf("%s: internal integrity error src %d != %d %s -> %s",
+			change.To.Name, len(src), file.Len(),
+			change.From.TreeEntry.Hash.String(), change.To.TreeEntry.Hash.String()))
+	}
+	diffs := dmp.DiffMainRunes(src, dst, false)
+	// we do not call RunesToDiffLines so the number of lines equals
+	// to the rune count
+	position := 0
+	pending := diffmatchpatch.Diff{Text: ""}
+
+	apply := func(edit diffmatchpatch.Diff) {
+		length := utf8.RuneCountInString(edit.Text)
+		if edit.Type == diffmatchpatch.DiffInsert {
+			file.Update(analyser.packPersonWithDay(author, analyser.day), position, length, 0)
+			position += length
+		} else {
+			file.Update(analyser.packPersonWithDay(author, analyser.day), position, 0, length)
+		}
+		if analyser.Debug {
+			file.Validate()
+		}
+	}
+
+	for _, edit := range diffs {
+		dump_before := ""
+		if analyser.Debug {
+			dump_before = file.Dump()
+		}
+		length := utf8.RuneCountInString(edit.Text)
+		debug_error := func() {
+			fmt.Fprintf(os.Stderr, "%s: internal diff error\n", change.To.Name)
+			fmt.Fprintf(os.Stderr, "Update(%d, %d, %d (0), %d (0))\n", analyser.day, position,
+				length, utf8.RuneCountInString(pending.Text))
+			if dump_before != "" {
+				fmt.Fprintf(os.Stderr, "====TREE BEFORE====\n%s====END====\n", dump_before)
+			}
+			fmt.Fprintf(os.Stderr, "====TREE AFTER====\n%s====END====\n", file.Dump())
+		}
+		switch edit.Type {
+		case diffmatchpatch.DiffEqual:
+			if pending.Text != "" {
+				apply(pending)
+				pending.Text = ""
+			}
+			position += length
+		case diffmatchpatch.DiffInsert:
+			if pending.Text != "" {
+				if pending.Type == diffmatchpatch.DiffInsert {
+					debug_error()
+					return errors.New("DiffInsert may not appear after DiffInsert")
+				}
+				file.Update(analyser.packPersonWithDay(author, analyser.day), position, length,
+					utf8.RuneCountInString(pending.Text))
+				if analyser.Debug {
+					file.Validate()
+				}
+				position += length
+				pending.Text = ""
+			} else {
+				pending = edit
+			}
+		case diffmatchpatch.DiffDelete:
+			if pending.Text != "" {
+				debug_error()
+				return errors.New("DiffDelete may not appear after DiffInsert/DiffDelete")
+			}
+			pending = edit
+		default:
+			debug_error()
+			return errors.New(fmt.Sprintf("diff operation is not supported: %d", edit.Type))
+		}
+	}
+	if pending.Text != "" {
+		apply(pending)
+		pending.Text = ""
+	}
+	if file.Len() != len(dst) {
+		return errors.New(fmt.Sprintf("%s: internal integrity error dst %d != %d",
+			change.To.Name, len(dst), file.Len()))
+	}
+	return nil
+}
+
+func (analyser *BurndownAnalysis) handleRename(from, to string) error {
+	file, exists := analyser.files[from]
+	if !exists {
+		return errors.New(fmt.Sprintf("file %s does not exist", from))
+	}
+	analyser.files[to] = file
+	delete(analyser.files, from)
+	return nil
+}
+
+func (analyser *BurndownAnalysis) groupStatus() ([]int64, map[string][]int64, [][]int64) {
+	granularity := analyser.Granularity
+	if granularity == 0 {
+		granularity = 1
+	}
+	day := analyser.day
+	day++
+	adjust := 0
+	if day%granularity != 0 {
+		adjust = 1
+	}
+	global := make([]int64, day/granularity+adjust)
+	var group int64
+	for i := 0; i < day; i++ {
+		group += analyser.globalStatus[i]
+		if (i % granularity) == (granularity - 1) {
+			global[i/granularity] = group
+			group = 0
+		}
+	}
+	if day%granularity != 0 {
+		global[len(global)-1] = group
+	}
+	locals := make(map[string][]int64)
+	for key, file := range analyser.files {
+		status := make([]int64, day/granularity+adjust)
+		var group int64
+		for i := 0; i < day; i++ {
+			group += file.Status(1).(map[int]int64)[i]
+			if (i % granularity) == (granularity - 1) {
+				status[i/granularity] = group
+				group = 0
+			}
+		}
+		if day%granularity != 0 {
+			status[len(status)-1] = group
+		}
+		locals[key] = status
+	}
+	peoples := make([][]int64, len(analyser.people))
+	for key, person := range analyser.people {
+		status := make([]int64, day/granularity+adjust)
+		var group int64
+		for i := 0; i < day; i++ {
+			group += person[i]
+			if (i % granularity) == (granularity - 1) {
+				status[i/granularity] = group
+				group = 0
+			}
+		}
+		if day%granularity != 0 {
+			status[len(status)-1] = group
+		}
+		peoples[key] = status
+	}
+	return global, locals, peoples
+}
+
+func (analyser *BurndownAnalysis) updateHistories(
+	globalStatus []int64, file_statuses map[string][]int64, people_statuses [][]int64, delta int) {
+	for i := 0; i < delta; i++ {
+		analyser.globalHistory = append(analyser.globalHistory, globalStatus)
+	}
+	to_delete := make([]string, 0)
+	for key, fh := range analyser.fileHistories {
+		ls, exists := file_statuses[key]
+		if !exists {
+			to_delete = append(to_delete, key)
+		} else {
+			for i := 0; i < delta; i++ {
+				fh = append(fh, ls)
+			}
+			analyser.fileHistories[key] = fh
+		}
+	}
+	for _, key := range to_delete {
+		delete(analyser.fileHistories, key)
+	}
+	for key, ls := range file_statuses {
+		fh, exists := analyser.fileHistories[key]
+		if exists {
+			continue
+		}
+		for i := 0; i < delta; i++ {
+			fh = append(fh, ls)
+		}
+		analyser.fileHistories[key] = fh
+	}
+
+	for key, ph := range analyser.peopleHistories {
+		ls := people_statuses[key]
+		for i := 0; i < delta; i++ {
+			ph = append(ph, ls)
+		}
+		analyser.peopleHistories[key] = ph
+	}
+}

+ 36 - 119
cmd/hercules/main.go

@@ -7,10 +7,8 @@ statistics from Git repositories. Usage:
 package main
 
 import (
-	"bufio"
 	"flag"
 	"fmt"
-	"io"
 	"net/http"
 	_ "net/http/pprof"
 	"os"
@@ -21,7 +19,6 @@ import (
 
 	"gopkg.in/src-d/go-billy.v3/osfs"
 	"gopkg.in/src-d/go-git.v4"
-	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
 	"gopkg.in/src-d/go-git.v4/storage"
 	"gopkg.in/src-d/go-git.v4/storage/filesystem"
@@ -29,96 +26,6 @@ import (
 	"gopkg.in/src-d/hercules.v1"
 )
 
-// Signature stores the author's identification. Only a single field is used to identify the
-// commit: first Email is checked, then Name.
-type Signature struct {
-	Name string
-	Email string
-}
-
-func loadPeopleDict(path string) (map[string]int, map[int]string, int) {
-	file, err := os.Open(path)
-	if err != nil {
-		panic(err)
-	}
-	defer file.Close()
-	scanner := bufio.NewScanner(file)
-	dict := make(map[string]int)
-	reverse_dict := make(map[int]string)
-	size := 0
-	for scanner.Scan() {
-		for _, id := range strings.Split(strings.ToLower(scanner.Text()), "|") {
-			dict[id] = size
-		}
-		reverse_dict[size] = scanner.Text()
-		size += 1
-	}
-	return dict, reverse_dict, size
-}
-
-func generatePeopleDict(commits []*object.Commit) (map[string]int, map[int]string, int) {
-	dict := make(map[string]int)
-	emails := make(map[int][]string)
-	names := make(map[int][]string)
-	size := 0
-	for _, commit := range commits {
-		email := strings.ToLower(commit.Author.Email)
-		name := strings.ToLower(commit.Author.Name)
-		id, exists := dict[email]
-		if exists {
-			_, exists := dict[name]
-			if !exists {
-				dict[name] = id
-			  names[id] = append(names[id], name)
-			}
-			continue
-		}
-		id, exists = dict[name]
-		if exists {
-			dict[email] = id
-			emails[id] = append(emails[id], email)
-			continue
-		}
-		dict[email] = size
-		dict[name] = size
-		emails[size] = append(emails[size], email)
-		names[size] = append(names[size], name)
-		size += 1
-	}
-	reverse_dict := make(map[int]string)
-	for _, val := range dict {
-		reverse_dict[val] = strings.Join(names[val], "|") + "|" + strings.Join(emails[val], "|")
-	}
-	return dict, reverse_dict, size
-}
-
-func loadCommitsFromFile(path string, repository *git.Repository) []*object.Commit {
-	var file io.Reader
-	if path != "-" {
-		file, err := os.Open(path)
-		if err != nil {
-			panic(err)
-		}
-		defer file.Close()
-	} else {
-		file = os.Stdin
-	}
-	scanner := bufio.NewScanner(file)
-	commits := []*object.Commit{}
-	for scanner.Scan() {
-		hash := plumbing.NewHash(scanner.Text())
-		if len(hash) != 20 {
-			panic("invalid commit hash " + scanner.Text())
-		}
-		commit, err := repository.CommitObject(hash)
-		if err != nil {
-			panic(err)
-		}
-		commits = append(commits, commit)
-	}
-	return commits
-}
-
 func printStatuses(statuses [][]int64, name string) {
 	// determine the maximum length of each value
 	var maxnum int64
@@ -225,62 +132,72 @@ func main() {
 	if err != nil {
 		panic(err)
 	}
+
 	// core logic
-	analyser := hercules.Analyser{
-		Repository: repository,
-		OnProgress: func(commit, length int) {
-			fmt.Fprintf(os.Stderr, "%d / %d\r", commit, length)
-		},
-		Granularity:         granularity,
-		Sampling:            sampling,
-		SimilarityThreshold: similarity_threshold,
-		Debug:               debug,
+	pipeline := hercules.NewPipeline(repository)
+	pipeline.OnProgress = func(commit, length int) {
+		fmt.Fprintf(os.Stderr, "%d / %d\r", commit, length)
 	}
 	// list of commits belonging to the default branch, from oldest to newest
 	// rev-list --first-parent
 	var commits []*object.Commit
 	if commitsFile == "" {
-		commits = analyser.Commits()
+		commits = pipeline.Commits()
 	} else {
-		commits = loadCommitsFromFile(commitsFile, repository)
+		commits = hercules.LoadCommitsFromFile(commitsFile, repository)
 	}
-	var people_ids map[int]string
+
+	pipeline.AddItem(&hercules.BlobCache{})
+	pipeline.AddItem(&hercules.DaysSinceStart{})
+	pipeline.AddItem(&hercules.RenameAnalysis{SimilarityThreshold: similarity_threshold})
+	pipeline.AddItem(&hercules.TreeDiff{})
+	id_matcher := &hercules.IdentityDetector{}
 	if with_people {
-		var people_dict map[string]int
-		var people_number int
 		if people_dict_path != "" {
-			people_dict, people_ids, people_number = loadPeopleDict(people_dict_path)
+			id_matcher.LoadPeopleDict(people_dict_path)
 		} else {
-			people_dict, people_ids, people_number = generatePeopleDict(commits)
+			id_matcher.GeneratePeopleDict(commits)
 		}
-		analyser.PeopleNumber = people_number
-		analyser.PeopleDict = people_dict
 	}
-	global_statuses, file_statuses, people_statuses, people_matrix := analyser.Analyse(commits)
+	pipeline.AddItem(id_matcher)
+	burndowner := &hercules.BurndownAnalysis{
+		Granularity:         granularity,
+		Sampling:            sampling,
+		Debug:               debug,
+		PeopleNumber:        len(id_matcher.ReversePeopleDict),
+	}
+	pipeline.AddItem(burndowner)
+
+	pipeline.Initialize()
+	result, err := pipeline.Run(commits)
+	if err != nil {
+		panic(err)
+	}
+	burndown_results := result[burndowner].(hercules.BurndownResult)
 	fmt.Fprint(os.Stderr, "                \r")
-	if len(global_statuses) == 0 {
+	if len(burndown_results.GlobalHistory) == 0 {
 		return
 	}
 	// print the start date, granularity, sampling
 	fmt.Println(commits[0].Author.When.Unix(),
 		commits[len(commits)-1].Author.When.Unix(),
 		granularity, sampling)
-	printStatuses(global_statuses, uri)
+	printStatuses(burndown_results.GlobalHistory, uri)
 	if with_files {
 		fmt.Print("files\n")
-		keys := sortedKeys(file_statuses)
+		keys := sortedKeys(burndown_results.FileHistories)
 		for _, key := range keys {
-			printStatuses(file_statuses[key], key)
+			printStatuses(burndown_results.FileHistories[key], key)
 		}
 	}
 	if with_people {
 		fmt.Print("people\n")
-		for key, val := range people_statuses {
+		for key, val := range burndown_results.PeopleHistories {
 			fmt.Printf("%d: ", key)
-			printStatuses(val, people_ids[key])
+			printStatuses(val, id_matcher.ReversePeopleDict[key])
 		}
 		fmt.Println()
-		for _, row := range(people_matrix) {
+		for _, row := range(burndown_results.PeopleMatrix) {
 			for _, cell := range(row) {
 				fmt.Print(cell, " ")
 			}

+ 51 - 0
day.go

@@ -0,0 +1,51 @@
+package hercules
+
+import (
+	"time"
+
+	"gopkg.in/src-d/go-git.v4"
+	"gopkg.in/src-d/go-git.v4/plumbing/object"
+)
+
+type DaysSinceStart struct {
+	day0        time.Time
+	previousDay int
+}
+
+func (days *DaysSinceStart) Name() string {
+	return "DaysSinceStart"
+}
+
+func (days *DaysSinceStart) Provides() []string {
+	arr := [...]string{"day"}
+	return arr[:]
+}
+
+func (days *DaysSinceStart) Requires() []string {
+	return []string{}
+}
+
+func (days *DaysSinceStart) Initialize(repository *git.Repository) {
+	days.day0 = time.Time{}
+	days.previousDay = 0
+}
+
+func (days *DaysSinceStart) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
+	commit := deps["commit"].(*object.Commit)
+	index := deps["index"].(int)
+	if index == 0 {
+		// first iteration - initialize the file objects from the tree
+		days.day0 = commit.Author.When
+	}
+	day := int(commit.Author.When.Sub(days.day0).Hours() / 24)
+	if day < days.previousDay {
+		// rebase works miracles, but we need the monotonous time
+		day = days.previousDay
+	}
+	days.previousDay = day
+	return map[string]interface{}{"day": day}, nil
+}
+
+func (days *DaysSinceStart) Finalize() interface{} {
+	return nil
+}

+ 57 - 0
dummies.go

@@ -0,0 +1,57 @@
+package hercules
+
+import (
+	"io"
+
+	"gopkg.in/src-d/go-git.v4/plumbing"
+	"gopkg.in/src-d/go-git.v4/plumbing/object"
+)
+
+type dummyIO struct {
+}
+
+func (dummyIO) Read(p []byte) (int, error) {
+	return 0, io.EOF
+}
+
+func (dummyIO) Write(p []byte) (int, error) {
+	return len(p), nil
+}
+
+func (dummyIO) Close() error {
+	return nil
+}
+
+type dummyEncodedObject struct {
+	FakeHash plumbing.Hash
+}
+
+func (obj dummyEncodedObject) Hash() plumbing.Hash {
+	return obj.FakeHash
+}
+
+func (obj dummyEncodedObject) Type() plumbing.ObjectType {
+	return plumbing.BlobObject
+}
+
+func (obj dummyEncodedObject) SetType(plumbing.ObjectType) {
+}
+
+func (obj dummyEncodedObject) Size() int64 {
+	return 0
+}
+
+func (obj dummyEncodedObject) SetSize(int64) {
+}
+
+func (obj dummyEncodedObject) Reader() (io.ReadCloser, error) {
+	return dummyIO{}, nil
+}
+
+func (obj dummyEncodedObject) Writer() (io.WriteCloser, error) {
+	return dummyIO{}, nil
+}
+
+func createDummyBlob(hash *plumbing.Hash) (*object.Blob, error) {
+	return object.DecodeBlob(dummyEncodedObject{*hash})
+}

+ 112 - 0
identity.go

@@ -0,0 +1,112 @@
+package hercules
+
+import (
+	"bufio"
+	"os"
+	"strings"
+
+	"gopkg.in/src-d/go-git.v4"
+	"gopkg.in/src-d/go-git.v4/plumbing/object"
+)
+
+type IdentityDetector struct {
+	// Maps email || name  -> developer id.
+	PeopleDict map[string]int
+	// Maps developer id -> description
+	ReversePeopleDict map[int]string
+}
+
+const MISSING_AUTHOR = (1 << 18) - 1
+const SELF_AUTHOR = (1 << 18) - 2
+
+func (id *IdentityDetector) Name() string {
+	return "IdentityDetector"
+}
+
+func (id *IdentityDetector) Provides() []string {
+	arr := [...]string{"author"}
+	return arr[:]
+}
+
+func (id *IdentityDetector) Requires() []string {
+	return []string{}
+}
+
+func (id *IdentityDetector) Initialize(repository *git.Repository) {
+}
+
+func (self *IdentityDetector) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
+	commit := deps["commit"].(*object.Commit)
+	signature := commit.Author
+	id, exists := self.PeopleDict[strings.ToLower(signature.Email)]
+	if !exists {
+		id, exists = self.PeopleDict[strings.ToLower(signature.Name)]
+		if !exists {
+			id = MISSING_AUTHOR
+		}
+	}
+	return map[string]interface{}{"author": id}, nil
+}
+
+func (id *IdentityDetector) Finalize() interface{} {
+	return nil
+}
+
+func (id *IdentityDetector) LoadPeopleDict(path string) error {
+	file, err := os.Open(path)
+	if err != nil {
+		return err
+	}
+	defer file.Close()
+	scanner := bufio.NewScanner(file)
+	dict := make(map[string]int)
+	reverse_dict := make(map[int]string)
+	size := 0
+	for scanner.Scan() {
+		for _, id := range strings.Split(strings.ToLower(scanner.Text()), "|") {
+			dict[id] = size
+		}
+		reverse_dict[size] = scanner.Text()
+		size += 1
+	}
+	id.PeopleDict = dict
+	id.ReversePeopleDict = reverse_dict
+	return nil
+}
+
+func (id *IdentityDetector) GeneratePeopleDict(commits []*object.Commit) {
+	dict := make(map[string]int)
+	emails := make(map[int][]string)
+	names := make(map[int][]string)
+	size := 0
+	for _, commit := range commits {
+		email := strings.ToLower(commit.Author.Email)
+		name := strings.ToLower(commit.Author.Name)
+		id, exists := dict[email]
+		if exists {
+			_, exists := dict[name]
+			if !exists {
+				dict[name] = id
+				names[id] = append(names[id], name)
+			}
+			continue
+		}
+		id, exists = dict[name]
+		if exists {
+			dict[email] = id
+			emails[id] = append(emails[id], email)
+			continue
+		}
+		dict[email] = size
+		dict[name] = size
+		emails[size] = append(emails[size], email)
+		names[size] = append(names[size], name)
+		size += 1
+	}
+	reverse_dict := make(map[int]string)
+	for _, val := range dict {
+		reverse_dict[val] = strings.Join(names[val], "|") + "|" + strings.Join(emails[val], "|")
+	}
+	id.PeopleDict = dict
+	id.ReversePeopleDict = reverse_dict
+}

+ 202 - 0
pipeline.go

@@ -0,0 +1,202 @@
+package hercules
+
+import (
+	"fmt"
+	"io"
+	"os"
+
+	"bufio"
+	"gopkg.in/src-d/go-git.v4"
+	"gopkg.in/src-d/go-git.v4/plumbing"
+	"gopkg.in/src-d/go-git.v4/plumbing/object"
+	"gopkg.in/src-d/hercules.v1/toposort"
+)
+
+type PipelineItem interface {
+	// Name returns the name of the analysis.
+	Name() string
+	// Provides returns the list of keys of reusable calculated entities.
+	// Other items may depend on them.
+	Provides() []string
+	// Requires returns the list of keys of needed entities which must be supplied in Consume().
+	Requires() []string
+	// Initialize prepares and resets the item. Consume() requires Initialize()
+	// to be called at least once beforehand.
+	Initialize(*git.Repository)
+	// Consume processes the next commit.
+	// deps contains the required entities which match Depends(). Besides, it always includes
+	// "commit" and "index".
+	// Returns the calculated entities which match Provides().
+	Consume(deps map[string]interface{}) (map[string]interface{}, error)
+	// Finalize returns the result of the analysis.
+	Finalize() interface{}
+}
+
+type Pipeline struct {
+	// OnProgress is the callback which is invoked in Analyse() to output it's
+	// progress. The first argument is the number of processed commits and the
+	// second is the total number of commits.
+	OnProgress func(int, int)
+
+	// repository points to the analysed Git repository struct from go-git.
+	repository *git.Repository
+
+	// items are the registered analysers in the pipeline.
+	items []PipelineItem
+
+	// plan is the resolved execution sequence.
+	plan []PipelineItem
+}
+
+func NewPipeline(repository *git.Repository) *Pipeline {
+	return &Pipeline{repository: repository, items: []PipelineItem{}, plan: []PipelineItem{}}
+}
+
+func (pipeline *Pipeline) AddItem(item PipelineItem) {
+	for _, reg := range pipeline.items {
+		if reg == item {
+			return
+		}
+	}
+	pipeline.items = append(pipeline.items, item)
+}
+
+func (pipeline *Pipeline) RemoveItem(item PipelineItem) {
+	for i, reg := range pipeline.items {
+		if reg == item {
+			pipeline.items = append(pipeline.items[:i], pipeline.items[i+1:]...)
+			return
+		}
+	}
+}
+
+// Commits returns the critical path in the repository's history. It starts
+// from HEAD and traces commits backwards till the root. When it encounters
+// a merge (more than one parent), it always chooses the first parent.
+func (pipeline *Pipeline) Commits() []*object.Commit {
+	result := []*object.Commit{}
+	repository := pipeline.repository
+	head, err := repository.Head()
+	if err != nil {
+		panic(err)
+	}
+	commit, err := repository.CommitObject(head.Hash())
+	if err != nil {
+		panic(err)
+	}
+	result = append(result, commit)
+	for ; err != io.EOF; commit, err = commit.Parents().Next() {
+		if err != nil {
+			panic(err)
+		}
+		result = append(result, commit)
+	}
+	// reverse the order
+	for i, j := 0, len(result)-1; i < j; i, j = i+1, j-1 {
+		result[i], result[j] = result[j], result[i]
+	}
+	return result
+}
+
+func (pipeline *Pipeline) Initialize() {
+	graph := toposort.NewGraph()
+	name2item := map[string]PipelineItem{}
+	for index, item := range pipeline.items {
+		name := fmt.Sprintf("%s_%d", item.Name(), index)
+		graph.AddNode(name)
+		name2item[name] = item
+		for _, key := range item.Provides() {
+			key += "_entity"
+			graph.AddNode(key)
+			graph.AddEdge(name, key)
+		}
+	}
+	for index, item := range pipeline.items {
+		name := fmt.Sprintf("%s_%d", item.Name(), index)
+		for _, key := range item.Requires() {
+			key += "_entity"
+			if !graph.AddEdge(key, name) {
+				panic(fmt.Sprintf("Unsatisfied dependency: %s -> %s", key, item.Name()))
+			}
+		}
+	}
+	strplan, ok := graph.Toposort()
+	if !ok {
+		panic("Failed to resolve pipeline dependencies.")
+	}
+	for _, key := range strplan {
+		item, ok := name2item[key]
+		if ok {
+			pipeline.plan = append(pipeline.plan, item)
+		}
+	}
+	if len(pipeline.plan) != len(pipeline.items) {
+		panic("Internal pipeline dependency resolution error.")
+	}
+	for _, item := range pipeline.items {
+		item.Initialize(pipeline.repository)
+	}
+}
+
+// Run executes the pipeline.
+//
+// commits is a slice with the sequential commit history. It shall start from
+// the root (ascending order).
+func (pipeline *Pipeline) Run(commits []*object.Commit) (map[PipelineItem]interface{}, error) {
+	onProgress := pipeline.OnProgress
+	if onProgress == nil {
+		onProgress = func(int, int) {}
+	}
+
+	for index, commit := range commits {
+		onProgress(index, len(commits))
+		state := map[string]interface{}{"commit": commit, "index": index}
+		for _, item := range pipeline.plan {
+			update, err := item.Consume(state)
+			if err != nil {
+				fmt.Fprintf(os.Stderr, "%s failed on commit #%d %s\n",
+					item.Name(), index, commit.Hash.String())
+				return nil, err
+			}
+			for _, key := range item.Provides() {
+				val, ok := update[key]
+				if !ok {
+					panic(fmt.Sprintf("%s: Consume() did not return %s", item.Name(), key))
+				}
+				state[key] = val
+			}
+		}
+	}
+	result := map[PipelineItem]interface{}{}
+	for _, item := range pipeline.items {
+		result[item] = item.Finalize()
+	}
+	return result, nil
+}
+
+func LoadCommitsFromFile(path string, repository *git.Repository) []*object.Commit {
+	var file io.Reader
+	if path != "-" {
+		file, err := os.Open(path)
+		if err != nil {
+			panic(err)
+		}
+		defer file.Close()
+	} else {
+		file = os.Stdin
+	}
+	scanner := bufio.NewScanner(file)
+	commits := []*object.Commit{}
+	for scanner.Scan() {
+		hash := plumbing.NewHash(scanner.Text())
+		if len(hash) != 20 {
+			panic("invalid commit hash " + scanner.Text())
+		}
+		commit, err := repository.CommitObject(hash)
+		if err != nil {
+			panic(err)
+		}
+		commits = append(commits, commit)
+	}
+	return commits
+}

+ 238 - 0
renames.go

@@ -0,0 +1,238 @@
+package hercules
+
+import (
+	"fmt"
+	"sort"
+	"unicode/utf8"
+
+	"github.com/sergi/go-diff/diffmatchpatch"
+	"gopkg.in/src-d/go-git.v4"
+	"gopkg.in/src-d/go-git.v4/plumbing"
+	"gopkg.in/src-d/go-git.v4/plumbing/object"
+	"gopkg.in/src-d/go-git.v4/utils/merkletrie"
+)
+
+type RenameAnalysis struct {
+	// SimilarityThreshold adjusts the heuristic to determine file renames.
+	// It has the same units as cgit's -X rename-threshold or -M. Better to
+	// set it to the default value of 90 (90%).
+	SimilarityThreshold int
+
+	repository *git.Repository
+}
+
+func (ra *RenameAnalysis) Name() string {
+	return "RenameAnalysis"
+}
+
+func (ra *RenameAnalysis) Provides() []string {
+	arr := [...]string{"renamed_changes"}
+	return arr[:]
+}
+
+func (ra *RenameAnalysis) Requires() []string {
+	arr := [...]string{"blob_cache", "changes"}
+	return arr[:]
+}
+
+func (ra *RenameAnalysis) Initialize(repository *git.Repository) {
+	if ra.SimilarityThreshold < 0 || ra.SimilarityThreshold > 100 {
+		panic("hercules.RenameAnalysis: an invalid SimilarityThreshold was specified")
+	}
+	ra.repository = repository
+}
+
+func (ra *RenameAnalysis) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
+	changes := deps["changes"].(object.Changes)
+	cache := deps["blob_cache"].(map[plumbing.Hash]*object.Blob)
+
+	reduced_changes := make(object.Changes, 0, changes.Len())
+
+	// Stage 1 - find renames by matching the hashes
+	// n log(n)
+	// We sort additions and deletions by hash and then do the single scan along
+	// both slices.
+	deleted := make(sortableChanges, 0, changes.Len())
+	added := make(sortableChanges, 0, changes.Len())
+	for _, change := range changes {
+		action, err := change.Action()
+		if err != nil {
+			return nil, err
+		}
+		switch action {
+		case merkletrie.Insert:
+			added = append(added, sortableChange{change, change.To.TreeEntry.Hash})
+		case merkletrie.Delete:
+			deleted = append(deleted, sortableChange{change, change.From.TreeEntry.Hash})
+		case merkletrie.Modify:
+			reduced_changes = append(reduced_changes, change)
+		default:
+			panic(fmt.Sprintf("unsupported action: %d", change.Action))
+		}
+	}
+	sort.Sort(deleted)
+	sort.Sort(added)
+	a := 0
+	d := 0
+	still_deleted := make(object.Changes, 0, deleted.Len())
+	still_added := make(object.Changes, 0, added.Len())
+	for a < added.Len() && d < deleted.Len() {
+		if added[a].hash == deleted[d].hash {
+			reduced_changes = append(
+				reduced_changes,
+				&object.Change{From: deleted[d].change.From, To: added[a].change.To})
+			a++
+			d++
+		} else if added[a].Less(&deleted[d]) {
+			still_added = append(still_added, added[a].change)
+			a++
+		} else {
+			still_deleted = append(still_deleted, deleted[d].change)
+			d++
+		}
+	}
+	for ; a < added.Len(); a++ {
+		still_added = append(still_added, added[a].change)
+	}
+	for ; d < deleted.Len(); d++ {
+		still_deleted = append(still_deleted, deleted[d].change)
+	}
+
+	// Stage 2 - apply the similarity threshold
+	// n^2 but actually linear
+	// We sort the blobs by size and do the single linear scan.
+	added_blobs := make(sortableBlobs, 0, still_added.Len())
+	deleted_blobs := make(sortableBlobs, 0, still_deleted.Len())
+	for _, change := range still_added {
+		blob := cache[change.To.TreeEntry.Hash]
+		added_blobs = append(
+			added_blobs, sortableBlob{change: change, size: blob.Size})
+	}
+	for _, change := range still_deleted {
+		blob := cache[change.From.TreeEntry.Hash]
+		deleted_blobs = append(
+			deleted_blobs, sortableBlob{change: change, size: blob.Size})
+	}
+	sort.Sort(added_blobs)
+	sort.Sort(deleted_blobs)
+	d_start := 0
+	for a = 0; a < added_blobs.Len(); a++ {
+		my_blob := cache[added_blobs[a].change.To.TreeEntry.Hash]
+		my_size := added_blobs[a].size
+		for d = d_start; d < deleted_blobs.Len() && !ra.sizesAreClose(my_size, deleted_blobs[d].size); d++ {
+		}
+		d_start = d
+		found_match := false
+		for d = d_start; d < deleted_blobs.Len() && ra.sizesAreClose(my_size, deleted_blobs[d].size); d++ {
+			blobsAreClose, err := ra.blobsAreClose(
+				my_blob, cache[deleted_blobs[d].change.From.TreeEntry.Hash])
+			if err != nil {
+				return nil, err
+			}
+			if blobsAreClose {
+				found_match = true
+				reduced_changes = append(
+					reduced_changes,
+					&object.Change{From: deleted_blobs[d].change.From,
+						To: added_blobs[a].change.To})
+				break
+			}
+		}
+		if found_match {
+			added_blobs = append(added_blobs[:a], added_blobs[a+1:]...)
+			a--
+			deleted_blobs = append(deleted_blobs[:d], deleted_blobs[d+1:]...)
+		}
+	}
+
+	// Stage 3 - we give up, everything left are independent additions and deletions
+	for _, blob := range added_blobs {
+		reduced_changes = append(reduced_changes, blob.change)
+	}
+	for _, blob := range deleted_blobs {
+		reduced_changes = append(reduced_changes, blob.change)
+	}
+	return map[string]interface{}{"renamed_changes": reduced_changes}, nil
+}
+
+func (ra *RenameAnalysis) Finalize() interface{} {
+	return nil
+}
+
+func (ra *RenameAnalysis) sizesAreClose(size1 int64, size2 int64) bool {
+	return abs64(size1-size2)*100/max64(1, min64(size1, size2)) <=
+		int64(100-ra.SimilarityThreshold)
+}
+
+func (ra *RenameAnalysis) blobsAreClose(
+	blob1 *object.Blob, blob2 *object.Blob) (bool, error) {
+	str_from, err := blobToString(blob1)
+	if err != nil {
+		return false, err
+	}
+	str_to, err := blobToString(blob2)
+	if err != nil {
+		return false, err
+	}
+	dmp := diffmatchpatch.New()
+	src, dst, _ := dmp.DiffLinesToRunes(str_from, str_to)
+	diffs := dmp.DiffMainRunes(src, dst, false)
+	common := 0
+	for _, edit := range diffs {
+		if edit.Type == diffmatchpatch.DiffEqual {
+			common += utf8.RuneCountInString(edit.Text)
+		}
+	}
+	return common*100/max(1, min(len(src), len(dst))) >= ra.SimilarityThreshold, nil
+}
+
+type sortableChange struct {
+	change *object.Change
+	hash   plumbing.Hash
+}
+
+type sortableChanges []sortableChange
+
+func (change *sortableChange) Less(other *sortableChange) bool {
+	for x := 0; x < 20; x++ {
+		if change.hash[x] < other.hash[x] {
+			return true
+		}
+	}
+	return false
+}
+
+func (slice sortableChanges) Len() int {
+	return len(slice)
+}
+
+func (slice sortableChanges) Less(i, j int) bool {
+	return slice[i].Less(&slice[j])
+}
+
+func (slice sortableChanges) Swap(i, j int) {
+	slice[i], slice[j] = slice[j], slice[i]
+}
+
+type sortableBlob struct {
+	change *object.Change
+	size   int64
+}
+
+type sortableBlobs []sortableBlob
+
+func (change *sortableBlob) Less(other *sortableBlob) bool {
+	return change.size < other.size
+}
+
+func (slice sortableBlobs) Len() int {
+	return len(slice)
+}
+
+func (slice sortableBlobs) Less(i, j int) bool {
+	return slice[i].Less(&slice[j])
+}
+
+func (slice sortableBlobs) Swap(i, j int) {
+	slice[i], slice[j] = slice[j], slice[i]
+}

+ 103 - 0
toposort/toposort.go

@@ -0,0 +1,103 @@
+package toposort
+
+// Copied from https://github.com/philopon/go-toposort
+
+type Graph struct {
+	nodes   []string
+	outputs map[string]map[string]int
+	inputs  map[string]int
+}
+
+func NewGraph() *Graph {
+	return &Graph{
+		nodes:   []string{},
+		inputs:  map[string]int{},
+		outputs: map[string]map[string]int{},
+	}
+}
+
+func (g *Graph) AddNode(name string) bool {
+	g.nodes = append(g.nodes, name)
+
+	if _, ok := g.outputs[name]; ok {
+		return false
+	}
+	g.outputs[name] = make(map[string]int)
+	g.inputs[name] = 0
+	return true
+}
+
+func (g *Graph) AddNodes(names ...string) bool {
+	for _, name := range names {
+		if ok := g.AddNode(name); !ok {
+			return false
+		}
+	}
+	return true
+}
+
+func (g *Graph) AddEdge(from, to string) bool {
+	m, ok := g.outputs[from]
+	if !ok {
+		return false
+	}
+
+	m[to] = len(m) + 1
+	g.inputs[to]++
+
+	return true
+}
+
+func (g *Graph) unsafeRemoveEdge(from, to string) {
+	delete(g.outputs[from], to)
+	g.inputs[to]--
+}
+
+func (g *Graph) RemoveEdge(from, to string) bool {
+	if _, ok := g.outputs[from]; !ok {
+		return false
+	}
+	g.unsafeRemoveEdge(from, to)
+	return true
+}
+
+func (g *Graph) Toposort() ([]string, bool) {
+	L := make([]string, 0, len(g.nodes))
+	S := make([]string, 0, len(g.nodes))
+
+	for _, n := range g.nodes {
+		if g.inputs[n] == 0 {
+			S = append(S, n)
+		}
+	}
+
+	for len(S) > 0 {
+		var n string
+		n, S = S[0], S[1:]
+		L = append(L, n)
+
+		ms := make([]string, len(g.outputs[n]))
+		for m, i := range g.outputs[n] {
+			ms[i-1] = m
+		}
+
+		for _, m := range ms {
+			g.unsafeRemoveEdge(n, m)
+
+			if g.inputs[m] == 0 {
+				S = append(S, m)
+			}
+		}
+	}
+
+	N := 0
+	for _, v := range g.inputs {
+		N += v
+	}
+
+	if N > 0 {
+		return L, false
+	}
+
+	return L, true
+}

+ 83 - 0
toposort/toposort_test.go

@@ -0,0 +1,83 @@
+package toposort
+
+import "testing"
+
+func index(s []string, v string) int {
+	for i, s := range s {
+		if s == v {
+			return i
+		}
+	}
+	return -1
+}
+
+type Edge struct {
+	From string
+	To   string
+}
+
+func TestDuplicatedNode(t *testing.T) {
+	graph := NewGraph()
+	graph.AddNode("a")
+	if graph.AddNode("a") {
+		t.Error("not raising duplicated node error")
+	}
+
+}
+
+func TestRemoveNotExistEdge(t *testing.T) {
+	graph := NewGraph()
+	if graph.RemoveEdge("a", "b") {
+		t.Error("not raising not exist edge error")
+	}
+}
+
+func TestWikipedia(t *testing.T) {
+	graph := NewGraph()
+	graph.AddNodes("2", "3", "5", "7", "8", "9", "10", "11")
+
+	edges := []Edge{
+		{"7", "8"},
+		{"7", "11"},
+
+		{"5", "11"},
+
+		{"3", "8"},
+		{"3", "10"},
+
+		{"11", "2"},
+		{"11", "9"},
+		{"11", "10"},
+
+		{"8", "9"},
+	}
+
+	for _, e := range edges {
+		graph.AddEdge(e.From, e.To)
+	}
+
+	result, ok := graph.Toposort()
+	if !ok {
+		t.Error("closed path detected in no closed pathed graph")
+	}
+
+	for _, e := range edges {
+		if i, j := index(result, e.From), index(result, e.To); i > j {
+			t.Errorf("dependency failed: not satisfy %v(%v) > %v(%v)", e.From, i, e.To, j)
+		}
+	}
+}
+
+func TestCycle(t *testing.T) {
+	graph := NewGraph()
+	graph.AddNodes("1", "2", "3")
+
+	graph.AddEdge("1", "2")
+	graph.AddEdge("2", "3")
+	graph.AddEdge("3", "1")
+
+	_, ok := graph.Toposort()
+	if ok {
+		t.Error("closed path not detected in closed pathed graph")
+	}
+}

+ 72 - 0
tree_diff.go

@@ -0,0 +1,72 @@
+package hercules
+
+import (
+	"io"
+
+	"gopkg.in/src-d/go-git.v4"
+	"gopkg.in/src-d/go-git.v4/plumbing/object"
+)
+
+type TreeDiff struct {
+	previousTree *object.Tree
+}
+
+func (treediff *TreeDiff) Name() string {
+	return "TreeDiff"
+}
+
+func (treediff *TreeDiff) Provides() []string {
+	arr := [...]string{"changes"}
+	return arr[:]
+}
+
+func (treediff *TreeDiff) Requires() []string {
+	return []string{}
+}
+
+func (treediff *TreeDiff) Initialize(repository *git.Repository) {
+	treediff.previousTree = nil
+}
+
+func (treediff *TreeDiff) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
+	commit := deps["commit"].(*object.Commit)
+	tree, err := commit.Tree()
+	if err != nil {
+		return nil, err
+	}
+	var diff object.Changes
+	if treediff.previousTree != nil {
+		diff, err = object.DiffTree(treediff.previousTree, tree)
+		if err != nil {
+			return nil, err
+		}
+	} else {
+		diff = []*object.Change{}
+		err = func() error {
+			file_iter := tree.Files()
+			defer file_iter.Close()
+			for {
+				file, err := file_iter.Next()
+				if err != nil {
+					if err == io.EOF {
+						break
+					}
+					return err
+				}
+				diff = append(diff, &object.Change{
+					To: object.ChangeEntry{Name: file.Name, Tree: tree, TreeEntry: object.TreeEntry{
+						Name: file.Name, Mode: file.Mode, Hash: file.Hash}}})
+			}
+			return nil
+		}()
+		if err != nil {
+			return nil, err
+		}
+	}
+	treediff.previousTree = tree
+	return map[string]interface{}{"changes": diff}, nil
+}
+
+func (treediff *TreeDiff) Finalize() interface{} {
+	return nil
+}