9 anni fa · 2b1ed97819
--- a/analyser.go
+++ b/analyser.go
@@ -1,926 +0,0 @@
 
				-package hercules
			
 
				-
			
 
				-import (
			
 
				-	"bufio"
			
 
				-	"bytes"
			
 
				-	"errors"
			
 
				-	"fmt"
			
 
				-	"io"
			
 
				-	"os"
			
 
				-	"sort"
			
 
				-	"strings"
			
 
				-	"time"
			
 
				-	"unicode/utf8"
			
 
				-
			
 
				-	"github.com/sergi/go-diff/diffmatchpatch"
			
 
				-	"gopkg.in/src-d/go-git.v4"
			
 
				-	"gopkg.in/src-d/go-git.v4/config"
			
 
				-	"gopkg.in/src-d/go-git.v4/plumbing"
			
 
				-	"gopkg.in/src-d/go-git.v4/plumbing/object"
			
 
				-	"gopkg.in/src-d/go-git.v4/utils/merkletrie"
			
 
				-)
			
 
				-
			
 
				-// Analyser allows to gather the line burndown statistics for a Git repository.
			
 
				-type Analyser struct {
			
 
				-	// Repository points to the analysed Git repository struct from go-git.
			
 
				-	Repository *git.Repository
			
 
				-	// Granularity sets the size of each band - the number of days it spans.
			
 
				-	// Smaller values provide better resolution but require more work and eat more
			
 
				-	// memory. 30 days is usually enough.
			
 
				-	Granularity int
			
 
				-	// Sampling sets how detailed is the statistic - the size of the interval in
			
 
				-	// days between consecutive measurements. It is usually a good idea to set it
			
 
				-	// <= Granularity. Try 15 or 30.
			
 
				-	Sampling int
			
 
				-	// SimilarityThreshold adjusts the heuristic to determine file renames.
			
 
				-	// It has the same units as cgit's -X rename-threshold or -M. Better to
			
 
				-	// set it to the default value of 90 (90%).
			
 
				-	SimilarityThreshold int
			
 
				-	// The number of developers for which to collect the burndown stats. 0 disables it.
			
 
				-	PeopleNumber int
			
 
				-	// Maps email || name  -> developer id.
			
 
				-	PeopleDict map[string]int
			
 
				-	// Debug activates the debugging mode. Analyse() runs slower in this mode
			
 
				-	// but it accurately checks all the intermediate states for invariant
			
 
				-	// violations.
			
 
				-	Debug bool
			
 
				-	// OnProgress is the callback which is invoked in Analyse() to output it's
			
 
				-	// progress. The first argument is the number of processed commits and the
			
 
				-	// second is the total number of commits.
			
 
				-	OnProgress func(int, int)
			
 
				-}
			
 
				-
			
 
				-type ProtoMatrix map[int]map[int]int64
			
 
				-
			
 
				-func checkClose(c io.Closer) {
			
 
				-	if err := c.Close(); err != nil {
			
 
				-		panic(err)
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-func loc(file *object.Blob) (int, error) {
			
 
				-	reader, err := file.Reader()
			
 
				-	if err != nil {
			
 
				-		panic(err)
			
 
				-	}
			
 
				-	defer checkClose(reader)
			
 
				-	var scanner *bufio.Scanner
			
 
				-	buffer := make([]byte, bufio.MaxScanTokenSize)
			
 
				-	counter := 0
			
 
				-	for scanner == nil || scanner.Err() == bufio.ErrTooLong {
			
 
				-		if scanner != nil && !utf8.Valid(scanner.Bytes()) {
			
 
				-			return -1, errors.New("binary")
			
 
				-		}
			
 
				-		scanner = bufio.NewScanner(reader)
			
 
				-		scanner.Buffer(buffer, 0)
			
 
				-		for scanner.Scan() {
			
 
				-			if !utf8.Valid(scanner.Bytes()) {
			
 
				-				return -1, errors.New("binary")
			
 
				-			}
			
 
				-			counter++
			
 
				-		}
			
 
				-	}
			
 
				-	return counter, nil
			
 
				-}
			
 
				-
			
 
				-func str(file *object.Blob) string {
			
 
				-	reader, err := file.Reader()
			
 
				-	if err != nil {
			
 
				-		panic(err)
			
 
				-	}
			
 
				-	defer checkClose(reader)
			
 
				-	buf := new(bytes.Buffer)
			
 
				-	buf.ReadFrom(reader)
			
 
				-	return buf.String()
			
 
				-}
			
 
				-
			
 
				-type dummyIO struct {
			
 
				-}
			
 
				-
			
 
				-func (dummyIO) Read(p []byte) (int, error) {
			
 
				-	return 0, io.EOF
			
 
				-}
			
 
				-
			
 
				-func (dummyIO) Write(p []byte) (int, error) {
			
 
				-	return len(p), nil
			
 
				-}
			
 
				-
			
 
				-func (dummyIO) Close() error {
			
 
				-	return nil
			
 
				-}
			
 
				-
			
 
				-type dummyEncodedObject struct {
			
 
				-	FakeHash plumbing.Hash
			
 
				-}
			
 
				-
			
 
				-func (obj dummyEncodedObject) Hash() plumbing.Hash {
			
 
				-	return obj.FakeHash
			
 
				-}
			
 
				-
			
 
				-func (obj dummyEncodedObject) Type() plumbing.ObjectType {
			
 
				-	return plumbing.BlobObject
			
 
				-}
			
 
				-
			
 
				-func (obj dummyEncodedObject) SetType(plumbing.ObjectType) {
			
 
				-}
			
 
				-
			
 
				-func (obj dummyEncodedObject) Size() int64 {
			
 
				-	return 0
			
 
				-}
			
 
				-
			
 
				-func (obj dummyEncodedObject) SetSize(int64) {
			
 
				-}
			
 
				-
			
 
				-func (obj dummyEncodedObject) Reader() (io.ReadCloser, error) {
			
 
				-	return dummyIO{}, nil
			
 
				-}
			
 
				-
			
 
				-func (obj dummyEncodedObject) Writer() (io.WriteCloser, error) {
			
 
				-	return dummyIO{}, nil
			
 
				-}
			
 
				-
			
 
				-func createDummyBlob(hash *plumbing.Hash) (*object.Blob, error) {
			
 
				-	return object.DecodeBlob(dummyEncodedObject{*hash})
			
 
				-}
			
 
				-
			
 
				-const MISSING_AUTHOR = (1 << 18) - 1
			
 
				-const SELF_AUTHOR = (1 << 18) - 2
			
 
				-
			
 
				-func (analyser *Analyser) packPersonWithDay(person int, day int) int {
			
 
				-	if analyser.PeopleNumber == 0 {
			
 
				-		return day
			
 
				-	}
			
 
				-	result := day
			
 
				-	result |= person << 14
			
 
				-	// This effectively means max 16384 days (>44 years) and (131072 - 2) devs
			
 
				-	return result
			
 
				-}
			
 
				-
			
 
				-func (analyser *Analyser) unpackPersonWithDay(value int) (int, int) {
			
 
				-	if analyser.PeopleNumber == 0 {
			
 
				-		return MISSING_AUTHOR, value
			
 
				-	}
			
 
				-	return value >> 14, value & 0x3FFF
			
 
				-}
			
 
				-
			
 
				-func (analyser *Analyser) updateStatus(
			
 
				-	status interface{}, _ int, previous_time_ int, delta int) {
			
 
				-
			
 
				-	_, previous_time := analyser.unpackPersonWithDay(previous_time_)
			
 
				-	status.(map[int]int64)[previous_time] += int64(delta)
			
 
				-}
			
 
				-
			
 
				-func (analyser *Analyser) updatePeople(people interface{}, _ int, previous_time_ int, delta int) {
			
 
				-	old_author, previous_time := analyser.unpackPersonWithDay(previous_time_)
			
 
				-	if old_author == MISSING_AUTHOR {
			
 
				-		return
			
 
				-	}
			
 
				-	casted := people.([]map[int]int64)
			
 
				-	stats := casted[old_author]
			
 
				-	if stats == nil {
			
 
				-		stats = map[int]int64{}
			
 
				-		casted[old_author] = stats
			
 
				-	}
			
 
				-	stats[previous_time] += int64(delta)
			
 
				-}
			
 
				-
			
 
				-func (analyser *Analyser) updateMatrix(
			
 
				-	matrix_ interface{}, current_time int, previous_time int, delta int) {
			
 
				-
			
 
				-	matrix := matrix_.([]map[int]int64)
			
 
				-	new_author, _ := analyser.unpackPersonWithDay(current_time)
			
 
				-	old_author, _ := analyser.unpackPersonWithDay(previous_time)
			
 
				-	if old_author == MISSING_AUTHOR {
			
 
				-		return
			
 
				-	}
			
 
				-	if new_author == old_author && delta > 0 {
			
 
				-		new_author = SELF_AUTHOR
			
 
				-	}
			
 
				-	row := matrix[old_author]
			
 
				-	if row == nil {
			
 
				-		row = map[int]int64{}
			
 
				-		matrix[old_author] = row
			
 
				-	}
			
 
				-	cell, exists := row[new_author]
			
 
				-	if !exists {
			
 
				-		row[new_author] = 0
			
 
				-		cell = 0
			
 
				-	}
			
 
				-	row[new_author] = cell + int64(delta)
			
 
				-}
			
 
				-
			
 
				-func (analyser *Analyser) newFile(
			
 
				-	author int, day int, size int, global map[int]int64, people []map[int]int64,
			
 
				-	matrix []map[int]int64) *File {
			
 
				-	if analyser.PeopleNumber == 0 {
			
 
				-		return NewFile(day, size, NewStatus(global, analyser.updateStatus),
			
 
				-			NewStatus(make(map[int]int64), analyser.updateStatus))
			
 
				-	}
			
 
				-	return NewFile(analyser.packPersonWithDay(author, day), size,
			
 
				-		NewStatus(global, analyser.updateStatus),
			
 
				-		NewStatus(make(map[int]int64), analyser.updateStatus),
			
 
				-		NewStatus(people, analyser.updatePeople),
			
 
				-		NewStatus(matrix, analyser.updateMatrix))
			
 
				-}
			
 
				-
			
 
				-func (analyser *Analyser) getAuthorId(signature object.Signature) int {
			
 
				-	id, exists := analyser.PeopleDict[strings.ToLower(signature.Email)]
			
 
				-	if !exists {
			
 
				-		id, exists = analyser.PeopleDict[strings.ToLower(signature.Name)]
			
 
				-		if !exists {
			
 
				-			id = MISSING_AUTHOR
			
 
				-		}
			
 
				-	}
			
 
				-	return id
			
 
				-}
			
 
				-
			
 
				-func (analyser *Analyser) handleInsertion(
			
 
				-	change *object.Change, author int, day int, global_status map[int]int64,
			
 
				-	files map[string]*File, people []map[int]int64, matrix []map[int]int64,
			
 
				-	cache *map[plumbing.Hash]*object.Blob) {
			
 
				-
			
 
				-	blob := (*cache)[change.To.TreeEntry.Hash]
			
 
				-	lines, err := loc(blob)
			
 
				-	if err != nil {
			
 
				-		return
			
 
				-	}
			
 
				-	name := change.To.Name
			
 
				-	file, exists := files[name]
			
 
				-	if exists {
			
 
				-		panic(fmt.Sprintf("file %s already exists", name))
			
 
				-	}
			
 
				-	file = analyser.newFile(author, day, lines, global_status, people, matrix)
			
 
				-	files[name] = file
			
 
				-}
			
 
				-
			
 
				-func (analyser *Analyser) handleDeletion(
			
 
				-	change *object.Change, author int, day int, status map[int]int64, files map[string]*File,
			
 
				-	cache *map[plumbing.Hash]*object.Blob) {
			
 
				-	blob := (*cache)[change.From.TreeEntry.Hash]
			
 
				-	lines, err := loc(blob)
			
 
				-	if err != nil {
			
 
				-		return
			
 
				-	}
			
 
				-	name := change.From.Name
			
 
				-	file := files[name]
			
 
				-	file.Update(analyser.packPersonWithDay(author, day), 0, 0, lines)
			
 
				-	delete(files, name)
			
 
				-}
			
 
				-
			
 
				-func (analyser *Analyser) handleModification(
			
 
				-	change *object.Change, author int, day int, status map[int]int64, files map[string]*File,
			
 
				-	people []map[int]int64, matrix []map[int]int64,
			
 
				-	cache *map[plumbing.Hash]*object.Blob) {
			
 
				-
			
 
				-	blob_from := (*cache)[change.From.TreeEntry.Hash]
			
 
				-	blob_to := (*cache)[change.To.TreeEntry.Hash]
			
 
				-	// we are not validating UTF-8 here because for example
			
 
				-	// git/git 4f7770c87ce3c302e1639a7737a6d2531fe4b160 fetch-pack.c is invalid UTF-8
			
 
				-	str_from := str(blob_from)
			
 
				-	str_to := str(blob_to)
			
 
				-	file, exists := files[change.From.Name]
			
 
				-	if !exists {
			
 
				-		analyser.handleInsertion(change, author, day, status, files, people, matrix, cache)
			
 
				-		return
			
 
				-	}
			
 
				-	// possible rename
			
 
				-	if change.To.Name != change.From.Name {
			
 
				-		analyser.handleRename(change.From.Name, change.To.Name, files)
			
 
				-	}
			
 
				-	dmp := diffmatchpatch.New()
			
 
				-	src, dst, _ := dmp.DiffLinesToRunes(str_from, str_to)
			
 
				-	if file.Len() != len(src) {
			
 
				-		fmt.Fprintf(os.Stderr, "====TREE====\n%s", file.Dump())
			
 
				-		panic(fmt.Sprintf("%s: internal integrity error src %d != %d %s -> %s",
			
 
				-			change.To.Name, len(src), file.Len(),
			
 
				-			change.From.TreeEntry.Hash.String(), change.To.TreeEntry.Hash.String()))
			
 
				-	}
			
 
				-	diffs := dmp.DiffMainRunes(src, dst, false)
			
 
				-	// we do not call RunesToDiffLines so the number of lines equals
			
 
				-	// to the rune count
			
 
				-	position := 0
			
 
				-	pending := diffmatchpatch.Diff{Text: ""}
			
 
				-
			
 
				-	apply := func(edit diffmatchpatch.Diff) {
			
 
				-		length := utf8.RuneCountInString(edit.Text)
			
 
				-		if edit.Type == diffmatchpatch.DiffInsert {
			
 
				-			file.Update(analyser.packPersonWithDay(author, day), position, length, 0)
			
 
				-			position += length
			
 
				-		} else {
			
 
				-			file.Update(analyser.packPersonWithDay(author, day), position, 0, length)
			
 
				-		}
			
 
				-		if analyser.Debug {
			
 
				-			file.Validate()
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	for _, edit := range diffs {
			
 
				-		dump_before := ""
			
 
				-		if analyser.Debug {
			
 
				-			dump_before = file.Dump()
			
 
				-		}
			
 
				-		length := utf8.RuneCountInString(edit.Text)
			
 
				-		func() {
			
 
				-			defer func() {
			
 
				-				r := recover()
			
 
				-				if r != nil {
			
 
				-					fmt.Fprintf(os.Stderr, "%s: internal diff error\n", change.To.Name)
			
 
				-					fmt.Fprintf(os.Stderr, "Update(%d, %d, %d (0), %d (0))\n", day, position,
			
 
				-						length, utf8.RuneCountInString(pending.Text))
			
 
				-					if dump_before != "" {
			
 
				-						fmt.Fprintf(os.Stderr, "====TREE BEFORE====\n%s====END====\n", dump_before)
			
 
				-					}
			
 
				-					fmt.Fprintf(os.Stderr, "====TREE AFTER====\n%s====END====\n", file.Dump())
			
 
				-					panic(r)
			
 
				-				}
			
 
				-			}()
			
 
				-			switch edit.Type {
			
 
				-			case diffmatchpatch.DiffEqual:
			
 
				-				if pending.Text != "" {
			
 
				-					apply(pending)
			
 
				-					pending.Text = ""
			
 
				-				}
			
 
				-				position += length
			
 
				-			case diffmatchpatch.DiffInsert:
			
 
				-				if pending.Text != "" {
			
 
				-					if pending.Type == diffmatchpatch.DiffInsert {
			
 
				-						panic("DiffInsert may not appear after DiffInsert")
			
 
				-					}
			
 
				-					file.Update(analyser.packPersonWithDay(author, day), position, length,
			
 
				-						utf8.RuneCountInString(pending.Text))
			
 
				-					if analyser.Debug {
			
 
				-						file.Validate()
			
 
				-					}
			
 
				-					position += length
			
 
				-					pending.Text = ""
			
 
				-				} else {
			
 
				-					pending = edit
			
 
				-				}
			
 
				-			case diffmatchpatch.DiffDelete:
			
 
				-				if pending.Text != "" {
			
 
				-					panic("DiffDelete may not appear after DiffInsert/DiffDelete")
			
 
				-				}
			
 
				-				pending = edit
			
 
				-			default:
			
 
				-				panic(fmt.Sprintf("diff operation is not supported: %d", edit.Type))
			
 
				-			}
			
 
				-		}()
			
 
				-	}
			
 
				-	if pending.Text != "" {
			
 
				-		apply(pending)
			
 
				-		pending.Text = ""
			
 
				-	}
			
 
				-	if file.Len() != len(dst) {
			
 
				-		panic(fmt.Sprintf("%s: internal integrity error dst %d != %d",
			
 
				-			change.To.Name, len(dst), file.Len()))
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-func (analyser *Analyser) handleRename(from, to string, files map[string]*File) {
			
 
				-	file, exists := files[from]
			
 
				-	if !exists {
			
 
				-		panic(fmt.Sprintf("file %s does not exist", from))
			
 
				-	}
			
 
				-	files[to] = file
			
 
				-	delete(files, from)
			
 
				-}
			
 
				-
			
 
				-// Commits returns the critical path in the repository's history. It starts
			
 
				-// from HEAD and traces commits backwards till the root. When it encounters
			
 
				-// a merge (more than one parent), it always chooses the first parent.
			
 
				-func (analyser *Analyser) Commits() []*object.Commit {
			
 
				-	result := []*object.Commit{}
			
 
				-	repository := analyser.Repository
			
 
				-	head, err := repository.Head()
			
 
				-	if err != nil {
			
 
				-		panic(err)
			
 
				-	}
			
 
				-	commit, err := repository.CommitObject(head.Hash())
			
 
				-	if err != nil {
			
 
				-		panic(err)
			
 
				-	}
			
 
				-	result = append(result, commit)
			
 
				-	for ; err != io.EOF; commit, err = commit.Parents().Next() {
			
 
				-		if err != nil {
			
 
				-			panic(err)
			
 
				-		}
			
 
				-		result = append(result, commit)
			
 
				-	}
			
 
				-	// reverse the order
			
 
				-	for i, j := 0, len(result)-1; i < j; i, j = i+1, j-1 {
			
 
				-		result[i], result[j] = result[j], result[i]
			
 
				-	}
			
 
				-	return result
			
 
				-}
			
 
				-
			
 
				-func (analyser *Analyser) groupStatus(
			
 
				-	status map[int]int64,
			
 
				-	files map[string]*File,
			
 
				-	people []map[int]int64,
			
 
				-	day int) ([]int64, map[string][]int64, [][]int64) {
			
 
				-	granularity := analyser.Granularity
			
 
				-	if granularity == 0 {
			
 
				-		granularity = 1
			
 
				-	}
			
 
				-	day++
			
 
				-	adjust := 0
			
 
				-	if day%granularity != 0 {
			
 
				-		adjust = 1
			
 
				-	}
			
 
				-	global := make([]int64, day/granularity+adjust)
			
 
				-	var group int64
			
 
				-	for i := 0; i < day; i++ {
			
 
				-		group += status[i]
			
 
				-		if (i % granularity) == (granularity - 1) {
			
 
				-			global[i/granularity] = group
			
 
				-			group = 0
			
 
				-		}
			
 
				-	}
			
 
				-	if day%granularity != 0 {
			
 
				-		global[len(global)-1] = group
			
 
				-	}
			
 
				-	locals := make(map[string][]int64)
			
 
				-	for key, file := range files {
			
 
				-		status := make([]int64, day/granularity+adjust)
			
 
				-		var group int64
			
 
				-		for i := 0; i < day; i++ {
			
 
				-			group += file.Status(1).(map[int]int64)[i]
			
 
				-			if (i % granularity) == (granularity - 1) {
			
 
				-				status[i/granularity] = group
			
 
				-				group = 0
			
 
				-			}
			
 
				-		}
			
 
				-		if day%granularity != 0 {
			
 
				-			status[len(status)-1] = group
			
 
				-		}
			
 
				-		locals[key] = status
			
 
				-	}
			
 
				-	peoples := make([][]int64, len(people))
			
 
				-	for key, person := range people {
			
 
				-		status := make([]int64, day/granularity+adjust)
			
 
				-		var group int64
			
 
				-		for i := 0; i < day; i++ {
			
 
				-			group += person[i]
			
 
				-			if (i % granularity) == (granularity - 1) {
			
 
				-				status[i/granularity] = group
			
 
				-				group = 0
			
 
				-			}
			
 
				-		}
			
 
				-		if day%granularity != 0 {
			
 
				-			status[len(status)-1] = group
			
 
				-		}
			
 
				-		peoples[key] = status
			
 
				-	}
			
 
				-	return global, locals, peoples
			
 
				-}
			
 
				-
			
 
				-func (analyser *Analyser) updateHistories(
			
 
				-	global_history [][]int64, global_status []int64,
			
 
				-	file_histories map[string][][]int64, file_statuses map[string][]int64,
			
 
				-	people_histories [][][]int64, people_statuses [][]int64,
			
 
				-	delta int) [][]int64 {
			
 
				-	for i := 0; i < delta; i++ {
			
 
				-		global_history = append(global_history, global_status)
			
 
				-	}
			
 
				-	to_delete := make([]string, 0)
			
 
				-	for key, fh := range file_histories {
			
 
				-		ls, exists := file_statuses[key]
			
 
				-		if !exists {
			
 
				-			to_delete = append(to_delete, key)
			
 
				-		} else {
			
 
				-			for i := 0; i < delta; i++ {
			
 
				-				fh = append(fh, ls)
			
 
				-			}
			
 
				-			file_histories[key] = fh
			
 
				-		}
			
 
				-	}
			
 
				-	for _, key := range to_delete {
			
 
				-		delete(file_histories, key)
			
 
				-	}
			
 
				-	for key, ls := range file_statuses {
			
 
				-		fh, exists := file_histories[key]
			
 
				-		if exists {
			
 
				-			continue
			
 
				-		}
			
 
				-		for i := 0; i < delta; i++ {
			
 
				-			fh = append(fh, ls)
			
 
				-		}
			
 
				-		file_histories[key] = fh
			
 
				-	}
			
 
				-
			
 
				-	for key, ph := range people_histories {
			
 
				-		ls := people_statuses[key]
			
 
				-		for i := 0; i < delta; i++ {
			
 
				-			ph = append(ph, ls)
			
 
				-		}
			
 
				-		people_histories[key] = ph
			
 
				-	}
			
 
				-	return global_history
			
 
				-}
			
 
				-
			
 
				-type sortableChange struct {
			
 
				-	change *object.Change
			
 
				-	hash   plumbing.Hash
			
 
				-}
			
 
				-
			
 
				-type sortableChanges []sortableChange
			
 
				-
			
 
				-func (change *sortableChange) Less(other *sortableChange) bool {
			
 
				-	for x := 0; x < 20; x++ {
			
 
				-		if change.hash[x] < other.hash[x] {
			
 
				-			return true
			
 
				-		}
			
 
				-	}
			
 
				-	return false
			
 
				-}
			
 
				-
			
 
				-func (slice sortableChanges) Len() int {
			
 
				-	return len(slice)
			
 
				-}
			
 
				-
			
 
				-func (slice sortableChanges) Less(i, j int) bool {
			
 
				-	return slice[i].Less(&slice[j])
			
 
				-}
			
 
				-
			
 
				-func (slice sortableChanges) Swap(i, j int) {
			
 
				-	slice[i], slice[j] = slice[j], slice[i]
			
 
				-}
			
 
				-
			
 
				-type sortableBlob struct {
			
 
				-	change *object.Change
			
 
				-	size   int64
			
 
				-}
			
 
				-
			
 
				-type sortableBlobs []sortableBlob
			
 
				-
			
 
				-func (change *sortableBlob) Less(other *sortableBlob) bool {
			
 
				-	return change.size < other.size
			
 
				-}
			
 
				-
			
 
				-func (slice sortableBlobs) Len() int {
			
 
				-	return len(slice)
			
 
				-}
			
 
				-
			
 
				-func (slice sortableBlobs) Less(i, j int) bool {
			
 
				-	return slice[i].Less(&slice[j])
			
 
				-}
			
 
				-
			
 
				-func (slice sortableBlobs) Swap(i, j int) {
			
 
				-	slice[i], slice[j] = slice[j], slice[i]
			
 
				-}
			
 
				-
			
 
				-func (analyser *Analyser) sizesAreClose(size1 int64, size2 int64) bool {
			
 
				-	return abs64(size1-size2)*100/max64(1, min64(size1, size2)) <=
			
 
				-		int64(100-analyser.SimilarityThreshold)
			
 
				-}
			
 
				-
			
 
				-func (analyser *Analyser) blobsAreClose(
			
 
				-	blob1 *object.Blob, blob2 *object.Blob) bool {
			
 
				-	str_from := str(blob1)
			
 
				-	str_to := str(blob2)
			
 
				-	dmp := diffmatchpatch.New()
			
 
				-	src, dst, _ := dmp.DiffLinesToRunes(str_from, str_to)
			
 
				-	diffs := dmp.DiffMainRunes(src, dst, false)
			
 
				-	common := 0
			
 
				-	for _, edit := range diffs {
			
 
				-		if edit.Type == diffmatchpatch.DiffEqual {
			
 
				-			common += utf8.RuneCountInString(edit.Text)
			
 
				-		}
			
 
				-	}
			
 
				-	return common*100/max(1, min(len(src), len(dst))) >=
			
 
				-		analyser.SimilarityThreshold
			
 
				-}
			
 
				-
			
 
				-func (analyser *Analyser) getBlob(entry *object.ChangeEntry, commit *object.Commit) (
			
 
				-	*object.Blob, error) {
			
 
				-	blob, err := analyser.Repository.BlobObject(entry.TreeEntry.Hash)
			
 
				-	if err != nil {
			
 
				-		if err.Error() != plumbing.ErrObjectNotFound.Error() {
			
 
				-			fmt.Fprintf(os.Stderr, "getBlob(%s)\n", entry.TreeEntry.Hash.String())
			
 
				-			return nil, err
			
 
				-		}
			
 
				-		file, err_modules := commit.File(".gitmodules")
			
 
				-		if err_modules != nil {
			
 
				-			return nil, err
			
 
				-		}
			
 
				-		contents, err_modules := file.Contents()
			
 
				-		if err_modules != nil {
			
 
				-			return nil, err
			
 
				-		}
			
 
				-		modules := config.NewModules()
			
 
				-		err_modules = modules.Unmarshal([]byte(contents))
			
 
				-		if err_modules != nil {
			
 
				-			return nil, err
			
 
				-		}
			
 
				-		_, exists := modules.Submodules[entry.Name]
			
 
				-		if exists {
			
 
				-			// we found that this is a submodule
			
 
				-			return createDummyBlob(&entry.TreeEntry.Hash)
			
 
				-		}
			
 
				-		return nil, err
			
 
				-	}
			
 
				-	return blob, nil
			
 
				-}
			
 
				-
			
 
				-func (analyser *Analyser) cacheBlobs(changes *object.Changes, commit *object.Commit) (
			
 
				-	*map[plumbing.Hash]*object.Blob, error) {
			
 
				-	cache := make(map[plumbing.Hash]*object.Blob)
			
 
				-	for _, change := range *changes {
			
 
				-		action, err := change.Action()
			
 
				-		if err != nil {
			
 
				-			return nil, err
			
 
				-		}
			
 
				-		switch action {
			
 
				-		case merkletrie.Insert:
			
 
				-			cache[change.To.TreeEntry.Hash], err = analyser.getBlob(&change.To, commit)
			
 
				-			if err != nil {
			
 
				-				fmt.Fprintf(os.Stderr, "file to %s\n", change.To.Name)
			
 
				-			}
			
 
				-		case merkletrie.Delete:
			
 
				-			cache[change.From.TreeEntry.Hash], err = analyser.getBlob(&change.From, commit)
			
 
				-			if err != nil {
			
 
				-				if err.Error() != plumbing.ErrObjectNotFound.Error() {
			
 
				-					fmt.Fprintf(os.Stderr, "file from %s\n", change.From.Name)
			
 
				-				} else {
			
 
				-					cache[change.From.TreeEntry.Hash], err = createDummyBlob(
			
 
				-						&change.From.TreeEntry.Hash)
			
 
				-				}
			
 
				-			}
			
 
				-		case merkletrie.Modify:
			
 
				-			cache[change.To.TreeEntry.Hash], err = analyser.getBlob(&change.To, commit)
			
 
				-			if err != nil {
			
 
				-				fmt.Fprintf(os.Stderr, "file to %s\n", change.To.Name)
			
 
				-			}
			
 
				-			cache[change.From.TreeEntry.Hash], err = analyser.getBlob(&change.From, commit)
			
 
				-			if err != nil {
			
 
				-				fmt.Fprintf(os.Stderr, "file from %s\n", change.From.Name)
			
 
				-			}
			
 
				-		default:
			
 
				-			panic(fmt.Sprintf("unsupported action: %d", change.Action))
			
 
				-		}
			
 
				-		if err != nil {
			
 
				-			return nil, err
			
 
				-		}
			
 
				-	}
			
 
				-	return &cache, nil
			
 
				-}
			
 
				-
			
 
				-func (analyser *Analyser) detectRenames(
			
 
				-	changes *object.Changes, cache *map[plumbing.Hash]*object.Blob) object.Changes {
			
 
				-	reduced_changes := make(object.Changes, 0, changes.Len())
			
 
				-
			
 
				-	// Stage 1 - find renames by matching the hashes
			
 
				-	// n log(n)
			
 
				-	// We sort additions and deletions by hash and then do the single scan along
			
 
				-	// both slices.
			
 
				-	deleted := make(sortableChanges, 0, changes.Len())
			
 
				-	added := make(sortableChanges, 0, changes.Len())
			
 
				-	for _, change := range *changes {
			
 
				-		action, err := change.Action()
			
 
				-		if err != nil {
			
 
				-			panic(err)
			
 
				-		}
			
 
				-		switch action {
			
 
				-		case merkletrie.Insert:
			
 
				-			added = append(added, sortableChange{change, change.To.TreeEntry.Hash})
			
 
				-		case merkletrie.Delete:
			
 
				-			deleted = append(deleted, sortableChange{change, change.From.TreeEntry.Hash})
			
 
				-		case merkletrie.Modify:
			
 
				-			reduced_changes = append(reduced_changes, change)
			
 
				-		default:
			
 
				-			panic(fmt.Sprintf("unsupported action: %d", change.Action))
			
 
				-		}
			
 
				-	}
			
 
				-	sort.Sort(deleted)
			
 
				-	sort.Sort(added)
			
 
				-	a := 0
			
 
				-	d := 0
			
 
				-	still_deleted := make(object.Changes, 0, deleted.Len())
			
 
				-	still_added := make(object.Changes, 0, added.Len())
			
 
				-	for a < added.Len() && d < deleted.Len() {
			
 
				-		if added[a].hash == deleted[d].hash {
			
 
				-			reduced_changes = append(
			
 
				-				reduced_changes,
			
 
				-				&object.Change{From: deleted[d].change.From, To: added[a].change.To})
			
 
				-			a++
			
 
				-			d++
			
 
				-		} else if added[a].Less(&deleted[d]) {
			
 
				-			still_added = append(still_added, added[a].change)
			
 
				-			a++
			
 
				-		} else {
			
 
				-			still_deleted = append(still_deleted, deleted[d].change)
			
 
				-			d++
			
 
				-		}
			
 
				-	}
			
 
				-	for ; a < added.Len(); a++ {
			
 
				-		still_added = append(still_added, added[a].change)
			
 
				-	}
			
 
				-	for ; d < deleted.Len(); d++ {
			
 
				-		still_deleted = append(still_deleted, deleted[d].change)
			
 
				-	}
			
 
				-
			
 
				-	// Stage 2 - apply the similarity threshold
			
 
				-	// n^2 but actually linear
			
 
				-	// We sort the blobs by size and do the single linear scan.
			
 
				-	added_blobs := make(sortableBlobs, 0, still_added.Len())
			
 
				-	deleted_blobs := make(sortableBlobs, 0, still_deleted.Len())
			
 
				-	for _, change := range still_added {
			
 
				-		blob := (*cache)[change.To.TreeEntry.Hash]
			
 
				-		added_blobs = append(
			
 
				-			added_blobs, sortableBlob{change: change, size: blob.Size})
			
 
				-	}
			
 
				-	for _, change := range still_deleted {
			
 
				-		blob := (*cache)[change.From.TreeEntry.Hash]
			
 
				-		deleted_blobs = append(
			
 
				-			deleted_blobs, sortableBlob{change: change, size: blob.Size})
			
 
				-	}
			
 
				-	sort.Sort(added_blobs)
			
 
				-	sort.Sort(deleted_blobs)
			
 
				-	d_start := 0
			
 
				-	for a = 0; a < added_blobs.Len(); a++ {
			
 
				-		my_blob := (*cache)[added_blobs[a].change.To.TreeEntry.Hash]
			
 
				-		my_size := added_blobs[a].size
			
 
				-		for d = d_start; d < deleted_blobs.Len() && !analyser.sizesAreClose(my_size, deleted_blobs[d].size); d++ {
			
 
				-		}
			
 
				-		d_start = d
			
 
				-		found_match := false
			
 
				-		for d = d_start; d < deleted_blobs.Len() && analyser.sizesAreClose(my_size, deleted_blobs[d].size); d++ {
			
 
				-			if analyser.blobsAreClose(
			
 
				-				my_blob, (*cache)[deleted_blobs[d].change.From.TreeEntry.Hash]) {
			
 
				-				found_match = true
			
 
				-				reduced_changes = append(
			
 
				-					reduced_changes,
			
 
				-					&object.Change{From: deleted_blobs[d].change.From,
			
 
				-						To: added_blobs[a].change.To})
			
 
				-				break
			
 
				-			}
			
 
				-		}
			
 
				-		if found_match {
			
 
				-			added_blobs = append(added_blobs[:a], added_blobs[a+1:]...)
			
 
				-			a--
			
 
				-			deleted_blobs = append(deleted_blobs[:d], deleted_blobs[d+1:]...)
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	// Stage 3 - we give up, everything left are independent additions and deletions
			
 
				-	for _, blob := range added_blobs {
			
 
				-		reduced_changes = append(reduced_changes, blob.change)
			
 
				-	}
			
 
				-	for _, blob := range deleted_blobs {
			
 
				-		reduced_changes = append(reduced_changes, blob.change)
			
 
				-	}
			
 
				-	return reduced_changes
			
 
				-}
			
 
				-
			
 
				-// Analyse calculates the line burndown statistics for the bound repository.
			
 
				-//
			
 
				-// commits is a slice with the sequential commit history. It shall start from
			
 
				-// the root (ascending order).
			
 
				-//
			
 
				-// Returns the list of snapshots of the cumulative line edit times and the
			
 
				-// similar lists for every file which is alive in HEAD.
			
 
				-// The number of snapshots (the first dimension >[]<[]int64) depends on
			
 
				-// Analyser.Sampling (the more Sampling, the less the value); the length of
			
 
				-// each snapshot depends on Analyser.Granularity (the more Granularity,
			
 
				-// the less the value).
			
 
				-func (analyser *Analyser) Analyse(commits []*object.Commit) (
			
 
				-	[][]int64, map[string][][]int64, [][][]int64, [][]int64) {
			
 
				-	sampling := analyser.Sampling
			
 
				-	if sampling == 0 {
			
 
				-		sampling = 1
			
 
				-	}
			
 
				-	onProgress := analyser.OnProgress
			
 
				-	if onProgress == nil {
			
 
				-		onProgress = func(int, int) {}
			
 
				-	}
			
 
				-	if analyser.SimilarityThreshold < 0 || analyser.SimilarityThreshold > 100 {
			
 
				-		panic("hercules.Analyser: an invalid SimilarityThreshold was specified")
			
 
				-	}
			
 
				-
			
 
				-	// current daily alive number of lines; key is the number of days from the
			
 
				-	// beginning of the history
			
 
				-	global_status := map[int]int64{}
			
 
				-	// weekly snapshots of status
			
 
				-	global_history := [][]int64{}
			
 
				-	// weekly snapshots of each file's status
			
 
				-	file_histories := map[string][][]int64{}
			
 
				-	// weekly snapshots of each person's status
			
 
				-	people_histories := make([][][]int64, analyser.PeopleNumber)
			
 
				-	// mapping <file path> -> hercules.File
			
 
				-	files := map[string]*File{}
			
 
				-	// Mutual deletions and self insertions
			
 
				-	matrix := make([]map[int]int64, analyser.PeopleNumber)
			
 
				-	// People's individual time stats
			
 
				-	people := make([]map[int]int64, analyser.PeopleNumber)
			
 
				-
			
 
				-	var day0 time.Time // will be initialized in the first iteration
			
 
				-	var prev_tree *object.Tree = nil
			
 
				-	var day, prev_day int
			
 
				-
			
 
				-	for index, commit := range commits {
			
 
				-		onProgress(index, len(commits))
			
 
				-		tree, err := commit.Tree()
			
 
				-		if err != nil {
			
 
				-			panic(err)
			
 
				-		}
			
 
				-		author := analyser.getAuthorId(commit.Author)
			
 
				-		if index == 0 {
			
 
				-			// first iteration - initialize the file objects from the tree
			
 
				-			day0 = commit.Author.When
			
 
				-			func() {
			
 
				-				file_iter := tree.Files()
			
 
				-				defer file_iter.Close()
			
 
				-				for {
			
 
				-					file, err := file_iter.Next()
			
 
				-					if err != nil {
			
 
				-						if err == io.EOF {
			
 
				-							break
			
 
				-						}
			
 
				-						panic(err)
			
 
				-					}
			
 
				-					lines, err := loc(&file.Blob)
			
 
				-					if err == nil {
			
 
				-						files[file.Name] = analyser.newFile(author, 0, lines, global_status, people, matrix)
			
 
				-					}
			
 
				-				}
			
 
				-			}()
			
 
				-		} else {
			
 
				-			day = int(commit.Author.When.Sub(day0).Hours() / 24)
			
 
				-			if day < prev_day {
			
 
				-				// rebase makes miracles
			
 
				-				day = prev_day
			
 
				-			}
			
 
				-			delta := (day / sampling) - (prev_day / sampling)
			
 
				-			if delta > 0 {
			
 
				-				prev_day = day
			
 
				-				gs, fss, pss := analyser.groupStatus(global_status, files, people, day)
			
 
				-				global_history = analyser.updateHistories(
			
 
				-					global_history, gs, file_histories, fss, people_histories, pss, delta)
			
 
				-			}
			
 
				-			tree_diff, err := object.DiffTree(prev_tree, tree)
			
 
				-			if err != nil {
			
 
				-				fmt.Fprintf(os.Stderr, "commit #%d %s\n", index, commit.Hash.String())
			
 
				-				panic(err)
			
 
				-			}
			
 
				-			cache, err := analyser.cacheBlobs(&tree_diff, commit)
			
 
				-			if err != nil {
			
 
				-				fmt.Fprintf(os.Stderr, "commit #%d %s\n", index, commit.Hash.String())
			
 
				-				panic(err)
			
 
				-			}
			
 
				-			tree_diff = analyser.detectRenames(&tree_diff, cache)
			
 
				-			for _, change := range tree_diff {
			
 
				-				action, err := change.Action()
			
 
				-				if err != nil {
			
 
				-					fmt.Fprintf(os.Stderr, "commit #%d %s\n", index, commit.Hash.String())
			
 
				-					panic(err)
			
 
				-				}
			
 
				-				switch action {
			
 
				-				case merkletrie.Insert:
			
 
				-					analyser.handleInsertion(change, author, day, global_status, files, people, matrix, cache)
			
 
				-				case merkletrie.Delete:
			
 
				-					analyser.handleDeletion(change, author, day, global_status, files, cache)
			
 
				-				case merkletrie.Modify:
			
 
				-					func() {
			
 
				-						defer func() {
			
 
				-							r := recover()
			
 
				-							if r != nil {
			
 
				-								fmt.Fprintf(os.Stderr, "#%d - %s: modification error\n",
			
 
				-									index, commit.Hash.String())
			
 
				-								panic(r)
			
 
				-							}
			
 
				-						}()
			
 
				-						analyser.handleModification(change, author, day, global_status, files, people, matrix, cache)
			
 
				-					}()
			
 
				-				}
			
 
				-			}
			
 
				-		}
			
 
				-		prev_tree = tree
			
 
				-	}
			
 
				-	gs, fss, pss := analyser.groupStatus(global_status, files, people, day)
			
 
				-	global_history = analyser.updateHistories(
			
 
				-		global_history, gs, file_histories, fss, people_histories, pss, 1)
			
 
				-	for key, statuses := range file_histories {
			
 
				-		if len(statuses) == len(global_history) {
			
 
				-			continue
			
 
				-		}
			
 
				-		padding := make([][]int64, len(global_history)-len(statuses))
			
 
				-		for i := range padding {
			
 
				-			padding[i] = make([]int64, len(global_status))
			
 
				-		}
			
 
				-		file_histories[key] = append(padding, statuses...)
			
 
				-	}
			
 
				-	people_matrix := make([][]int64, analyser.PeopleNumber)
			
 
				-	for i, row := range matrix {
			
 
				-		mrow := make([]int64, analyser.PeopleNumber+2)
			
 
				-		people_matrix[i] = mrow
			
 
				-		for key, val := range row {
			
 
				-			if key == MISSING_AUTHOR {
			
 
				-				key = -1
			
 
				-			} else if key == SELF_AUTHOR {
			
 
				-				key = -2
			
 
				-			}
			
 
				-			mrow[key+2] = val
			
 
				-		}
			
 
				-	}
			
 
				-	return global_history, file_histories, people_histories, people_matrix
			
 
				-}
			
--- a/blob_cache.go
+++ b/blob_cache.go
@@ -0,0 +1,114 @@
 
				+package hercules
			
 
				+
			
 
				+import (
			
 
				+	"fmt"
			
 
				+	"os"
			
 
				+
			
 
				+	"gopkg.in/src-d/go-git.v4"
			
 
				+	"gopkg.in/src-d/go-git.v4/config"
			
 
				+	"gopkg.in/src-d/go-git.v4/plumbing"
			
 
				+	"gopkg.in/src-d/go-git.v4/plumbing/object"
			
 
				+	"gopkg.in/src-d/go-git.v4/utils/merkletrie"
			
 
				+)
			
 
				+
			
 
				+type BlobCache struct {
			
 
				+	repository *git.Repository
			
 
				+}
			
 
				+
			
 
				+func (cache *BlobCache) Name() string {
			
 
				+	return "BlobCache"
			
 
				+}
			
 
				+
			
 
				+func (cache *BlobCache) Provides() []string {
			
 
				+	arr := [...]string{"blob_cache"}
			
 
				+	return arr[:]
			
 
				+}
			
 
				+
			
 
				+func (cache *BlobCache) Requires() []string {
			
 
				+	arr := [...]string{"changes"}
			
 
				+	return arr[:]
			
 
				+}
			
 
				+
			
 
				+func (cache *BlobCache) Initialize(repository *git.Repository) {
			
 
				+	cache.repository = repository
			
 
				+}
			
 
				+
			
 
				+func (self *BlobCache) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
			
 
				+	commit := deps["commit"].(*object.Commit)
			
 
				+	changes := deps["changes"].(object.Changes)
			
 
				+	cache := make(map[plumbing.Hash]*object.Blob)
			
 
				+	for _, change := range changes {
			
 
				+		action, err := change.Action()
			
 
				+		if err != nil {
			
 
				+			fmt.Fprintf(os.Stderr, "no action in %s\n", change.To.TreeEntry.Hash)
			
 
				+			return nil, err
			
 
				+		}
			
 
				+		switch action {
			
 
				+		case merkletrie.Insert:
			
 
				+			cache[change.To.TreeEntry.Hash], err = self.getBlob(&change.To, commit)
			
 
				+			if err != nil {
			
 
				+				fmt.Fprintf(os.Stderr, "file to %s %s\n", change.To.Name, change.To.TreeEntry.Hash)
			
 
				+			}
			
 
				+		case merkletrie.Delete:
			
 
				+			cache[change.From.TreeEntry.Hash], err = self.getBlob(&change.From, commit)
			
 
				+			if err != nil {
			
 
				+				if err.Error() != plumbing.ErrObjectNotFound.Error() {
			
 
				+					fmt.Fprintf(os.Stderr, "file from %s %s\n", change.From.Name, change.From.TreeEntry.Hash)
			
 
				+				} else {
			
 
				+					cache[change.From.TreeEntry.Hash], err = createDummyBlob(
			
 
				+						&change.From.TreeEntry.Hash)
			
 
				+				}
			
 
				+			}
			
 
				+		case merkletrie.Modify:
			
 
				+			cache[change.To.TreeEntry.Hash], err = self.getBlob(&change.To, commit)
			
 
				+			if err != nil {
			
 
				+				fmt.Fprintf(os.Stderr, "file to %s\n", change.To.Name)
			
 
				+			}
			
 
				+			cache[change.From.TreeEntry.Hash], err = self.getBlob(&change.From, commit)
			
 
				+			if err != nil {
			
 
				+				fmt.Fprintf(os.Stderr, "file from %s\n", change.From.Name)
			
 
				+			}
			
 
				+		default:
			
 
				+			panic(fmt.Sprintf("unsupported action: %d", change.Action))
			
 
				+		}
			
 
				+		if err != nil {
			
 
				+			return nil, err
			
 
				+		}
			
 
				+	}
			
 
				+	return map[string]interface{}{"blob_cache": cache}, nil
			
 
				+}
			
 
				+
			
 
				+func (cache *BlobCache) Finalize() interface{} {
			
 
				+	return nil
			
 
				+}
			
 
				+
			
 
				+func (cache *BlobCache) getBlob(entry *object.ChangeEntry, commit *object.Commit) (
			
 
				+	*object.Blob, error) {
			
 
				+	blob, err := cache.repository.BlobObject(entry.TreeEntry.Hash)
			
 
				+	if err != nil {
			
 
				+		if err.Error() != plumbing.ErrObjectNotFound.Error() {
			
 
				+			fmt.Fprintf(os.Stderr, "getBlob(%s)\n", entry.TreeEntry.Hash.String())
			
 
				+			return nil, err
			
 
				+		}
			
 
				+		file, err_modules := commit.File(".gitmodules")
			
 
				+		if err_modules != nil {
			
 
				+			return nil, err
			
 
				+		}
			
 
				+		contents, err_modules := file.Contents()
			
 
				+		if err_modules != nil {
			
 
				+			return nil, err
			
 
				+		}
			
 
				+		modules := config.NewModules()
			
 
				+		err_modules = modules.Unmarshal([]byte(contents))
			
 
				+		if err_modules != nil {
			
 
				+			return nil, err
			
 
				+		}
			
 
				+		_, exists := modules.Submodules[entry.Name]
			
 
				+		if exists {
			
 
				+			// we found that this is a submodule
			
 
				+			return createDummyBlob(&entry.TreeEntry.Hash)
			
 
				+		}
			
 
				+		return nil, err
			
 
				+	}
			
 
				+	return blob, nil
			
 
				+}
			
--- a/burndown.go
+++ b/burndown.go
@@ -0,0 +1,543 @@
 
				+package hercules
			
 
				+
			
 
				+import (
			
 
				+	"bufio"
			
 
				+	"bytes"
			
 
				+	"errors"
			
 
				+	"fmt"
			
 
				+	"io"
			
 
				+	"os"
			
 
				+	"unicode/utf8"
			
 
				+
			
 
				+	"github.com/sergi/go-diff/diffmatchpatch"
			
 
				+	"gopkg.in/src-d/go-git.v4"
			
 
				+	"gopkg.in/src-d/go-git.v4/plumbing"
			
 
				+	"gopkg.in/src-d/go-git.v4/plumbing/object"
			
 
				+	"gopkg.in/src-d/go-git.v4/utils/merkletrie"
			
 
				+)
			
 
				+
			
 
				+// BurndownAnalyser allows to gather the line burndown statistics for a Git repository.
			
 
				+type BurndownAnalysis struct {
			
 
				+	// Granularity sets the size of each band - the number of days it spans.
			
 
				+	// Smaller values provide better resolution but require more work and eat more
			
 
				+	// memory. 30 days is usually enough.
			
 
				+	Granularity int
			
 
				+	// Sampling sets how detailed is the statistic - the size of the interval in
			
 
				+	// days between consecutive measurements. It is usually a good idea to set it
			
 
				+	// <= Granularity. Try 15 or 30.
			
 
				+	Sampling int
			
 
				+
			
 
				+	// The number of developers for which to collect the burndown stats. 0 disables it.
			
 
				+	PeopleNumber int
			
 
				+
			
 
				+	// Debug activates the debugging mode. Analyse() runs slower in this mode
			
 
				+	// but it accurately checks all the intermediate states for invariant
			
 
				+	// violations.
			
 
				+	Debug bool
			
 
				+
			
 
				+	// Repository points to the analysed Git repository struct from go-git.
			
 
				+	repository *git.Repository
			
 
				+	// globalStatus is the current daily alive number of lines; key is the number
			
 
				+	// of days from the beginning of the history.
			
 
				+	globalStatus map[int]int64
			
 
				+	// globalHistory is the weekly snapshots of globalStatus.
			
 
				+	globalHistory [][]int64
			
 
				+	// fileHistories is the weekly snapshots of each file's status.
			
 
				+	fileHistories map[string][][]int64
			
 
				+	// peopleHistories is the weekly snapshots of each person's status.
			
 
				+	peopleHistories [][][]int64
			
 
				+	// fiales is the mapping <file path> -> hercules.File.
			
 
				+	files map[string]*File
			
 
				+	// matrix is the mutual deletions and self insertions.
			
 
				+	matrix []map[int]int64
			
 
				+	// people is the people's individual time stats.
			
 
				+	people []map[int]int64
			
 
				+	// day is the most recent day index processed.
			
 
				+	day int
			
 
				+	// previousDay is the day from the previous sample period -
			
 
				+	// different from DaysSinceStart.previousDay.
			
 
				+	previousDay int
			
 
				+}
			
 
				+
			
 
				+type BurndownResult struct {
			
 
				+	GlobalHistory   [][]int64
			
 
				+	FileHistories   map[string][][]int64
			
 
				+	PeopleHistories [][][]int64
			
 
				+	PeopleMatrix    [][]int64
			
 
				+}
			
 
				+
			
 
				+func (analyser *BurndownAnalysis) Name() string {
			
 
				+	return "BurndownAnalysis"
			
 
				+}
			
 
				+
			
 
				+func (analyser *BurndownAnalysis) Provides() []string {
			
 
				+	return []string{}
			
 
				+}
			
 
				+
			
 
				+func (analyser *BurndownAnalysis) Requires() []string {
			
 
				+	arr := [...]string{"renamed_changes", "blob_cache", "day", "author"}
			
 
				+	return arr[:]
			
 
				+}
			
 
				+
			
 
				+func (analyser *BurndownAnalysis) Initialize(repository *git.Repository) {
			
 
				+	analyser.repository = repository
			
 
				+	analyser.globalStatus = map[int]int64{}
			
 
				+	analyser.globalHistory = [][]int64{}
			
 
				+	analyser.fileHistories = map[string][][]int64{}
			
 
				+	analyser.peopleHistories = make([][][]int64, analyser.PeopleNumber)
			
 
				+	analyser.files = map[string]*File{}
			
 
				+	analyser.matrix = make([]map[int]int64, analyser.PeopleNumber)
			
 
				+	analyser.people = make([]map[int]int64, analyser.PeopleNumber)
			
 
				+	analyser.day = 0
			
 
				+	analyser.previousDay = 0
			
 
				+}
			
 
				+
			
 
				+func (analyser *BurndownAnalysis) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
			
 
				+	sampling := analyser.Sampling
			
 
				+	if sampling == 0 {
			
 
				+		sampling = 1
			
 
				+	}
			
 
				+	author := deps["author"].(int)
			
 
				+	analyser.day = deps["day"].(int)
			
 
				+	delta := (analyser.day / sampling) - (analyser.previousDay / sampling)
			
 
				+	if delta > 0 {
			
 
				+		analyser.previousDay = analyser.day
			
 
				+		gs, fss, pss := analyser.groupStatus()
			
 
				+		analyser.updateHistories(gs, fss, pss, delta)
			
 
				+	}
			
 
				+	cache := deps["blob_cache"].(map[plumbing.Hash]*object.Blob)
			
 
				+	tree_diff := deps["renamed_changes"].(object.Changes)
			
 
				+	for _, change := range tree_diff {
			
 
				+		action, err := change.Action()
			
 
				+		if err != nil {
			
 
				+			return nil, err
			
 
				+		}
			
 
				+		switch action {
			
 
				+		case merkletrie.Insert:
			
 
				+			err = analyser.handleInsertion(change, author, cache)
			
 
				+		case merkletrie.Delete:
			
 
				+			err = analyser.handleDeletion(change, author, cache)
			
 
				+		case merkletrie.Modify:
			
 
				+			err = analyser.handleModification(change, author, cache)
			
 
				+		}
			
 
				+		if err != nil {
			
 
				+			return nil, err
			
 
				+		}
			
 
				+	}
			
 
				+	return nil, nil
			
 
				+}
			
 
				+
			
 
				+// Finalize() returns the list of snapshots of the cumulative line edit times
			
 
				+// and the similar lists for every file which is alive in HEAD.
			
 
				+// The number of snapshots (the first dimension >[]<[]int64) depends on
			
 
				+// Analyser.Sampling (the more Sampling, the less the value); the length of
			
 
				+// each snapshot depends on Analyser.Granularity (the more Granularity,
			
 
				+// the less the value).
			
 
				+func (analyser *BurndownAnalysis) Finalize() interface{} {
			
 
				+	gs, fss, pss := analyser.groupStatus()
			
 
				+	analyser.updateHistories(gs, fss, pss, 1)
			
 
				+	for key, statuses := range analyser.fileHistories {
			
 
				+		if len(statuses) == len(analyser.globalHistory) {
			
 
				+			continue
			
 
				+		}
			
 
				+		padding := make([][]int64, len(analyser.globalHistory)-len(statuses))
			
 
				+		for i := range padding {
			
 
				+			padding[i] = make([]int64, len(analyser.globalStatus))
			
 
				+		}
			
 
				+		analyser.fileHistories[key] = append(padding, statuses...)
			
 
				+	}
			
 
				+	peopleMatrix := make([][]int64, analyser.PeopleNumber)
			
 
				+	for i, row := range analyser.matrix {
			
 
				+		mrow := make([]int64, analyser.PeopleNumber+2)
			
 
				+		peopleMatrix[i] = mrow
			
 
				+		for key, val := range row {
			
 
				+			if key == MISSING_AUTHOR {
			
 
				+				key = -1
			
 
				+			} else if key == SELF_AUTHOR {
			
 
				+				key = -2
			
 
				+			}
			
 
				+			mrow[key+2] = val
			
 
				+		}
			
 
				+	}
			
 
				+	return BurndownResult{
			
 
				+		GlobalHistory:   analyser.globalHistory,
			
 
				+		FileHistories:   analyser.fileHistories,
			
 
				+		PeopleHistories: analyser.peopleHistories,
			
 
				+		PeopleMatrix:    peopleMatrix}
			
 
				+}
			
 
				+
			
 
				+func checkClose(c io.Closer) {
			
 
				+	if err := c.Close(); err != nil {
			
 
				+		panic(err)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func countLines(file *object.Blob) (int, error) {
			
 
				+	reader, err := file.Reader()
			
 
				+	if err != nil {
			
 
				+		return 0, err
			
 
				+	}
			
 
				+	defer checkClose(reader)
			
 
				+	var scanner *bufio.Scanner
			
 
				+	buffer := make([]byte, bufio.MaxScanTokenSize)
			
 
				+	counter := 0
			
 
				+	for scanner == nil || scanner.Err() == bufio.ErrTooLong {
			
 
				+		if scanner != nil && !utf8.Valid(scanner.Bytes()) {
			
 
				+			return -1, errors.New("binary")
			
 
				+		}
			
 
				+		scanner = bufio.NewScanner(reader)
			
 
				+		scanner.Buffer(buffer, 0)
			
 
				+		for scanner.Scan() {
			
 
				+			if !utf8.Valid(scanner.Bytes()) {
			
 
				+				return -1, errors.New("binary")
			
 
				+			}
			
 
				+			counter++
			
 
				+		}
			
 
				+	}
			
 
				+	return counter, nil
			
 
				+}
			
 
				+
			
 
				+func blobToString(file *object.Blob) (string, error) {
			
 
				+	reader, err := file.Reader()
			
 
				+	if err != nil {
			
 
				+		return "", err
			
 
				+	}
			
 
				+	defer checkClose(reader)
			
 
				+	buf := new(bytes.Buffer)
			
 
				+	buf.ReadFrom(reader)
			
 
				+	return buf.String(), nil
			
 
				+}
			
 
				+
			
 
				+func (analyser *BurndownAnalysis) packPersonWithDay(person int, day int) int {
			
 
				+	if analyser.PeopleNumber == 0 {
			
 
				+		return day
			
 
				+	}
			
 
				+	result := day
			
 
				+	result |= person << 14
			
 
				+	// This effectively means max 16384 days (>44 years) and (131072 - 2) devs
			
 
				+	return result
			
 
				+}
			
 
				+
			
 
				+func (analyser *BurndownAnalysis) unpackPersonWithDay(value int) (int, int) {
			
 
				+	if analyser.PeopleNumber == 0 {
			
 
				+		return MISSING_AUTHOR, value
			
 
				+	}
			
 
				+	return value >> 14, value & 0x3FFF
			
 
				+}
			
 
				+
			
 
				+func (analyser *BurndownAnalysis) updateStatus(
			
 
				+	status interface{}, _ int, previous_time_ int, delta int) {
			
 
				+
			
 
				+	_, previous_time := analyser.unpackPersonWithDay(previous_time_)
			
 
				+	status.(map[int]int64)[previous_time] += int64(delta)
			
 
				+}
			
 
				+
			
 
				+func (analyser *BurndownAnalysis) updatePeople(people interface{}, _ int, previous_time_ int, delta int) {
			
 
				+	old_author, previous_time := analyser.unpackPersonWithDay(previous_time_)
			
 
				+	if old_author == MISSING_AUTHOR {
			
 
				+		return
			
 
				+	}
			
 
				+	casted := people.([]map[int]int64)
			
 
				+	stats := casted[old_author]
			
 
				+	if stats == nil {
			
 
				+		stats = map[int]int64{}
			
 
				+		casted[old_author] = stats
			
 
				+	}
			
 
				+	stats[previous_time] += int64(delta)
			
 
				+}
			
 
				+
			
 
				+func (analyser *BurndownAnalysis) updateMatrix(
			
 
				+	matrix_ interface{}, current_time int, previous_time int, delta int) {
			
 
				+
			
 
				+	matrix := matrix_.([]map[int]int64)
			
 
				+	new_author, _ := analyser.unpackPersonWithDay(current_time)
			
 
				+	old_author, _ := analyser.unpackPersonWithDay(previous_time)
			
 
				+	if old_author == MISSING_AUTHOR {
			
 
				+		return
			
 
				+	}
			
 
				+	if new_author == old_author && delta > 0 {
			
 
				+		new_author = SELF_AUTHOR
			
 
				+	}
			
 
				+	row := matrix[old_author]
			
 
				+	if row == nil {
			
 
				+		row = map[int]int64{}
			
 
				+		matrix[old_author] = row
			
 
				+	}
			
 
				+	cell, exists := row[new_author]
			
 
				+	if !exists {
			
 
				+		row[new_author] = 0
			
 
				+		cell = 0
			
 
				+	}
			
 
				+	row[new_author] = cell + int64(delta)
			
 
				+}
			
 
				+
			
 
				+func (analyser *BurndownAnalysis) newFile(
			
 
				+	author int, day int, size int, global map[int]int64, people []map[int]int64,
			
 
				+	matrix []map[int]int64) *File {
			
 
				+	if analyser.PeopleNumber == 0 {
			
 
				+		return NewFile(day, size, NewStatus(global, analyser.updateStatus),
			
 
				+			NewStatus(make(map[int]int64), analyser.updateStatus))
			
 
				+	}
			
 
				+	return NewFile(analyser.packPersonWithDay(author, day), size,
			
 
				+		NewStatus(global, analyser.updateStatus),
			
 
				+		NewStatus(make(map[int]int64), analyser.updateStatus),
			
 
				+		NewStatus(people, analyser.updatePeople),
			
 
				+		NewStatus(matrix, analyser.updateMatrix))
			
 
				+}
			
 
				+
			
 
				+func (analyser *BurndownAnalysis) handleInsertion(
			
 
				+	change *object.Change, author int, cache map[plumbing.Hash]*object.Blob) error {
			
 
				+	blob := cache[change.To.TreeEntry.Hash]
			
 
				+	lines, err := countLines(blob)
			
 
				+	if err != nil {
			
 
				+		if err.Error() == "binary" {
			
 
				+			return nil
			
 
				+		}
			
 
				+		return err
			
 
				+	}
			
 
				+	name := change.To.Name
			
 
				+	file, exists := analyser.files[name]
			
 
				+	if exists {
			
 
				+		return errors.New(fmt.Sprintf("file %s already exists", name))
			
 
				+	}
			
 
				+	file = analyser.newFile(
			
 
				+		author, analyser.day, lines, analyser.globalStatus, analyser.people, analyser.matrix)
			
 
				+	analyser.files[name] = file
			
 
				+	return nil
			
 
				+}
			
 
				+
			
 
				+func (analyser *BurndownAnalysis) handleDeletion(
			
 
				+	change *object.Change, author int, cache map[plumbing.Hash]*object.Blob) error {
			
 
				+
			
 
				+	blob := cache[change.From.TreeEntry.Hash]
			
 
				+	lines, err := countLines(blob)
			
 
				+	if err != nil {
			
 
				+		return err
			
 
				+	}
			
 
				+	name := change.From.Name
			
 
				+	file := analyser.files[name]
			
 
				+	file.Update(analyser.packPersonWithDay(author, analyser.day), 0, 0, lines)
			
 
				+	delete(analyser.files, name)
			
 
				+	return nil
			
 
				+}
			
 
				+
			
 
				+func (analyser *BurndownAnalysis) handleModification(
			
 
				+	change *object.Change, author int, cache map[plumbing.Hash]*object.Blob) error {
			
 
				+
			
 
				+	blob_from := cache[change.From.TreeEntry.Hash]
			
 
				+	blob_to := cache[change.To.TreeEntry.Hash]
			
 
				+	// we are not validating UTF-8 here because for example
			
 
				+	// git/git 4f7770c87ce3c302e1639a7737a6d2531fe4b160 fetch-pack.c is invalid UTF-8
			
 
				+	str_from, err := blobToString(blob_from)
			
 
				+	if err != nil {
			
 
				+		return err
			
 
				+	}
			
 
				+	str_to, err := blobToString(blob_to)
			
 
				+	if err != nil {
			
 
				+		return err
			
 
				+	}
			
 
				+	file, exists := analyser.files[change.From.Name]
			
 
				+	if !exists {
			
 
				+		return analyser.handleInsertion(change, author, cache)
			
 
				+	}
			
 
				+	// possible rename
			
 
				+	if change.To.Name != change.From.Name {
			
 
				+		err = analyser.handleRename(change.From.Name, change.To.Name)
			
 
				+		if err != nil {
			
 
				+			return err
			
 
				+		}
			
 
				+	}
			
 
				+	dmp := diffmatchpatch.New()
			
 
				+	src, dst, _ := dmp.DiffLinesToRunes(str_from, str_to)
			
 
				+	if file.Len() != len(src) {
			
 
				+		fmt.Fprintf(os.Stderr, "====TREE====\n%s", file.Dump())
			
 
				+		return errors.New(fmt.Sprintf("%s: internal integrity error src %d != %d %s -> %s",
			
 
				+			change.To.Name, len(src), file.Len(),
			
 
				+			change.From.TreeEntry.Hash.String(), change.To.TreeEntry.Hash.String()))
			
 
				+	}
			
 
				+	diffs := dmp.DiffMainRunes(src, dst, false)
			
 
				+	// we do not call RunesToDiffLines so the number of lines equals
			
 
				+	// to the rune count
			
 
				+	position := 0
			
 
				+	pending := diffmatchpatch.Diff{Text: ""}
			
 
				+
			
 
				+	apply := func(edit diffmatchpatch.Diff) {
			
 
				+		length := utf8.RuneCountInString(edit.Text)
			
 
				+		if edit.Type == diffmatchpatch.DiffInsert {
			
 
				+			file.Update(analyser.packPersonWithDay(author, analyser.day), position, length, 0)
			
 
				+			position += length
			
 
				+		} else {
			
 
				+			file.Update(analyser.packPersonWithDay(author, analyser.day), position, 0, length)
			
 
				+		}
			
 
				+		if analyser.Debug {
			
 
				+			file.Validate()
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for _, edit := range diffs {
			
 
				+		dump_before := ""
			
 
				+		if analyser.Debug {
			
 
				+			dump_before = file.Dump()
			
 
				+		}
			
 
				+		length := utf8.RuneCountInString(edit.Text)
			
 
				+		debug_error := func() {
			
 
				+			fmt.Fprintf(os.Stderr, "%s: internal diff error\n", change.To.Name)
			
 
				+			fmt.Fprintf(os.Stderr, "Update(%d, %d, %d (0), %d (0))\n", analyser.day, position,
			
 
				+				length, utf8.RuneCountInString(pending.Text))
			
 
				+			if dump_before != "" {
			
 
				+				fmt.Fprintf(os.Stderr, "====TREE BEFORE====\n%s====END====\n", dump_before)
			
 
				+			}
			
 
				+			fmt.Fprintf(os.Stderr, "====TREE AFTER====\n%s====END====\n", file.Dump())
			
 
				+		}
			
 
				+		switch edit.Type {
			
 
				+		case diffmatchpatch.DiffEqual:
			
 
				+			if pending.Text != "" {
			
 
				+				apply(pending)
			
 
				+				pending.Text = ""
			
 
				+			}
			
 
				+			position += length
			
 
				+		case diffmatchpatch.DiffInsert:
			
 
				+			if pending.Text != "" {
			
 
				+				if pending.Type == diffmatchpatch.DiffInsert {
			
 
				+					debug_error()
			
 
				+					return errors.New("DiffInsert may not appear after DiffInsert")
			
 
				+				}
			
 
				+				file.Update(analyser.packPersonWithDay(author, analyser.day), position, length,
			
 
				+					utf8.RuneCountInString(pending.Text))
			
 
				+				if analyser.Debug {
			
 
				+					file.Validate()
			
 
				+				}
			
 
				+				position += length
			
 
				+				pending.Text = ""
			
 
				+			} else {
			
 
				+				pending = edit
			
 
				+			}
			
 
				+		case diffmatchpatch.DiffDelete:
			
 
				+			if pending.Text != "" {
			
 
				+				debug_error()
			
 
				+				return errors.New("DiffDelete may not appear after DiffInsert/DiffDelete")
			
 
				+			}
			
 
				+			pending = edit
			
 
				+		default:
			
 
				+			debug_error()
			
 
				+			return errors.New(fmt.Sprintf("diff operation is not supported: %d", edit.Type))
			
 
				+		}
			
 
				+	}
			
 
				+	if pending.Text != "" {
			
 
				+		apply(pending)
			
 
				+		pending.Text = ""
			
 
				+	}
			
 
				+	if file.Len() != len(dst) {
			
 
				+		return errors.New(fmt.Sprintf("%s: internal integrity error dst %d != %d",
			
 
				+			change.To.Name, len(dst), file.Len()))
			
 
				+	}
			
 
				+	return nil
			
 
				+}
			
 
				+
			
 
				+func (analyser *BurndownAnalysis) handleRename(from, to string) error {
			
 
				+	file, exists := analyser.files[from]
			
 
				+	if !exists {
			
 
				+		return errors.New(fmt.Sprintf("file %s does not exist", from))
			
 
				+	}
			
 
				+	analyser.files[to] = file
			
 
				+	delete(analyser.files, from)
			
 
				+	return nil
			
 
				+}
			
 
				+
			
 
				+func (analyser *BurndownAnalysis) groupStatus() ([]int64, map[string][]int64, [][]int64) {
			
 
				+	granularity := analyser.Granularity
			
 
				+	if granularity == 0 {
			
 
				+		granularity = 1
			
 
				+	}
			
 
				+	day := analyser.day
			
 
				+	day++
			
 
				+	adjust := 0
			
 
				+	if day%granularity != 0 {
			
 
				+		adjust = 1
			
 
				+	}
			
 
				+	global := make([]int64, day/granularity+adjust)
			
 
				+	var group int64
			
 
				+	for i := 0; i < day; i++ {
			
 
				+		group += analyser.globalStatus[i]
			
 
				+		if (i % granularity) == (granularity - 1) {
			
 
				+			global[i/granularity] = group
			
 
				+			group = 0
			
 
				+		}
			
 
				+	}
			
 
				+	if day%granularity != 0 {
			
 
				+		global[len(global)-1] = group
			
 
				+	}
			
 
				+	locals := make(map[string][]int64)
			
 
				+	for key, file := range analyser.files {
			
 
				+		status := make([]int64, day/granularity+adjust)
			
 
				+		var group int64
			
 
				+		for i := 0; i < day; i++ {
			
 
				+			group += file.Status(1).(map[int]int64)[i]
			
 
				+			if (i % granularity) == (granularity - 1) {
			
 
				+				status[i/granularity] = group
			
 
				+				group = 0
			
 
				+			}
			
 
				+		}
			
 
				+		if day%granularity != 0 {
			
 
				+			status[len(status)-1] = group
			
 
				+		}
			
 
				+		locals[key] = status
			
 
				+	}
			
 
				+	peoples := make([][]int64, len(analyser.people))
			
 
				+	for key, person := range analyser.people {
			
 
				+		status := make([]int64, day/granularity+adjust)
			
 
				+		var group int64
			
 
				+		for i := 0; i < day; i++ {
			
 
				+			group += person[i]
			
 
				+			if (i % granularity) == (granularity - 1) {
			
 
				+				status[i/granularity] = group
			
 
				+				group = 0
			
 
				+			}
			
 
				+		}
			
 
				+		if day%granularity != 0 {
			
 
				+			status[len(status)-1] = group
			
 
				+		}
			
 
				+		peoples[key] = status
			
 
				+	}
			
 
				+	return global, locals, peoples
			
 
				+}
			
 
				+
			
 
				+func (analyser *BurndownAnalysis) updateHistories(
			
 
				+	globalStatus []int64, file_statuses map[string][]int64, people_statuses [][]int64, delta int) {
			
 
				+	for i := 0; i < delta; i++ {
			
 
				+		analyser.globalHistory = append(analyser.globalHistory, globalStatus)
			
 
				+	}
			
 
				+	to_delete := make([]string, 0)
			
 
				+	for key, fh := range analyser.fileHistories {
			
 
				+		ls, exists := file_statuses[key]
			
 
				+		if !exists {
			
 
				+			to_delete = append(to_delete, key)
			
 
				+		} else {
			
 
				+			for i := 0; i < delta; i++ {
			
 
				+				fh = append(fh, ls)
			
 
				+			}
			
 
				+			analyser.fileHistories[key] = fh
			
 
				+		}
			
 
				+	}
			
 
				+	for _, key := range to_delete {
			
 
				+		delete(analyser.fileHistories, key)
			
 
				+	}
			
 
				+	for key, ls := range file_statuses {
			
 
				+		fh, exists := analyser.fileHistories[key]
			
 
				+		if exists {
			
 
				+			continue
			
 
				+		}
			
 
				+		for i := 0; i < delta; i++ {
			
 
				+			fh = append(fh, ls)
			
 
				+		}
			
 
				+		analyser.fileHistories[key] = fh
			
 
				+	}
			
 
				+
			
 
				+	for key, ph := range analyser.peopleHistories {
			
 
				+		ls := people_statuses[key]
			
 
				+		for i := 0; i < delta; i++ {
			
 
				+			ph = append(ph, ls)
			
 
				+		}
			
 
				+		analyser.peopleHistories[key] = ph
			
 
				+	}
			
 
				+}
			
--- a/cmd/hercules/main.go
+++ b/cmd/hercules/main.go
@@ -7,10 +7,8 @@ statistics from Git repositories. Usage:
 
				 package main
			
 
				 
			
 
				 import (
			
 
				-	"bufio"
			
 
				 	"flag"
			
 
				 	"fmt"
			
 
				-	"io"
			
 
				 	"net/http"
			
 
				 	_ "net/http/pprof"
			
 
				 	"os"
			
@@ -21,7 +19,6 @@ import (
 
				 
			
 
				 	"gopkg.in/src-d/go-billy.v3/osfs"
			
 
				 	"gopkg.in/src-d/go-git.v4"
			
 
				-	"gopkg.in/src-d/go-git.v4/plumbing"
			
 
				 	"gopkg.in/src-d/go-git.v4/plumbing/object"
			
 
				 	"gopkg.in/src-d/go-git.v4/storage"
			
 
				 	"gopkg.in/src-d/go-git.v4/storage/filesystem"
			
@@ -29,96 +26,6 @@ import (
 
				 	"gopkg.in/src-d/hercules.v1"
			
 
				 )
			
 
				 
			
 
				-// Signature stores the author's identification. Only a single field is used to identify the
			
 
				-// commit: first Email is checked, then Name.
			
 
				-type Signature struct {
			
 
				-	Name string
			
 
				-	Email string
			
 
				-}
			
 
				-
			
 
				-func loadPeopleDict(path string) (map[string]int, map[int]string, int) {
			
 
				-	file, err := os.Open(path)
			
 
				-	if err != nil {
			
 
				-		panic(err)
			
 
				-	}
			
 
				-	defer file.Close()
			
 
				-	scanner := bufio.NewScanner(file)
			
 
				-	dict := make(map[string]int)
			
 
				-	reverse_dict := make(map[int]string)
			
 
				-	size := 0
			
 
				-	for scanner.Scan() {
			
 
				-		for _, id := range strings.Split(strings.ToLower(scanner.Text()), "|") {
			
 
				-			dict[id] = size
			
 
				-		}
			
 
				-		reverse_dict[size] = scanner.Text()
			
 
				-		size += 1
			
 
				-	}
			
 
				-	return dict, reverse_dict, size
			
 
				-}
			
 
				-
			
 
				-func generatePeopleDict(commits []*object.Commit) (map[string]int, map[int]string, int) {
			
 
				-	dict := make(map[string]int)
			
 
				-	emails := make(map[int][]string)
			
 
				-	names := make(map[int][]string)
			
 
				-	size := 0
			
 
				-	for _, commit := range commits {
			
 
				-		email := strings.ToLower(commit.Author.Email)
			
 
				-		name := strings.ToLower(commit.Author.Name)
			
 
				-		id, exists := dict[email]
			
 
				-		if exists {
			
 
				-			_, exists := dict[name]
			
 
				-			if !exists {
			
 
				-				dict[name] = id
			
 
				-			  names[id] = append(names[id], name)
			
 
				-			}
			
 
				-			continue
			
 
				-		}
			
 
				-		id, exists = dict[name]
			
 
				-		if exists {
			
 
				-			dict[email] = id
			
 
				-			emails[id] = append(emails[id], email)
			
 
				-			continue
			
 
				-		}
			
 
				-		dict[email] = size
			
 
				-		dict[name] = size
			
 
				-		emails[size] = append(emails[size], email)
			
 
				-		names[size] = append(names[size], name)
			
 
				-		size += 1
			
 
				-	}
			
 
				-	reverse_dict := make(map[int]string)
			
 
				-	for _, val := range dict {
			
 
				-		reverse_dict[val] = strings.Join(names[val], "|") + "|" + strings.Join(emails[val], "|")
			
 
				-	}
			
 
				-	return dict, reverse_dict, size
			
 
				-}
			
 
				-
			
 
				-func loadCommitsFromFile(path string, repository *git.Repository) []*object.Commit {
			
 
				-	var file io.Reader
			
 
				-	if path != "-" {
			
 
				-		file, err := os.Open(path)
			
 
				-		if err != nil {
			
 
				-			panic(err)
			
 
				-		}
			
 
				-		defer file.Close()
			
 
				-	} else {
			
 
				-		file = os.Stdin
			
 
				-	}
			
 
				-	scanner := bufio.NewScanner(file)
			
 
				-	commits := []*object.Commit{}
			
 
				-	for scanner.Scan() {
			
 
				-		hash := plumbing.NewHash(scanner.Text())
			
 
				-		if len(hash) != 20 {
			
 
				-			panic("invalid commit hash " + scanner.Text())
			
 
				-		}
			
 
				-		commit, err := repository.CommitObject(hash)
			
 
				-		if err != nil {
			
 
				-			panic(err)
			
 
				-		}
			
 
				-		commits = append(commits, commit)
			
 
				-	}
			
 
				-	return commits
			
 
				-}
			
 
				-
			
 
				 func printStatuses(statuses [][]int64, name string) {
			
 
				 	// determine the maximum length of each value
			
 
				 	var maxnum int64
			
@@ -225,62 +132,72 @@ func main() {
 
				 	if err != nil {
			
 
				 		panic(err)
			
 
				 	}
			
 
				+
			
 
				 	// core logic
			
 
				-	analyser := hercules.Analyser{
			
 
				-		Repository: repository,
			
 
				-		OnProgress: func(commit, length int) {
			
 
				-			fmt.Fprintf(os.Stderr, "%d / %d\r", commit, length)
			
 
				-		},
			
 
				-		Granularity:         granularity,
			
 
				-		Sampling:            sampling,
			
 
				-		SimilarityThreshold: similarity_threshold,
			
 
				-		Debug:               debug,
			
 
				+	pipeline := hercules.NewPipeline(repository)
			
 
				+	pipeline.OnProgress = func(commit, length int) {
			
 
				+		fmt.Fprintf(os.Stderr, "%d / %d\r", commit, length)
			
 
				 	}
			
 
				 	// list of commits belonging to the default branch, from oldest to newest
			
 
				 	// rev-list --first-parent
			
 
				 	var commits []*object.Commit
			
 
				 	if commitsFile == "" {
			
 
				-		commits = analyser.Commits()
			
 
				+		commits = pipeline.Commits()
			
 
				 	} else {
			
 
				-		commits = loadCommitsFromFile(commitsFile, repository)
			
 
				+		commits = hercules.LoadCommitsFromFile(commitsFile, repository)
			
 
				 	}
			
 
				-	var people_ids map[int]string
			
 
				+
			
 
				+	pipeline.AddItem(&hercules.BlobCache{})
			
 
				+	pipeline.AddItem(&hercules.DaysSinceStart{})
			
 
				+	pipeline.AddItem(&hercules.RenameAnalysis{SimilarityThreshold: similarity_threshold})
			
 
				+	pipeline.AddItem(&hercules.TreeDiff{})
			
 
				+	id_matcher := &hercules.IdentityDetector{}
			
 
				 	if with_people {
			
 
				-		var people_dict map[string]int
			
 
				-		var people_number int
			
 
				 		if people_dict_path != "" {
			
 
				-			people_dict, people_ids, people_number = loadPeopleDict(people_dict_path)
			
 
				+			id_matcher.LoadPeopleDict(people_dict_path)
			
 
				 		} else {
			
 
				-			people_dict, people_ids, people_number = generatePeopleDict(commits)
			
 
				+			id_matcher.GeneratePeopleDict(commits)
			
 
				 		}
			
 
				-		analyser.PeopleNumber = people_number
			
 
				-		analyser.PeopleDict = people_dict
			
 
				 	}
			
 
				-	global_statuses, file_statuses, people_statuses, people_matrix := analyser.Analyse(commits)
			
 
				+	pipeline.AddItem(id_matcher)
			
 
				+	burndowner := &hercules.BurndownAnalysis{
			
 
				+		Granularity:         granularity,
			
 
				+		Sampling:            sampling,
			
 
				+		Debug:               debug,
			
 
				+		PeopleNumber:        len(id_matcher.ReversePeopleDict),
			
 
				+	}
			
 
				+	pipeline.AddItem(burndowner)
			
 
				+
			
 
				+	pipeline.Initialize()
			
 
				+	result, err := pipeline.Run(commits)
			
 
				+	if err != nil {
			
 
				+		panic(err)
			
 
				+	}
			
 
				+	burndown_results := result[burndowner].(hercules.BurndownResult)
			
 
				 	fmt.Fprint(os.Stderr, "                \r")
			
 
				-	if len(global_statuses) == 0 {
			
 
				+	if len(burndown_results.GlobalHistory) == 0 {
			
 
				 		return
			
 
				 	}
			
 
				 	// print the start date, granularity, sampling
			
 
				 	fmt.Println(commits[0].Author.When.Unix(),
			
 
				 		commits[len(commits)-1].Author.When.Unix(),
			
 
				 		granularity, sampling)
			
 
				-	printStatuses(global_statuses, uri)
			
 
				+	printStatuses(burndown_results.GlobalHistory, uri)
			
 
				 	if with_files {
			
 
				 		fmt.Print("files\n")
			
 
				-		keys := sortedKeys(file_statuses)
			
 
				+		keys := sortedKeys(burndown_results.FileHistories)
			
 
				 		for _, key := range keys {
			
 
				-			printStatuses(file_statuses[key], key)
			
 
				+			printStatuses(burndown_results.FileHistories[key], key)
			
 
				 		}
			
 
				 	}
			
 
				 	if with_people {
			
 
				 		fmt.Print("people\n")
			
 
				-		for key, val := range people_statuses {
			
 
				+		for key, val := range burndown_results.PeopleHistories {
			
 
				 			fmt.Printf("%d: ", key)
			
 
				-			printStatuses(val, people_ids[key])
			
 
				+			printStatuses(val, id_matcher.ReversePeopleDict[key])
			
 
				 		}
			
 
				 		fmt.Println()
			
 
				-		for _, row := range(people_matrix) {
			
 
				+		for _, row := range(burndown_results.PeopleMatrix) {
			
 
				 			for _, cell := range(row) {
			
 
				 				fmt.Print(cell, " ")
			
 
				 			}
			
--- a/day.go
+++ b/day.go
@@ -0,0 +1,51 @@
 
				+package hercules
			
 
				+
			
 
				+import (
			
 
				+	"time"
			
 
				+
			
 
				+	"gopkg.in/src-d/go-git.v4"
			
 
				+	"gopkg.in/src-d/go-git.v4/plumbing/object"
			
 
				+)
			
 
				+
			
 
				+type DaysSinceStart struct {
			
 
				+	day0        time.Time
			
 
				+	previousDay int
			
 
				+}
			
 
				+
			
 
				+func (days *DaysSinceStart) Name() string {
			
 
				+	return "DaysSinceStart"
			
 
				+}
			
 
				+
			
 
				+func (days *DaysSinceStart) Provides() []string {
			
 
				+	arr := [...]string{"day"}
			
 
				+	return arr[:]
			
 
				+}
			
 
				+
			
 
				+func (days *DaysSinceStart) Requires() []string {
			
 
				+	return []string{}
			
 
				+}
			
 
				+
			
 
				+func (days *DaysSinceStart) Initialize(repository *git.Repository) {
			
 
				+	days.day0 = time.Time{}
			
 
				+	days.previousDay = 0
			
 
				+}
			
 
				+
			
 
				+func (days *DaysSinceStart) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
			
 
				+	commit := deps["commit"].(*object.Commit)
			
 
				+	index := deps["index"].(int)
			
 
				+	if index == 0 {
			
 
				+		// first iteration - initialize the file objects from the tree
			
 
				+		days.day0 = commit.Author.When
			
 
				+	}
			
 
				+	day := int(commit.Author.When.Sub(days.day0).Hours() / 24)
			
 
				+	if day < days.previousDay {
			
 
				+		// rebase works miracles, but we need the monotonous time
			
 
				+		day = days.previousDay
			
 
				+	}
			
 
				+	days.previousDay = day
			
 
				+	return map[string]interface{}{"day": day}, nil
			
 
				+}
			
 
				+
			
 
				+func (days *DaysSinceStart) Finalize() interface{} {
			
 
				+	return nil
			
 
				+}
			
--- a/dummies.go
+++ b/dummies.go
@@ -0,0 +1,57 @@
 
				+package hercules
			
 
				+
			
 
				+import (
			
 
				+	"io"
			
 
				+
			
 
				+	"gopkg.in/src-d/go-git.v4/plumbing"
			
 
				+	"gopkg.in/src-d/go-git.v4/plumbing/object"
			
 
				+)
			
 
				+
			
 
				+type dummyIO struct {
			
 
				+}
			
 
				+
			
 
				+func (dummyIO) Read(p []byte) (int, error) {
			
 
				+	return 0, io.EOF
			
 
				+}
			
 
				+
			
 
				+func (dummyIO) Write(p []byte) (int, error) {
			
 
				+	return len(p), nil
			
 
				+}
			
 
				+
			
 
				+func (dummyIO) Close() error {
			
 
				+	return nil
			
 
				+}
			
 
				+
			
 
				+type dummyEncodedObject struct {
			
 
				+	FakeHash plumbing.Hash
			
 
				+}
			
 
				+
			
 
				+func (obj dummyEncodedObject) Hash() plumbing.Hash {
			
 
				+	return obj.FakeHash
			
 
				+}
			
 
				+
			
 
				+func (obj dummyEncodedObject) Type() plumbing.ObjectType {
			
 
				+	return plumbing.BlobObject
			
 
				+}
			
 
				+
			
 
				+func (obj dummyEncodedObject) SetType(plumbing.ObjectType) {
			
 
				+}
			
 
				+
			
 
				+func (obj dummyEncodedObject) Size() int64 {
			
 
				+	return 0
			
 
				+}
			
 
				+
			
 
				+func (obj dummyEncodedObject) SetSize(int64) {
			
 
				+}
			
 
				+
			
 
				+func (obj dummyEncodedObject) Reader() (io.ReadCloser, error) {
			
 
				+	return dummyIO{}, nil
			
 
				+}
			
 
				+
			
 
				+func (obj dummyEncodedObject) Writer() (io.WriteCloser, error) {
			
 
				+	return dummyIO{}, nil
			
 
				+}
			
 
				+
			
 
				+func createDummyBlob(hash *plumbing.Hash) (*object.Blob, error) {
			
 
				+	return object.DecodeBlob(dummyEncodedObject{*hash})
			
 
				+}
			
--- a/identity.go
+++ b/identity.go
@@ -0,0 +1,112 @@
 
				+package hercules
			
 
				+
			
 
				+import (
			
 
				+	"bufio"
			
 
				+	"os"
			
 
				+	"strings"
			
 
				+
			
 
				+	"gopkg.in/src-d/go-git.v4"
			
 
				+	"gopkg.in/src-d/go-git.v4/plumbing/object"
			
 
				+)
			
 
				+
			
 
				+type IdentityDetector struct {
			
 
				+	// Maps email || name  -> developer id.
			
 
				+	PeopleDict map[string]int
			
 
				+	// Maps developer id -> description
			
 
				+	ReversePeopleDict map[int]string
			
 
				+}
			
 
				+
			
 
				+const MISSING_AUTHOR = (1 << 18) - 1
			
 
				+const SELF_AUTHOR = (1 << 18) - 2
			
 
				+
			
 
				+func (id *IdentityDetector) Name() string {
			
 
				+	return "IdentityDetector"
			
 
				+}
			
 
				+
			
 
				+func (id *IdentityDetector) Provides() []string {
			
 
				+	arr := [...]string{"author"}
			
 
				+	return arr[:]
			
 
				+}
			
 
				+
			
 
				+func (id *IdentityDetector) Requires() []string {
			
 
				+	return []string{}
			
 
				+}
			
 
				+
			
 
				+func (id *IdentityDetector) Initialize(repository *git.Repository) {
			
 
				+}
			
 
				+
			
 
				+func (self *IdentityDetector) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
			
 
				+	commit := deps["commit"].(*object.Commit)
			
 
				+	signature := commit.Author
			
 
				+	id, exists := self.PeopleDict[strings.ToLower(signature.Email)]
			
 
				+	if !exists {
			
 
				+		id, exists = self.PeopleDict[strings.ToLower(signature.Name)]
			
 
				+		if !exists {
			
 
				+			id = MISSING_AUTHOR
			
 
				+		}
			
 
				+	}
			
 
				+	return map[string]interface{}{"author": id}, nil
			
 
				+}
			
 
				+
			
 
				+func (id *IdentityDetector) Finalize() interface{} {
			
 
				+	return nil
			
 
				+}
			
 
				+
			
 
				+func (id *IdentityDetector) LoadPeopleDict(path string) error {
			
 
				+	file, err := os.Open(path)
			
 
				+	if err != nil {
			
 
				+		return err
			
 
				+	}
			
 
				+	defer file.Close()
			
 
				+	scanner := bufio.NewScanner(file)
			
 
				+	dict := make(map[string]int)
			
 
				+	reverse_dict := make(map[int]string)
			
 
				+	size := 0
			
 
				+	for scanner.Scan() {
			
 
				+		for _, id := range strings.Split(strings.ToLower(scanner.Text()), "|") {
			
 
				+			dict[id] = size
			
 
				+		}
			
 
				+		reverse_dict[size] = scanner.Text()
			
 
				+		size += 1
			
 
				+	}
			
 
				+	id.PeopleDict = dict
			
 
				+	id.ReversePeopleDict = reverse_dict
			
 
				+	return nil
			
 
				+}
			
 
				+
			
 
				+func (id *IdentityDetector) GeneratePeopleDict(commits []*object.Commit) {
			
 
				+	dict := make(map[string]int)
			
 
				+	emails := make(map[int][]string)
			
 
				+	names := make(map[int][]string)
			
 
				+	size := 0
			
 
				+	for _, commit := range commits {
			
 
				+		email := strings.ToLower(commit.Author.Email)
			
 
				+		name := strings.ToLower(commit.Author.Name)
			
 
				+		id, exists := dict[email]
			
 
				+		if exists {
			
 
				+			_, exists := dict[name]
			
 
				+			if !exists {
			
 
				+				dict[name] = id
			
 
				+				names[id] = append(names[id], name)
			
 
				+			}
			
 
				+			continue
			
 
				+		}
			
 
				+		id, exists = dict[name]
			
 
				+		if exists {
			
 
				+			dict[email] = id
			
 
				+			emails[id] = append(emails[id], email)
			
 
				+			continue
			
 
				+		}
			
 
				+		dict[email] = size
			
 
				+		dict[name] = size
			
 
				+		emails[size] = append(emails[size], email)
			
 
				+		names[size] = append(names[size], name)
			
 
				+		size += 1
			
 
				+	}
			
 
				+	reverse_dict := make(map[int]string)
			
 
				+	for _, val := range dict {
			
 
				+		reverse_dict[val] = strings.Join(names[val], "|") + "|" + strings.Join(emails[val], "|")
			
 
				+	}
			
 
				+	id.PeopleDict = dict
			
 
				+	id.ReversePeopleDict = reverse_dict
			
 
				+}
			
--- a/pipeline.go
+++ b/pipeline.go
@@ -0,0 +1,202 @@
 
				+package hercules
			
 
				+
			
 
				+import (
			
 
				+	"fmt"
			
 
				+	"io"
			
 
				+	"os"
			
 
				+
			
 
				+	"bufio"
			
 
				+	"gopkg.in/src-d/go-git.v4"
			
 
				+	"gopkg.in/src-d/go-git.v4/plumbing"
			
 
				+	"gopkg.in/src-d/go-git.v4/plumbing/object"
			
 
				+	"gopkg.in/src-d/hercules.v1/toposort"
			
 
				+)
			
 
				+
			
 
				+type PipelineItem interface {
			
 
				+	// Name returns the name of the analysis.
			
 
				+	Name() string
			
 
				+	// Provides returns the list of keys of reusable calculated entities.
			
 
				+	// Other items may depend on them.
			
 
				+	Provides() []string
			
 
				+	// Requires returns the list of keys of needed entities which must be supplied in Consume().
			
 
				+	Requires() []string
			
 
				+	// Initialize prepares and resets the item. Consume() requires Initialize()
			
 
				+	// to be called at least once beforehand.
			
 
				+	Initialize(*git.Repository)
			
 
				+	// Consume processes the next commit.
			
 
				+	// deps contains the required entities which match Depends(). Besides, it always includes
			
 
				+	// "commit" and "index".
			
 
				+	// Returns the calculated entities which match Provides().
			
 
				+	Consume(deps map[string]interface{}) (map[string]interface{}, error)
			
 
				+	// Finalize returns the result of the analysis.
			
 
				+	Finalize() interface{}
			
 
				+}
			
 
				+
			
 
				+type Pipeline struct {
			
 
				+	// OnProgress is the callback which is invoked in Analyse() to output it's
			
 
				+	// progress. The first argument is the number of processed commits and the
			
 
				+	// second is the total number of commits.
			
 
				+	OnProgress func(int, int)
			
 
				+
			
 
				+	// repository points to the analysed Git repository struct from go-git.
			
 
				+	repository *git.Repository
			
 
				+
			
 
				+	// items are the registered analysers in the pipeline.
			
 
				+	items []PipelineItem
			
 
				+
			
 
				+	// plan is the resolved execution sequence.
			
 
				+	plan []PipelineItem
			
 
				+}
			
 
				+
			
 
				+func NewPipeline(repository *git.Repository) *Pipeline {
			
 
				+	return &Pipeline{repository: repository, items: []PipelineItem{}, plan: []PipelineItem{}}
			
 
				+}
			
 
				+
			
 
				+func (pipeline *Pipeline) AddItem(item PipelineItem) {
			
 
				+	for _, reg := range pipeline.items {
			
 
				+		if reg == item {
			
 
				+			return
			
 
				+		}
			
 
				+	}
			
 
				+	pipeline.items = append(pipeline.items, item)
			
 
				+}
			
 
				+
			
 
				+func (pipeline *Pipeline) RemoveItem(item PipelineItem) {
			
 
				+	for i, reg := range pipeline.items {
			
 
				+		if reg == item {
			
 
				+			pipeline.items = append(pipeline.items[:i], pipeline.items[i+1:]...)
			
 
				+			return
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// Commits returns the critical path in the repository's history. It starts
			
 
				+// from HEAD and traces commits backwards till the root. When it encounters
			
 
				+// a merge (more than one parent), it always chooses the first parent.
			
 
				+func (pipeline *Pipeline) Commits() []*object.Commit {
			
 
				+	result := []*object.Commit{}
			
 
				+	repository := pipeline.repository
			
 
				+	head, err := repository.Head()
			
 
				+	if err != nil {
			
 
				+		panic(err)
			
 
				+	}
			
 
				+	commit, err := repository.CommitObject(head.Hash())
			
 
				+	if err != nil {
			
 
				+		panic(err)
			
 
				+	}
			
 
				+	result = append(result, commit)
			
 
				+	for ; err != io.EOF; commit, err = commit.Parents().Next() {
			
 
				+		if err != nil {
			
 
				+			panic(err)
			
 
				+		}
			
 
				+		result = append(result, commit)
			
 
				+	}
			
 
				+	// reverse the order
			
 
				+	for i, j := 0, len(result)-1; i < j; i, j = i+1, j-1 {
			
 
				+		result[i], result[j] = result[j], result[i]
			
 
				+	}
			
 
				+	return result
			
 
				+}
			
 
				+
			
 
				+func (pipeline *Pipeline) Initialize() {
			
 
				+	graph := toposort.NewGraph()
			
 
				+	name2item := map[string]PipelineItem{}
			
 
				+	for index, item := range pipeline.items {
			
 
				+		name := fmt.Sprintf("%s_%d", item.Name(), index)
			
 
				+		graph.AddNode(name)
			
 
				+		name2item[name] = item
			
 
				+		for _, key := range item.Provides() {
			
 
				+			key += "_entity"
			
 
				+			graph.AddNode(key)
			
 
				+			graph.AddEdge(name, key)
			
 
				+		}
			
 
				+	}
			
 
				+	for index, item := range pipeline.items {
			
 
				+		name := fmt.Sprintf("%s_%d", item.Name(), index)
			
 
				+		for _, key := range item.Requires() {
			
 
				+			key += "_entity"
			
 
				+			if !graph.AddEdge(key, name) {
			
 
				+				panic(fmt.Sprintf("Unsatisfied dependency: %s -> %s", key, item.Name()))
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	strplan, ok := graph.Toposort()
			
 
				+	if !ok {
			
 
				+		panic("Failed to resolve pipeline dependencies.")
			
 
				+	}
			
 
				+	for _, key := range strplan {
			
 
				+		item, ok := name2item[key]
			
 
				+		if ok {
			
 
				+			pipeline.plan = append(pipeline.plan, item)
			
 
				+		}
			
 
				+	}
			
 
				+	if len(pipeline.plan) != len(pipeline.items) {
			
 
				+		panic("Internal pipeline dependency resolution error.")
			
 
				+	}
			
 
				+	for _, item := range pipeline.items {
			
 
				+		item.Initialize(pipeline.repository)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// Run executes the pipeline.
			
 
				+//
			
 
				+// commits is a slice with the sequential commit history. It shall start from
			
 
				+// the root (ascending order).
			
 
				+func (pipeline *Pipeline) Run(commits []*object.Commit) (map[PipelineItem]interface{}, error) {
			
 
				+	onProgress := pipeline.OnProgress
			
 
				+	if onProgress == nil {
			
 
				+		onProgress = func(int, int) {}
			
 
				+	}
			
 
				+
			
 
				+	for index, commit := range commits {
			
 
				+		onProgress(index, len(commits))
			
 
				+		state := map[string]interface{}{"commit": commit, "index": index}
			
 
				+		for _, item := range pipeline.plan {
			
 
				+			update, err := item.Consume(state)
			
 
				+			if err != nil {
			
 
				+				fmt.Fprintf(os.Stderr, "%s failed on commit #%d %s\n",
			
 
				+					item.Name(), index, commit.Hash.String())
			
 
				+				return nil, err
			
 
				+			}
			
 
				+			for _, key := range item.Provides() {
			
 
				+				val, ok := update[key]
			
 
				+				if !ok {
			
 
				+					panic(fmt.Sprintf("%s: Consume() did not return %s", item.Name(), key))
			
 
				+				}
			
 
				+				state[key] = val
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	result := map[PipelineItem]interface{}{}
			
 
				+	for _, item := range pipeline.items {
			
 
				+		result[item] = item.Finalize()
			
 
				+	}
			
 
				+	return result, nil
			
 
				+}
			
 
				+
			
 
				+func LoadCommitsFromFile(path string, repository *git.Repository) []*object.Commit {
			
 
				+	var file io.Reader
			
 
				+	if path != "-" {
			
 
				+		file, err := os.Open(path)
			
 
				+		if err != nil {
			
 
				+			panic(err)
			
 
				+		}
			
 
				+		defer file.Close()
			
 
				+	} else {
			
 
				+		file = os.Stdin
			
 
				+	}
			
 
				+	scanner := bufio.NewScanner(file)
			
 
				+	commits := []*object.Commit{}
			
 
				+	for scanner.Scan() {
			
 
				+		hash := plumbing.NewHash(scanner.Text())
			
 
				+		if len(hash) != 20 {
			
 
				+			panic("invalid commit hash " + scanner.Text())
			
 
				+		}
			
 
				+		commit, err := repository.CommitObject(hash)
			
 
				+		if err != nil {
			
 
				+			panic(err)
			
 
				+		}
			
 
				+		commits = append(commits, commit)
			
 
				+	}
			
 
				+	return commits
			
 
				+}
			
--- a/renames.go
+++ b/renames.go
@@ -0,0 +1,238 @@
 
				+package hercules
			
 
				+
			
 
				+import (
			
 
				+	"fmt"
			
 
				+	"sort"
			
 
				+	"unicode/utf8"
			
 
				+
			
 
				+	"github.com/sergi/go-diff/diffmatchpatch"
			
 
				+	"gopkg.in/src-d/go-git.v4"
			
 
				+	"gopkg.in/src-d/go-git.v4/plumbing"
			
 
				+	"gopkg.in/src-d/go-git.v4/plumbing/object"
			
 
				+	"gopkg.in/src-d/go-git.v4/utils/merkletrie"
			
 
				+)
			
 
				+
			
 
				+type RenameAnalysis struct {
			
 
				+	// SimilarityThreshold adjusts the heuristic to determine file renames.
			
 
				+	// It has the same units as cgit's -X rename-threshold or -M. Better to
			
 
				+	// set it to the default value of 90 (90%).
			
 
				+	SimilarityThreshold int
			
 
				+
			
 
				+	repository *git.Repository
			
 
				+}
			
 
				+
			
 
				+func (ra *RenameAnalysis) Name() string {
			
 
				+	return "RenameAnalysis"
			
 
				+}
			
 
				+
			
 
				+func (ra *RenameAnalysis) Provides() []string {
			
 
				+	arr := [...]string{"renamed_changes"}
			
 
				+	return arr[:]
			
 
				+}
			
 
				+
			
 
				+func (ra *RenameAnalysis) Requires() []string {
			
 
				+	arr := [...]string{"blob_cache", "changes"}
			
 
				+	return arr[:]
			
 
				+}
			
 
				+
			
 
				+func (ra *RenameAnalysis) Initialize(repository *git.Repository) {
			
 
				+	if ra.SimilarityThreshold < 0 || ra.SimilarityThreshold > 100 {
			
 
				+		panic("hercules.RenameAnalysis: an invalid SimilarityThreshold was specified")
			
 
				+	}
			
 
				+	ra.repository = repository
			
 
				+}
			
 
				+
			
 
				+func (ra *RenameAnalysis) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
			
 
				+	changes := deps["changes"].(object.Changes)
			
 
				+	cache := deps["blob_cache"].(map[plumbing.Hash]*object.Blob)
			
 
				+
			
 
				+	reduced_changes := make(object.Changes, 0, changes.Len())
			
 
				+
			
 
				+	// Stage 1 - find renames by matching the hashes
			
 
				+	// n log(n)
			
 
				+	// We sort additions and deletions by hash and then do the single scan along
			
 
				+	// both slices.
			
 
				+	deleted := make(sortableChanges, 0, changes.Len())
			
 
				+	added := make(sortableChanges, 0, changes.Len())
			
 
				+	for _, change := range changes {
			
 
				+		action, err := change.Action()
			
 
				+		if err != nil {
			
 
				+			return nil, err
			
 
				+		}
			
 
				+		switch action {
			
 
				+		case merkletrie.Insert:
			
 
				+			added = append(added, sortableChange{change, change.To.TreeEntry.Hash})
			
 
				+		case merkletrie.Delete:
			
 
				+			deleted = append(deleted, sortableChange{change, change.From.TreeEntry.Hash})
			
 
				+		case merkletrie.Modify:
			
 
				+			reduced_changes = append(reduced_changes, change)
			
 
				+		default:
			
 
				+			panic(fmt.Sprintf("unsupported action: %d", change.Action))
			
 
				+		}
			
 
				+	}
			
 
				+	sort.Sort(deleted)
			
 
				+	sort.Sort(added)
			
 
				+	a := 0
			
 
				+	d := 0
			
 
				+	still_deleted := make(object.Changes, 0, deleted.Len())
			
 
				+	still_added := make(object.Changes, 0, added.Len())
			
 
				+	for a < added.Len() && d < deleted.Len() {
			
 
				+		if added[a].hash == deleted[d].hash {
			
 
				+			reduced_changes = append(
			
 
				+				reduced_changes,
			
 
				+				&object.Change{From: deleted[d].change.From, To: added[a].change.To})
			
 
				+			a++
			
 
				+			d++
			
 
				+		} else if added[a].Less(&deleted[d]) {
			
 
				+			still_added = append(still_added, added[a].change)
			
 
				+			a++
			
 
				+		} else {
			
 
				+			still_deleted = append(still_deleted, deleted[d].change)
			
 
				+			d++
			
 
				+		}
			
 
				+	}
			
 
				+	for ; a < added.Len(); a++ {
			
 
				+		still_added = append(still_added, added[a].change)
			
 
				+	}
			
 
				+	for ; d < deleted.Len(); d++ {
			
 
				+		still_deleted = append(still_deleted, deleted[d].change)
			
 
				+	}
			
 
				+
			
 
				+	// Stage 2 - apply the similarity threshold
			
 
				+	// n^2 but actually linear
			
 
				+	// We sort the blobs by size and do the single linear scan.
			
 
				+	added_blobs := make(sortableBlobs, 0, still_added.Len())
			
 
				+	deleted_blobs := make(sortableBlobs, 0, still_deleted.Len())
			
 
				+	for _, change := range still_added {
			
 
				+		blob := cache[change.To.TreeEntry.Hash]
			
 
				+		added_blobs = append(
			
 
				+			added_blobs, sortableBlob{change: change, size: blob.Size})
			
 
				+	}
			
 
				+	for _, change := range still_deleted {
			
 
				+		blob := cache[change.From.TreeEntry.Hash]
			
 
				+		deleted_blobs = append(
			
 
				+			deleted_blobs, sortableBlob{change: change, size: blob.Size})
			
 
				+	}
			
 
				+	sort.Sort(added_blobs)
			
 
				+	sort.Sort(deleted_blobs)
			
 
				+	d_start := 0
			
 
				+	for a = 0; a < added_blobs.Len(); a++ {
			
 
				+		my_blob := cache[added_blobs[a].change.To.TreeEntry.Hash]
			
 
				+		my_size := added_blobs[a].size
			
 
				+		for d = d_start; d < deleted_blobs.Len() && !ra.sizesAreClose(my_size, deleted_blobs[d].size); d++ {
			
 
				+		}
			
 
				+		d_start = d
			
 
				+		found_match := false
			
 
				+		for d = d_start; d < deleted_blobs.Len() && ra.sizesAreClose(my_size, deleted_blobs[d].size); d++ {
			
 
				+			blobsAreClose, err := ra.blobsAreClose(
			
 
				+				my_blob, cache[deleted_blobs[d].change.From.TreeEntry.Hash])
			
 
				+			if err != nil {
			
 
				+				return nil, err
			
 
				+			}
			
 
				+			if blobsAreClose {
			
 
				+				found_match = true
			
 
				+				reduced_changes = append(
			
 
				+					reduced_changes,
			
 
				+					&object.Change{From: deleted_blobs[d].change.From,
			
 
				+						To: added_blobs[a].change.To})
			
 
				+				break
			
 
				+			}
			
 
				+		}
			
 
				+		if found_match {
			
 
				+			added_blobs = append(added_blobs[:a], added_blobs[a+1:]...)
			
 
				+			a--
			
 
				+			deleted_blobs = append(deleted_blobs[:d], deleted_blobs[d+1:]...)
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Stage 3 - we give up, everything left are independent additions and deletions
			
 
				+	for _, blob := range added_blobs {
			
 
				+		reduced_changes = append(reduced_changes, blob.change)
			
 
				+	}
			
 
				+	for _, blob := range deleted_blobs {
			
 
				+		reduced_changes = append(reduced_changes, blob.change)
			
 
				+	}
			
 
				+	return map[string]interface{}{"renamed_changes": reduced_changes}, nil
			
 
				+}
			
 
				+
			
 
				+func (ra *RenameAnalysis) Finalize() interface{} {
			
 
				+	return nil
			
 
				+}
			
 
				+
			
 
				+func (ra *RenameAnalysis) sizesAreClose(size1 int64, size2 int64) bool {
			
 
				+	return abs64(size1-size2)*100/max64(1, min64(size1, size2)) <=
			
 
				+		int64(100-ra.SimilarityThreshold)
			
 
				+}
			
 
				+
			
 
				+func (ra *RenameAnalysis) blobsAreClose(
			
 
				+	blob1 *object.Blob, blob2 *object.Blob) (bool, error) {
			
 
				+	str_from, err := blobToString(blob1)
			
 
				+	if err != nil {
			
 
				+		return false, err
			
 
				+	}
			
 
				+	str_to, err := blobToString(blob2)
			
 
				+	if err != nil {
			
 
				+		return false, err
			
 
				+	}
			
 
				+	dmp := diffmatchpatch.New()
			
 
				+	src, dst, _ := dmp.DiffLinesToRunes(str_from, str_to)
			
 
				+	diffs := dmp.DiffMainRunes(src, dst, false)
			
 
				+	common := 0
			
 
				+	for _, edit := range diffs {
			
 
				+		if edit.Type == diffmatchpatch.DiffEqual {
			
 
				+			common += utf8.RuneCountInString(edit.Text)
			
 
				+		}
			
 
				+	}
			
 
				+	return common*100/max(1, min(len(src), len(dst))) >= ra.SimilarityThreshold, nil
			
 
				+}
			
 
				+
			
 
				+type sortableChange struct {
			
 
				+	change *object.Change
			
 
				+	hash   plumbing.Hash
			
 
				+}
			
 
				+
			
 
				+type sortableChanges []sortableChange
			
 
				+
			
 
				+func (change *sortableChange) Less(other *sortableChange) bool {
			
 
				+	for x := 0; x < 20; x++ {
			
 
				+		if change.hash[x] < other.hash[x] {
			
 
				+			return true
			
 
				+		}
			
 
				+	}
			
 
				+	return false
			
 
				+}
			
 
				+
			
 
				+func (slice sortableChanges) Len() int {
			
 
				+	return len(slice)
			
 
				+}
			
 
				+
			
 
				+func (slice sortableChanges) Less(i, j int) bool {
			
 
				+	return slice[i].Less(&slice[j])
			
 
				+}
			
 
				+
			
 
				+func (slice sortableChanges) Swap(i, j int) {
			
 
				+	slice[i], slice[j] = slice[j], slice[i]
			
 
				+}
			
 
				+
			
 
				+type sortableBlob struct {
			
 
				+	change *object.Change
			
 
				+	size   int64
			
 
				+}
			
 
				+
			
 
				+type sortableBlobs []sortableBlob
			
 
				+
			
 
				+func (change *sortableBlob) Less(other *sortableBlob) bool {
			
 
				+	return change.size < other.size
			
 
				+}
			
 
				+
			
 
				+func (slice sortableBlobs) Len() int {
			
 
				+	return len(slice)
			
 
				+}
			
 
				+
			
 
				+func (slice sortableBlobs) Less(i, j int) bool {
			
 
				+	return slice[i].Less(&slice[j])
			
 
				+}
			
 
				+
			
 
				+func (slice sortableBlobs) Swap(i, j int) {
			
 
				+	slice[i], slice[j] = slice[j], slice[i]
			
 
				+}
			
--- a/toposort/toposort.go
+++ b/toposort/toposort.go
@@ -0,0 +1,103 @@
 
				+package toposort
			
 
				+
			
 
				+// Copied from https://github.com/philopon/go-toposort
			
 
				+
			
 
				+type Graph struct {
			
 
				+	nodes   []string
			
 
				+	outputs map[string]map[string]int
			
 
				+	inputs  map[string]int
			
 
				+}
			
 
				+
			
 
				+func NewGraph() *Graph {
			
 
				+	return &Graph{
			
 
				+		nodes:   []string{},
			
 
				+		inputs:  map[string]int{},
			
 
				+		outputs: map[string]map[string]int{},
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (g *Graph) AddNode(name string) bool {
			
 
				+	g.nodes = append(g.nodes, name)
			
 
				+
			
 
				+	if _, ok := g.outputs[name]; ok {
			
 
				+		return false
			
 
				+	}
			
 
				+	g.outputs[name] = make(map[string]int)
			
 
				+	g.inputs[name] = 0
			
 
				+	return true
			
 
				+}
			
 
				+
			
 
				+func (g *Graph) AddNodes(names ...string) bool {
			
 
				+	for _, name := range names {
			
 
				+		if ok := g.AddNode(name); !ok {
			
 
				+			return false
			
 
				+		}
			
 
				+	}
			
 
				+	return true
			
 
				+}
			
 
				+
			
 
				+func (g *Graph) AddEdge(from, to string) bool {
			
 
				+	m, ok := g.outputs[from]
			
 
				+	if !ok {
			
 
				+		return false
			
 
				+	}
			
 
				+
			
 
				+	m[to] = len(m) + 1
			
 
				+	g.inputs[to]++
			
 
				+
			
 
				+	return true
			
 
				+}
			
 
				+
			
 
				+func (g *Graph) unsafeRemoveEdge(from, to string) {
			
 
				+	delete(g.outputs[from], to)
			
 
				+	g.inputs[to]--
			
 
				+}
			
 
				+
			
 
				+func (g *Graph) RemoveEdge(from, to string) bool {
			
 
				+	if _, ok := g.outputs[from]; !ok {
			
 
				+		return false
			
 
				+	}
			
 
				+	g.unsafeRemoveEdge(from, to)
			
 
				+	return true
			
 
				+}
			
 
				+
			
 
				+func (g *Graph) Toposort() ([]string, bool) {
			
 
				+	L := make([]string, 0, len(g.nodes))
			
 
				+	S := make([]string, 0, len(g.nodes))
			
 
				+
			
 
				+	for _, n := range g.nodes {
			
 
				+		if g.inputs[n] == 0 {
			
 
				+			S = append(S, n)
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for len(S) > 0 {
			
 
				+		var n string
			
 
				+		n, S = S[0], S[1:]
			
 
				+		L = append(L, n)
			
 
				+
			
 
				+		ms := make([]string, len(g.outputs[n]))
			
 
				+		for m, i := range g.outputs[n] {
			
 
				+			ms[i-1] = m
			
 
				+		}
			
 
				+
			
 
				+		for _, m := range ms {
			
 
				+			g.unsafeRemoveEdge(n, m)
			
 
				+
			
 
				+			if g.inputs[m] == 0 {
			
 
				+				S = append(S, m)
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	N := 0
			
 
				+	for _, v := range g.inputs {
			
 
				+		N += v
			
 
				+	}
			
 
				+
			
 
				+	if N > 0 {
			
 
				+		return L, false
			
 
				+	}
			
 
				+
			
 
				+	return L, true
			
 
				+}
			
--- a/toposort/toposort_test.go
+++ b/toposort/toposort_test.go
@@ -0,0 +1,83 @@
 
				+package toposort
			
 
				+
			
 
				+import "testing"
			
 
				+
			
 
				+func index(s []string, v string) int {
			
 
				+	for i, s := range s {
			
 
				+		if s == v {
			
 
				+			return i
			
 
				+		}
			
 
				+	}
			
 
				+	return -1
			
 
				+}
			
 
				+
			
 
				+type Edge struct {
			
 
				+	From string
			
 
				+	To   string
			
 
				+}
			
 
				+
			
 
				+func TestDuplicatedNode(t *testing.T) {
			
 
				+	graph := NewGraph()
			
 
				+	graph.AddNode("a")
			
 
				+	if graph.AddNode("a") {
			
 
				+		t.Error("not raising duplicated node error")
			
 
				+	}
			
 
				+
			
 
				+}
			
 
				+
			
 
				+func TestRemoveNotExistEdge(t *testing.T) {
			
 
				+	graph := NewGraph()
			
 
				+	if graph.RemoveEdge("a", "b") {
			
 
				+		t.Error("not raising not exist edge error")
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func TestWikipedia(t *testing.T) {
			
 
				+	graph := NewGraph()
			
 
				+	graph.AddNodes("2", "3", "5", "7", "8", "9", "10", "11")
			
 
				+
			
 
				+	edges := []Edge{
			
 
				+		{"7", "8"},
			
 
				+		{"7", "11"},
			
 
				+
			
 
				+		{"5", "11"},
			
 
				+
			
 
				+		{"3", "8"},
			
 
				+		{"3", "10"},
			
 
				+
			
 
				+		{"11", "2"},
			
 
				+		{"11", "9"},
			
 
				+		{"11", "10"},
			
 
				+
			
 
				+		{"8", "9"},
			
 
				+	}
			
 
				+
			
 
				+	for _, e := range edges {
			
 
				+		graph.AddEdge(e.From, e.To)
			
 
				+	}
			
 
				+
			
 
				+	result, ok := graph.Toposort()
			
 
				+	if !ok {
			
 
				+		t.Error("closed path detected in no closed pathed graph")
			
 
				+	}
			
 
				+
			
 
				+	for _, e := range edges {
			
 
				+		if i, j := index(result, e.From), index(result, e.To); i > j {
			
 
				+			t.Errorf("dependency failed: not satisfy %v(%v) > %v(%v)", e.From, i, e.To, j)
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func TestCycle(t *testing.T) {
			
 
				+	graph := NewGraph()
			
 
				+	graph.AddNodes("1", "2", "3")
			
 
				+
			
 
				+	graph.AddEdge("1", "2")
			
 
				+	graph.AddEdge("2", "3")
			
 
				+	graph.AddEdge("3", "1")
			
 
				+
			
 
				+	_, ok := graph.Toposort()
			
 
				+	if ok {
			
 
				+		t.Error("closed path not detected in closed pathed graph")
			
 
				+	}
			
 
				+}
			
--- a/tree_diff.go
+++ b/tree_diff.go
@@ -0,0 +1,72 @@
 
				+package hercules
			
 
				+
			
 
				+import (
			
 
				+	"io"
			
 
				+
			
 
				+	"gopkg.in/src-d/go-git.v4"
			
 
				+	"gopkg.in/src-d/go-git.v4/plumbing/object"
			
 
				+)
			
 
				+
			
 
				+type TreeDiff struct {
			
 
				+	previousTree *object.Tree
			
 
				+}
			
 
				+
			
 
				+func (treediff *TreeDiff) Name() string {
			
 
				+	return "TreeDiff"
			
 
				+}
			
 
				+
			
 
				+func (treediff *TreeDiff) Provides() []string {
			
 
				+	arr := [...]string{"changes"}
			
 
				+	return arr[:]
			
 
				+}
			
 
				+
			
 
				+func (treediff *TreeDiff) Requires() []string {
			
 
				+	return []string{}
			
 
				+}
			
 
				+
			
 
				+func (treediff *TreeDiff) Initialize(repository *git.Repository) {
			
 
				+	treediff.previousTree = nil
			
 
				+}
			
 
				+
			
 
				+func (treediff *TreeDiff) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
			
 
				+	commit := deps["commit"].(*object.Commit)
			
 
				+	tree, err := commit.Tree()
			
 
				+	if err != nil {
			
 
				+		return nil, err
			
 
				+	}
			
 
				+	var diff object.Changes
			
 
				+	if treediff.previousTree != nil {
			
 
				+		diff, err = object.DiffTree(treediff.previousTree, tree)
			
 
				+		if err != nil {
			
 
				+			return nil, err
			
 
				+		}
			
 
				+	} else {
			
 
				+		diff = []*object.Change{}
			
 
				+		err = func() error {
			
 
				+			file_iter := tree.Files()
			
 
				+			defer file_iter.Close()
			
 
				+			for {
			
 
				+				file, err := file_iter.Next()
			
 
				+				if err != nil {
			
 
				+					if err == io.EOF {
			
 
				+						break
			
 
				+					}
			
 
				+					return err
			
 
				+				}
			
 
				+				diff = append(diff, &object.Change{
			
 
				+					To: object.ChangeEntry{Name: file.Name, Tree: tree, TreeEntry: object.TreeEntry{
			
 
				+						Name: file.Name, Mode: file.Mode, Hash: file.Hash}}})
			
 
				+			}
			
 
				+			return nil
			
 
				+		}()
			
 
				+		if err != nil {
			
 
				+			return nil, err
			
 
				+		}
			
 
				+	}
			
 
				+	treediff.previousTree = tree
			
 
				+	return map[string]interface{}{"changes": diff}, nil
			
 
				+}
			
 
				+
			
 
				+func (treediff *TreeDiff) Finalize() interface{} {
			
 
				+	return nil
			
 
				+}