|
@@ -7,20 +7,23 @@ import (
|
|
|
"fmt"
|
|
|
"io"
|
|
|
"os"
|
|
|
+ "sort"
|
|
|
"time"
|
|
|
"unicode/utf8"
|
|
|
|
|
|
"github.com/sergi/go-diff/diffmatchpatch"
|
|
|
"gopkg.in/src-d/go-git.v4"
|
|
|
+ "gopkg.in/src-d/go-git.v4/plumbing"
|
|
|
"gopkg.in/src-d/go-git.v4/plumbing/object"
|
|
|
"gopkg.in/src-d/go-git.v4/utils/merkletrie"
|
|
|
)
|
|
|
|
|
|
type Analyser struct {
|
|
|
- Repository *git.Repository
|
|
|
- Granularity int
|
|
|
- Sampling int
|
|
|
- OnProgress func(int, int)
|
|
|
+ Repository *git.Repository
|
|
|
+ Granularity int
|
|
|
+ Sampling int
|
|
|
+ SimilarityThreshold int
|
|
|
+ OnProgress func(int, int)
|
|
|
}
|
|
|
|
|
|
func checkClose(c io.Closer) {
|
|
@@ -58,11 +61,9 @@ func str(file *object.Blob) string {
|
|
|
}
|
|
|
|
|
|
func (analyser *Analyser) handleInsertion(
|
|
|
- change *object.Change, day int, status map[int]int64, files map[string]*File) {
|
|
|
- blob, err := analyser.Repository.BlobObject(change.To.TreeEntry.Hash)
|
|
|
- if err != nil {
|
|
|
- panic(err)
|
|
|
- }
|
|
|
+ change *object.Change, day int, status map[int]int64, files map[string]*File,
|
|
|
+ cache *map[plumbing.Hash]*object.Blob) {
|
|
|
+ blob := (*cache)[change.To.TreeEntry.Hash]
|
|
|
lines, err := loc(blob)
|
|
|
if err != nil {
|
|
|
return
|
|
@@ -77,11 +78,9 @@ func (analyser *Analyser) handleInsertion(
|
|
|
}
|
|
|
|
|
|
func (analyser *Analyser) handleDeletion(
|
|
|
- change *object.Change, day int, status map[int]int64, files map[string]*File) {
|
|
|
- blob, err := analyser.Repository.BlobObject(change.From.TreeEntry.Hash)
|
|
|
- if err != nil {
|
|
|
- panic(err)
|
|
|
- }
|
|
|
+ change *object.Change, day int, status map[int]int64, files map[string]*File,
|
|
|
+ cache *map[plumbing.Hash]*object.Blob) {
|
|
|
+ blob := (*cache)[change.From.TreeEntry.Hash]
|
|
|
lines, err := loc(blob)
|
|
|
if err != nil {
|
|
|
return
|
|
@@ -93,22 +92,17 @@ func (analyser *Analyser) handleDeletion(
|
|
|
}
|
|
|
|
|
|
func (analyser *Analyser) handleModification(
|
|
|
- change *object.Change, day int, status map[int]int64, files map[string]*File) {
|
|
|
- blob_from, err := analyser.Repository.BlobObject(change.From.TreeEntry.Hash)
|
|
|
- if err != nil {
|
|
|
- panic(err)
|
|
|
- }
|
|
|
- blob_to, err := analyser.Repository.BlobObject(change.To.TreeEntry.Hash)
|
|
|
- if err != nil {
|
|
|
- panic(err)
|
|
|
- }
|
|
|
+ change *object.Change, day int, status map[int]int64, files map[string]*File,
|
|
|
+ cache *map[plumbing.Hash]*object.Blob) {
|
|
|
+ blob_from := (*cache)[change.From.TreeEntry.Hash]
|
|
|
+ blob_to := (*cache)[change.To.TreeEntry.Hash]
|
|
|
// we are not validating UTF-8 here because for example
|
|
|
// git/git 4f7770c87ce3c302e1639a7737a6d2531fe4b160 fetch-pack.c is invalid UTF-8
|
|
|
str_from := str(blob_from)
|
|
|
str_to := str(blob_to)
|
|
|
file, exists := files[change.From.Name]
|
|
|
if !exists {
|
|
|
- analyser.handleInsertion(change, day, status, files)
|
|
|
+ analyser.handleInsertion(change, day, status, files, cache)
|
|
|
return
|
|
|
}
|
|
|
// possible rename
|
|
@@ -249,6 +243,216 @@ func (analyser *Analyser) groupStatus(status map[int]int64, day int) []int64 {
|
|
|
return result
|
|
|
}
|
|
|
|
|
|
+type sortableChange struct {
|
|
|
+ change *object.Change
|
|
|
+ hash plumbing.Hash
|
|
|
+}
|
|
|
+
|
|
|
+type sortableChanges []sortableChange
|
|
|
+
|
|
|
+func (change *sortableChange) Less(other *sortableChange) bool {
|
|
|
+ for x := 0; x < 20; x++ {
|
|
|
+ if change.hash[x] < other.hash[x] {
|
|
|
+ return true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return false
|
|
|
+}
|
|
|
+
|
|
|
+func (slice sortableChanges) Len() int {
|
|
|
+ return len(slice)
|
|
|
+}
|
|
|
+
|
|
|
+func (slice sortableChanges) Less(i, j int) bool {
|
|
|
+ return slice[i].Less(&slice[j])
|
|
|
+}
|
|
|
+
|
|
|
+func (slice sortableChanges) Swap(i, j int) {
|
|
|
+ slice[i], slice[j] = slice[j], slice[i]
|
|
|
+}
|
|
|
+
|
|
|
+type sortableBlob struct {
|
|
|
+ change *object.Change
|
|
|
+ size int64
|
|
|
+}
|
|
|
+
|
|
|
+type sortableBlobs []sortableBlob
|
|
|
+
|
|
|
+func (change *sortableBlob) Less(other *sortableBlob) bool {
|
|
|
+ return change.size < other.size
|
|
|
+}
|
|
|
+
|
|
|
+func (slice sortableBlobs) Len() int {
|
|
|
+ return len(slice)
|
|
|
+}
|
|
|
+
|
|
|
+func (slice sortableBlobs) Less(i, j int) bool {
|
|
|
+ return slice[i].Less(&slice[j])
|
|
|
+}
|
|
|
+
|
|
|
+func (slice sortableBlobs) Swap(i, j int) {
|
|
|
+ slice[i], slice[j] = slice[j], slice[i]
|
|
|
+}
|
|
|
+
|
|
|
+func (analyser *Analyser) sizesAreClose(size1 int64, size2 int64) bool {
|
|
|
+ return abs64(size1-size2)*100/min64(size1, size2) <=
|
|
|
+ int64(100-analyser.SimilarityThreshold)
|
|
|
+}
|
|
|
+
|
|
|
+func (analyser *Analyser) blobsAreClose(
|
|
|
+ blob1 *object.Blob, blob2 *object.Blob) bool {
|
|
|
+ str_from := str(blob1)
|
|
|
+ str_to := str(blob2)
|
|
|
+ dmp := diffmatchpatch.New()
|
|
|
+ src, dst, _ := dmp.DiffLinesToRunes(str_from, str_to)
|
|
|
+ diffs := dmp.DiffMainRunes(src, dst, false)
|
|
|
+ common := 0
|
|
|
+ for _, edit := range diffs {
|
|
|
+ if edit.Type == diffmatchpatch.DiffEqual {
|
|
|
+ common += utf8.RuneCountInString(edit.Text)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return common*100/min(len(src), len(dst)) >=
|
|
|
+ analyser.SimilarityThreshold
|
|
|
+}
|
|
|
+
|
|
|
+func (analyser *Analyser) getBlob(hash plumbing.Hash) *object.Blob {
|
|
|
+ blob, err := analyser.Repository.BlobObject(hash)
|
|
|
+ if err != nil {
|
|
|
+ panic(err)
|
|
|
+ }
|
|
|
+ return blob
|
|
|
+}
|
|
|
+
|
|
|
+func (analyser *Analyser) cacheBlobs(changes object.Changes) *map[plumbing.Hash]*object.Blob {
|
|
|
+ cache := make(map[plumbing.Hash]*object.Blob)
|
|
|
+ for _, change := range changes {
|
|
|
+ action, err := change.Action()
|
|
|
+ if err != nil {
|
|
|
+ panic(err)
|
|
|
+ }
|
|
|
+ switch action {
|
|
|
+ case merkletrie.Insert:
|
|
|
+ cache[change.To.TreeEntry.Hash] = analyser.getBlob(change.To.TreeEntry.Hash)
|
|
|
+ case merkletrie.Delete:
|
|
|
+ cache[change.From.TreeEntry.Hash] = analyser.getBlob(change.From.TreeEntry.Hash)
|
|
|
+ case merkletrie.Modify:
|
|
|
+ cache[change.To.TreeEntry.Hash] = analyser.getBlob(change.To.TreeEntry.Hash)
|
|
|
+ cache[change.From.TreeEntry.Hash] = analyser.getBlob(change.From.TreeEntry.Hash)
|
|
|
+ default:
|
|
|
+ panic(fmt.Sprintf("unsupported action: %d", change.Action))
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return &cache
|
|
|
+}
|
|
|
+
|
|
|
+func (analyser *Analyser) detectRenames(
|
|
|
+ changes object.Changes, cache *map[plumbing.Hash]*object.Blob) object.Changes {
|
|
|
+ reduced_changes := make(object.Changes, 0, changes.Len())
|
|
|
+
|
|
|
+ // Stage 1 - find renames by matching the hashes
|
|
|
+ // n log(n)
|
|
|
+ // We sort additions and deletions by hash and then do the single scan along
|
|
|
+ // both slices.
|
|
|
+ deleted := make(sortableChanges, 0, changes.Len())
|
|
|
+ added := make(sortableChanges, 0, changes.Len())
|
|
|
+ for _, change := range changes {
|
|
|
+ action, err := change.Action()
|
|
|
+ if err != nil {
|
|
|
+ panic(err)
|
|
|
+ }
|
|
|
+ switch action {
|
|
|
+ case merkletrie.Insert:
|
|
|
+ added = append(added, sortableChange{change, change.To.TreeEntry.Hash})
|
|
|
+ case merkletrie.Delete:
|
|
|
+ deleted = append(deleted, sortableChange{change, change.From.TreeEntry.Hash})
|
|
|
+ case merkletrie.Modify:
|
|
|
+ reduced_changes = append(reduced_changes, change)
|
|
|
+ default:
|
|
|
+ panic(fmt.Sprintf("unsupported action: %d", change.Action))
|
|
|
+ }
|
|
|
+ }
|
|
|
+ sort.Sort(deleted)
|
|
|
+ sort.Sort(added)
|
|
|
+ a := 0
|
|
|
+ d := 0
|
|
|
+ still_deleted := make(object.Changes, 0, deleted.Len())
|
|
|
+ still_added := make(object.Changes, 0, added.Len())
|
|
|
+ for a < added.Len() && d < deleted.Len() {
|
|
|
+ if added[a].hash == deleted[d].hash {
|
|
|
+ reduced_changes = append(
|
|
|
+ reduced_changes,
|
|
|
+ &object.Change{From: deleted[d].change.From, To: added[a].change.To})
|
|
|
+ a++
|
|
|
+ d++
|
|
|
+ } else if added[a].Less(&deleted[d]) {
|
|
|
+ still_added = append(still_added, added[a].change)
|
|
|
+ a++
|
|
|
+ } else {
|
|
|
+ still_deleted = append(still_deleted, deleted[d].change)
|
|
|
+ d++
|
|
|
+ }
|
|
|
+ }
|
|
|
+ for ; a < added.Len(); a++ {
|
|
|
+ still_added = append(still_added, added[a].change)
|
|
|
+ }
|
|
|
+ for ; d < deleted.Len(); d++ {
|
|
|
+ still_deleted = append(still_deleted, deleted[d].change)
|
|
|
+ }
|
|
|
+
|
|
|
+ // Stage 2 - apply the similarity threshold
|
|
|
+ // n^2 but actually linear
|
|
|
+ // We sort the blobs by size and do the single linear scan.
|
|
|
+ added_blobs := make(sortableBlobs, 0, still_added.Len())
|
|
|
+ deleted_blobs := make(sortableBlobs, 0, still_deleted.Len())
|
|
|
+ for _, change := range still_added {
|
|
|
+ blob := (*cache)[change.To.TreeEntry.Hash]
|
|
|
+ added_blobs = append(
|
|
|
+ added_blobs, sortableBlob{change: change, size: blob.Size})
|
|
|
+ }
|
|
|
+ for _, change := range still_deleted {
|
|
|
+ blob := (*cache)[change.From.TreeEntry.Hash]
|
|
|
+ deleted_blobs = append(
|
|
|
+ deleted_blobs, sortableBlob{change: change, size: blob.Size})
|
|
|
+ }
|
|
|
+ sort.Sort(added_blobs)
|
|
|
+ sort.Sort(deleted_blobs)
|
|
|
+ d_start := 0
|
|
|
+ for a = 0; a < added_blobs.Len(); a++ {
|
|
|
+ my_blob := (*cache)[added_blobs[a].change.To.TreeEntry.Hash]
|
|
|
+ my_size := added_blobs[a].size
|
|
|
+ for d = d_start; d < deleted_blobs.Len() && !analyser.sizesAreClose(my_size, deleted_blobs[d].size); d++ {
|
|
|
+ }
|
|
|
+ d_start = d
|
|
|
+ found_match := false
|
|
|
+ for d = d_start; d < deleted_blobs.Len() && analyser.sizesAreClose(my_size, deleted_blobs[d].size); d++ {
|
|
|
+ if analyser.blobsAreClose(
|
|
|
+ my_blob, (*cache)[deleted_blobs[d].change.From.TreeEntry.Hash]) {
|
|
|
+ found_match = true
|
|
|
+ reduced_changes = append(
|
|
|
+ reduced_changes,
|
|
|
+ &object.Change{From: deleted_blobs[d].change.From,
|
|
|
+ To: added_blobs[a].change.To})
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if found_match {
|
|
|
+ added_blobs = append(added_blobs[:a], added_blobs[a+1:]...)
|
|
|
+ a--
|
|
|
+ deleted_blobs = append(deleted_blobs[:d], deleted_blobs[d+1:]...)
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // Stage 3 - we give up, everything left are independent additions and deletions
|
|
|
+ for _, blob := range added_blobs {
|
|
|
+ reduced_changes = append(reduced_changes, blob.change)
|
|
|
+ }
|
|
|
+ for _, blob := range deleted_blobs {
|
|
|
+ reduced_changes = append(reduced_changes, blob.change)
|
|
|
+ }
|
|
|
+ return reduced_changes
|
|
|
+}
|
|
|
+
|
|
|
func (analyser *Analyser) Analyse(commits []*object.Commit) [][]int64 {
|
|
|
sampling := analyser.Sampling
|
|
|
if sampling == 0 {
|
|
@@ -258,6 +462,9 @@ func (analyser *Analyser) Analyse(commits []*object.Commit) [][]int64 {
|
|
|
if onProgress == nil {
|
|
|
onProgress = func(int, int) {}
|
|
|
}
|
|
|
+ if analyser.SimilarityThreshold < 0 || analyser.SimilarityThreshold > 100 {
|
|
|
+ panic("hercules.Analyser: an invalid SimilarityThreshold was specified")
|
|
|
+ }
|
|
|
|
|
|
// current daily alive number of lines; key is the number of days from the
|
|
|
// beginning of the history
|
|
@@ -311,6 +518,8 @@ func (analyser *Analyser) Analyse(commits []*object.Commit) [][]int64 {
|
|
|
if err != nil {
|
|
|
panic(err)
|
|
|
}
|
|
|
+ cache := analyser.cacheBlobs(tree_diff)
|
|
|
+ tree_diff = analyser.detectRenames(tree_diff, cache)
|
|
|
for _, change := range tree_diff {
|
|
|
action, err := change.Action()
|
|
|
if err != nil {
|
|
@@ -318,9 +527,9 @@ func (analyser *Analyser) Analyse(commits []*object.Commit) [][]int64 {
|
|
|
}
|
|
|
switch action {
|
|
|
case merkletrie.Insert:
|
|
|
- analyser.handleInsertion(change, day, status, files)
|
|
|
+ analyser.handleInsertion(change, day, status, files, cache)
|
|
|
case merkletrie.Delete:
|
|
|
- analyser.handleDeletion(change, day, status, files)
|
|
|
+ analyser.handleDeletion(change, day, status, files, cache)
|
|
|
case merkletrie.Modify:
|
|
|
func() {
|
|
|
defer func() {
|
|
@@ -330,10 +539,8 @@ func (analyser *Analyser) Analyse(commits []*object.Commit) [][]int64 {
|
|
|
panic(r)
|
|
|
}
|
|
|
}()
|
|
|
- analyser.handleModification(change, day, status, files)
|
|
|
+ analyser.handleModification(change, day, status, files, cache)
|
|
|
}()
|
|
|
- default:
|
|
|
- panic(fmt.Sprintf("unsupported action: %d", change.Action))
|
|
|
}
|
|
|
}
|
|
|
}
|