Quellcode durchsuchen

Add renames detection

Vadim Markovtsev vor 8 Jahren
Ursprung
Commit
53d4a048b0
3 geänderte Dateien mit 397 neuen und 170 gelöschten Zeilen
  1. 236 29
      analyser.go
  2. 144 141
      cmd/hercules/main.go
  3. 17 0
      file.go

+ 236 - 29
analyser.go

@@ -7,20 +7,23 @@ import (
 	"fmt"
 	"io"
 	"os"
+	"sort"
 	"time"
 	"unicode/utf8"
 
 	"github.com/sergi/go-diff/diffmatchpatch"
 	"gopkg.in/src-d/go-git.v4"
+	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
 	"gopkg.in/src-d/go-git.v4/utils/merkletrie"
 )
 
 type Analyser struct {
-	Repository  *git.Repository
-	Granularity int
-	Sampling    int
-	OnProgress  func(int, int)
+	Repository          *git.Repository
+	Granularity         int
+	Sampling            int
+	SimilarityThreshold int
+	OnProgress          func(int, int)
 }
 
 func checkClose(c io.Closer) {
@@ -58,11 +61,9 @@ func str(file *object.Blob) string {
 }
 
 func (analyser *Analyser) handleInsertion(
-	change *object.Change, day int, status map[int]int64, files map[string]*File) {
-	blob, err := analyser.Repository.BlobObject(change.To.TreeEntry.Hash)
-	if err != nil {
-		panic(err)
-	}
+	change *object.Change, day int, status map[int]int64, files map[string]*File,
+	cache *map[plumbing.Hash]*object.Blob) {
+	blob := (*cache)[change.To.TreeEntry.Hash]
 	lines, err := loc(blob)
 	if err != nil {
 		return
@@ -77,11 +78,9 @@ func (analyser *Analyser) handleInsertion(
 }
 
 func (analyser *Analyser) handleDeletion(
-	change *object.Change, day int, status map[int]int64, files map[string]*File) {
-	blob, err := analyser.Repository.BlobObject(change.From.TreeEntry.Hash)
-	if err != nil {
-		panic(err)
-	}
+	change *object.Change, day int, status map[int]int64, files map[string]*File,
+	cache *map[plumbing.Hash]*object.Blob) {
+	blob := (*cache)[change.From.TreeEntry.Hash]
 	lines, err := loc(blob)
 	if err != nil {
 		return
@@ -93,22 +92,17 @@ func (analyser *Analyser) handleDeletion(
 }
 
 func (analyser *Analyser) handleModification(
-	change *object.Change, day int, status map[int]int64, files map[string]*File) {
-	blob_from, err := analyser.Repository.BlobObject(change.From.TreeEntry.Hash)
-	if err != nil {
-		panic(err)
-	}
-	blob_to, err := analyser.Repository.BlobObject(change.To.TreeEntry.Hash)
-	if err != nil {
-		panic(err)
-	}
+	change *object.Change, day int, status map[int]int64, files map[string]*File,
+	cache *map[plumbing.Hash]*object.Blob) {
+	blob_from := (*cache)[change.From.TreeEntry.Hash]
+	blob_to := (*cache)[change.To.TreeEntry.Hash]
 	// we are not validating UTF-8 here because for example
 	// git/git 4f7770c87ce3c302e1639a7737a6d2531fe4b160 fetch-pack.c is invalid UTF-8
 	str_from := str(blob_from)
 	str_to := str(blob_to)
 	file, exists := files[change.From.Name]
 	if !exists {
-		analyser.handleInsertion(change, day, status, files)
+		analyser.handleInsertion(change, day, status, files, cache)
 		return
 	}
 	// possible rename
@@ -249,6 +243,216 @@ func (analyser *Analyser) groupStatus(status map[int]int64, day int) []int64 {
 	return result
 }
 
+type sortableChange struct {
+	change *object.Change
+	hash   plumbing.Hash
+}
+
+type sortableChanges []sortableChange
+
+func (change *sortableChange) Less(other *sortableChange) bool {
+	for x := 0; x < 20; x++ {
+		if change.hash[x] < other.hash[x] {
+			return true
+		}
+	}
+	return false
+}
+
+func (slice sortableChanges) Len() int {
+	return len(slice)
+}
+
+func (slice sortableChanges) Less(i, j int) bool {
+	return slice[i].Less(&slice[j])
+}
+
+func (slice sortableChanges) Swap(i, j int) {
+	slice[i], slice[j] = slice[j], slice[i]
+}
+
+type sortableBlob struct {
+	change *object.Change
+	size   int64
+}
+
+type sortableBlobs []sortableBlob
+
+func (change *sortableBlob) Less(other *sortableBlob) bool {
+	return change.size < other.size
+}
+
+func (slice sortableBlobs) Len() int {
+	return len(slice)
+}
+
+func (slice sortableBlobs) Less(i, j int) bool {
+	return slice[i].Less(&slice[j])
+}
+
+func (slice sortableBlobs) Swap(i, j int) {
+	slice[i], slice[j] = slice[j], slice[i]
+}
+
+func (analyser *Analyser) sizesAreClose(size1 int64, size2 int64) bool {
+	return abs64(size1-size2)*100/min64(size1, size2) <=
+		int64(100-analyser.SimilarityThreshold)
+}
+
+func (analyser *Analyser) blobsAreClose(
+	blob1 *object.Blob, blob2 *object.Blob) bool {
+	str_from := str(blob1)
+	str_to := str(blob2)
+	dmp := diffmatchpatch.New()
+	src, dst, _ := dmp.DiffLinesToRunes(str_from, str_to)
+	diffs := dmp.DiffMainRunes(src, dst, false)
+	common := 0
+	for _, edit := range diffs {
+		if edit.Type == diffmatchpatch.DiffEqual {
+			common += utf8.RuneCountInString(edit.Text)
+		}
+	}
+	return common*100/min(len(src), len(dst)) >=
+		analyser.SimilarityThreshold
+}
+
+func (analyser *Analyser) getBlob(hash plumbing.Hash) *object.Blob {
+	blob, err := analyser.Repository.BlobObject(hash)
+	if err != nil {
+		panic(err)
+	}
+	return blob
+}
+
+func (analyser *Analyser) cacheBlobs(changes object.Changes) *map[plumbing.Hash]*object.Blob {
+	cache := make(map[plumbing.Hash]*object.Blob)
+	for _, change := range changes {
+		action, err := change.Action()
+		if err != nil {
+			panic(err)
+		}
+		switch action {
+		case merkletrie.Insert:
+			cache[change.To.TreeEntry.Hash] = analyser.getBlob(change.To.TreeEntry.Hash)
+		case merkletrie.Delete:
+			cache[change.From.TreeEntry.Hash] = analyser.getBlob(change.From.TreeEntry.Hash)
+		case merkletrie.Modify:
+			cache[change.To.TreeEntry.Hash] = analyser.getBlob(change.To.TreeEntry.Hash)
+			cache[change.From.TreeEntry.Hash] = analyser.getBlob(change.From.TreeEntry.Hash)
+		default:
+			panic(fmt.Sprintf("unsupported action: %d", change.Action))
+		}
+	}
+	return &cache
+}
+
+func (analyser *Analyser) detectRenames(
+	changes object.Changes, cache *map[plumbing.Hash]*object.Blob) object.Changes {
+	reduced_changes := make(object.Changes, 0, changes.Len())
+
+	// Stage 1 - find renames by matching the hashes
+	// n log(n)
+	// We sort additions and deletions by hash and then do the single scan along
+	// both slices.
+	deleted := make(sortableChanges, 0, changes.Len())
+	added := make(sortableChanges, 0, changes.Len())
+	for _, change := range changes {
+		action, err := change.Action()
+		if err != nil {
+			panic(err)
+		}
+		switch action {
+		case merkletrie.Insert:
+			added = append(added, sortableChange{change, change.To.TreeEntry.Hash})
+		case merkletrie.Delete:
+			deleted = append(deleted, sortableChange{change, change.From.TreeEntry.Hash})
+		case merkletrie.Modify:
+			reduced_changes = append(reduced_changes, change)
+		default:
+			panic(fmt.Sprintf("unsupported action: %d", change.Action))
+		}
+	}
+	sort.Sort(deleted)
+	sort.Sort(added)
+	a := 0
+	d := 0
+	still_deleted := make(object.Changes, 0, deleted.Len())
+	still_added := make(object.Changes, 0, added.Len())
+	for a < added.Len() && d < deleted.Len() {
+		if added[a].hash == deleted[d].hash {
+			reduced_changes = append(
+				reduced_changes,
+				&object.Change{From: deleted[d].change.From, To: added[a].change.To})
+			a++
+			d++
+		} else if added[a].Less(&deleted[d]) {
+			still_added = append(still_added, added[a].change)
+			a++
+		} else {
+			still_deleted = append(still_deleted, deleted[d].change)
+			d++
+		}
+	}
+	for ; a < added.Len(); a++ {
+		still_added = append(still_added, added[a].change)
+	}
+	for ; d < deleted.Len(); d++ {
+		still_deleted = append(still_deleted, deleted[d].change)
+	}
+
+	// Stage 2 - apply the similarity threshold
+	// n^2 but actually linear
+	// We sort the blobs by size and do the single linear scan.
+	added_blobs := make(sortableBlobs, 0, still_added.Len())
+	deleted_blobs := make(sortableBlobs, 0, still_deleted.Len())
+	for _, change := range still_added {
+		blob := (*cache)[change.To.TreeEntry.Hash]
+		added_blobs = append(
+			added_blobs, sortableBlob{change: change, size: blob.Size})
+	}
+	for _, change := range still_deleted {
+		blob := (*cache)[change.From.TreeEntry.Hash]
+		deleted_blobs = append(
+			deleted_blobs, sortableBlob{change: change, size: blob.Size})
+	}
+	sort.Sort(added_blobs)
+	sort.Sort(deleted_blobs)
+	d_start := 0
+	for a = 0; a < added_blobs.Len(); a++ {
+		my_blob := (*cache)[added_blobs[a].change.To.TreeEntry.Hash]
+		my_size := added_blobs[a].size
+		for d = d_start; d < deleted_blobs.Len() && !analyser.sizesAreClose(my_size, deleted_blobs[d].size); d++ {
+		}
+		d_start = d
+		found_match := false
+		for d = d_start; d < deleted_blobs.Len() && analyser.sizesAreClose(my_size, deleted_blobs[d].size); d++ {
+			if analyser.blobsAreClose(
+				my_blob, (*cache)[deleted_blobs[d].change.From.TreeEntry.Hash]) {
+				found_match = true
+				reduced_changes = append(
+					reduced_changes,
+					&object.Change{From: deleted_blobs[d].change.From,
+						To: added_blobs[a].change.To})
+				break
+			}
+		}
+		if found_match {
+			added_blobs = append(added_blobs[:a], added_blobs[a+1:]...)
+			a--
+			deleted_blobs = append(deleted_blobs[:d], deleted_blobs[d+1:]...)
+		}
+	}
+
+	// Stage 3 - we give up, everything left are independent additions and deletions
+	for _, blob := range added_blobs {
+		reduced_changes = append(reduced_changes, blob.change)
+	}
+	for _, blob := range deleted_blobs {
+		reduced_changes = append(reduced_changes, blob.change)
+	}
+	return reduced_changes
+}
+
 func (analyser *Analyser) Analyse(commits []*object.Commit) [][]int64 {
 	sampling := analyser.Sampling
 	if sampling == 0 {
@@ -258,6 +462,9 @@ func (analyser *Analyser) Analyse(commits []*object.Commit) [][]int64 {
 	if onProgress == nil {
 		onProgress = func(int, int) {}
 	}
+	if analyser.SimilarityThreshold < 0 || analyser.SimilarityThreshold > 100 {
+		panic("hercules.Analyser: an invalid SimilarityThreshold was specified")
+	}
 
 	// current daily alive number of lines; key is the number of days from the
 	// beginning of the history
@@ -311,6 +518,8 @@ func (analyser *Analyser) Analyse(commits []*object.Commit) [][]int64 {
 			if err != nil {
 				panic(err)
 			}
+			cache := analyser.cacheBlobs(tree_diff)
+			tree_diff = analyser.detectRenames(tree_diff, cache)
 			for _, change := range tree_diff {
 				action, err := change.Action()
 				if err != nil {
@@ -318,9 +527,9 @@ func (analyser *Analyser) Analyse(commits []*object.Commit) [][]int64 {
 				}
 				switch action {
 				case merkletrie.Insert:
-					analyser.handleInsertion(change, day, status, files)
+					analyser.handleInsertion(change, day, status, files, cache)
 				case merkletrie.Delete:
-					analyser.handleDeletion(change, day, status, files)
+					analyser.handleDeletion(change, day, status, files, cache)
 				case merkletrie.Modify:
 					func() {
 						defer func() {
@@ -330,10 +539,8 @@ func (analyser *Analyser) Analyse(commits []*object.Commit) [][]int64 {
 								panic(r)
 							}
 						}()
-						analyser.handleModification(change, day, status, files)
+						analyser.handleModification(change, day, status, files, cache)
 					}()
-				default:
-					panic(fmt.Sprintf("unsupported action: %d", change.Action))
 				}
 			}
 		}

+ 144 - 141
cmd/hercules/main.go

@@ -1,152 +1,155 @@
 package main
 
 import (
-    "bufio"
-    "flag"
-    "fmt"
-    "io"
-    "os"
-    "runtime/pprof"
-    "strconv"
-    "strings"
+	"bufio"
+	"flag"
+	"fmt"
+	"io"
+	"os"
+	"runtime/pprof"
+	"strconv"
+	"strings"
 
-    "gopkg.in/src-d/go-billy.v2/osfs"
-    "gopkg.in/src-d/go-git.v4"
-    "gopkg.in/src-d/go-git.v4/storage"
-    "gopkg.in/src-d/go-git.v4/storage/memory"
-    "gopkg.in/src-d/go-git.v4/storage/filesystem"
-    "gopkg.in/src-d/go-git.v4/plumbing"
-    "gopkg.in/src-d/go-git.v4/plumbing/object"
-    "gopkg.in/src-d/hercules.v1"
+	"gopkg.in/src-d/go-billy.v2/osfs"
+	"gopkg.in/src-d/go-git.v4"
+	"gopkg.in/src-d/go-git.v4/plumbing"
+	"gopkg.in/src-d/go-git.v4/plumbing/object"
+	"gopkg.in/src-d/go-git.v4/storage"
+	"gopkg.in/src-d/go-git.v4/storage/filesystem"
+	"gopkg.in/src-d/go-git.v4/storage/memory"
+	"gopkg.in/src-d/hercules.v1"
 )
 
 func loadCommitsFromFile(path string, repository *git.Repository) []*object.Commit {
-    var file io.Reader
-    if path != "-" {
-        file, err := os.Open(path)
-        if err != nil {
-            panic(err)
-        }
-        defer file.Close()
-    } else {
-        file = os.Stdin
-    }
-    scanner := bufio.NewScanner(file)
-    commits := []*object.Commit{}
-    for scanner.Scan() {
-        hash := plumbing.NewHash(scanner.Text())
-        if len(hash) != 20 {
-            panic("invalid commit hash " + scanner.Text())
-        }
-        commit, err := repository.CommitObject(hash)
-        if err != nil {
-            panic(err)
-        }
-        commits = append(commits, commit)
-    }
-    return commits
+	var file io.Reader
+	if path != "-" {
+		file, err := os.Open(path)
+		if err != nil {
+			panic(err)
+		}
+		defer file.Close()
+	} else {
+		file = os.Stdin
+	}
+	scanner := bufio.NewScanner(file)
+	commits := []*object.Commit{}
+	for scanner.Scan() {
+		hash := plumbing.NewHash(scanner.Text())
+		if len(hash) != 20 {
+			panic("invalid commit hash " + scanner.Text())
+		}
+		commit, err := repository.CommitObject(hash)
+		if err != nil {
+			panic(err)
+		}
+		commits = append(commits, commit)
+	}
+	return commits
 }
 
 func main() {
-    var profile bool
-    var granularity, sampling int
-    var commitsFile string
-    flag.BoolVar(&profile, "profile", false, "Collect the profile to hercules.pprof.")
-    flag.IntVar(&granularity, "granularity", 30, "Report granularity in days.")
-    flag.IntVar(&sampling, "sampling", 30, "Report sampling in days.")
-    flag.StringVar(&commitsFile, "commits", "", "Path to the text file with the " +
-        "commit history to follow instead of the default rev-list " +
-            "--first-parent. The format is the list of hashes, each hash on a " +
-            "separate line. The first hash is the root.")
-    flag.Parse()
-    if (granularity <= 0) {
-        fmt.Fprint(os.Stderr, "Warning: adjusted the granularity to 1 day\n")
-        granularity = 1
-    }
-    if profile {
-        prof, _ := os.Create("hercules.pprof")
-        pprof.StartCPUProfile(prof)
-        defer pprof.StopCPUProfile()
-    }
-    if len(flag.Args()) == 0 || len(flag.Args()) > 3 {
-        fmt.Fprint(os.Stderr,
-                     "Usage: hercules <path to repo or URL> [<disk cache path>]\n")
-        os.Exit(1)
-    }
-  uri := flag.Arg(0)
-    var repository *git.Repository
-    var storage storage.Storer
-    var err error
-    if strings.Contains(uri, "://") {
-        if len(flag.Args()) == 2 {
-            storage, err = filesystem.NewStorage(osfs.New(flag.Arg(1)))
-            if err != nil {
-                panic(err)
-            }
-        } else {
-            storage = memory.NewStorage()
-        }
-        fmt.Fprint(os.Stderr, "cloning...\r")
-        repository, err = git.Clone(storage, nil, &git.CloneOptions{
-            URL: uri,
-        })
-        fmt.Fprint(os.Stderr, "          \r")
-    } else {
-        if uri[len(uri) - 1] == os.PathSeparator {
-            uri = uri[:len(uri) - 1]
-        }
-        if !strings.HasSuffix(uri, ".git") {
-            uri += string(os.PathSeparator) + ".git"
-        }
-        repository, err = git.PlainOpen(uri)
-    }
-    if err != nil {
-        panic(err)
-    }
-    // core logic
-    analyser := hercules.Analyser{
-        Repository: repository,
-        OnProgress: func(commit, length int) {
-          fmt.Fprintf(os.Stderr, "%d / %d\r", commit, length)
-      },
-        Granularity: granularity,
-        Sampling: sampling,
-    }
-    // list of commits belonging to the default branch, from oldest to newest
-    // rev-list --first-parent
-    var commits []*object.Commit
-    if commitsFile == "" {
-        commits = analyser.Commits()
-    } else {
-        commits = loadCommitsFromFile(commitsFile, repository)
-    }
-    statuses := analyser.Analyse(commits)
-    fmt.Fprint(os.Stderr, "                \r")
-    if len(statuses) == 0 {
-        return
-    }
-    // determine the maximum length of each value
-    var maxnum int64
-    for _, status := range statuses {
-        for _, val := range status {
-            if val > maxnum {
-                maxnum = val
-            }
-        }
-    }
-    width := len(strconv.FormatInt(maxnum, 10))
-    last := len(statuses[len(statuses) - 1])
-    // print the start date, granularity, sampling
-    fmt.Println(commits[0].Author.When.Unix(), granularity, sampling)
-    // print the resulting triangle matrix
-    for _, status := range statuses {
-        for i := 0; i < last; i++ {
-            var val int64
-            if i < len(status) {
-                val = status[i]
-            }
-            fmt.Printf("%[1]*[2]d ", width, val)
-        }
-        fmt.Println()
-    }
+	var profile bool
+	var granularity, sampling, similarity_threshold int
+	var commitsFile string
+	flag.BoolVar(&profile, "profile", false, "Collect the profile to hercules.pprof.")
+	flag.IntVar(&granularity, "granularity", 30, "Report granularity in days.")
+	flag.IntVar(&sampling, "sampling", 30, "Report sampling in days.")
+	flag.IntVar(&similarity_threshold, "M", 90,
+		"A threshold on the similarity index used to detect renames.")
+	flag.StringVar(&commitsFile, "commits", "", "Path to the text file with the "+
+		"commit history to follow instead of the default rev-list "+
+		"--first-parent. The format is the list of hashes, each hash on a "+
+		"separate line. The first hash is the root.")
+	flag.Parse()
+	if granularity <= 0 {
+		fmt.Fprint(os.Stderr, "Warning: adjusted the granularity to 1 day\n")
+		granularity = 1
+	}
+	if profile {
+		prof, _ := os.Create("hercules.pprof")
+		pprof.StartCPUProfile(prof)
+		defer pprof.StopCPUProfile()
+	}
+	if len(flag.Args()) == 0 || len(flag.Args()) > 3 {
+		fmt.Fprint(os.Stderr,
+			"Usage: hercules <path to repo or URL> [<disk cache path>]\n")
+		os.Exit(1)
+	}
+	uri := flag.Arg(0)
+	var repository *git.Repository
+	var storage storage.Storer
+	var err error
+	if strings.Contains(uri, "://") {
+		if len(flag.Args()) == 2 {
+			storage, err = filesystem.NewStorage(osfs.New(flag.Arg(1)))
+			if err != nil {
+				panic(err)
+			}
+		} else {
+			storage = memory.NewStorage()
+		}
+		fmt.Fprint(os.Stderr, "cloning...\r")
+		repository, err = git.Clone(storage, nil, &git.CloneOptions{
+			URL: uri,
+		})
+		fmt.Fprint(os.Stderr, "          \r")
+	} else {
+		if uri[len(uri)-1] == os.PathSeparator {
+			uri = uri[:len(uri)-1]
+		}
+		if !strings.HasSuffix(uri, ".git") {
+			uri += string(os.PathSeparator) + ".git"
+		}
+		repository, err = git.PlainOpen(uri)
+	}
+	if err != nil {
+		panic(err)
+	}
+	// core logic
+	analyser := hercules.Analyser{
+		Repository: repository,
+		OnProgress: func(commit, length int) {
+			fmt.Fprintf(os.Stderr, "%d / %d\r", commit, length)
+		},
+		Granularity:         granularity,
+		Sampling:            sampling,
+		SimilarityThreshold: similarity_threshold,
+	}
+	// list of commits belonging to the default branch, from oldest to newest
+	// rev-list --first-parent
+	var commits []*object.Commit
+	if commitsFile == "" {
+		commits = analyser.Commits()
+	} else {
+		commits = loadCommitsFromFile(commitsFile, repository)
+	}
+	statuses := analyser.Analyse(commits)
+	fmt.Fprint(os.Stderr, "                \r")
+	if len(statuses) == 0 {
+		return
+	}
+	// determine the maximum length of each value
+	var maxnum int64
+	for _, status := range statuses {
+		for _, val := range status {
+			if val > maxnum {
+				maxnum = val
+			}
+		}
+	}
+	width := len(strconv.FormatInt(maxnum, 10))
+	last := len(statuses[len(statuses)-1])
+	// print the start date, granularity, sampling
+	fmt.Println(commits[0].Author.When.Unix(), granularity, sampling)
+	// print the resulting triangle matrix
+	for _, status := range statuses {
+		for i := 0; i < last; i++ {
+			var val int64
+			if i < len(status) {
+				val = status[i]
+			}
+			fmt.Printf("%[1]*[2]d ", width, val)
+		}
+		fmt.Println()
+	}
 }

+ 17 - 0
file.go

@@ -9,6 +9,9 @@ type File struct {
 
 const TreeEnd int = -1
 
+// An ugly side of Go.
+// template <typename T> please!
+
 func min(a int, b int) int {
 	if a < b {
 		return a
@@ -16,6 +19,13 @@ func min(a int, b int) int {
 	return b
 }
 
+func min64(a int64, b int64) int64 {
+	if a < b {
+		return a
+	}
+	return b
+}
+
 func max(a int, b int) int {
 	if a < b {
 		return b
@@ -23,6 +33,13 @@ func max(a int, b int) int {
 	return a
 }
 
+func abs64(v int64) int64 {
+	if v <= 0 {
+		return -v
+	}
+	return v
+}
+
 func NewFile(time int, length int, status map[int]int64) *File {
 	file := new(File)
 	file.status = status