Browse Source

Add couples collection

Vadim Markovtsev 7 years ago
parent
commit
37183d8d8b
4 changed files with 218 additions and 17 deletions
  1. 72 11
      cmd/hercules/main.go
  2. 139 0
      couples.go
  3. 6 5
      identity.go
  4. 1 1
      labours.py

+ 72 - 11
cmd/hercules/main.go

@@ -28,7 +28,7 @@ import (
 
 func printMatrix(matrix [][]int64, name string, fixNegative bool) {
 	// determine the maximum length of each value
-	var maxnum int64 = - (1 << 32)
+	var maxnum int64 = -(1 << 32)
 	var minnum int64 = 1 << 32
 	for _, status := range matrix {
 		for _, val := range status {
@@ -47,12 +47,12 @@ func printMatrix(matrix [][]int64, name string, fixNegative bool) {
 	last := len(matrix[len(matrix)-1])
 	indent := 2
 	if name != "" {
-		fmt.Printf("  %s: |-\n", name)
+		fmt.Printf("  \"%s\": |-\n", name)
 		indent += 2
 	}
 	// print the resulting triangular matrix
 	for _, status := range matrix {
-		fmt.Print(strings.Repeat(" ", indent - 1))
+		fmt.Print(strings.Repeat(" ", indent-1))
 		for i := 0; i < last; i++ {
 			var val int64
 			if i < len(status) {
@@ -69,6 +69,51 @@ func printMatrix(matrix [][]int64, name string, fixNegative bool) {
 	}
 }
 
+func printCouples(result *hercules.CouplesResult, peopleDict []string) {
+	fmt.Println("files_coocc:")
+		fmt.Println("  index:")
+		for _, file := range result.Files {
+			fmt.Printf("    - \"%s\"\n", file)
+		}
+		fmt.Println("  matrix:")
+		for _, files := range result.FilesMatrix {
+			fmt.Print("    - {")
+			indices := []int{}
+			for file := range files {
+				indices = append(indices, file)
+			}
+			sort.Ints(indices)
+			for i, file := range indices {
+				fmt.Printf("%d: %d", file, files[file])
+				if i < len(indices) - 1 {
+					fmt.Print(", ")
+				}
+			}
+			fmt.Println("}")
+		}
+		fmt.Println("people_coocc:")
+	  fmt.Println("  index:")
+	  for _, person := range peopleDict {
+		  fmt.Printf("    - \"%s\"\n", person)
+	  }
+	  fmt.Println("  matrix:")
+		for _, people := range result.PeopleMatrix {
+			fmt.Print("    - {")
+			indices := []int{}
+			for file := range people {
+				indices = append(indices, file)
+			}
+			sort.Ints(indices)
+			for i, person := range indices {
+				fmt.Printf("%d: %d", person, people[person])
+				if i < len(indices) - 1 {
+					fmt.Print(", ")
+				}
+			}
+			fmt.Println("}")
+		}
+}
+
 func sortedKeys(m map[string][][]int64) []string {
 	keys := make([]string, 0, len(m))
 	for k := range m {
@@ -81,6 +126,7 @@ func sortedKeys(m map[string][][]int64) []string {
 func main() {
 	var with_files bool
 	var with_people bool
+	var with_couples bool
 	var people_dict_path string
 	var profile bool
 	var granularity, sampling, similarity_threshold int
@@ -88,6 +134,8 @@ func main() {
 	var debug bool
 	flag.BoolVar(&with_files, "files", false, "Output detailed statistics per each file.")
 	flag.BoolVar(&with_people, "people", false, "Output detailed statistics per each developer.")
+	flag.BoolVar(&with_couples, "couples", false, "Gather the co-occurrence matrix "+
+		"for files and people.")
 	flag.StringVar(&people_dict_path, "people-dict", "", "Path to the developers' email associations.")
 	flag.BoolVar(&profile, "profile", false, "Collect the profile to hercules.pprof.")
 	flag.IntVar(&granularity, "granularity", 30, "How many days there are in a single band.")
@@ -162,7 +210,7 @@ func main() {
 	pipeline.AddItem(&hercules.RenameAnalysis{SimilarityThreshold: similarity_threshold})
 	pipeline.AddItem(&hercules.TreeDiff{})
 	id_matcher := &hercules.IdentityDetector{}
-	if with_people {
+	if with_people || with_couples {
 		if people_dict_path != "" {
 			id_matcher.LoadPeopleDict(people_dict_path)
 		} else {
@@ -171,19 +219,29 @@ func main() {
 	}
 	pipeline.AddItem(id_matcher)
 	burndowner := &hercules.BurndownAnalysis{
-		Granularity:         granularity,
-		Sampling:            sampling,
-		Debug:               debug,
-		PeopleNumber:        len(id_matcher.ReversePeopleDict),
+		Granularity:  granularity,
+		Sampling:     sampling,
+		Debug:        debug,
+		PeopleNumber: len(id_matcher.ReversePeopleDict),
 	}
 	pipeline.AddItem(burndowner)
+	var coupler *hercules.Couples
+	if with_couples {
+		coupler = &hercules.Couples{PeopleNumber: len(id_matcher.ReversePeopleDict)}
+		pipeline.AddItem(coupler)
+	}
 
 	pipeline.Initialize()
 	result, err := pipeline.Run(commits)
 	if err != nil {
 		panic(err)
 	}
+	fmt.Fprint(os.Stderr, "writing...    \r")
 	burndown_results := result[burndowner].(hercules.BurndownResult)
+	var couples_result hercules.CouplesResult
+	if with_couples {
+		couples_result = result[coupler].(hercules.CouplesResult)
+	}
 	fmt.Fprint(os.Stderr, "                \r")
 	if len(burndown_results.GlobalHistory) == 0 {
 		return
@@ -191,11 +249,11 @@ func main() {
 	// print the start date, granularity, sampling
 	fmt.Println("burndown:")
 	fmt.Println("  version: 1")
-  fmt.Println("  begin:", commits[0].Author.When.Unix())
+	fmt.Println("  begin:", commits[0].Author.When.Unix())
 	fmt.Println("  end:", commits[len(commits)-1].Author.When.Unix())
 	fmt.Println("  granularity:", granularity)
 	fmt.Println("  sampling:", sampling)
-  fmt.Println("project:")
+	fmt.Println("project:")
 	printMatrix(burndown_results.GlobalHistory, uri, true)
 	if with_files {
 		fmt.Println("files:")
@@ -213,7 +271,10 @@ func main() {
 		for key, val := range burndown_results.PeopleHistories {
 			printMatrix(val, id_matcher.ReversePeopleDict[key], true)
 		}
-		fmt.Println("interaction: |-")
+		fmt.Println("people_interaction: |-")
 		printMatrix(burndown_results.PeopleMatrix, "", false)
 	}
+	if with_couples {
+    printCouples(&couples_result, id_matcher.ReversePeopleDict)
+	}
 }

+ 139 - 0
couples.go

@@ -0,0 +1,139 @@
+package hercules
+
+import (
+	"gopkg.in/src-d/go-git.v4"
+	"gopkg.in/src-d/go-git.v4/plumbing/object"
+	"gopkg.in/src-d/go-git.v4/utils/merkletrie"
+	"sort"
+)
+
+type Couples struct {
+	// The number of developers for which to build the matrix. 0 disables this analysis.
+	PeopleNumber int
+
+	// people store how many times every developer committed to every file.
+	people []map[string]int
+	// files store every file occurred in the same commit with every other file.
+	files map[string]map[string]int
+}
+
+type CouplesResult struct {
+	PeopleMatrix []map[int]int64
+	FilesMatrix  []map[int]int
+	Files        []string
+}
+
+func (couples *Couples) Name() string {
+	return "Couples"
+}
+
+func (couples *Couples) Provides() []string {
+	return []string{}
+}
+
+func (couples *Couples) Requires() []string {
+	arr := [...]string{"author", "renamed_changes"}
+	return arr[:]
+}
+
+func (couples *Couples) Initialize(repository *git.Repository) {
+	couples.people = make([]map[string]int, couples.PeopleNumber)
+	for i := range couples.people {
+		couples.people[i] = map[string]int{}
+	}
+	couples.files = map[string]map[string]int{}
+}
+
+func (couples *Couples) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
+	author := deps["author"].(int)
+	tree_diff := deps["renamed_changes"].(object.Changes)
+	context := make([]string, 0)
+	deleteFile := func(name string) {
+		// we do not remove the file from people - the context does not expire
+		delete(couples.files, name)
+		for _, otherFiles := range couples.files {
+			delete(otherFiles, name)
+		}
+	}
+	for _, change := range tree_diff {
+		action, err := change.Action()
+		if err != nil {
+			return nil, err
+		}
+		switch action {
+		case merkletrie.Insert:
+			context = append(context, change.To.Name)
+			couples.people[author][change.To.Name] += 1
+		case merkletrie.Delete:
+			deleteFile(change.From.Name)
+		case merkletrie.Modify:
+			toName := change.To.Name
+			fromName := change.From.Name
+			if fromName != toName {
+				// renamed
+				couples.files[toName] = couples.files[fromName]
+				for _, otherFiles := range couples.files {
+					val, exists := otherFiles[fromName]
+					if exists {
+						otherFiles[toName] = val
+					}
+				}
+				deleteFile(change.From.Name)
+			}
+			context = append(context, toName)
+			couples.people[author][toName] += 1
+		}
+	}
+	for _, file := range context {
+		for _, otherFile := range context {
+			lane, exists := couples.files[file]
+			if !exists {
+				lane = map[string]int{}
+				couples.files[file] = lane
+			}
+			lane[otherFile] += 1
+		}
+	}
+	return nil, nil
+}
+
+func (couples *Couples) Finalize() interface{} {
+	peopleMatrix := make([]map[int]int64, couples.PeopleNumber)
+	for i := range peopleMatrix {
+		peopleMatrix[i] = map[int]int64{}
+		for file, commits := range couples.people[i] {
+			for j, otherFiles := range couples.people {
+				if i == j {
+					continue
+				}
+				otherCommits := otherFiles[file]
+				delta := otherCommits
+				if otherCommits > commits {
+					delta = commits
+				}
+				if delta > 0 {
+					peopleMatrix[i][j] += int64(delta)
+				}
+			}
+		}
+	}
+	filesSequence := make([]string, len(couples.files))
+	i := 0
+	for file := range couples.files {
+		filesSequence[i] = file
+		i++
+	}
+	sort.Strings(filesSequence)
+	filesIndex := map[string]int{}
+	for i, file := range filesSequence {
+		filesIndex[file] = i
+	}
+	filesMatrix := make([]map[int]int, len(filesIndex))
+	for i := range filesMatrix {
+		filesMatrix[i] = map[int]int{}
+		for otherFile, cooccs := range couples.files[filesSequence[i]] {
+			filesMatrix[i][filesIndex[otherFile]] = cooccs
+		}
+	}
+	return CouplesResult{PeopleMatrix: peopleMatrix, Files: filesSequence, FilesMatrix: filesMatrix}
+}

+ 6 - 5
identity.go

@@ -13,7 +13,7 @@ type IdentityDetector struct {
 	// Maps email || name  -> developer id.
 	PeopleDict map[string]int
 	// Maps developer id -> description
-	ReversePeopleDict map[int]string
+	ReversePeopleDict []string
 }
 
 const MISSING_AUTHOR = (1 << 18) - 1
@@ -60,13 +60,14 @@ func (id *IdentityDetector) LoadPeopleDict(path string) error {
 	defer file.Close()
 	scanner := bufio.NewScanner(file)
 	dict := make(map[string]int)
-	reverse_dict := make(map[int]string)
+	reverse_dict := []string{}
 	size := 0
 	for scanner.Scan() {
-		for _, id := range strings.Split(strings.ToLower(scanner.Text()), "|") {
+		ids := strings.Split(strings.ToLower(scanner.Text()), "|")
+		for _, id := range ids {
 			dict[id] = size
 		}
-		reverse_dict[size] = scanner.Text()
+		reverse_dict = append(reverse_dict, ids[0])
 		size += 1
 	}
 	id.PeopleDict = dict
@@ -103,7 +104,7 @@ func (id *IdentityDetector) GeneratePeopleDict(commits []*object.Commit) {
 		names[size] = append(names[size], name)
 		size += 1
 	}
-	reverse_dict := make(map[int]string)
+	reverse_dict := make([]string, size)
 	for _, val := range dict {
 		reverse_dict[val] = strings.Join(names[val], "|") + "|" + strings.Join(emails[val], "|")
 	}

+ 1 - 1
labours.py

@@ -52,7 +52,7 @@ def read_input(args):
     else:
         data = yaml.load(sys.stdin)
     return data["burndown"], data["project"], data.get("files"), data.get("people_sequence"), \
-           data.get("people"), data.get("interaction")
+           data.get("people"), data.get("people_interaction")
 
 
 def calculate_average_lifetime(matrix):