Преглед на файлове

Merge pull request #32 from vmarkovtsev/master

Implement Couples merge
Vadim Markovtsev преди 7 години
родител
ревизия
d93b85bbe8
променени са 11 файла, в които са добавени 297 реда и са изтрити 106 реда
  1. 12 1
      README.md
  2. 6 4
      burndown.go
  3. 1 1
      burndown_test.go
  4. 125 9
      couples.go
  5. 107 10
      couples_test.go
  6. 1 1
      identity.go
  7. 1 1
      identity_test.go
  8. 3 1
      labours.py
  9. 3 7
      pb/pb.proto
  10. 38 71
      pb/pb_pb2.py
  11. BIN
      test_data/couples.pb

+ 12 - 1
README.md

@@ -7,7 +7,8 @@ Powered by [go-git](https://github.com/src-d/go-git) and [Babelfish](https://doc
 There are two tools: `hercules` and `labours.py`. The first is the program
 written in Go which takes a Git repository and runs a Directed Acyclic Graph (DAG) of [analysis tasks](doc/PIPELINE_ITEMS.md).
 The second is the Python script which draws some predefined plots. These two tools are normally used together through
-a pipe. It is possible to write custom analyses using the plugin system.
+a pipe. It is possible to write custom analyses using the plugin system. It is also possible
+to merge several analysis results together.
 
 ![Hercules DAG of Burndown analysis](doc/dag.png)
 <p align="center">The DAG of burndown and couples analyses with UAST diff refining. Generated with <code>hercules -burndown -burndown-people -couples -feature=uast -dry-run -dump-dag doc/dag.dot https://github.com/src-d/hercules</code></p>
@@ -190,6 +191,16 @@ python3 labours.py -m all
 
 Hercules has a plugin system and allows to run custom analyses. See [PLUGINS.md](PLUGINS.md).
 
+### Merging
+
+`hercules-combine` is the tool which joins several analysis results in Protocol Buffers format together. 
+
+```
+hercules -burndown -pb https://github.com/src-d/go-git > go-git.pb
+hercules -burndown -pb https://github.com/src-d/hercules > hercules.pb
+hercules-combine go-git.pb hercules.pb | python3 labours.py -f pb -m project --resample M
+```
+
 ### Bad unicode errors
 
 YAML does not support the whole range of Unicode characters and the parser on `labours.py` side

+ 6 - 4
burndown.go

@@ -260,6 +260,8 @@ func (analyser *BurndownAnalysis) Finalize() interface{} {
 		PeopleHistories:    analyser.peopleHistories,
 		PeopleMatrix:       peopleMatrix,
 		reversedPeopleDict: analyser.reversedPeopleDict,
+		sampling:           analyser.Sampling,
+		granularity:        analyser.Granularity,
 	}
 }
 
@@ -651,8 +653,8 @@ func addBurndownMatrix(matrix [][]int64, granularity, sampling int, daily [][]fl
 }
 
 func (analyser *BurndownAnalysis) serializeText(result *BurndownResult, writer io.Writer) {
-	fmt.Fprintln(writer, "  granularity:", analyser.Granularity)
-	fmt.Fprintln(writer, "  sampling:", analyser.Sampling)
+	fmt.Fprintln(writer, "  granularity:", result.granularity)
+	fmt.Fprintln(writer, "  sampling:", result.sampling)
 	yaml.PrintMatrix(writer, result.GlobalHistory, 2, "project", true)
 	if len(result.FileHistories) > 0 {
 		fmt.Fprintln(writer, "  files:")
@@ -678,8 +680,8 @@ func (analyser *BurndownAnalysis) serializeText(result *BurndownResult, writer i
 
 func (analyser *BurndownAnalysis) serializeBinary(result *BurndownResult, writer io.Writer) error {
 	message := pb.BurndownAnalysisResults{
-		Granularity: int32(analyser.Granularity),
-		Sampling:    int32(analyser.Sampling),
+		Granularity: int32(result.granularity),
+		Sampling:    int32(result.sampling),
 	}
 	if len(result.GlobalHistory) > 0 {
 		message.Project = pb.ToBurndownSparseMatrix(result.GlobalHistory, "project")

+ 1 - 1
burndown_test.go

@@ -315,7 +315,7 @@ func TestBurndownConsumeFinalize(t *testing.T) {
 	}
 }
 
-func TestBurndownAnalysisSerialize(t *testing.T) {
+func TestBurndownSerialize(t *testing.T) {
 	burndown := BurndownAnalysis{
 		Granularity:  30,
 		Sampling:     30,

+ 125 - 9
couples.go

@@ -32,6 +32,9 @@ type CouplesResult struct {
 	PeopleFiles  [][]int
 	FilesMatrix  []map[int]int64
 	Files        []string
+
+	// references IdentityDetector.ReversedPeopleDict
+	reversedPeopleDict []string
 }
 
 func (couples *CouplesAnalysis) Name() string {
@@ -180,8 +183,12 @@ func (couples *CouplesAnalysis) Finalize() interface{} {
 		}
 	}
 	return CouplesResult{
-		PeopleMatrix: peopleMatrix, PeopleFiles: peopleFiles,
-		Files: filesSequence, FilesMatrix: filesMatrix}
+		PeopleMatrix:       peopleMatrix,
+		PeopleFiles:        peopleFiles,
+		Files:              filesSequence,
+		FilesMatrix:        filesMatrix,
+		reversedPeopleDict: couples.reversedPeopleDict,
+	}
 }
 
 func (couples *CouplesAnalysis) Serialize(result interface{}, binary bool, writer io.Writer) error {
@@ -193,6 +200,117 @@ func (couples *CouplesAnalysis) Serialize(result interface{}, binary bool, write
 	return nil
 }
 
+func (couples *CouplesAnalysis) Deserialize(pbmessage []byte) (interface{}, error) {
+	message := pb.CouplesAnalysisResults{}
+	err := proto.Unmarshal(pbmessage, &message)
+	if err != nil {
+		return nil, err
+	}
+	result := CouplesResult{
+		Files:              message.FileCouples.Index,
+		FilesMatrix:        make([]map[int]int64, message.FileCouples.Matrix.NumberOfRows),
+		PeopleFiles:        make([][]int, len(message.PeopleCouples.Index)),
+		PeopleMatrix:       make([]map[int]int64, message.PeopleCouples.Matrix.NumberOfRows),
+		reversedPeopleDict: message.PeopleCouples.Index,
+	}
+	for i, files := range message.PeopleFiles {
+		result.PeopleFiles[i] = make([]int, len(files.Files))
+		for j, val := range files.Files {
+			result.PeopleFiles[i][j] = int(val)
+		}
+	}
+	convertCSR := func(dest []map[int]int64, src *pb.CompressedSparseRowMatrix) {
+		for indptr := range src.Indptr {
+			if indptr == 0 {
+				continue
+			}
+			dest[indptr-1] = map[int]int64{}
+			for j := src.Indptr[indptr-1]; j < src.Indptr[indptr]; j++ {
+				dest[indptr-1][int(src.Indices[j])] = src.Data[j]
+			}
+		}
+	}
+	convertCSR(result.FilesMatrix, message.FileCouples.Matrix)
+	convertCSR(result.PeopleMatrix, message.PeopleCouples.Matrix)
+	return result, nil
+}
+
+func (couples *CouplesAnalysis) MergeResults(r1, r2 interface{}, c1, c2 *CommonAnalysisResult) interface{} {
+	cr1 := r1.(CouplesResult)
+	cr2 := r2.(CouplesResult)
+	merged := CouplesResult{}
+	var people, files map[string][3]int
+	people, merged.reversedPeopleDict = IdentityDetector{}.MergeReversedDicts(
+		cr1.reversedPeopleDict, cr2.reversedPeopleDict)
+	files, merged.Files = IdentityDetector{}.MergeReversedDicts(cr1.Files, cr2.Files)
+	merged.PeopleFiles = make([][]int, len(merged.reversedPeopleDict))
+	peopleFilesDicts := make([]map[int]bool, len(merged.reversedPeopleDict))
+	addPeopleFiles := func(peopleFiles [][]int, reversedPeopleDict []string,
+		reversedFilesDict []string) {
+		for pi, fs := range peopleFiles {
+			idx := people[reversedPeopleDict[pi]][0]
+			m := peopleFilesDicts[idx]
+			if m == nil {
+				m = map[int]bool{}
+				peopleFilesDicts[idx] = m
+			}
+			for _, f := range fs {
+				m[files[reversedFilesDict[f]][0]] = true
+			}
+		}
+	}
+	addPeopleFiles(cr1.PeopleFiles, cr1.reversedPeopleDict, cr1.Files)
+	addPeopleFiles(cr2.PeopleFiles, cr2.reversedPeopleDict, cr2.Files)
+	for i, m := range peopleFilesDicts {
+		merged.PeopleFiles[i] = make([]int, len(m))
+		j := 0
+		for f := range m {
+			merged.PeopleFiles[i][j] = f
+			j++
+		}
+		sort.Ints(merged.PeopleFiles[i])
+	}
+	merged.PeopleMatrix = make([]map[int]int64, len(merged.reversedPeopleDict)+1)
+	addPeople := func(peopleMatrix []map[int]int64, reversedPeopleDict []string,
+		reversedFilesDict []string) {
+		for pi, pc := range peopleMatrix {
+			var idx int
+			if pi < len(reversedPeopleDict) {
+				idx = people[reversedPeopleDict[pi]][0]
+			} else {
+				idx = len(merged.reversedPeopleDict)
+			}
+			m := merged.PeopleMatrix[idx]
+			if m == nil {
+				m = map[int]int64{}
+				merged.PeopleMatrix[idx] = m
+			}
+			for file, val := range pc {
+				m[files[reversedFilesDict[file]][0]] += val
+			}
+		}
+	}
+	addPeople(cr1.PeopleMatrix, cr1.reversedPeopleDict, cr1.Files)
+	addPeople(cr2.PeopleMatrix, cr2.reversedPeopleDict, cr2.Files)
+	merged.FilesMatrix = make([]map[int]int64, len(merged.Files))
+	addFiles := func(filesMatrix []map[int]int64, reversedFilesDict []string) {
+		for fi, fc := range filesMatrix {
+			idx := people[reversedFilesDict[fi]][0]
+			m := merged.FilesMatrix[idx]
+			if m == nil {
+				m = map[int]int64{}
+				merged.FilesMatrix[idx] = m
+			}
+			for file, val := range fc {
+				m[files[reversedFilesDict[file]][0]] += val
+			}
+		}
+	}
+	addFiles(cr1.FilesMatrix, cr1.Files)
+	addFiles(cr2.FilesMatrix, cr2.Files)
+	return merged
+}
+
 func (couples *CouplesAnalysis) serializeText(result *CouplesResult, writer io.Writer) {
 	fmt.Fprintln(writer, "  files_coocc:")
 	fmt.Fprintln(writer, "    index:")
@@ -291,20 +409,18 @@ func (couples *CouplesAnalysis) serializeBinary(result *CouplesResult, writer io
 		Index:  result.Files,
 		Matrix: pb.MapToCompressedSparseRowMatrix(result.FilesMatrix),
 	}
-	message.DeveloperCouples = &pb.Couples{
-		Index:  couples.reversedPeopleDict,
+	message.PeopleCouples = &pb.Couples{
+		Index:  result.reversedPeopleDict,
 		Matrix: pb.MapToCompressedSparseRowMatrix(result.PeopleMatrix),
 	}
-	message.TouchedFiles = &pb.DeveloperTouchedFiles{
-		Developers: make([]*pb.TouchedFiles, len(couples.reversedPeopleDict)),
-	}
-	for key := range couples.reversedPeopleDict {
+	message.PeopleFiles = make([]*pb.TouchedFiles, len(result.reversedPeopleDict))
+	for key := range result.reversedPeopleDict {
 		files := result.PeopleFiles[key]
 		int32Files := make([]int32, len(files))
 		for i, f := range files {
 			int32Files[i] = int32(f)
 		}
-		message.TouchedFiles.Developers[key] = &pb.TouchedFiles{
+		message.PeopleFiles[key] = &pb.TouchedFiles{
 			Files: int32Files,
 		}
 	}

+ 107 - 10
couples_test.go

@@ -2,6 +2,8 @@ package hercules
 
 import (
 	"bytes"
+	"io/ioutil"
+	"path"
 	"strings"
 	"testing"
 
@@ -216,22 +218,22 @@ func TestCouplesSerialize(t *testing.T) {
 	c.Serialize(result, true, buffer)
 	msg := pb.CouplesAnalysisResults{}
 	proto.Unmarshal(buffer.Bytes(), &msg)
-	assert.Len(t, msg.TouchedFiles.Developers, 3)
+	assert.Len(t, msg.PeopleFiles, 3)
 	tmp1 := [...]int32{0, 1, 2}
-	assert.Equal(t, msg.TouchedFiles.Developers[0].Files, tmp1[:])
+	assert.Equal(t, msg.PeopleFiles[0].Files, tmp1[:])
 	tmp2 := [...]int32{1, 2}
-	assert.Equal(t, msg.TouchedFiles.Developers[1].Files, tmp2[:])
+	assert.Equal(t, msg.PeopleFiles[1].Files, tmp2[:])
 	tmp3 := [...]int32{0}
-	assert.Equal(t, msg.TouchedFiles.Developers[2].Files, tmp3[:])
-	assert.Equal(t, msg.DeveloperCouples.Index, people[:])
-	assert.Equal(t, msg.DeveloperCouples.Matrix.NumberOfRows, int32(4))
-	assert.Equal(t, msg.DeveloperCouples.Matrix.NumberOfColumns, int32(4))
+	assert.Equal(t, msg.PeopleFiles[2].Files, tmp3[:])
+	assert.Equal(t, msg.PeopleCouples.Index, people[:])
+	assert.Equal(t, msg.PeopleCouples.Matrix.NumberOfRows, int32(4))
+	assert.Equal(t, msg.PeopleCouples.Matrix.NumberOfColumns, int32(4))
 	data := [...]int64{7, 3, 1, 3, 3, 1, 1}
-	assert.Equal(t, msg.DeveloperCouples.Matrix.Data, data[:])
+	assert.Equal(t, msg.PeopleCouples.Matrix.Data, data[:])
 	indices := [...]int32{0, 1, 2, 0, 1, 0, 2}
-	assert.Equal(t, msg.DeveloperCouples.Matrix.Indices, indices[:])
+	assert.Equal(t, msg.PeopleCouples.Matrix.Indices, indices[:])
 	indptr := [...]int64{0, 3, 5, 7, 7}
-	assert.Equal(t, msg.DeveloperCouples.Matrix.Indptr, indptr[:])
+	assert.Equal(t, msg.PeopleCouples.Matrix.Indptr, indptr[:])
 	files := [...]string{"five", "one", "three"}
 	assert.Equal(t, msg.FileCouples.Index, files[:])
 	assert.Equal(t, msg.FileCouples.Matrix.NumberOfRows, int32(3))
@@ -243,3 +245,98 @@ func TestCouplesSerialize(t *testing.T) {
 	indptr2 := [...]int64{0, 3, 6, 9}
 	assert.Equal(t, msg.FileCouples.Matrix.Indptr, indptr2[:])
 }
+
+func TestCouplesDeserialize(t *testing.T) {
+	allBuffer, err := ioutil.ReadFile(path.Join("test_data", "couples.pb"))
+	assert.Nil(t, err)
+	message := pb.AnalysisResults{}
+	err = proto.Unmarshal(allBuffer, &message)
+	assert.Nil(t, err)
+	couples := CouplesAnalysis{}
+	iresult, err := couples.Deserialize(message.Contents[couples.Name()])
+	assert.Nil(t, err)
+	result := iresult.(CouplesResult)
+	assert.Len(t, result.reversedPeopleDict, 2)
+	assert.Len(t, result.PeopleFiles, 2)
+	assert.Len(t, result.PeopleMatrix, 3)
+	assert.Len(t, result.Files, 74)
+	assert.Len(t, result.FilesMatrix, 74)
+}
+
+func TestCouplesMerge(t *testing.T) {
+	r1, r2 := CouplesResult{}, CouplesResult{}
+	people1 := [...]string{"one", "two"}
+	people2 := [...]string{"two", "three"}
+	r1.reversedPeopleDict = people1[:]
+	r2.reversedPeopleDict = people2[:]
+	r1.Files = people1[:]
+	r2.Files = people2[:]
+	r1.PeopleFiles = make([][]int, 2)
+	r1.PeopleFiles[0] = make([]int, 2)
+	r1.PeopleFiles[0][0] = 0
+	r1.PeopleFiles[0][1] = 1
+	r1.PeopleFiles[1] = make([]int, 1)
+	r1.PeopleFiles[1][0] = 0
+	r2.PeopleFiles = make([][]int, 2)
+	r2.PeopleFiles[0] = make([]int, 1)
+	r2.PeopleFiles[0][0] = 1
+	r2.PeopleFiles[1] = make([]int, 2)
+	r2.PeopleFiles[1][0] = 0
+	r2.PeopleFiles[1][1] = 1
+	r1.FilesMatrix = make([]map[int]int64, 2)
+	r1.FilesMatrix[0] = map[int]int64{}
+	r1.FilesMatrix[1] = map[int]int64{}
+	r1.FilesMatrix[0][1] = 100
+	r1.FilesMatrix[1][0] = 100
+	r2.FilesMatrix = make([]map[int]int64, 2)
+	r2.FilesMatrix[0] = map[int]int64{}
+	r2.FilesMatrix[1] = map[int]int64{}
+	r2.FilesMatrix[0][1] = 200
+	r2.FilesMatrix[1][0] = 200
+	r1.PeopleMatrix = make([]map[int]int64, 3)
+	r1.PeopleMatrix[0] = map[int]int64{}
+	r1.PeopleMatrix[1] = map[int]int64{}
+	r1.PeopleMatrix[2] = map[int]int64{}
+	r1.PeopleMatrix[0][1] = 100
+	r1.PeopleMatrix[1][0] = 100
+	r1.PeopleMatrix[2][0] = 300
+	r1.PeopleMatrix[2][1] = 400
+	r2.PeopleMatrix = make([]map[int]int64, 3)
+	r2.PeopleMatrix[0] = map[int]int64{}
+	r2.PeopleMatrix[1] = map[int]int64{}
+	r2.PeopleMatrix[2] = map[int]int64{}
+	r2.PeopleMatrix[0][1] = 10
+	r2.PeopleMatrix[1][0] = 10
+	r2.PeopleMatrix[2][0] = 30
+	r2.PeopleMatrix[2][1] = 40
+	couples := CouplesAnalysis{}
+	merged := couples.MergeResults(r1, r2, nil, nil).(CouplesResult)
+	mergedPeople := [...]string{"one", "two", "three"}
+	assert.Equal(t, merged.reversedPeopleDict, mergedPeople[:])
+	assert.Equal(t, merged.Files, mergedPeople[:])
+	assert.Len(t, merged.PeopleFiles, 3)
+	assert.Equal(t, merged.PeopleFiles[0], getSlice(0, 1))
+	assert.Equal(t, merged.PeopleFiles[1], getSlice(0, 2))
+	assert.Equal(t, merged.PeopleFiles[2], getSlice(1, 2))
+	assert.Len(t, merged.PeopleMatrix, 4)
+	assert.Equal(t, merged.PeopleMatrix[0], getCouplesMap(1, 100))
+	assert.Equal(t, merged.PeopleMatrix[1], getCouplesMap(0, 100, 2, 10))
+	assert.Equal(t, merged.PeopleMatrix[2], getCouplesMap(1, 10))
+	assert.Equal(t, merged.PeopleMatrix[3], getCouplesMap(0, 300, 1, 430, 2, 40))
+	assert.Len(t, merged.FilesMatrix, 3)
+	assert.Equal(t, merged.FilesMatrix[0], getCouplesMap(1, 100))
+	assert.Equal(t, merged.FilesMatrix[1], getCouplesMap(0, 100, 2, 200))
+	assert.Equal(t, merged.FilesMatrix[2], getCouplesMap(1, 200))
+}
+
+func getSlice(vals ...int) []int {
+	return vals
+}
+
+func getCouplesMap(vals ...int) map[int]int64 {
+	res := map[int]int64{}
+	for i := 0; i < len(vals); i += 2 {
+		res[vals[i]] = int64(vals[i+1])
+	}
+	return res
+}

+ 1 - 1
identity.go

@@ -236,7 +236,7 @@ func (_ IdentityDetector) MergeReversedDicts(rd1, rd2 []string) (map[string][3]i
 	for name, ptrs := range people {
 		mrd[ptrs[0]] = name
 	}
-  return people, mrd
+	return people, mrd
 }
 
 func init() {

+ 1 - 1
identity_test.go

@@ -390,4 +390,4 @@ func TestIdentityDetectorMergeReversedDicts(t *testing.T) {
 	assert.Equal(t, people["three"], [3]int{2, -1, 1})
 	vm = [...]string{"two", "one", "three"}
 	assert.Equal(t, merged, vm[:])
-}
+}

+ 3 - 1
labours.py

@@ -235,7 +235,7 @@ class ProtobufReader(Reader):
         return list(node.index), self._parse_sparse_matrix(node.matrix)
 
     def get_people_coocc(self):
-        node = self.contents["Couples"].developer_couples
+        node = self.contents["Couples"].people_couples
         return list(node.index), self._parse_sparse_matrix(node.matrix)
 
     def _parse_burndown_matrix(self, matrix):
@@ -281,6 +281,8 @@ def load_burndown(header, name, matrix, resample):
     import pandas
 
     start, last, sampling, granularity = header
+    assert sampling > 0
+    assert granularity >= sampling
     start = datetime.fromtimestamp(start)
     last = datetime.fromtimestamp(last)
     print(name, "lifetime index:", calculate_average_lifetime(matrix))

+ 3 - 7
pb/pb.proto

@@ -66,15 +66,11 @@ message TouchedFiles {
     repeated int32 files = 1;  // values correspond to `file_couples::index`
 }
 
-message DeveloperTouchedFiles {
-    // order corresponds to `developer_couples::index`
-    repeated TouchedFiles developers = 1;
-}
-
 message CouplesAnalysisResults {
     Couples file_couples = 6;
-    Couples developer_couples = 7;
-    DeveloperTouchedFiles touched_files = 8;
+    Couples people_couples = 7;
+    // order corresponds to `people_couples::index`
+    repeated TouchedFiles people_files = 8;
 }
 
 message UASTChange {

Файловите разлики са ограничени, защото са твърде много
+ 38 - 71
pb/pb_pb2.py


BIN
test_data/couples.pb