Browse Source

Implement Couples merge

Vadim Markovtsev 7 years ago
parent
commit
30f11f4891
6 changed files with 237 additions and 28 deletions
  1. 125 9
      couples.go
  2. 107 10
      couples_test.go
  3. 1 1
      identity.go
  4. 1 1
      identity_test.go
  5. 3 7
      pb/pb.proto
  6. BIN
      test_data/couples.pb

+ 125 - 9
couples.go

@@ -32,6 +32,9 @@ type CouplesResult struct {
 	PeopleFiles  [][]int
 	FilesMatrix  []map[int]int64
 	Files        []string
+
+	// references IdentityDetector.ReversedPeopleDict
+	reversedPeopleDict []string
 }
 
 func (couples *CouplesAnalysis) Name() string {
@@ -180,8 +183,12 @@ func (couples *CouplesAnalysis) Finalize() interface{} {
 		}
 	}
 	return CouplesResult{
-		PeopleMatrix: peopleMatrix, PeopleFiles: peopleFiles,
-		Files: filesSequence, FilesMatrix: filesMatrix}
+		PeopleMatrix:       peopleMatrix,
+		PeopleFiles:        peopleFiles,
+		Files:              filesSequence,
+		FilesMatrix:        filesMatrix,
+		reversedPeopleDict: couples.reversedPeopleDict,
+	}
 }
 
 func (couples *CouplesAnalysis) Serialize(result interface{}, binary bool, writer io.Writer) error {
@@ -193,6 +200,117 @@ func (couples *CouplesAnalysis) Serialize(result interface{}, binary bool, write
 	return nil
 }
 
+func (couples *CouplesAnalysis) Deserialize(pbmessage []byte) (interface{}, error) {
+	message := pb.CouplesAnalysisResults{}
+	err := proto.Unmarshal(pbmessage, &message)
+	if err != nil {
+		return nil, err
+	}
+	result := CouplesResult{
+		Files:              message.FileCouples.Index,
+		FilesMatrix:        make([]map[int]int64, message.FileCouples.Matrix.NumberOfRows),
+		PeopleFiles:        make([][]int, len(message.PeopleCouples.Index)),
+		PeopleMatrix:       make([]map[int]int64, message.PeopleCouples.Matrix.NumberOfRows),
+		reversedPeopleDict: message.PeopleCouples.Index,
+	}
+	for i, files := range message.PeopleFiles {
+		result.PeopleFiles[i] = make([]int, len(files.Files))
+		for j, val := range files.Files {
+			result.PeopleFiles[i][j] = int(val)
+		}
+	}
+	convertCSR := func(dest []map[int]int64, src *pb.CompressedSparseRowMatrix) {
+		for indptr := range src.Indptr {
+			if indptr == 0 {
+				continue
+			}
+			dest[indptr-1] = map[int]int64{}
+			for j := src.Indptr[indptr-1]; j < src.Indptr[indptr]; j++ {
+				dest[indptr-1][int(src.Indices[j])] = src.Data[j]
+			}
+		}
+	}
+	convertCSR(result.FilesMatrix, message.FileCouples.Matrix)
+	convertCSR(result.PeopleMatrix, message.PeopleCouples.Matrix)
+	return result, nil
+}
+
+func (couples *CouplesAnalysis) MergeResults(r1, r2 interface{}, c1, c2 *CommonAnalysisResult) interface{} {
+	cr1 := r1.(CouplesResult)
+	cr2 := r2.(CouplesResult)
+	merged := CouplesResult{}
+	var people, files map[string][3]int
+	people, merged.reversedPeopleDict = IdentityDetector{}.MergeReversedDicts(
+		cr1.reversedPeopleDict, cr2.reversedPeopleDict)
+	files, merged.Files = IdentityDetector{}.MergeReversedDicts(cr1.Files, cr2.Files)
+	merged.PeopleFiles = make([][]int, len(merged.reversedPeopleDict))
+	peopleFilesDicts := make([]map[int]bool, len(merged.reversedPeopleDict))
+	addPeopleFiles := func(peopleFiles [][]int, reversedPeopleDict []string,
+		reversedFilesDict []string) {
+		for pi, fs := range peopleFiles {
+			idx := people[reversedPeopleDict[pi]][0]
+			m := peopleFilesDicts[idx]
+			if m == nil {
+				m = map[int]bool{}
+				peopleFilesDicts[idx] = m
+			}
+			for _, f := range fs {
+				m[files[reversedFilesDict[f]][0]] = true
+			}
+		}
+	}
+	addPeopleFiles(cr1.PeopleFiles, cr1.reversedPeopleDict, cr1.Files)
+	addPeopleFiles(cr2.PeopleFiles, cr2.reversedPeopleDict, cr2.Files)
+	for i, m := range peopleFilesDicts {
+		merged.PeopleFiles[i] = make([]int, len(m))
+		j := 0
+		for f := range m {
+			merged.PeopleFiles[i][j] = f
+			j++
+		}
+		sort.Ints(merged.PeopleFiles[i])
+	}
+	merged.PeopleMatrix = make([]map[int]int64, len(merged.reversedPeopleDict)+1)
+	addPeople := func(peopleMatrix []map[int]int64, reversedPeopleDict []string,
+		reversedFilesDict []string) {
+		for pi, pc := range peopleMatrix {
+			var idx int
+			if pi < len(reversedPeopleDict) {
+				idx = people[reversedPeopleDict[pi]][0]
+			} else {
+				idx = len(merged.reversedPeopleDict)
+			}
+			m := merged.PeopleMatrix[idx]
+			if m == nil {
+				m = map[int]int64{}
+				merged.PeopleMatrix[idx] = m
+			}
+			for file, val := range pc {
+				m[files[reversedFilesDict[file]][0]] += val
+			}
+		}
+	}
+	addPeople(cr1.PeopleMatrix, cr1.reversedPeopleDict, cr1.Files)
+	addPeople(cr2.PeopleMatrix, cr2.reversedPeopleDict, cr2.Files)
+	merged.FilesMatrix = make([]map[int]int64, len(merged.Files))
+	addFiles := func(filesMatrix []map[int]int64, reversedFilesDict []string) {
+		for fi, fc := range filesMatrix {
+			idx := people[reversedFilesDict[fi]][0]
+			m := merged.FilesMatrix[idx]
+			if m == nil {
+				m = map[int]int64{}
+				merged.FilesMatrix[idx] = m
+			}
+			for file, val := range fc {
+				m[files[reversedFilesDict[file]][0]] += val
+			}
+		}
+	}
+	addFiles(cr1.FilesMatrix, cr1.Files)
+	addFiles(cr2.FilesMatrix, cr2.Files)
+	return merged
+}
+
 func (couples *CouplesAnalysis) serializeText(result *CouplesResult, writer io.Writer) {
 	fmt.Fprintln(writer, "  files_coocc:")
 	fmt.Fprintln(writer, "    index:")
@@ -291,20 +409,18 @@ func (couples *CouplesAnalysis) serializeBinary(result *CouplesResult, writer io
 		Index:  result.Files,
 		Matrix: pb.MapToCompressedSparseRowMatrix(result.FilesMatrix),
 	}
-	message.DeveloperCouples = &pb.Couples{
-		Index:  couples.reversedPeopleDict,
+	message.PeopleCouples = &pb.Couples{
+		Index:  result.reversedPeopleDict,
 		Matrix: pb.MapToCompressedSparseRowMatrix(result.PeopleMatrix),
 	}
-	message.TouchedFiles = &pb.DeveloperTouchedFiles{
-		Developers: make([]*pb.TouchedFiles, len(couples.reversedPeopleDict)),
-	}
-	for key := range couples.reversedPeopleDict {
+	message.PeopleFiles = make([]*pb.TouchedFiles, len(result.reversedPeopleDict))
+	for key := range result.reversedPeopleDict {
 		files := result.PeopleFiles[key]
 		int32Files := make([]int32, len(files))
 		for i, f := range files {
 			int32Files[i] = int32(f)
 		}
-		message.TouchedFiles.Developers[key] = &pb.TouchedFiles{
+		message.PeopleFiles[key] = &pb.TouchedFiles{
 			Files: int32Files,
 		}
 	}

+ 107 - 10
couples_test.go

@@ -2,6 +2,8 @@ package hercules
 
 import (
 	"bytes"
+	"io/ioutil"
+	"path"
 	"strings"
 	"testing"
 
@@ -216,22 +218,22 @@ func TestCouplesSerialize(t *testing.T) {
 	c.Serialize(result, true, buffer)
 	msg := pb.CouplesAnalysisResults{}
 	proto.Unmarshal(buffer.Bytes(), &msg)
-	assert.Len(t, msg.TouchedFiles.Developers, 3)
+	assert.Len(t, msg.PeopleFiles, 3)
 	tmp1 := [...]int32{0, 1, 2}
-	assert.Equal(t, msg.TouchedFiles.Developers[0].Files, tmp1[:])
+	assert.Equal(t, msg.PeopleFiles[0].Files, tmp1[:])
 	tmp2 := [...]int32{1, 2}
-	assert.Equal(t, msg.TouchedFiles.Developers[1].Files, tmp2[:])
+	assert.Equal(t, msg.PeopleFiles[1].Files, tmp2[:])
 	tmp3 := [...]int32{0}
-	assert.Equal(t, msg.TouchedFiles.Developers[2].Files, tmp3[:])
-	assert.Equal(t, msg.DeveloperCouples.Index, people[:])
-	assert.Equal(t, msg.DeveloperCouples.Matrix.NumberOfRows, int32(4))
-	assert.Equal(t, msg.DeveloperCouples.Matrix.NumberOfColumns, int32(4))
+	assert.Equal(t, msg.PeopleFiles[2].Files, tmp3[:])
+	assert.Equal(t, msg.PeopleCouples.Index, people[:])
+	assert.Equal(t, msg.PeopleCouples.Matrix.NumberOfRows, int32(4))
+	assert.Equal(t, msg.PeopleCouples.Matrix.NumberOfColumns, int32(4))
 	data := [...]int64{7, 3, 1, 3, 3, 1, 1}
-	assert.Equal(t, msg.DeveloperCouples.Matrix.Data, data[:])
+	assert.Equal(t, msg.PeopleCouples.Matrix.Data, data[:])
 	indices := [...]int32{0, 1, 2, 0, 1, 0, 2}
-	assert.Equal(t, msg.DeveloperCouples.Matrix.Indices, indices[:])
+	assert.Equal(t, msg.PeopleCouples.Matrix.Indices, indices[:])
 	indptr := [...]int64{0, 3, 5, 7, 7}
-	assert.Equal(t, msg.DeveloperCouples.Matrix.Indptr, indptr[:])
+	assert.Equal(t, msg.PeopleCouples.Matrix.Indptr, indptr[:])
 	files := [...]string{"five", "one", "three"}
 	assert.Equal(t, msg.FileCouples.Index, files[:])
 	assert.Equal(t, msg.FileCouples.Matrix.NumberOfRows, int32(3))
@@ -243,3 +245,98 @@ func TestCouplesSerialize(t *testing.T) {
 	indptr2 := [...]int64{0, 3, 6, 9}
 	assert.Equal(t, msg.FileCouples.Matrix.Indptr, indptr2[:])
 }
+
+func TestCouplesDeserialize(t *testing.T) {
+	allBuffer, err := ioutil.ReadFile(path.Join("test_data", "couples.pb"))
+	assert.Nil(t, err)
+	message := pb.AnalysisResults{}
+	err = proto.Unmarshal(allBuffer, &message)
+	assert.Nil(t, err)
+	couples := CouplesAnalysis{}
+	iresult, err := couples.Deserialize(message.Contents[couples.Name()])
+	assert.Nil(t, err)
+	result := iresult.(CouplesResult)
+	assert.Len(t, result.reversedPeopleDict, 2)
+	assert.Len(t, result.PeopleFiles, 2)
+	assert.Len(t, result.PeopleMatrix, 3)
+	assert.Len(t, result.Files, 74)
+	assert.Len(t, result.FilesMatrix, 74)
+}
+
+func TestCouplesMerge(t *testing.T) {
+	r1, r2 := CouplesResult{}, CouplesResult{}
+	people1 := [...]string{"one", "two"}
+	people2 := [...]string{"two", "three"}
+	r1.reversedPeopleDict = people1[:]
+	r2.reversedPeopleDict = people2[:]
+	r1.Files = people1[:]
+	r2.Files = people2[:]
+	r1.PeopleFiles = make([][]int, 2)
+	r1.PeopleFiles[0] = make([]int, 2)
+	r1.PeopleFiles[0][0] = 0
+	r1.PeopleFiles[0][1] = 1
+	r1.PeopleFiles[1] = make([]int, 1)
+	r1.PeopleFiles[1][0] = 0
+	r2.PeopleFiles = make([][]int, 2)
+	r2.PeopleFiles[0] = make([]int, 1)
+	r2.PeopleFiles[0][0] = 1
+	r2.PeopleFiles[1] = make([]int, 2)
+	r2.PeopleFiles[1][0] = 0
+	r2.PeopleFiles[1][1] = 1
+	r1.FilesMatrix = make([]map[int]int64, 2)
+	r1.FilesMatrix[0] = map[int]int64{}
+	r1.FilesMatrix[1] = map[int]int64{}
+	r1.FilesMatrix[0][1] = 100
+	r1.FilesMatrix[1][0] = 100
+	r2.FilesMatrix = make([]map[int]int64, 2)
+	r2.FilesMatrix[0] = map[int]int64{}
+	r2.FilesMatrix[1] = map[int]int64{}
+	r2.FilesMatrix[0][1] = 200
+	r2.FilesMatrix[1][0] = 200
+	r1.PeopleMatrix = make([]map[int]int64, 3)
+	r1.PeopleMatrix[0] = map[int]int64{}
+	r1.PeopleMatrix[1] = map[int]int64{}
+	r1.PeopleMatrix[2] = map[int]int64{}
+	r1.PeopleMatrix[0][1] = 100
+	r1.PeopleMatrix[1][0] = 100
+	r1.PeopleMatrix[2][0] = 300
+	r1.PeopleMatrix[2][1] = 400
+	r2.PeopleMatrix = make([]map[int]int64, 3)
+	r2.PeopleMatrix[0] = map[int]int64{}
+	r2.PeopleMatrix[1] = map[int]int64{}
+	r2.PeopleMatrix[2] = map[int]int64{}
+	r2.PeopleMatrix[0][1] = 10
+	r2.PeopleMatrix[1][0] = 10
+	r2.PeopleMatrix[2][0] = 30
+	r2.PeopleMatrix[2][1] = 40
+	couples := CouplesAnalysis{}
+	merged := couples.MergeResults(r1, r2, nil, nil).(CouplesResult)
+	mergedPeople := [...]string{"one", "two", "three"}
+	assert.Equal(t, merged.reversedPeopleDict, mergedPeople[:])
+	assert.Equal(t, merged.Files, mergedPeople[:])
+	assert.Len(t, merged.PeopleFiles, 3)
+	assert.Equal(t, merged.PeopleFiles[0], getSlice(0, 1))
+	assert.Equal(t, merged.PeopleFiles[1], getSlice(0, 2))
+	assert.Equal(t, merged.PeopleFiles[2], getSlice(1, 2))
+	assert.Len(t, merged.PeopleMatrix, 4)
+	assert.Equal(t, merged.PeopleMatrix[0], getCouplesMap(1, 100))
+	assert.Equal(t, merged.PeopleMatrix[1], getCouplesMap(0, 100, 2, 10))
+	assert.Equal(t, merged.PeopleMatrix[2], getCouplesMap(1, 10))
+	assert.Equal(t, merged.PeopleMatrix[3], getCouplesMap(0, 300, 1, 430, 2, 40))
+	assert.Len(t, merged.FilesMatrix, 3)
+	assert.Equal(t, merged.FilesMatrix[0], getCouplesMap(1, 100))
+	assert.Equal(t, merged.FilesMatrix[1], getCouplesMap(0, 100, 2, 200))
+	assert.Equal(t, merged.FilesMatrix[2], getCouplesMap(1, 200))
+}
+
+func getSlice(vals ...int) []int {
+	return vals
+}
+
+func getCouplesMap(vals ...int) map[int]int64 {
+	res := map[int]int64{}
+	for i := 0; i < len(vals); i += 2 {
+		res[vals[i]] = int64(vals[i+1])
+	}
+	return res
+}

+ 1 - 1
identity.go

@@ -236,7 +236,7 @@ func (_ IdentityDetector) MergeReversedDicts(rd1, rd2 []string) (map[string][3]i
 	for name, ptrs := range people {
 		mrd[ptrs[0]] = name
 	}
-  return people, mrd
+	return people, mrd
 }
 
 func init() {

+ 1 - 1
identity_test.go

@@ -390,4 +390,4 @@ func TestIdentityDetectorMergeReversedDicts(t *testing.T) {
 	assert.Equal(t, people["three"], [3]int{2, -1, 1})
 	vm = [...]string{"two", "one", "three"}
 	assert.Equal(t, merged, vm[:])
-}
+}

+ 3 - 7
pb/pb.proto

@@ -66,15 +66,11 @@ message TouchedFiles {
     repeated int32 files = 1;  // values correspond to `file_couples::index`
 }
 
-message DeveloperTouchedFiles {
-    // order corresponds to `developer_couples::index`
-    repeated TouchedFiles developers = 1;
-}
-
 message CouplesAnalysisResults {
     Couples file_couples = 6;
-    Couples developer_couples = 7;
-    DeveloperTouchedFiles touched_files = 8;
+    Couples people_couples = 7;
+    // order corresponds to `people_couples::index`
+    repeated TouchedFiles people_files = 8;
 }
 
 message UASTChange {

BIN
test_data/couples.pb