فهرست منبع

Merge pull request #42 from vmarkovtsev/master

Shotness in labours
Vadim Markovtsev 7 سال پیش
والد
کامیت
5af556fda3
3فایلهای تغییر یافته به همراه125 افزوده شده و 80 حذف شده
  1. 75 75
      file_history.go
  2. 4 5
      file_history_test.go
  3. 46 0
      labours.py

+ 75 - 75
file_history.go

@@ -1,49 +1,49 @@
 package hercules
 
 import (
-  "fmt"
-  "io"
-  "sort"
-  "strings"
-
-  "github.com/gogo/protobuf/proto"
-  "gopkg.in/src-d/go-git.v4"
-  "gopkg.in/src-d/go-git.v4/plumbing"
-  "gopkg.in/src-d/go-git.v4/plumbing/object"
-  "gopkg.in/src-d/go-git.v4/utils/merkletrie"
-  "gopkg.in/src-d/hercules.v3/pb"
+	"fmt"
+	"io"
+	"sort"
+	"strings"
+
+	"github.com/gogo/protobuf/proto"
+	"gopkg.in/src-d/go-git.v4"
+	"gopkg.in/src-d/go-git.v4/plumbing"
+	"gopkg.in/src-d/go-git.v4/plumbing/object"
+	"gopkg.in/src-d/go-git.v4/utils/merkletrie"
+	"gopkg.in/src-d/hercules.v3/pb"
 )
 
 // FileHistory contains the intermediate state which is mutated by Consume(). It should implement
 // LeafPipelineItem.
 type FileHistory struct {
-  files map[string][]plumbing.Hash
+	files map[string][]plumbing.Hash
 }
 
 // FileHistoryResult is returned by Finalize() and represents the analysis result.
 type FileHistoryResult struct {
-  Files map[string][]plumbing.Hash
+	Files map[string][]plumbing.Hash
 }
 
 func (history *FileHistory) Name() string {
-  return "FileHistory"
+	return "FileHistory"
 }
 
 func (history *FileHistory) Provides() []string {
-  return []string{}
+	return []string{}
 }
 
 func (history *FileHistory) Requires() []string {
-  arr := [...]string{DependencyTreeChanges}
-  return arr[:]
+	arr := [...]string{DependencyTreeChanges}
+	return arr[:]
 }
 
 func (history *FileHistory) ListConfigurationOptions() []ConfigurationOption {
-  return []ConfigurationOption{}
+	return []ConfigurationOption{}
 }
 
 func (history *FileHistory) Flag() string {
-  return "file-history"
+	return "file-history"
 }
 
 func (history *FileHistory) Configure(facts map[string]interface{}) {
@@ -51,87 +51,87 @@ func (history *FileHistory) Configure(facts map[string]interface{}) {
 
 // Initialize resets the internal temporary data structures and prepares the object for Consume().
 func (history *FileHistory) Initialize(repository *git.Repository) {
-  history.files = map[string][]plumbing.Hash{}
+	history.files = map[string][]plumbing.Hash{}
 }
 
 // Consume is called for every commit in the sequence.
 func (history *FileHistory) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
-  commit := deps["commit"].(*object.Commit).Hash
-  changes := deps[DependencyTreeChanges].(object.Changes)
-  for _, change := range changes {
-    action, _ := change.Action()
-    switch action {
+	commit := deps["commit"].(*object.Commit).Hash
+	changes := deps[DependencyTreeChanges].(object.Changes)
+	for _, change := range changes {
+		action, _ := change.Action()
+		switch action {
 		case merkletrie.Insert:
 			hashes := make([]plumbing.Hash, 1)
-      hashes[0] = commit
-      history.files[change.To.Name] = hashes
+			hashes[0] = commit
+			history.files[change.To.Name] = hashes
 		case merkletrie.Delete:
 			delete(history.files, change.From.Name)
 		case merkletrie.Modify:
 			hashes := history.files[change.From.Name]
-      if change.From.Name != change.To.Name {
-        delete(history.files, change.From.Name)
-      }
-      hashes = append(hashes, commit)
-      history.files[change.To.Name] = hashes
+			if change.From.Name != change.To.Name {
+				delete(history.files, change.From.Name)
+			}
+			hashes = append(hashes, commit)
+			history.files[change.To.Name] = hashes
 		}
-  }
-  return nil, nil
+	}
+	return nil, nil
 }
 
 func (history *FileHistory) Finalize() interface{} {
-  return FileHistoryResult{Files: history.files}
+	return FileHistoryResult{Files: history.files}
 }
 
 // Serialize converts the result from Finalize() to either Protocol Buffers or YAML.
 func (history *FileHistory) Serialize(result interface{}, binary bool, writer io.Writer) error {
-  historyResult := result.(FileHistoryResult)
-  if binary {
-    return history.serializeBinary(&historyResult, writer)
-  }
-  history.serializeText(&historyResult, writer)
-  return nil
+	historyResult := result.(FileHistoryResult)
+	if binary {
+		return history.serializeBinary(&historyResult, writer)
+	}
+	history.serializeText(&historyResult, writer)
+	return nil
 }
 
 func (history *FileHistory) serializeText(result *FileHistoryResult, writer io.Writer) {
-  keys := make([]string, len(result.Files))
-  i := 0
-  for key := range result.Files {
-    keys[i] = key
-    i++
-  }
-  sort.Strings(keys)
-  for _, key := range keys {
-    hashes := result.Files[key]
-    strhashes := make([]string, len(hashes))
-    for i, hash := range hashes {
-      strhashes[i] = "\"" + hash.String() + "\""
-    }
-    fmt.Fprintf(writer, "  - %s: [%s]\n", key, strings.Join(strhashes, ","))
-  }
+	keys := make([]string, len(result.Files))
+	i := 0
+	for key := range result.Files {
+		keys[i] = key
+		i++
+	}
+	sort.Strings(keys)
+	for _, key := range keys {
+		hashes := result.Files[key]
+		strhashes := make([]string, len(hashes))
+		for i, hash := range hashes {
+			strhashes[i] = "\"" + hash.String() + "\""
+		}
+		fmt.Fprintf(writer, "  - %s: [%s]\n", key, strings.Join(strhashes, ","))
+	}
 }
 
 func (history *FileHistory) serializeBinary(result *FileHistoryResult, writer io.Writer) error {
-  message := pb.FileHistoryResultMessage{
-    Files: map[string]*pb.FileHistory{},
-  }
-  for key, vals := range result.Files {
-    hashes := &pb.FileHistory{
-      Commits: make([]string, len(vals)),
-    }
-    for i, hash := range vals {
-      hashes.Commits[i] = hash.String()
-    }
-    message.Files[key] = hashes
-  }
-  serialized, err := proto.Marshal(&message)
-  if err != nil {
-    return err
-  }
-  writer.Write(serialized)
-  return nil
+	message := pb.FileHistoryResultMessage{
+		Files: map[string]*pb.FileHistory{},
+	}
+	for key, vals := range result.Files {
+		hashes := &pb.FileHistory{
+			Commits: make([]string, len(vals)),
+		}
+		for i, hash := range vals {
+			hashes.Commits[i] = hash.String()
+		}
+		message.Files[key] = hashes
+	}
+	serialized, err := proto.Marshal(&message)
+	if err != nil {
+		return err
+	}
+	writer.Write(serialized)
+	return nil
 }
 
 func init() {
-  Registry.Register(&FileHistory{})
+	Registry.Register(&FileHistory{})
 }

+ 4 - 5
file_history_test.go

@@ -4,8 +4,8 @@ import (
 	"bytes"
 	"testing"
 
-	"github.com/stretchr/testify/assert"
 	"github.com/gogo/protobuf/proto"
+	"github.com/stretchr/testify/assert"
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
 	"gopkg.in/src-d/hercules.v3/pb"
@@ -37,7 +37,7 @@ func TestFileHistoryRegistration(t *testing.T) {
 }
 
 func TestFileHistoryConsume(t *testing.T) {
-  fh := fixtureFileHistory()
+	fh := fixtureFileHistory()
 	deps := map[string]interface{}{}
 	changes := make(object.Changes, 3)
 	treeFrom, _ := testRepository.TreeObject(plumbing.NewHash(
@@ -105,7 +105,7 @@ func TestFileHistoryConsume(t *testing.T) {
 }
 
 func TestFileHistorySerializeText(t *testing.T) {
-  fh := fixtureFileHistory()
+	fh := fixtureFileHistory()
 	deps := map[string]interface{}{}
 	changes := make(object.Changes, 1)
 	treeTo, _ := testRepository.TreeObject(plumbing.NewHash(
@@ -132,7 +132,7 @@ func TestFileHistorySerializeText(t *testing.T) {
 }
 
 func TestFileHistorySerializeBinary(t *testing.T) {
-  fh := fixtureFileHistory()
+	fh := fixtureFileHistory()
 	deps := map[string]interface{}{}
 	changes := make(object.Changes, 1)
 	treeTo, _ := testRepository.TreeObject(plumbing.NewHash(
@@ -161,4 +161,3 @@ func TestFileHistorySerializeBinary(t *testing.T) {
 	assert.Len(t, msg.Files[".travis.yml"].Commits, 1)
 	assert.Equal(t, msg.Files[".travis.yml"].Commits[0], "2b1ed978194a94edeabbca6de7ff3b5771d4d665")
 }
-

+ 46 - 0
labours.py

@@ -104,6 +104,9 @@ class Reader(object):
     def get_people_coocc(self):
         raise NotImplementedError
 
+    def get_shotness_coocc(self):
+        raise NotImplementedError
+
     def get_shotness(self):
         raise NotImplementedError
 
@@ -168,6 +171,24 @@ class YamlReader(Reader):
         coocc = self.data["Couples"]["people_coocc"]
         return coocc["index"], self._parse_coocc_matrix(coocc["matrix"])
 
+    def get_shotness_coocc(self):
+        shotness = self.data["Shotness"]
+        index = ["%s:%s" % (i["file"], i["name"]) for i in shotness]
+        indptr = numpy.zeros(len(shotness) + 1, dtype=numpy.int64)
+        indices = []
+        data = []
+        for i, record in enumerate(shotness):
+            pairs = [(int(k), v) for k, v in record["counters"].items()]
+            pairs.sort()
+            indptr[i + 1] = indptr[i] + len(pairs)
+            for k, v in pairs:
+                indices.append(k)
+                data.append(v)
+        indices = numpy.array(indices, dtype=numpy.int32)
+        data = numpy.array(data, dtype=numpy.int32)
+        from scipy.sparse import csr_matrix
+        return index, csr_matrix((data, indices, indptr), shape=(len(shotness),) * 2)
+
     def get_shotness(self):
         from munch import munchify
         obj = munchify(self.data["Shotness"])
@@ -254,6 +275,25 @@ class ProtobufReader(Reader):
         node = self.contents["Couples"].people_couples
         return list(node.index), self._parse_sparse_matrix(node.matrix)
 
+    def get_shotness_coocc(self):
+        shotness = self.get_shotness()
+        index = ["%s:%s" % (i.file, i.name) for i in shotness]
+        indptr = numpy.zeros(len(shotness) + 1, dtype=numpy.int32)
+        indices = []
+        data = []
+        for i, record in enumerate(shotness):
+            pairs = list(record.counters.items())
+            pairs.sort()
+            indptr[i + 1] = indptr[i] + len(pairs)
+            for k, v in pairs:
+                indices.append(k)
+                data.append(v)
+        indices = numpy.array(indices, dtype=numpy.int32)
+        data = numpy.array(data, dtype=numpy.int32)
+        from scipy.sparse import csr_matrix
+        return index, csr_matrix((data, indices, indptr), shape=(len(shotness),) * 2)
+
+
     def get_shotness(self):
         return self.contents["Shotness"].records
 
@@ -1064,6 +1104,12 @@ def main():
                                                tmpdir=args.couples_tmp_dir))
         except KeyError:
             print(couples_warning)
+        try:
+            write_embeddings("shotness", args.output, not args.disable_projector,
+                             *train_embeddings(*reader.get_shotness_coocc(),
+                                               tmpdir=args.couples_tmp_dir))
+        except KeyError:
+            print(shotness_warning)
 
     def shotness():
         try: