Browse Source

Add the necessary edits for the hackathon

Vadim Markovtsev 7 years ago
parent
commit
d7a77943e5
3 changed files with 195 additions and 27 deletions
  1. 64 13
      cmd/hercules/main.go
  2. 2 2
      pipeline.go
  3. 129 12
      uast.go

+ 64 - 13
cmd/hercules/main.go

@@ -29,9 +29,11 @@ package main
 
 
 import (
 import (
 	"bytes"
 	"bytes"
+	"context"
 	"flag"
 	"flag"
 	"fmt"
 	"fmt"
 	"io"
 	"io"
+	"io/ioutil"
 	"net/http"
 	"net/http"
 	_ "net/http/pprof"
 	_ "net/http/pprof"
 	"os"
 	"os"
@@ -49,6 +51,7 @@ import (
 	"gopkg.in/src-d/hercules.v2/stdout"
 	"gopkg.in/src-d/hercules.v2/stdout"
 	"gopkg.in/src-d/hercules.v2/pb"
 	"gopkg.in/src-d/hercules.v2/pb"
 	"github.com/gogo/protobuf/proto"
 	"github.com/gogo/protobuf/proto"
+	"time"
 )
 )
 
 
 func sortedKeys(m map[string][][]int64) []string {
 func sortedKeys(m map[string][][]int64) []string {
@@ -82,9 +85,13 @@ func main() {
 	var withFiles bool
 	var withFiles bool
 	var withPeople bool
 	var withPeople bool
 	var withCouples bool
 	var withCouples bool
+	var withUasts bool
 	var people_dict_path string
 	var people_dict_path string
 	var profile bool
 	var profile bool
-	var granularity, sampling, similarity_threshold int
+	var granularity, sampling, similarityThreshold int
+	var bblfshEndpoint string
+	var bblfshTimeout int
+	var uastPoolSize int
 	var commitsFile string
 	var commitsFile string
 	var ignoreMissingSubmodules bool
 	var ignoreMissingSubmodules bool
 	var debug bool
 	var debug bool
@@ -92,11 +99,13 @@ func main() {
 	flag.BoolVar(&withPeople, "people", false, "Output detailed statistics per each developer.")
 	flag.BoolVar(&withPeople, "people", false, "Output detailed statistics per each developer.")
 	flag.BoolVar(&withCouples, "couples", false, "Gather the co-occurrence matrix "+
 	flag.BoolVar(&withCouples, "couples", false, "Gather the co-occurrence matrix "+
 		"for files and people.")
 		"for files and people.")
+	flag.BoolVar(&withUasts, "uasts", false, "Output pairs of Universal Abstract Syntax Trees for " +
+			"every changed file in each commit.")
 	flag.StringVar(&people_dict_path, "people-dict", "", "Path to the developers' email associations.")
 	flag.StringVar(&people_dict_path, "people-dict", "", "Path to the developers' email associations.")
 	flag.BoolVar(&profile, "profile", false, "Collect the profile to hercules.pprof.")
 	flag.BoolVar(&profile, "profile", false, "Collect the profile to hercules.pprof.")
 	flag.IntVar(&granularity, "granularity", 30, "How many days there are in a single band.")
 	flag.IntVar(&granularity, "granularity", 30, "How many days there are in a single band.")
 	flag.IntVar(&sampling, "sampling", 30, "How frequently to record the state in days.")
 	flag.IntVar(&sampling, "sampling", 30, "How frequently to record the state in days.")
-	flag.IntVar(&similarity_threshold, "M", 90,
+	flag.IntVar(&similarityThreshold, "M", 90,
 		"A threshold on the similarity index used to detect renames.")
 		"A threshold on the similarity index used to detect renames.")
 	flag.BoolVar(&debug, "debug", false, "Validate the trees on each step.")
 	flag.BoolVar(&debug, "debug", false, "Validate the trees on each step.")
 	flag.StringVar(&commitsFile, "commits", "", "Path to the text file with the "+
 	flag.StringVar(&commitsFile, "commits", "", "Path to the text file with the "+
@@ -106,6 +115,9 @@ func main() {
 	flag.BoolVar(&ignoreMissingSubmodules, "ignore-missing-submodules", false,
 	flag.BoolVar(&ignoreMissingSubmodules, "ignore-missing-submodules", false,
 		"Do not panic on submodules which are not registered..")
 		"Do not panic on submodules which are not registered..")
 	flag.BoolVar(&protobuf, "pb", false, "The output format will be Protocol Buffers instead of YAML.")
 	flag.BoolVar(&protobuf, "pb", false, "The output format will be Protocol Buffers instead of YAML.")
+	flag.IntVar(&uastPoolSize, "uast-pool-size", 1, "Number of goroutines to extract UASTs.")
+	flag.StringVar(&bblfshEndpoint, "bblfsh", "0.0.0.0:9432", "Babelfish server's endpoint.")
+	flag.IntVar(&bblfshTimeout, "bblfsh-timeout", 20, "Babelfish's server timeout.")
 	flag.Parse()
 	flag.Parse()
 	if granularity <= 0 {
 	if granularity <= 0 {
 		fmt.Fprint(os.Stderr, "Warning: adjusted the granularity to 1 day\n")
 		fmt.Fprint(os.Stderr, "Warning: adjusted the granularity to 1 day\n")
@@ -180,22 +192,37 @@ func main() {
 	pipeline.AddItem(&hercules.BlobCache{
 	pipeline.AddItem(&hercules.BlobCache{
 		IgnoreMissingSubmodules: ignoreMissingSubmodules,
 		IgnoreMissingSubmodules: ignoreMissingSubmodules,
 	})
 	})
+	var uastSaver *hercules.UASTChangesSaver
+	if withUasts {
+		pipeline.AddItem(&hercules.UASTExtractor{
+			Endpoint: bblfshEndpoint,
+			Context: func() context.Context {
+				ctx, _ := context.WithTimeout(context.Background(),
+					                            time.Duration(bblfshTimeout) * time.Second)
+				return ctx
+			},
+			PoolSize: uastPoolSize,
+		  Extensions: map[string]bool {"py": true, "java": true}})
+		pipeline.AddItem(&hercules.UASTChanges{})
+		uastSaver = &hercules.UASTChangesSaver{}
+		pipeline.AddItem(uastSaver)
+	}
 	pipeline.AddItem(&hercules.DaysSinceStart{})
 	pipeline.AddItem(&hercules.DaysSinceStart{})
-	pipeline.AddItem(&hercules.RenameAnalysis{SimilarityThreshold: similarity_threshold})
+	pipeline.AddItem(&hercules.RenameAnalysis{SimilarityThreshold: similarityThreshold})
 	pipeline.AddItem(&hercules.TreeDiff{})
 	pipeline.AddItem(&hercules.TreeDiff{})
 	pipeline.AddItem(&hercules.FileDiff{})
 	pipeline.AddItem(&hercules.FileDiff{})
-	id_matcher := &hercules.IdentityDetector{}
+	idMatcher := &hercules.IdentityDetector{}
 	var peopleCount int
 	var peopleCount int
 	if withPeople || withCouples {
 	if withPeople || withCouples {
 		if people_dict_path != "" {
 		if people_dict_path != "" {
-			id_matcher.LoadPeopleDict(people_dict_path)
-			peopleCount = len(id_matcher.ReversePeopleDict) - 1
+			idMatcher.LoadPeopleDict(people_dict_path)
+			peopleCount = len(idMatcher.ReversePeopleDict) - 1
 		} else {
 		} else {
-			id_matcher.GeneratePeopleDict(commits)
-			peopleCount = len(id_matcher.ReversePeopleDict)
+			idMatcher.GeneratePeopleDict(commits)
+			peopleCount = len(idMatcher.ReversePeopleDict)
 		}
 		}
 	}
 	}
-	pipeline.AddItem(id_matcher)
+	pipeline.AddItem(idMatcher)
 	burndowner := &hercules.BurndownAnalysis{
 	burndowner := &hercules.BurndownAnalysis{
 		Granularity:  granularity,
 		Granularity:  granularity,
 		Sampling:     sampling,
 		Sampling:     sampling,
@@ -217,23 +244,47 @@ func main() {
 	}
 	}
 	fmt.Fprint(os.Stderr, "writing...    \r")
 	fmt.Fprint(os.Stderr, "writing...    \r")
 	burndownResults := result[burndowner].(hercules.BurndownResult)
 	burndownResults := result[burndowner].(hercules.BurndownResult)
+	if len(burndownResults.GlobalHistory) == 0 {
+		return
+	}
 	var couplesResult hercules.CouplesResult
 	var couplesResult hercules.CouplesResult
 	if withCouples {
 	if withCouples {
 		couplesResult = result[coupler].(hercules.CouplesResult)
 		couplesResult = result[coupler].(hercules.CouplesResult)
 	}
 	}
-	if len(burndownResults.GlobalHistory) == 0 {
-		return
+	if withUasts {
+		changedUasts := result[uastSaver].([][]hercules.UASTChange)
+		for i, changes := range changedUasts {
+			for j, change := range changes {
+				if change.Before == nil || change.After == nil {
+					continue
+				}
+				bs, _ := change.Before.Marshal()
+				ioutil.WriteFile(fmt.Sprintf(
+					"%d_%d_before_%s.pb", i, j, change.Change.From.TreeEntry.Hash.String()), bs, 0666)
+				blob, _ := repository.BlobObject(change.Change.From.TreeEntry.Hash)
+				s, _ := (&object.File{Blob: *blob}).Contents()
+				ioutil.WriteFile(fmt.Sprintf(
+					"%d_%d_before_%s.src", i, j, change.Change.From.TreeEntry.Hash.String()), []byte(s), 0666)
+				bs, _ = change.After.Marshal()
+				ioutil.WriteFile(fmt.Sprintf(
+					"%d_%d_after_%s.pb", i, j, change.Change.To.TreeEntry.Hash.String()), bs, 0666)
+				blob, _ = repository.BlobObject(change.Change.To.TreeEntry.Hash)
+				s, _ = (&object.File{Blob: *blob}).Contents()
+				ioutil.WriteFile(fmt.Sprintf(
+					"%d_%d_after_%s.src", i, j, change.Change.To.TreeEntry.Hash.String()), []byte(s), 0666)
+			}
+		}
 	}
 	}
 	begin := commits[0].Author.When.Unix()
 	begin := commits[0].Author.When.Unix()
 	end := commits[len(commits)-1].Author.When.Unix()
 	end := commits[len(commits)-1].Author.When.Unix()
 	if !protobuf {
 	if !protobuf {
 		printResults(uri, begin, end, granularity, sampling,
 		printResults(uri, begin, end, granularity, sampling,
 			withFiles, withPeople, withCouples,
 			withFiles, withPeople, withCouples,
-			burndownResults, couplesResult, id_matcher.ReversePeopleDict)
+			burndownResults, couplesResult, idMatcher.ReversePeopleDict)
 	} else {
 	} else {
 		serializeResults(uri, begin, end, granularity, sampling,
 		serializeResults(uri, begin, end, granularity, sampling,
 			withFiles, withPeople, withCouples,
 			withFiles, withPeople, withCouples,
-			burndownResults, couplesResult, id_matcher.ReversePeopleDict)
+			burndownResults, couplesResult, idMatcher.ReversePeopleDict)
 	}
 	}
 }
 }
 
 

+ 2 - 2
pipeline.go

@@ -107,7 +107,7 @@ func (pipeline *Pipeline) Initialize() {
 		graph.AddNode(name)
 		graph.AddNode(name)
 		name2item[name] = item
 		name2item[name] = item
 		for _, key := range item.Provides() {
 		for _, key := range item.Provides() {
-			key += "_entity"
+			key = "[" + key + "]"
 			graph.AddNode(key)
 			graph.AddNode(key)
 			graph.AddEdge(name, key)
 			graph.AddEdge(name, key)
 		}
 		}
@@ -115,7 +115,7 @@ func (pipeline *Pipeline) Initialize() {
 	for index, item := range pipeline.items {
 	for index, item := range pipeline.items {
 		name := fmt.Sprintf("%s_%d", item.Name(), index)
 		name := fmt.Sprintf("%s_%d", item.Name(), index)
 		for _, key := range item.Requires() {
 		for _, key := range item.Requires() {
-			key += "_entity"
+			key = "[" + key + "]"
 			if !graph.AddEdge(key, name) {
 			if !graph.AddEdge(key, name) {
 				panic(fmt.Sprintf("Unsatisfied dependency: %s -> %s", key, item.Name()))
 				panic(fmt.Sprintf("Unsatisfied dependency: %s -> %s", key, item.Name()))
 			}
 			}

+ 129 - 12
uast.go

@@ -8,19 +8,23 @@ import (
 	"sync"
 	"sync"
 
 
 	"github.com/jeffail/tunny"
 	"github.com/jeffail/tunny"
-	"gopkg.in/bblfsh/client-go.v0"
-	"gopkg.in/bblfsh/sdk.v0/protocol"
-	"gopkg.in/bblfsh/sdk.v0/uast"
+	"gopkg.in/bblfsh/client-go.v1"
+	"gopkg.in/bblfsh/sdk.v1/protocol"
+	"gopkg.in/bblfsh/sdk.v1/uast"
 	"gopkg.in/src-d/go-git.v4"
 	"gopkg.in/src-d/go-git.v4"
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
 	"gopkg.in/src-d/go-git.v4/utils/merkletrie"
 	"gopkg.in/src-d/go-git.v4/utils/merkletrie"
+	"fmt"
+	"os"
 )
 )
 
 
 type UASTExtractor struct {
 type UASTExtractor struct {
 	Endpoint string
 	Endpoint string
 	Context  func() context.Context
 	Context  func() context.Context
 	PoolSize int
 	PoolSize int
+	Extensions map[string]bool
+	FailOnErrors bool
 
 
 	clients []*bblfsh.BblfshClient
 	clients []*bblfsh.BblfshClient
 	pool   *tunny.WorkPool
 	pool   *tunny.WorkPool
@@ -29,8 +33,7 @@ type UASTExtractor struct {
 type uastTask struct {
 type uastTask struct {
 	Client *bblfsh.BblfshClient
 	Client *bblfsh.BblfshClient
 	Lock   *sync.RWMutex
 	Lock   *sync.RWMutex
-	Dest   map[string]*uast.Node
-	Name   string
+	Dest   map[plumbing.Hash]*uast.Node
 	File   *object.File
 	File   *object.File
 	Errors *[]error
 	Errors *[]error
 	Status chan int
 	Status chan int
@@ -61,7 +64,7 @@ func (exr *UASTExtractor) Provides() []string {
 }
 }
 
 
 func (exr *UASTExtractor) Requires() []string {
 func (exr *UASTExtractor) Requires() []string {
-	arr := [...]string{"changes", "blob_cache"}
+	arr := [...]string{"renamed_changes", "blob_cache"}
 	return arr[:]
 	return arr[:]
 }
 }
 
 
@@ -97,16 +100,28 @@ func (exr *UASTExtractor) Initialize(repository *git.Repository) {
 
 
 func (exr *UASTExtractor) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
 func (exr *UASTExtractor) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
 	cache := deps["blob_cache"].(map[plumbing.Hash]*object.Blob)
 	cache := deps["blob_cache"].(map[plumbing.Hash]*object.Blob)
-	treeDiffs := deps["changes"].(object.Changes)
-	uasts := map[string]*uast.Node{}
+	treeDiffs := deps["renamed_changes"].(object.Changes)
+	uasts := map[plumbing.Hash]*uast.Node{}
 	lock := sync.RWMutex{}
 	lock := sync.RWMutex{}
 	errs := make([]error, 0)
 	errs := make([]error, 0)
 	status := make(chan int)
 	status := make(chan int)
 	pending := 0
 	pending := 0
 	submit := func(change *object.Change) {
 	submit := func(change *object.Change) {
+		var ext string
+		dotpos := strings.LastIndex(change.To.Name, ".")
+		if dotpos >= 0 {
+			ext = change.To.Name[dotpos + 1:]
+		} else {
+			ext = change.To.Name
+		}
+		_, exists := exr.Extensions[ext]
+		if !exists {
+			return
+		}
 		pending++
 		pending++
 		exr.pool.SendWorkAsync(uastTask{
 		exr.pool.SendWorkAsync(uastTask{
-			Lock: &lock, Dest: uasts, Name: change.To.Name,
+			Lock:   &lock,
+			Dest:   uasts,
 			File:   &object.File{Name: change.To.Name, Blob: *cache[change.To.TreeEntry.Hash]},
 			File:   &object.File{Name: change.To.Name, Blob: *cache[change.To.TreeEntry.Hash]},
 			Errors: &errs, Status: status}, nil)
 			Errors: &errs, Status: status}, nil)
 	}
 	}
@@ -132,7 +147,12 @@ func (exr *UASTExtractor) Consume(deps map[string]interface{}) (map[string]inter
 		for i, err := range errs {
 		for i, err := range errs {
 			msgs[i] = err.Error()
 			msgs[i] = err.Error()
 		}
 		}
-		return nil, errors.New(strings.Join(msgs, "\n"))
+		joined := strings.Join(msgs, "\n")
+		if exr.FailOnErrors {
+			return nil, errors.New(joined)
+		} else {
+			fmt.Fprintln(os.Stderr, joined)
+		}
 	}
 	}
 	return map[string]interface{}{"uasts": uasts}, nil
 	return map[string]interface{}{"uasts": uasts}, nil
 }
 }
@@ -152,6 +172,9 @@ func (exr *UASTExtractor) extractUAST(
 	request.Filename(file.Name)
 	request.Filename(file.Name)
 	response, err := request.DoWithContext(exr.Context())
 	response, err := request.DoWithContext(exr.Context())
 	if err != nil {
 	if err != nil {
+		if strings.Contains("missing driver", err.Error()) {
+			return nil, nil
+		}
 		return nil, err
 		return nil, err
 	}
 	}
 	if response.Status != protocol.Ok {
 	if response.Status != protocol.Ok {
@@ -170,9 +193,103 @@ func (exr *UASTExtractor) extractTask(data interface{}) interface{} {
 	task.Lock.Lock()
 	task.Lock.Lock()
 	defer task.Lock.Unlock()
 	defer task.Lock.Unlock()
 	if err != nil {
 	if err != nil {
-		*task.Errors = append(*task.Errors, errors.New(task.Name+": "+err.Error()))
+		*task.Errors = append(*task.Errors, errors.New(task.File.Name+": "+err.Error()))
 		return nil
 		return nil
 	}
 	}
-	task.Dest[task.Name] = node
+	task.Dest[task.File.Hash] = node
 	return nil
 	return nil
 }
 }
+
+type UASTChange struct {
+	Before *uast.Node
+	After *uast.Node
+	Change *object.Change
+}
+
+type UASTChanges struct {
+  cache map[plumbing.Hash]*uast.Node
+}
+
+func (uc *UASTChanges) Name() string {
+	return "UASTChanges"
+}
+
+func (uc *UASTChanges) Provides() []string {
+	arr := [...]string{"changed_uasts"}
+	return arr[:]
+}
+
+func (uc *UASTChanges) Requires() []string {
+	arr := [...]string{"uasts", "renamed_changes"}
+	return arr[:]
+}
+
+func (uc *UASTChanges) Initialize(repository *git.Repository) {
+	uc.cache = map[plumbing.Hash]*uast.Node{}
+}
+
+func (uc *UASTChanges) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
+  uasts := deps["uasts"].(map[plumbing.Hash]*uast.Node)
+	treeDiffs := deps["renamed_changes"].(object.Changes)
+	commit := make([]UASTChange, 0, len(treeDiffs))
+	for _, change := range treeDiffs {
+		action, err := change.Action()
+		if err != nil {
+			return nil, err
+		}
+		switch action {
+		case merkletrie.Insert:
+			hashTo := change.To.TreeEntry.Hash
+			uastTo := uasts[hashTo]
+			commit = append(commit, UASTChange{Before: nil, After: uastTo, Change: change})
+			uc.cache[hashTo] = uastTo
+		case merkletrie.Delete:
+			hashFrom := change.From.TreeEntry.Hash
+			commit = append(commit, UASTChange{Before: uc.cache[hashFrom], After: nil, Change: change})
+			delete(uc.cache, hashFrom)
+		case merkletrie.Modify:
+			hashFrom := change.From.TreeEntry.Hash
+			hashTo := change.To.TreeEntry.Hash
+			uastTo := uasts[hashTo]
+			commit = append(commit, UASTChange{Before: uc.cache[hashFrom], After: uastTo, Change: change})
+			delete(uc.cache, hashFrom)
+			uc.cache[hashTo] = uastTo
+		}
+	}
+	return map[string]interface{}{"changed_uasts": commit}, nil
+}
+
+func (uc *UASTChanges) Finalize() interface{} {
+	return nil
+}
+
+type UASTChangesSaver struct {
+  result [][]UASTChange
+}
+
+func (saver *UASTChangesSaver) Name() string {
+	return "UASTChangesSaver"
+}
+
+func (saver *UASTChangesSaver) Provides() []string {
+	return []string{}
+}
+
+func (saver *UASTChangesSaver) Requires() []string {
+	arr := [...]string{"changed_uasts"}
+	return arr[:]
+}
+
+func (saver *UASTChangesSaver) Initialize(repository *git.Repository) {
+	saver.result = [][]UASTChange{}
+}
+
+func (saver *UASTChangesSaver) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
+	changes := deps["changed_uasts"].([]UASTChange)
+	saver.result = append(saver.result, changes)
+	return nil, nil
+}
+
+func (saver *UASTChangesSaver) Finalize() interface{} {
+	return saver.result
+}