浏览代码

Merge pull request #105 from vmarkovtsev/master

Add filtering changes by language
Vadim Markovtsev 6 年之前
父节点
当前提交
9b8478df11
共有 4 个文件被更改,包括 155 次插入65 次删除
  1. 84 13
      internal/plumbing/tree_diff.go
  2. 43 1
      internal/plumbing/tree_diff_test.go
  3. 1 28
      internal/plumbing/uast/uast.go
  4. 27 23
      internal/plumbing/uast/uast_test.go

+ 84 - 13
internal/plumbing/tree_diff.go

@@ -1,6 +1,8 @@
 package plumbing
 
 import (
+	"fmt"
+	"gopkg.in/src-d/enry.v1"
 	"io"
 	"log"
 	"strings"
@@ -18,8 +20,11 @@ import (
 type TreeDiff struct {
 	core.NoopMerger
 	SkipDirs     []string
+	Languages    map[string]bool
+
 	previousTree *object.Tree
 	previousCommit plumbing.Hash
+	repository *git.Repository
 }
 
 const (
@@ -31,6 +36,13 @@ const (
 	// ConfigTreeDiffBlacklistedDirs s the name of the configuration option
 	// (TreeDiff.Configure()) which allows to set blacklisted directories.
 	ConfigTreeDiffBlacklistedDirs = "TreeDiff.BlacklistedDirs"
+	// ConfigTreeDiffLanguages is the name of the configuration option (TreeDiff.Configure())
+	// which sets the list of programming languages to analyze. Language names are at
+	// https://doc.bblf.sh/languages.html Names are joined with a comma ",".
+	// "all" is the special name which disables this filter.
+	ConfigTreeDiffLanguages = "TreeDiff.Languages"
+	// allLanguages denotes passing all files in.
+	allLanguages = "all"
 )
 
 var defaultBlacklistedDirs = []string{"vendor/", "vendors/", "node_modules/"}
@@ -67,7 +79,15 @@ func (treediff *TreeDiff) ListConfigurationOptions() []core.ConfigurationOption
 		Description: "List of blacklisted directories. Separated by comma \",\".",
 		Flag:        "blacklisted-dirs",
 		Type:        core.StringsConfigurationOption,
-		Default:     defaultBlacklistedDirs},
+		Default:     defaultBlacklistedDirs}, {
+		Name:        ConfigTreeDiffLanguages,
+		Description: fmt.Sprintf(
+			"List of programming languages to analyze. Separated by comma \",\". " +
+			"Names are at https://doc.bblf.sh/languages.html \"%s\" is the special name " +
+			"which disables this filter and lets all the files through.", allLanguages),
+		Flag:        "languages",
+		Type:        core.StringsConfigurationOption,
+		Default:     []string{allLanguages}},
 	}
 	return options[:]
 }
@@ -77,12 +97,26 @@ func (treediff *TreeDiff) Configure(facts map[string]interface{}) {
 	if val, exist := facts[ConfigTreeDiffEnableBlacklist]; exist && val.(bool) {
 		treediff.SkipDirs = facts[ConfigTreeDiffBlacklistedDirs].([]string)
 	}
+	if val, exists := facts[ConfigTreeDiffLanguages].(string); exists {
+		treediff.Languages = map[string]bool{}
+		for _, lang := range strings.Split(val, ",") {
+			treediff.Languages[strings.TrimSpace(lang)] = true
+		}
+	} else if treediff.Languages == nil {
+		treediff.Languages = map[string]bool{}
+		treediff.Languages[allLanguages] = true
+	}
 }
 
 // Initialize resets the temporary caches and prepares this PipelineItem for a series of Consume()
 // calls. The repository which is going to be analysed is supplied as an argument.
 func (treediff *TreeDiff) Initialize(repository *git.Repository) {
 	treediff.previousTree = nil
+	treediff.repository = repository
+	if treediff.Languages == nil {
+		treediff.Languages = map[string]bool{}
+		treediff.Languages[allLanguages] = true
+	}
 }
 
 // Consume runs this PipelineItem on the next commit data.
@@ -124,6 +158,13 @@ func (treediff *TreeDiff) Consume(deps map[string]interface{}) (map[string]inter
 					}
 					return err
 				}
+				pass, err := treediff.checkLanguage(file.Name, file.Hash)
+				if err != nil {
+					return err
+				}
+				if !pass {
+					continue
+				}
 				diff = append(diff, &object.Change{
 					To: object.ChangeEntry{Name: file.Name, Tree: tree, TreeEntry: object.TreeEntry{
 						Name: file.Name, Mode: file.Mode, Hash: file.Hash}}})
@@ -137,21 +178,29 @@ func (treediff *TreeDiff) Consume(deps map[string]interface{}) (map[string]inter
 	treediff.previousTree = tree
 	treediff.previousCommit = commit.Hash
 
-	if len(treediff.SkipDirs) > 0 {
-		// filter without allocation
-		filteredDiff := make([]*object.Change, 0, len(diff))
-	OUTER:
-		for _, change := range diff {
-			for _, dir := range treediff.SkipDirs {
-				if strings.HasPrefix(change.To.Name, dir) || strings.HasPrefix(change.From.Name, dir) {
-					continue OUTER
-				}
+	// filter without allocation
+	filteredDiff := make([]*object.Change, 0, len(diff))
+OUTER:
+	for _, change := range diff {
+		for _, dir := range treediff.SkipDirs {
+			if strings.HasPrefix(change.To.Name, dir) || strings.HasPrefix(change.From.Name, dir) {
+				continue OUTER
 			}
-			filteredDiff = append(filteredDiff, change)
 		}
-
-		diff = filteredDiff
+		var changeEntry object.ChangeEntry
+		if change.To.Tree == nil {
+			changeEntry = change.From
+		} else {
+			changeEntry = change.To
+		}
+		pass, _ := treediff.checkLanguage(changeEntry.Name, changeEntry.TreeEntry.Hash)
+		if !pass {
+			continue
+		}
+		filteredDiff = append(filteredDiff, change)
 	}
+
+	diff = filteredDiff
 	return map[string]interface{}{DependencyTreeChanges: diff}, nil
 }
 
@@ -160,6 +209,28 @@ func (treediff *TreeDiff) Fork(n int) []core.PipelineItem {
 	return core.ForkCopyPipelineItem(treediff, n)
 }
 
+// checkLanguage returns whether the blob corresponds to the list of required languages.
+func (treediff *TreeDiff) checkLanguage(name string, blobHash plumbing.Hash) (bool, error) {
+	if treediff.Languages[allLanguages] {
+		return true, nil
+	}
+	blob, err := treediff.repository.BlobObject(blobHash)
+	if err != nil {
+		return false, err
+	}
+	reader, err := blob.Reader()
+	if err != nil {
+		return false, err
+	}
+	buffer := make([]byte, 1024)
+	_, err = reader.Read(buffer)
+	if err != nil {
+		return false, err
+	}
+	lang := enry.GetLanguage(name, buffer)
+	return treediff.Languages[lang], nil
+}
+
 func init() {
 	core.Registry.Register(&TreeDiff{})
 }

+ 43 - 1
internal/plumbing/tree_diff_test.go

@@ -25,7 +25,7 @@ func TestTreeDiffMeta(t *testing.T) {
 	assert.Equal(t, len(td.Provides()), 1)
 	assert.Equal(t, td.Provides()[0], DependencyTreeChanges)
 	opts := td.ListConfigurationOptions()
-	assert.Len(t, opts, 2)
+	assert.Len(t, opts, 3)
 }
 
 func TestTreeDiffRegistration(t *testing.T) {
@@ -115,6 +115,7 @@ func TestTreeDiffBadCommit(t *testing.T) {
 func TestTreeDiffConsumeSkip(t *testing.T) {
 	// consume without skiping
 	td := fixtureTreeDiff()
+	assert.Contains(t, td.Languages, allLanguages)
 	commit, _ := test.Repository.CommitObject(plumbing.NewHash(
 		"aefdedf7cafa6ee110bae9a3910bf5088fdeb5a9"))
 	deps := map[string]interface{}{}
@@ -142,6 +143,47 @@ func TestTreeDiffConsumeSkip(t *testing.T) {
 	assert.Equal(t, 31, len(changes))
 }
 
+func TestTreeDiffConsumeLanguageFilterFirst(t *testing.T) {
+	td := fixtureTreeDiff()
+	td.Configure(map[string]interface{}{ConfigTreeDiffLanguages: "Go"})
+	commit, _ := test.Repository.CommitObject(plumbing.NewHash(
+		"fbe766ffdc3f87f6affddc051c6f8b419beea6a2"))
+	deps := map[string]interface{}{}
+	deps[core.DependencyCommit] = commit
+	res, err := td.Consume(deps)
+	assert.Nil(t, err)
+	assert.Equal(t, len(res), 1)
+	changes := res[DependencyTreeChanges].(object.Changes)
+	assert.Equal(t, len(changes), 6)
+	assert.Equal(t, changes[0].To.Name, "analyser.go")
+	assert.Equal(t, changes[1].To.Name, "cmd/hercules/main.go")
+	assert.Equal(t, changes[2].To.Name, "doc.go")
+	assert.Equal(t, changes[3].To.Name, "file.go")
+	assert.Equal(t, changes[4].To.Name, "file_test.go")
+	assert.Equal(t, changes[5].To.Name, "rbtree.go")
+}
+
+func TestTreeDiffConsumeLanguageFilter(t *testing.T) {
+	td := fixtureTreeDiff()
+	td.Configure(map[string]interface{}{ConfigTreeDiffLanguages: "Python"})
+	commit, _ := test.Repository.CommitObject(plumbing.NewHash(
+		"e89c1d10fb31e32668ad905eb59dc44d7a4a021e"))
+	deps := map[string]interface{}{}
+	deps[core.DependencyCommit] = commit
+	res, err := td.Consume(deps)
+	assert.Nil(t, err)
+	assert.Equal(t, len(res), 1)
+	commit, _ = test.Repository.CommitObject(plumbing.NewHash(
+		"fbe766ffdc3f87f6affddc051c6f8b419beea6a2"))
+	deps[core.DependencyCommit] = commit
+	res, err = td.Consume(deps)
+	assert.Nil(t, err)
+	assert.Equal(t, len(res), 1)
+	changes := res[DependencyTreeChanges].(object.Changes)
+	assert.Equal(t, len(changes), 1)
+	assert.Equal(t, changes[0].To.Name, "labours.py")
+}
+
 func TestTreeDiffFork(t *testing.T) {
 	td1 := fixtureTreeDiff()
 	td1.SkipDirs = append(td1.SkipDirs, "skip")

+ 1 - 28
internal/plumbing/uast/uast.go

@@ -19,7 +19,6 @@ import (
 	"gopkg.in/bblfsh/client-go.v2"
 	"gopkg.in/bblfsh/sdk.v1/protocol"
 	"gopkg.in/bblfsh/sdk.v1/uast"
-	"gopkg.in/src-d/enry.v1"
 	"gopkg.in/src-d/go-git.v4"
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
@@ -37,7 +36,6 @@ type Extractor struct {
 	Endpoint       string
 	Context        func() (context.Context, context.CancelFunc)
 	PoolSize       int
-	Languages      map[string]bool
 	FailOnErrors   bool
 	ProcessedFiles map[string]int
 
@@ -60,11 +58,6 @@ const (
 	// ConfigUASTFailOnErrors is the name of the configuration option (Extractor.Configure())
 	// which enables early exit in case of any Babelfish UAST parsing errors.
 	ConfigUASTFailOnErrors = "ConfigUASTFailOnErrors"
-	// ConfigUASTLanguages is the name of the configuration option (Extractor.Configure())
-	// which sets the list of languages to parse. Language names are at
-	// https://doc.bblf.sh/languages.html Names are joined with a comma ",".
-	ConfigUASTLanguages = "ConfigUASTLanguages"
-
 	// FeatureUast is the name of the Pipeline feature which activates all the items related to UAST.
 	FeatureUast = "uast"
 	// DependencyUasts is the name of the dependency provided by Extractor.
@@ -140,12 +133,7 @@ func (exr *Extractor) ListConfigurationOptions() []core.ConfigurationOption {
 		Description: "Panic if there is a UAST extraction error.",
 		Flag:        "bblfsh-fail-on-error",
 		Type:        core.BoolConfigurationOption,
-		Default:     false}, {
-		Name:        ConfigUASTLanguages,
-		Description: "Programming languages from which to extract UASTs. Separated by comma \",\".",
-		Flag:        "languages",
-		Type:        core.StringConfigurationOption,
-		Default:     "Python,Java,Go,JavaScript,Ruby,PHP"},
+		Default:     false},
 	}
 	return options[:]
 }
@@ -164,12 +152,6 @@ func (exr *Extractor) Configure(facts map[string]interface{}) {
 	if val, exists := facts[ConfigUASTPoolSize].(int); exists {
 		exr.PoolSize = val
 	}
-	if val, exists := facts[ConfigUASTLanguages].(string); exists {
-		exr.Languages = map[string]bool{}
-		for _, lang := range strings.Split(val, ",") {
-			exr.Languages[strings.TrimSpace(lang)] = true
-		}
-	}
 	if val, exists := facts[ConfigUASTFailOnErrors].(bool); exists {
 		exr.FailOnErrors = val
 	}
@@ -210,9 +192,6 @@ func (exr *Extractor) Initialize(repository *git.Repository) {
 		panic("UAST goroutine pool was not created")
 	}
 	exr.ProcessedFiles = map[string]int{}
-	if exr.Languages == nil {
-		exr.Languages = map[string]bool{}
-	}
 }
 
 // Consume runs this PipelineItem on the next commit data.
@@ -235,17 +214,11 @@ func (exr *Extractor) Consume(deps map[string]interface{}) (map[string]interface
 				return
 			}
 			defer ioutil.CheckClose(reader, &err)
-
 			buf := new(bytes.Buffer)
 			if _, err := buf.ReadFrom(reader); err != nil {
 				errs = append(errs, err)
 				return
 			}
-			lang := enry.GetLanguage(change.To.Name, buf.Bytes())
-			if _, exists := exr.Languages[lang]; !exists {
-				exr.ProcessedFiles[change.To.Name] = uastExtractionSkipped
-				return
-			}
 			exr.ProcessedFiles[change.To.Name]++
 		}
 		wg.Add(1)

+ 27 - 23
internal/plumbing/uast/uast_test.go

@@ -25,7 +25,6 @@ import (
 func fixtureUASTExtractor() *Extractor {
 	exr := Extractor{Endpoint: "0.0.0.0:9432"}
 	exr.Initialize(test.Repository)
-	exr.Languages["Python"] = true
 	return &exr
 }
 
@@ -38,12 +37,11 @@ func TestUASTExtractorMeta(t *testing.T) {
 	assert.Equal(t, exr.Requires()[0], items.DependencyTreeChanges)
 	assert.Equal(t, exr.Requires()[1], items.DependencyBlobCache)
 	opts := exr.ListConfigurationOptions()
-	assert.Len(t, opts, 5)
+	assert.Len(t, opts, 4)
 	assert.Equal(t, opts[0].Name, ConfigUASTEndpoint)
 	assert.Equal(t, opts[1].Name, ConfigUASTTimeout)
 	assert.Equal(t, opts[2].Name, ConfigUASTPoolSize)
 	assert.Equal(t, opts[3].Name, ConfigUASTFailOnErrors)
-	assert.Equal(t, opts[4].Name, ConfigUASTLanguages)
 	feats := exr.Features()
 	assert.Len(t, feats, 1)
 	assert.Equal(t, feats[0], FeatureUast)
@@ -56,15 +54,11 @@ func TestUASTExtractorConfiguration(t *testing.T) {
 	facts[ConfigUASTEndpoint] = "localhost:9432"
 	facts[ConfigUASTTimeout] = 15
 	facts[ConfigUASTPoolSize] = 7
-	facts[ConfigUASTLanguages] = "C, Go"
 	facts[ConfigUASTFailOnErrors] = true
 	exr.Configure(facts)
 	assert.Equal(t, exr.Endpoint, facts[ConfigUASTEndpoint])
 	assert.NotNil(t, exr.Context)
 	assert.Equal(t, exr.PoolSize, facts[ConfigUASTPoolSize])
-	assert.True(t, exr.Languages["C"])
-	assert.True(t, exr.Languages["Go"])
-	assert.False(t, exr.Languages["Python"])
 	assert.Equal(t, exr.FailOnErrors, true)
 }
 
@@ -79,7 +73,7 @@ func TestUASTExtractorRegistration(t *testing.T) {
 
 func TestUASTExtractorConsume(t *testing.T) {
 	exr := fixtureUASTExtractor()
-	changes := make(object.Changes, 2)
+	changes := make(object.Changes, 3)
 	// 2b1ed978194a94edeabbca6de7ff3b5771d4d665
 	treeFrom, _ := test.Repository.TreeObject(plumbing.NewHash(
 		"96c6ece9b2f3c7c51b83516400d278dea5605100"))
@@ -113,31 +107,39 @@ func TestUASTExtractorConsume(t *testing.T) {
 		},
 	},
 	}
+	changes[2] = &object.Change{From: object.ChangeEntry{}, To: object.ChangeEntry{
+		Name: "linux.png",
+		Tree: treeTo,
+		TreeEntry: object.TreeEntry{
+			Name: "linux.png",
+			Mode: 0100644,
+			Hash: plumbing.NewHash("81f2b6d1fa5357f90e9dead150cd515720897545"),
+		},
+	},
+	}
 	cache := map[plumbing.Hash]*object.Blob{}
-	hash := plumbing.NewHash("baa64828831d174f40140e4b3cfa77d1e917a2c1")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("5d78f57d732aed825764347ec6f3ab74d50d0619")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("c29112dbd697ad9b401333b80c18a63951bc18d9")
-	cache[hash], _ = test.Repository.BlobObject(hash)
-	hash = plumbing.NewHash("f7d918ec500e2f925ecde79b51cc007bac27de72")
-	cache[hash], _ = test.Repository.BlobObject(hash)
+	for _, hash := range []string{
+		"baa64828831d174f40140e4b3cfa77d1e917a2c1",
+		"5d78f57d732aed825764347ec6f3ab74d50d0619",
+		"c29112dbd697ad9b401333b80c18a63951bc18d9",
+		"f7d918ec500e2f925ecde79b51cc007bac27de72",
+		"81f2b6d1fa5357f90e9dead150cd515720897545",
+	} {
+		cache[plumbing.NewHash(hash)], _ = test.Repository.BlobObject(plumbing.NewHash(hash))
+	}
 	deps := map[string]interface{}{}
 	deps[items.DependencyBlobCache] = cache
 	deps[items.DependencyTreeChanges] = changes
 	deps[core.DependencyCommit], _ = test.Repository.CommitObject(
 		plumbing.NewHash("2b1ed978194a94edeabbca6de7ff3b5771d4d665"))
 	res, err := exr.Consume(deps)
-	// Language not enabled
-	assert.Len(t, res[DependencyUasts], 0)
+	assert.Len(t, res[DependencyUasts], 1)
 	assert.Nil(t, err)
-	exr.Languages["Go3000"] = true
 	res, err = exr.Consume(deps)
-	// No Go driver
-	assert.Len(t, res[DependencyUasts], 0)
+	assert.Len(t, res[DependencyUasts], 1)
 	assert.Nil(t, err)
 
-	hash = plumbing.NewHash("5d78f57d732aed825764347ec6f3ab74d50d0619")
+	hash := plumbing.NewHash("5d78f57d732aed825764347ec6f3ab74d50d0619")
 	changes[1] = &object.Change{From: object.ChangeEntry{}, To: object.ChangeEntry{
 		Name: "labours.py",
 		Tree: treeTo,
@@ -148,6 +150,7 @@ func TestUASTExtractorConsume(t *testing.T) {
 		},
 	},
 	}
+	deps[items.DependencyTreeChanges] = changes[:2]
 
 	res, err = exr.Consume(deps)
 	assert.Nil(t, err)
@@ -201,7 +204,7 @@ func TestUASTChangesRegistration(t *testing.T) {
 }
 
 func TestUASTChangesConsume(t *testing.T) {
-	uastsArray := []*uast.Node{}
+	var uastsArray []*uast.Node
 	uasts := map[plumbing.Hash]*uast.Node{}
 	hash := plumbing.NewHash("291286b4ac41952cbd1389fda66420ec03c1a9fe")
 	uasts[hash] = &uast.Node{}
@@ -304,6 +307,7 @@ func fixtureUASTChangesSaver() *ChangesSaver {
 func TestUASTChangesSaverMeta(t *testing.T) {
 	chs := fixtureUASTChangesSaver()
 	assert.Equal(t, chs.Name(), "UASTChangesSaver")
+	assert.True(t, len(chs.Description()) > 0)
 	assert.Equal(t, len(chs.Provides()), 0)
 	assert.Equal(t, len(chs.Requires()), 1)
 	assert.Equal(t, chs.Requires()[0], DependencyUastChanges)