Browse Source

Fix the problem with missing positions in --typos-dataset

Signed-off-by: Vadim Markovtsev <vadim@sourced.tech>
Vadim Markovtsev 6 years ago
parent
commit
92dc2f9fe2
4 changed files with 102 additions and 32 deletions
  1. 8 0
      internal/core/pipeline.go
  2. 1 3
      internal/plumbing/ticks.go
  3. 37 23
      leaves/research/typos.go
  4. 56 6
      leaves/research/typos_test.go

+ 8 - 0
internal/core/pipeline.go

@@ -894,3 +894,11 @@ func LoadCommitsFromFile(path string, repository *git.Repository) ([]*object.Com
 	}
 	return commits, nil
 }
+
+// GetSensibleRemote extracts a remote URL of the repository to identify it.
+func GetSensibleRemote(repository *git.Repository) string {
+	if r, err := repository.Remotes(); err == nil && len(r) > 0 {
+		return r[0].Config().URLs[0]
+	}
+	return "<no remote>"
+}

+ 1 - 3
internal/plumbing/ticks.go

@@ -103,9 +103,7 @@ func (ticks *TicksSinceStart) Initialize(repository *git.Repository) error {
 			delete(ticks.commits, key)
 		}
 	}
-	if r, err := repository.Remotes(); err == nil && len(r) > 0 {
-		ticks.remote = r[0].Config().URLs[0]
-	}
+	ticks.remote = core.GetSensibleRemote(repository)
 	return nil
 }
 

+ 37 - 23
leaves/research/typos.go

@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"fmt"
 	"io"
+	"log"
 	"unicode/utf8"
 
 	"github.com/gogo/protobuf/proto"
@@ -34,6 +35,8 @@ type TyposDatasetBuilder struct {
 	lcontext *levenshtein.Context
 	// xpather filters identifiers.
 	xpather uast_items.ChangesXPather
+	// remote carries the repository remote URL (for debugging)
+	remote string
 }
 
 // TyposResult is returned by TyposDatasetBuilder.Finalize() and carries the found typo-fix
@@ -121,6 +124,7 @@ func (tdb *TyposDatasetBuilder) Initialize(repository *git.Repository) error {
 	}
 	tdb.lcontext = &levenshtein.Context{}
 	tdb.xpather.XPath = "//uast:Identifier"
+	tdb.remote = core.GetSensibleRemote(repository)
 	return nil
 }
 
@@ -150,33 +154,35 @@ func (tdb *TyposDatasetBuilder) Consume(deps map[string]interface{}) (map[string
 		linesAfter := bytes.Split(cache[change.Change.To.TreeEntry.Hash].Data, []byte{'\n'})
 		diff := diffs[change.Change.To.Name]
 		var lineNumBefore, lineNumAfter int
-		clear := false
 		var candidates []candidate
 		focusedLinesBefore := map[int]bool{}
 		focusedLinesAfter := map[int]bool{}
+		removedSize := 0
 		for _, edit := range diff.Diffs {
 			size := utf8.RuneCountInString(edit.Text)
 			switch edit.Type {
 			case diffmatchpatch.DiffDelete:
 				lineNumBefore += size
-				clear = size == 1
+				removedSize = size
 			case diffmatchpatch.DiffInsert:
-				if size == 1 && clear {
-					dist := tdb.lcontext.Distance(
-						string(linesBefore[lineNumBefore-1]),
-						string(linesAfter[lineNumAfter]))
-					if dist <= tdb.MaximumAllowedDistance {
-						candidates = append(candidates, candidate{lineNumBefore - 1, lineNumAfter})
-						focusedLinesBefore[lineNumBefore-1] = true
-						focusedLinesAfter[lineNumAfter] = true
+				if size == removedSize {
+					for i := 0; i < size; i++ {
+						lb := lineNumBefore - size + i
+						la := lineNumAfter + i
+						dist := tdb.lcontext.Distance(string(linesBefore[lb]), string(linesAfter[la]))
+						if dist <= tdb.MaximumAllowedDistance {
+							candidates = append(candidates, candidate{lb, la})
+							focusedLinesBefore[lb] = true
+							focusedLinesAfter[la] = true
+						}
 					}
 				}
 				lineNumAfter += size
-				clear = false
+				removedSize = 0
 			case diffmatchpatch.DiffEqual:
 				lineNumBefore += size
 				lineNumAfter += size
-				clear = false
+				removedSize = 0
 			}
 		}
 		if len(candidates) == 0 {
@@ -190,25 +196,33 @@ func (tdb *TyposDatasetBuilder) Consume(deps map[string]interface{}) (map[string
 		removedIdentifiers := map[int][]nodes.Node{}
 		for _, n := range nodesAdded {
 			pos := uast.PositionsOf(n.(nodes.Object))
-			if pos.Start() != nil {
-				line := int(pos.Start().Line) - 1
-				if focusedLinesAfter[line] {
-					addedIdentifiers[line] = append(addedIdentifiers[line], n)
-				}
+			if pos.Start() == nil {
+				log.Printf("repo %s commit %s file %s adds identifier %s with no position",
+					tdb.remote, commit.String(), change.Change.To.Name,
+					n.(nodes.Object)["Name"].(nodes.String))
+				continue
+			}
+			line := int(pos.Start().Line) - 1
+			if focusedLinesAfter[line] {
+				addedIdentifiers[line] = append(addedIdentifiers[line], n)
 			}
 		}
 		for _, n := range nodesRemoved {
 			pos := uast.PositionsOf(n.(nodes.Object))
+			if pos.Start() == nil {
+				log.Printf("repo %s commit %s file %s removes identifier %s with no position",
+					tdb.remote, commit.String(), change.Change.To.Name,
+					n.(nodes.Object)["Name"].(nodes.String))
+				continue
+			}
 			line := int(pos.Start().Line) - 1
-			if pos.Start() != nil {
-				if focusedLinesBefore[line] {
-					removedIdentifiers[line] = append(removedIdentifiers[line], n)
-				}
+			if focusedLinesBefore[line] {
+				removedIdentifiers[line] = append(removedIdentifiers[line], n)
 			}
 		}
 		for _, c := range candidates {
-			nodesBefore := addedIdentifiers[c.Before]
-			nodesAfter := removedIdentifiers[c.After]
+			nodesBefore := removedIdentifiers[c.Before]
+			nodesAfter := addedIdentifiers[c.After]
 			if len(nodesBefore) == 1 && len(nodesAfter) == 1 {
 				idBefore := string(nodesBefore[0].(nodes.Object)["Name"].(nodes.String))
 				idAfter := string(nodesAfter[0].(nodes.Object)["Name"].(nodes.String))

+ 56 - 6
leaves/research/typos_test.go

@@ -8,6 +8,9 @@ import (
 
 	"github.com/gogo/protobuf/proto"
 	"github.com/stretchr/testify/assert"
+	"gopkg.in/bblfsh/client-go.v3/tools"
+	"gopkg.in/bblfsh/sdk.v2/uast"
+	"gopkg.in/bblfsh/sdk.v2/uast/nodes"
 	"gopkg.in/src-d/go-git.v4/plumbing"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
 	"gopkg.in/src-d/hercules.v10/internal/core"
@@ -69,7 +72,7 @@ func AddHash(t *testing.T, cache map[plumbing.Hash]*items.CachedBlob, hash strin
 	cache[objhash] = cb
 }
 
-func TestTyposDatasetConsume(t *testing.T) {
+func bakeTyposDeps(t *testing.T) map[string]interface{} {
 	deps := map[string]interface{}{}
 	cache := map[plumbing.Hash]*items.CachedBlob{}
 	AddHash(t, cache, "b9a12fd144274c99c7c9a0a32a0268f8b36d2f2c")
@@ -133,25 +136,72 @@ func TestTyposDatasetConsume(t *testing.T) {
 	diffResult, err := fd.Consume(deps)
 	assert.Nil(t, err)
 	deps[items.DependencyFileDiff] = diffResult[items.DependencyFileDiff]
+	return deps
+}
 
+func TestTyposDatasetConsume(t *testing.T) {
+	deps := bakeTyposDeps(t)
 	tbd := &TyposDatasetBuilder{}
 	assert.Nil(t, tbd.Initialize(test.Repository))
 	res, err := tbd.Consume(deps)
 	assert.Nil(t, res)
 	assert.Nil(t, err)
-	assert.Len(t, tbd.typos, 4)
-	assert.Equal(t, tbd.typos[0].Wrong, "TestZeroInitializeFile")
-	assert.Equal(t, tbd.typos[0].Correct, "TestZeroInitialize")
+	assert.Len(t, tbd.typos, 26)
+	assert.Equal(t, tbd.typos[0].Wrong, "TestInitialize")
+	assert.Equal(t, tbd.typos[0].Correct, "TestInitializeFile")
 	assert.Equal(t, tbd.typos[0].Commit, plumbing.NewHash(
 		"84165d3b02647fae12cc026c7a580045246e8c98"))
 	assert.Equal(t, tbd.typos[0].File, "file_test.go")
-	assert.Equal(t, tbd.typos[0].Line, 74)
+	assert.Equal(t, tbd.typos[0].Line, 19)
 
 	deps[core.DependencyIsMerge] = true
 	res, err = tbd.Consume(deps)
 	assert.Nil(t, res)
 	assert.Nil(t, err)
-	assert.Len(t, tbd.typos, 4)
+	assert.Len(t, tbd.typos, 26)
+}
+
+func dropPositions(root nodes.Node, target string) {
+	for element := range tools.Iterate(tools.NewIterator(root, tools.PreOrder)) {
+		obj, isobj := element.(nodes.Object)
+		if !isobj {
+			continue
+		}
+		nameval, exists := obj["Name"]
+		if !exists {
+			continue
+		}
+		if name, isstr := nameval.(nodes.String); !isstr || string(name) != target {
+			continue
+		}
+		posval, exists := obj[uast.KeyPos]
+		if !exists {
+			continue
+		}
+		posobj, isobj := posval.(nodes.Object)
+		if !isobj {
+			continue
+		}
+		for k, v := range posobj {
+			po, _ := v.(nodes.Object)
+			if uast.AsPosition(po) != nil {
+				posobj[k] = nil
+			}
+		}
+	}
+}
+
+func TestTyposDatasetConsumeMissingPosition(t *testing.T) {
+	deps := bakeTyposDeps(t)
+	uastChanges := deps[uast_items.DependencyUastChanges].([]uast_items.Change)
+	dropPositions(uastChanges[0].Before, "TestZeroInitialize")
+	dropPositions(uastChanges[0].After, "TestZeroInitializeFile")
+	tbd := &TyposDatasetBuilder{}
+	assert.Nil(t, tbd.Initialize(test.Repository))
+	res, err := tbd.Consume(deps)
+	assert.Nil(t, res)
+	assert.Nil(t, err)
+	assert.Len(t, tbd.typos, 25)
 }
 
 func fixtureTyposDataset() *TyposDatasetBuilder {