Pārlūkot izejas kodu

Add CommentSentimentAnalysis

Signed-off-by: Vadim Markovtsev <vadim@sourced.tech>
Vadim Markovtsev 7 gadi atpakaļ
vecāks
revīzija
f8a7048a30
7 mainītis faili ar 425 papildinājumiem un 4 dzēšanām
  1. 8 3
      .travis.yml
  2. 4 1
      Makefile
  3. 1 0
      README.md
  4. 1 0
      appveyor.yml
  5. 346 0
      comment_sentiment.go
  6. 55 0
      comment_sentiment_test.go
  7. 10 0
      pb/pb.proto

+ 8 - 3
.travis.yml

@@ -27,7 +27,7 @@ stages:
   - deploy
 
 env:
-  - PROTOC_VERSION=3.5.1
+  - PROTOC_VERSION=3.5.1 TENSORFLOW_VERSION=1.6.0
 
 before_install:
   - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-6 90
@@ -43,13 +43,15 @@ before_install:
   - pip3 install --user -r requirements.txt tensorflow
   - docker run -d --privileged -p 9432:9432 --name bblfshd bblfsh/bblfshd
   - docker exec -it bblfshd bblfshctl driver install --all
+  - curl -L "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-cpu-$(go env GOOS)-x86_64-$TENSORFLOW_VERSION.tar.gz" | sudo tar -C /usr/local -xz
+  - sudo ldconfig
 install:
   - make
 script:
   - set -e
   - go vet ./...
   - golint -set_exit_status ./...
-  - go test -v -cpu=1,2 -coverprofile=coverage.txt -covermode=count gopkg.in/src-d/hercules.v3
+  - go test -tags tensorflow -v -cpu=1,2 -coverprofile=coverage.txt -covermode=count gopkg.in/src-d/hercules.v3
   - $GOPATH/bin/hercules version
   - $GOPATH/bin/hercules --burndown --couples --quiet --pb https://github.com/src-d/hercules > 1.pb
   - cp 1.pb 2.pb
@@ -58,6 +60,7 @@ script:
   - (cd contrib/_plugin_example && make)
   - $GOPATH/bin/hercules --burndown --burndown-files --burndown-people --couples --quiet https://github.com/src-d/hercules | python3 labours.py -m all -o out --backend Agg --disable-projector
   - $GOPATH/bin/hercules --burndown --burndown-files --burndown-people --couples --quiet --pb https://github.com/src-d/hercules | python3 labours.py -f pb -m all -o out --backend Agg --disable-projector
+  - $GOPATH/bin/hercules --sentiment --quiet --languages Python https://github.com/src-d/hercules
   - set +e
 after_success:
   - bash <(curl -s https://codecov.io/bash)
@@ -74,7 +77,7 @@ jobs:
         - gzip -S .darwin_amd64.gz $GOPATH/bin/hercules
       script: skip
       install:
-        - make
+        - DISABLE_TENSORFLOW=1 make
       deploy:
         provider: releases
         api_key:
@@ -91,6 +94,8 @@ jobs:
         - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-6 90
         - wget -O protoc.zip https://github.com/google/protobuf/releases/download/v$PROTOC_VERSION/protoc-$PROTOC_VERSION-linux-x86_64.zip
         - unzip -d ~/.local protoc.zip && rm protoc.zip
+        - curl -L "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-cpu-$(go env GOOS)-x86_64-$TENSORFLOW_VERSION.tar.gz" | sudo tar -C /usr/local -xz
+        - sudo ldconfig
       script: skip
       install:
         - make

+ 4 - 1
Makefile

@@ -5,6 +5,9 @@ else
 EXE = .exe
 endif
 PKG = $(shell go env GOOS)_$(shell go env GOARCH)
+ifneq (${DISABLE_TENSORFLOW},1)
+TAGS ?= tensorflow
+endif
 
 all: ${GOPATH}/bin/hercules${EXE}
 
@@ -37,4 +40,4 @@ ${GOPATH}/pkg/$(PKG)/gopkg.in/bblfsh/client-go.v2: ${GOPATH}/src/gopkg.in/bblfsh
 	make dependencies
 
 ${GOPATH}/bin/hercules${EXE}: *.go cmd/hercules/*.go rbtree/*.go yaml/*.go toposort/*.go pb/*.go ${GOPATH}/pkg/$(PKG)/gopkg.in/bblfsh/client-go.v2 pb/pb.pb.go pb/pb_pb2.py cmd/hercules/plugin_template_source.go
-	go get -ldflags "-X gopkg.in/src-d/hercules.v3.BinaryGitHash=$(shell git rev-parse HEAD)" gopkg.in/src-d/hercules.v3/cmd/hercules
+	go get -tags "$(TAGS)" -ldflags "-X gopkg.in/src-d/hercules.v3.BinaryGitHash=$(shell git rev-parse HEAD)" gopkg.in/src-d/hercules.v3/cmd/hercules

+ 1 - 0
README.md

@@ -25,6 +25,7 @@ pip3 install -r requirements.txt
 ```
 
 Numpy and Scipy can be installed on Windows using http://www.lfd.uci.edu/~gohlke/pythonlibs/
+Linux releases require [`libtensorflow`](https://www.tensorflow.org/install/install_go).
 
 ### Build from source
 You are going to need Go (>= v1.8) and Python 2 or 3.

+ 1 - 0
appveyor.yml

@@ -18,6 +18,7 @@ build_script:
   - set PATH=%PATH:C:\Program Files\Git\usr\bin;=%
   - set PATH=C:\msys64\mingw64\bin;%PATH%
   - cd %GOPATH%\src\gopkg.in\src-d\hercules.v3
+  - set DISABLE_TENSORFLOW=1
   - make
   - 7z a c:\gopath\src\gopkg.in\src-d\hercules.v3\hercules.win64.zip %GOPATH%\bin\hercules.exe
 

+ 346 - 0
comment_sentiment.go

@@ -0,0 +1,346 @@
+// +build tensorflow
+
+package hercules
+
+import (
+	"fmt"
+	"io"
+	"log"
+	"os"
+	"regexp"
+	"sort"
+	"strings"
+
+	"github.com/gogo/protobuf/proto"
+	progress "gopkg.in/cheggaaa/pb.v1"
+	"gopkg.in/bblfsh/sdk.v1/uast"
+	"gopkg.in/src-d/go-git.v4"
+	"gopkg.in/src-d/go-git.v4/plumbing"
+	"gopkg.in/src-d/hercules.v3/pb"
+	"gopkg.in/vmarkovtsev/BiDiSentiment.v1"
+)
+
+// CommentSentimentAnalysis measures comment sentiment through time.
+type CommentSentimentAnalysis struct {
+	MinCommentLength int
+	Gap              float32
+
+	commentsByDay map[int][]string
+	commitsByDay  map[int][]plumbing.Hash
+	xpather       *ChangesXPather
+}
+
+// CommentSentimentResult contains the sentiment values per day, where 1 means very negative
+// and 0 means very positive.
+type CommentSentimentResult struct {
+	EmotionsByDay map[int]float32
+	CommentsByDay map[int][]string
+	commitsByDay  map[int][]plumbing.Hash
+}
+
+const (
+	ConfigCommentSentimentMinLength = "CommentSentiment.MinLength"
+	ConfigCommentSentimentGap       = "CommentSentiment.Gap"
+
+	DefaultCommentSentimentCommentMinLength = 20
+	DefaultCommentSentimentGap              = float32(0.5)
+
+	// CommentLettersRatio is the threshold to filter impure comments which contain code.
+	CommentLettersRatio = 0.6
+)
+
+var (
+	filteredFirstCharRE = regexp.MustCompile("[^a-zA-Z0-9]")
+	filteredCharsRE     = regexp.MustCompile("[^-a-zA-Z0-9_:;,./?!#&%+*=\\n \\t()]+")
+	charsRE             = regexp.MustCompile("[a-zA-Z]+")
+	functionNameRE      = regexp.MustCompile("\\s*[a-zA-Z_][a-zA-Z_0-9]*\\(\\)")
+	whitespaceRE        = regexp.MustCompile("\\s+")
+	licenseRE           = regexp.MustCompile("(?i)[li[cs]en[cs][ei]|copyright|©")
+)
+
+// Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
+func (sent *CommentSentimentAnalysis) Name() string {
+	return "Sentiment"
+}
+
+// Provides returns the list of names of entities which are produced by this PipelineItem.
+// Each produced entity will be inserted into `deps` of dependent Consume()-s according
+// to this list. Also used by hercules.Registry to build the global map of providers.
+func (sent *CommentSentimentAnalysis) Provides() []string {
+	return []string{}
+}
+
+// Requires returns the list of names of entities which are needed by this PipelineItem.
+// Each requested entity will be inserted into `deps` of Consume(). In turn, those
+// entities are Provides() upstream.
+func (sent *CommentSentimentAnalysis) Requires() []string {
+	arr := [...]string{DependencyUastChanges, DependencyDay}
+	return arr[:]
+}
+
+// Features which must be enabled for this PipelineItem to be automatically inserted into the DAG.
+func (sent *CommentSentimentAnalysis) Features() []string {
+	arr := [...]string{FeatureUast}
+	return arr[:]
+}
+
+// ListConfigurationOptions returns the list of changeable public properties of this PipelineItem.
+func (sent *CommentSentimentAnalysis) ListConfigurationOptions() []ConfigurationOption {
+	options := [...]ConfigurationOption{{
+		Name:        ConfigCommentSentimentMinLength,
+		Description: "Minimum length of the comment to be analyzed.",
+		Flag:        "min-comment-len",
+		Type:        IntConfigurationOption,
+		Default:     DefaultCommentSentimentCommentMinLength}, {
+		Name: ConfigCommentSentimentGap,
+		Description: "Sentiment value threshold, values between 0.5 - X/2 and 0.5 + x/2 will not be " +
+			"considered. Must be >= 0 and < 1. The purpose is to exclude neutral comments.",
+		Flag:    "sentiment-gap",
+		Type:    FloatConfigurationOption,
+		Default: DefaultCommentSentimentGap},
+	}
+	return options[:]
+}
+
+// Flag returns the command line switch which activates the analysis.
+func (sent *CommentSentimentAnalysis) Flag() string {
+	return "sentiment"
+}
+
+// Configure sets the properties previously published by ListConfigurationOptions().
+func (sent *CommentSentimentAnalysis) Configure(facts map[string]interface{}) {
+	if val, exists := facts[ConfigCommentSentimentGap]; exists {
+		sent.Gap = val.(float32)
+		if sent.Gap < 0 || sent.Gap >= 1 {
+			log.Printf("Sentiment gap is too big: %f => reset to the default %f",
+				sent.Gap, DefaultCommentSentimentGap)
+			sent.Gap = DefaultCommentSentimentGap
+		}
+	}
+	if val, exists := facts[ConfigCommentSentimentMinLength]; exists {
+		sent.MinCommentLength = val.(int)
+		if sent.MinCommentLength < 10 {
+			log.Printf("Comment minimum length is too small: %d => reset to the default %d",
+				sent.MinCommentLength, DefaultCommentSentimentCommentMinLength)
+			sent.MinCommentLength = DefaultCommentSentimentCommentMinLength
+		}
+	}
+	sent.commitsByDay = facts[FactCommitsByDay].(map[int][]plumbing.Hash)
+}
+
+// Initialize resets the temporary caches and prepares this PipelineItem for a series of Consume()
+// calls. The repository which is going to be analysed is supplied as an argument.
+func (sent *CommentSentimentAnalysis) Initialize(repository *git.Repository) {
+	sent.commentsByDay = map[int][]string{}
+	sent.xpather = &ChangesXPather{XPath: "//*[@roleComment]"}
+}
+
+// Consume runs this PipelineItem on the next commit data.
+// `deps` contain all the results from upstream PipelineItem-s as requested by Requires().
+// Additionally, "commit" is always present there and represents the analysed *object.Commit.
+// This function returns the mapping with analysis results. The keys must be the same as
+// in Provides(). If there was an error, nil is returned.
+func (sent *CommentSentimentAnalysis) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
+	changes := deps[DependencyUastChanges].([]UASTChange)
+	day := deps[DependencyDay].(int)
+	commentNodes := sent.xpather.Extract(changes)
+	comments := sent.mergeComments(commentNodes)
+	dayComments := sent.commentsByDay[day]
+	if dayComments == nil {
+		dayComments = []string{}
+	}
+	dayComments = append(dayComments, comments...)
+	sent.commentsByDay[day] = dayComments
+	return nil, nil
+}
+
+// Finalize returns the result of the analysis. Further Consume() calls are not expected.
+func (sent *CommentSentimentAnalysis) Finalize() interface{} {
+	result := CommentSentimentResult{
+		EmotionsByDay: map[int]float32{},
+		CommentsByDay: map[int][]string{},
+		commitsByDay:  sent.commitsByDay,
+	}
+	texts := []string{}
+	days := make([]int, 0, len(sent.commentsByDay))
+	for day := range sent.commentsByDay {
+		days = append(days, day)
+	}
+	sort.Ints(days)
+	for _, key := range days {
+		for _, val := range sent.commentsByDay[key] {
+			texts = append(texts, val)
+		}
+	}
+	session, err := sentiment.OpenSession()
+	if err != nil {
+		panic(err)
+	}
+	defer session.Close()
+	var bar *progress.ProgressBar
+	callback := func(pos int, total int) {
+		if bar == nil {
+			bar = progress.New(total)
+			bar.Callback = func(msg string) {
+				os.Stderr.WriteString("\r" + msg)
+			}
+			bar.NotPrint = true
+			bar.ShowPercent = false
+			bar.ShowSpeed = false
+			bar.SetMaxWidth(80)
+			bar.Start()
+		}
+		bar.Set(pos)
+	}
+	// we run the bulk evaluation in the end for efficiency
+	weights, err := sentiment.EvaluateWithProgress(texts, session, callback)
+	if bar != nil {
+		bar.Finish()
+	}
+	if err != nil {
+		panic(err)
+	}
+	pos := 0
+	for _, key := range days {
+		sum := float32(0)
+		comments := make([]string, 0, len(sent.commentsByDay[key]))
+		for _, comment := range sent.commentsByDay[key] {
+			if weights[pos] < 0.5*(1-sent.Gap) || weights[pos] > 0.5*(1+sent.Gap) {
+				sum += weights[pos]
+				comments = append(comments, comment)
+			}
+			pos++
+		}
+		if len(comments) > 0 {
+			result.EmotionsByDay[key] = sum / float32(len(comments))
+			result.CommentsByDay[key] = comments
+		}
+	}
+	return result
+}
+
+// Serialize converts the analysis result as returned by Finalize() to text or bytes.
+// The text format is YAML and the bytes format is Protocol Buffers.
+func (sent *CommentSentimentAnalysis) Serialize(result interface{}, binary bool, writer io.Writer) error {
+	sentimentResult := result.(CommentSentimentResult)
+	if binary {
+		return sent.serializeBinary(&sentimentResult, writer)
+	}
+	sent.serializeText(&sentimentResult, writer)
+	return nil
+}
+
+func (sent *CommentSentimentAnalysis) serializeText(result *CommentSentimentResult, writer io.Writer) {
+	days := make([]int, 0, len(result.EmotionsByDay))
+	for day := range result.EmotionsByDay {
+		days = append(days, day)
+	}
+	sort.Ints(days)
+	for _, day := range days {
+		commits := sent.commitsByDay[day]
+		hashes := make([]string, len(commits))
+		for i, hash := range commits {
+			hashes[i] = hash.String()
+		}
+		fmt.Fprintf(writer, "  %d: [%.4f, [%s], \"%s\"]\n",
+			day, result.EmotionsByDay[day], strings.Join(hashes, ","),
+			strings.Join(result.CommentsByDay[day], "|"))
+	}
+}
+
+func (sent *CommentSentimentAnalysis) serializeBinary(
+	result *CommentSentimentResult, writer io.Writer) error {
+	message := pb.CommentSentimentResults{
+		SentimentByDay: map[int32]*pb.Sentiment{},
+	}
+	for key, val := range result.EmotionsByDay {
+		commits := make([]string, len(result.commitsByDay[key]))
+		for i, commit := range result.commitsByDay[key] {
+			commits[i] = commit.String()
+		}
+		message.SentimentByDay[int32(key)] = &pb.Sentiment{
+			Value:    val,
+			Comments: result.CommentsByDay[key],
+			Commits:  commits,
+		}
+	}
+	serialized, err := proto.Marshal(&message)
+	if err != nil {
+		return err
+	}
+	writer.Write(serialized)
+	return nil
+}
+
+func (sent *CommentSentimentAnalysis) mergeComments(nodes []*uast.Node) []string {
+	mergedComments := []string{}
+	lines := map[int][]*uast.Node{}
+	for _, node := range nodes {
+		lineno := int(node.StartPosition.Line)
+		subnodes := lines[lineno]
+		if subnodes == nil {
+			subnodes = []*uast.Node{}
+		}
+		subnodes = append(subnodes, node)
+		lines[lineno] = subnodes
+	}
+	lineNums := make([]int, 0, len(lines))
+	for line := range lines {
+		lineNums = append(lineNums, line)
+	}
+	sort.Ints(lineNums)
+	buffer := []string{}
+	for i, line := range lineNums {
+		lineNodes := lines[line]
+		maxEnd := line
+		for _, node := range lineNodes {
+			if node.EndPosition != nil && maxEnd < int(node.EndPosition.Line) {
+				maxEnd = int(node.EndPosition.Line)
+			}
+			token := strings.TrimSpace(node.Token)
+			if token != "" {
+				buffer = append(buffer, token)
+			}
+		}
+		if i < len(lineNums)-1 && lineNums[i+1] <= maxEnd+1 {
+			continue
+		}
+		mergedComments = append(mergedComments, strings.Join(buffer, "\n"))
+		buffer = buffer[:0]
+	}
+	// We remove unneeded chars and filter too short comments
+	filteredComments := make([]string, 0, len(mergedComments))
+	for _, comment := range mergedComments {
+		comment = strings.TrimSpace(comment)
+		if comment == "" || filteredFirstCharRE.MatchString(comment[:1]) {
+			// heuristic - we discard docstrings
+			continue
+		}
+		// heuristic - remove function names
+		comment = functionNameRE.ReplaceAllString(comment, "")
+		comment = filteredCharsRE.ReplaceAllString(comment, "")
+		if len(comment) < sent.MinCommentLength {
+			continue
+		}
+		// collapse whitespace
+		comment = whitespaceRE.ReplaceAllString(comment, " ")
+		// heuristic - number of letters must be at least 60%
+		charsCount := 0
+		for _, match := range charsRE.FindAllStringIndex(comment, -1) {
+			charsCount += match[1] - match[0]
+		}
+		if charsCount < int(float32(len(comment))*CommentLettersRatio) {
+			continue
+		}
+		// heuristic - license
+		if licenseRE.MatchString(comment) {
+			continue
+		}
+		filteredComments = append(filteredComments, comment)
+	}
+	return filteredComments
+}
+
+func init() {
+	Registry.Register(&CommentSentimentAnalysis{})
+}

+ 55 - 0
comment_sentiment_test.go

@@ -0,0 +1,55 @@
+package hercules
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"gopkg.in/src-d/go-git.v4/plumbing"
+)
+
+func TestCommentSentimentMeta(t *testing.T) {
+	sent := CommentSentimentAnalysis{}
+	assert.Equal(t, sent.Name(), "Sentiment")
+	assert.Equal(t, len(sent.Provides()), 0)
+	required := [...]string{DependencyUastChanges, DependencyDay}
+	for _, name := range required {
+		assert.Contains(t, sent.Requires(), name)
+	}
+	opts := sent.ListConfigurationOptions()
+	matches := 0
+	for _, opt := range opts {
+		switch opt.Name {
+		case ConfigCommentSentimentMinLength, ConfigCommentSentimentGap:
+			matches++
+		}
+	}
+	assert.Len(t, opts, matches)
+	assert.Equal(t, sent.Flag(), "sentiment")
+	assert.Len(t, sent.Features(), 1)
+	assert.Equal(t, sent.Features()[0], FeatureUast)
+}
+
+func TestCommentSentimentConfigure(t *testing.T) {
+	sent := CommentSentimentAnalysis{}
+	facts := map[string]interface{}{}
+	facts[ConfigCommentSentimentMinLength] = 77
+	facts[ConfigCommentSentimentGap] = float32(0.77)
+	facts[FactCommitsByDay] = map[int][]plumbing.Hash{}
+	sent.Configure(facts)
+	assert.Equal(t, sent.Gap, float32(0.77))
+	assert.Equal(t, sent.MinCommentLength, 77)
+	facts[ConfigCommentSentimentMinLength] = -10
+	facts[ConfigCommentSentimentGap] = float32(2)
+	sent.Configure(facts)
+	assert.Equal(t, sent.Gap, DefaultCommentSentimentGap)
+	assert.Equal(t, sent.MinCommentLength, DefaultCommentSentimentCommentMinLength)
+}
+
+func TestCommentSentimentRegistration(t *testing.T) {
+	tp, exists := Registry.registered[(&CommentSentimentAnalysis{}).Name()]
+	assert.True(t, exists)
+	assert.Equal(t, tp.Elem().Name(), "CommentSentimentAnalysis")
+	tp, exists = Registry.flags[(&CommentSentimentAnalysis{}).Flag()]
+	assert.True(t, exists)
+	assert.Equal(t, tp.Elem().Name(), "CommentSentimentAnalysis")
+}

+ 10 - 0
pb/pb.proto

@@ -105,6 +105,16 @@ message FileHistoryResultMessage {
     map<string, FileHistory> files = 1;
 }
 
+message Sentiment {
+    float value = 1;
+    repeated string comments = 2;
+    repeated string commits = 3;
+}
+
+message CommentSentimentResults {
+    map<int32, Sentiment> sentiment_by_day = 1;
+}
+
 message AnalysisResults {
     Metadata header = 1;
     // the mapped values are dynamic messages which require the second parsing pass.