123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372 |
- // +build tensorflow
- package leaves
- import (
- "fmt"
- "io"
- "os"
- "regexp"
- "sort"
- "strings"
- "github.com/gogo/protobuf/proto"
- "gopkg.in/bblfsh/sdk.v2/uast"
- "gopkg.in/bblfsh/sdk.v2/uast/nodes"
- progress "gopkg.in/cheggaaa/pb.v1"
- "gopkg.in/src-d/go-git.v4"
- "gopkg.in/src-d/go-git.v4/plumbing"
- "gopkg.in/src-d/hercules.v10/internal/core"
- "gopkg.in/src-d/hercules.v10/internal/pb"
- items "gopkg.in/src-d/hercules.v10/internal/plumbing"
- uast_items "gopkg.in/src-d/hercules.v10/internal/plumbing/uast"
- sentiment "gopkg.in/vmarkovtsev/BiDiSentiment.v1"
- )
- // CommentSentimentAnalysis measures comment sentiment through time.
- type CommentSentimentAnalysis struct {
- core.NoopMerger
- core.OneShotMergeProcessor
- MinCommentLength int
- Gap float32
- commentsByTick map[int][]string
- commitsByTick map[int][]plumbing.Hash
- xpather *uast_items.ChangesXPather
- l core.Logger
- }
- // CommentSentimentResult contains the sentiment values per tick, where 1 means very negative
- // and 0 means very positive.
- type CommentSentimentResult struct {
- EmotionsByTick map[int]float32
- CommentsByTick map[int][]string
- commitsByTick map[int][]plumbing.Hash
- }
- const (
- ConfigCommentSentimentMinLength = "CommentSentiment.MinLength"
- ConfigCommentSentimentGap = "CommentSentiment.Gap"
- DefaultCommentSentimentCommentMinLength = 20
- DefaultCommentSentimentGap = float32(0.5)
- // CommentLettersRatio is the threshold to filter impure comments which contain code.
- CommentLettersRatio = 0.6
- )
- var (
- filteredFirstCharRE = regexp.MustCompile("[^a-zA-Z0-9]")
- filteredCharsRE = regexp.MustCompile("[^-a-zA-Z0-9_:;,./?!#&%+*=\\n \\t()]+")
- charsRE = regexp.MustCompile("[a-zA-Z]+")
- functionNameRE = regexp.MustCompile("\\s*[a-zA-Z_][a-zA-Z_0-9]*\\(\\)")
- whitespaceRE = regexp.MustCompile("\\s+")
- licenseRE = regexp.MustCompile("(?i)[li[cs]en[cs][ei]|copyright|©")
- )
- // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
- func (sent *CommentSentimentAnalysis) Name() string {
- return "Sentiment"
- }
- // Provides returns the list of names of entities which are produced by this PipelineItem.
- // Each produced entity will be inserted into `deps` of dependent Consume()-s according
- // to this list. Also used by core.Registry to build the global map of providers.
- func (sent *CommentSentimentAnalysis) Provides() []string {
- return []string{}
- }
- // Requires returns the list of names of entities which are needed by this PipelineItem.
- // Each requested entity will be inserted into `deps` of Consume(). In turn, those
- // entities are Provides() upstream.
- func (sent *CommentSentimentAnalysis) Requires() []string {
- return []string{uast_items.DependencyUastChanges, items.DependencyTick}
- }
- // ListConfigurationOptions returns the list of changeable public properties of this PipelineItem.
- func (sent *CommentSentimentAnalysis) ListConfigurationOptions() []core.ConfigurationOption {
- options := [...]core.ConfigurationOption{{
- Name: ConfigCommentSentimentMinLength,
- Description: "Minimum length of the comment to be analyzed.",
- Flag: "min-comment-len",
- Type: core.IntConfigurationOption,
- Default: DefaultCommentSentimentCommentMinLength}, {
- Name: ConfigCommentSentimentGap,
- Description: "Sentiment value threshold, values between 0.5 - X/2 and 0.5 + x/2 will not be " +
- "considered. Must be >= 0 and < 1. The purpose is to exclude neutral comments.",
- Flag: "sentiment-gap",
- Type: core.FloatConfigurationOption,
- Default: DefaultCommentSentimentGap},
- }
- return options[:]
- }
- // Flag returns the command line switch which activates the analysis.
- func (sent *CommentSentimentAnalysis) Flag() string {
- return "sentiment"
- }
- // Description returns the text which explains what the analysis is doing.
- func (sent *CommentSentimentAnalysis) Description() string {
- return "Classifies each new or changed comment per commit as containing positive or " +
- "negative emotions. The classifier outputs a real number between 0 and 1," +
- "1 is the most positive and 0 is the most negative."
- }
- // Configure sets the properties previously published by ListConfigurationOptions().
- func (sent *CommentSentimentAnalysis) Configure(facts map[string]interface{}) error {
- if l, exists := facts[core.ConfigLogger].(core.Logger); exists {
- sent.l = l
- }
- if val, exists := facts[ConfigCommentSentimentGap]; exists {
- sent.Gap = val.(float32)
- }
- if val, exists := facts[ConfigCommentSentimentMinLength]; exists {
- sent.MinCommentLength = val.(int)
- }
- sent.validate()
- sent.commitsByTick = facts[items.FactCommitsByTick].(map[int][]plumbing.Hash)
- return nil
- }
- func (sent *CommentSentimentAnalysis) validate() {
- if sent.Gap < 0 || sent.Gap >= 1 {
- sent.l.Warnf("Sentiment gap is too big: %f => reset to the default %f",
- sent.Gap, DefaultCommentSentimentGap)
- sent.Gap = DefaultCommentSentimentGap
- }
- if sent.MinCommentLength < 10 {
- sent.l.Warnf("Comment minimum length is too small: %d => reset to the default %d",
- sent.MinCommentLength, DefaultCommentSentimentCommentMinLength)
- sent.MinCommentLength = DefaultCommentSentimentCommentMinLength
- }
- }
- // Initialize resets the temporary caches and prepares this PipelineItem for a series of Consume()
- // calls. The repository which is going to be analysed is supplied as an argument.
- func (sent *CommentSentimentAnalysis) Initialize(repository *git.Repository) error {
- sent.l = core.NewLogger()
- sent.commentsByTick = map[int][]string{}
- sent.xpather = &uast_items.ChangesXPather{XPath: "//uast:Comment"}
- sent.validate()
- sent.OneShotMergeProcessor.Initialize()
- return nil
- }
- // Consume runs this PipelineItem on the next commit data.
- // `deps` contain all the results from upstream PipelineItem-s as requested by Requires().
- // Additionally, DependencyCommit is always present there and represents the analysed *object.Commit.
- // This function returns the mapping with analysis results. The keys must be the same as
- // in Provides(). If there was an error, nil is returned.
- func (sent *CommentSentimentAnalysis) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
- if !sent.ShouldConsumeCommit(deps) {
- return nil, nil
- }
- changes := deps[uast_items.DependencyUastChanges].([]uast_items.Change)
- tick := deps[items.DependencyTick].(int)
- commentNodes, _ := sent.xpather.Extract(changes)
- comments := sent.mergeComments(commentNodes)
- tickComments := sent.commentsByTick[tick]
- if tickComments == nil {
- tickComments = []string{}
- }
- tickComments = append(tickComments, comments...)
- sent.commentsByTick[tick] = tickComments
- return nil, nil
- }
- // Finalize returns the result of the analysis. Further Consume() calls are not expected.
- func (sent *CommentSentimentAnalysis) Finalize() interface{} {
- result := CommentSentimentResult{
- EmotionsByTick: map[int]float32{},
- CommentsByTick: map[int][]string{},
- commitsByTick: sent.commitsByTick,
- }
- ticks := make([]int, 0, len(sent.commentsByTick))
- for tick := range sent.commentsByTick {
- ticks = append(ticks, tick)
- }
- sort.Ints(ticks)
- var texts []string
- for _, key := range ticks {
- texts = append(texts, sent.commentsByTick[key]...)
- }
- session, err := sentiment.OpenSession()
- if err != nil {
- panic(err)
- }
- defer session.Close()
- var bar *progress.ProgressBar
- callback := func(pos int, total int) {
- if bar == nil {
- bar = progress.New(total)
- bar.Callback = func(msg string) {
- os.Stderr.WriteString("\r" + msg)
- }
- bar.NotPrint = true
- bar.ShowPercent = false
- bar.ShowSpeed = false
- bar.SetMaxWidth(80)
- bar.Start()
- }
- bar.Set(pos)
- }
- // we run the bulk evaluation in the end for efficiency
- weights, err := sentiment.EvaluateWithProgress(texts, session, callback)
- if bar != nil {
- bar.Finish()
- }
- if err != nil {
- panic(err)
- }
- pos := 0
- for _, key := range ticks {
- sum := float32(0)
- comments := make([]string, 0, len(sent.commentsByTick[key]))
- for _, comment := range sent.commentsByTick[key] {
- if weights[pos] < 0.5*(1-sent.Gap) || weights[pos] > 0.5*(1+sent.Gap) {
- sum += weights[pos]
- comments = append(comments, comment)
- }
- pos++
- }
- if len(comments) > 0 {
- result.EmotionsByTick[key] = sum / float32(len(comments))
- result.CommentsByTick[key] = comments
- }
- }
- return result
- }
- // Fork clones this PipelineItem.
- func (sent *CommentSentimentAnalysis) Fork(n int) []core.PipelineItem {
- return core.ForkSamePipelineItem(sent, n)
- }
- // Serialize converts the analysis result as returned by Finalize() to text or bytes.
- // The text format is YAML and the bytes format is Protocol Buffers.
- func (sent *CommentSentimentAnalysis) Serialize(result interface{}, binary bool, writer io.Writer) error {
- sentimentResult := result.(CommentSentimentResult)
- if binary {
- return sent.serializeBinary(&sentimentResult, writer)
- }
- sent.serializeText(&sentimentResult, writer)
- return nil
- }
- func (sent *CommentSentimentAnalysis) serializeText(result *CommentSentimentResult, writer io.Writer) {
- ticks := make([]int, 0, len(result.EmotionsByTick))
- for tick := range result.EmotionsByTick {
- ticks = append(ticks, tick)
- }
- sort.Ints(ticks)
- for _, tick := range ticks {
- commits := result.commitsByTick[tick]
- hashes := make([]string, len(commits))
- for i, hash := range commits {
- hashes[i] = hash.String()
- }
- fmt.Fprintf(writer, " %d: [%.4f, [%s], \"%s\"]\n",
- tick, result.EmotionsByTick[tick], strings.Join(hashes, ","),
- strings.Join(result.CommentsByTick[tick], "|"))
- }
- }
- func (sent *CommentSentimentAnalysis) serializeBinary(
- result *CommentSentimentResult, writer io.Writer) error {
- message := pb.CommentSentimentResults{
- SentimentByTick: map[int32]*pb.Sentiment{},
- }
- for key, val := range result.EmotionsByTick {
- commits := make([]string, len(result.commitsByTick[key]))
- for i, commit := range result.commitsByTick[key] {
- commits[i] = commit.String()
- }
- message.SentimentByTick[int32(key)] = &pb.Sentiment{
- Value: val,
- Comments: result.CommentsByTick[key],
- Commits: commits,
- }
- }
- serialized, err := proto.Marshal(&message)
- if err != nil {
- return err
- }
- writer.Write(serialized)
- return nil
- }
- func (sent *CommentSentimentAnalysis) mergeComments(extracted []nodes.Node) []string {
- var mergedComments []string
- lines := map[int][]nodes.Node{}
- for _, node := range extracted {
- pos := uast.PositionsOf(node.(nodes.Object))
- if pos.Start() == nil {
- continue
- }
- lineno := int(pos.Start().Line)
- lines[lineno] = append(lines[lineno], node)
- }
- lineNums := make([]int, 0, len(lines))
- for line := range lines {
- lineNums = append(lineNums, line)
- }
- sort.Ints(lineNums)
- var buffer []string
- for i, line := range lineNums {
- lineNodes := lines[line]
- maxEnd := line
- for _, node := range lineNodes {
- pos := uast.PositionsOf(node.(nodes.Object))
- if pos.End() != nil && maxEnd < int(pos.End().Line) {
- maxEnd = int(pos.End().Line)
- }
- token := strings.TrimSpace(string(node.(nodes.Object)["Text"].(nodes.String)))
- if token != "" {
- buffer = append(buffer, token)
- }
- }
- if i < len(lineNums)-1 && lineNums[i+1] <= maxEnd+1 {
- continue
- }
- mergedComments = append(mergedComments, strings.Join(buffer, "\n"))
- buffer = make([]string, 0, len(buffer))
- }
- // We remove unneeded chars and filter too short comments
- filteredComments := make([]string, 0, len(mergedComments))
- for _, comment := range mergedComments {
- comment = strings.TrimSpace(comment)
- if comment == "" || filteredFirstCharRE.MatchString(comment[:1]) {
- // heuristic - we discard docstrings
- continue
- }
- // heuristic - remove function names
- comment = functionNameRE.ReplaceAllString(comment, "")
- comment = filteredCharsRE.ReplaceAllString(comment, "")
- if len(comment) < sent.MinCommentLength {
- continue
- }
- // collapse whitespace
- comment = whitespaceRE.ReplaceAllString(comment, " ")
- // heuristic - number of letters must be at least 60%
- charsCount := 0
- for _, match := range charsRE.FindAllStringIndex(comment, -1) {
- charsCount += match[1] - match[0]
- }
- if charsCount < int(float32(len(comment))*CommentLettersRatio) {
- continue
- }
- // heuristic - license
- if licenseRE.MatchString(comment) {
- continue
- }
- filteredComments = append(filteredComments, comment)
- }
- return filteredComments
- }
- func init() {
- core.Registry.Register(&CommentSentimentAnalysis{})
- }
|