comment_sentiment.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. // +build tensorflow
  2. package leaves
  3. import (
  4. "fmt"
  5. "io"
  6. "os"
  7. "regexp"
  8. "sort"
  9. "strings"
  10. "github.com/gogo/protobuf/proto"
  11. "gopkg.in/bblfsh/sdk.v2/uast"
  12. "gopkg.in/bblfsh/sdk.v2/uast/nodes"
  13. progress "gopkg.in/cheggaaa/pb.v1"
  14. "gopkg.in/src-d/go-git.v4"
  15. "gopkg.in/src-d/go-git.v4/plumbing"
  16. "gopkg.in/src-d/hercules.v10/internal/core"
  17. "gopkg.in/src-d/hercules.v10/internal/pb"
  18. items "gopkg.in/src-d/hercules.v10/internal/plumbing"
  19. uast_items "gopkg.in/src-d/hercules.v10/internal/plumbing/uast"
  20. sentiment "gopkg.in/vmarkovtsev/BiDiSentiment.v1"
  21. )
  22. // CommentSentimentAnalysis measures comment sentiment through time.
  23. type CommentSentimentAnalysis struct {
  24. core.NoopMerger
  25. core.OneShotMergeProcessor
  26. MinCommentLength int
  27. Gap float32
  28. commentsByTick map[int][]string
  29. commitsByTick map[int][]plumbing.Hash
  30. xpather *uast_items.ChangesXPather
  31. l core.Logger
  32. }
  33. // CommentSentimentResult contains the sentiment values per tick, where 1 means very negative
  34. // and 0 means very positive.
  35. type CommentSentimentResult struct {
  36. EmotionsByTick map[int]float32
  37. CommentsByTick map[int][]string
  38. commitsByTick map[int][]plumbing.Hash
  39. }
  40. const (
  41. ConfigCommentSentimentMinLength = "CommentSentiment.MinLength"
  42. ConfigCommentSentimentGap = "CommentSentiment.Gap"
  43. DefaultCommentSentimentCommentMinLength = 20
  44. DefaultCommentSentimentGap = float32(0.5)
  45. // CommentLettersRatio is the threshold to filter impure comments which contain code.
  46. CommentLettersRatio = 0.6
  47. )
  48. var (
  49. filteredFirstCharRE = regexp.MustCompile("[^a-zA-Z0-9]")
  50. filteredCharsRE = regexp.MustCompile("[^-a-zA-Z0-9_:;,./?!#&%+*=\\n \\t()]+")
  51. charsRE = regexp.MustCompile("[a-zA-Z]+")
  52. functionNameRE = regexp.MustCompile("\\s*[a-zA-Z_][a-zA-Z_0-9]*\\(\\)")
  53. whitespaceRE = regexp.MustCompile("\\s+")
  54. licenseRE = regexp.MustCompile("(?i)[li[cs]en[cs][ei]|copyright|©")
  55. )
  56. // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
  57. func (sent *CommentSentimentAnalysis) Name() string {
  58. return "Sentiment"
  59. }
  60. // Provides returns the list of names of entities which are produced by this PipelineItem.
  61. // Each produced entity will be inserted into `deps` of dependent Consume()-s according
  62. // to this list. Also used by core.Registry to build the global map of providers.
  63. func (sent *CommentSentimentAnalysis) Provides() []string {
  64. return []string{}
  65. }
  66. // Requires returns the list of names of entities which are needed by this PipelineItem.
  67. // Each requested entity will be inserted into `deps` of Consume(). In turn, those
  68. // entities are Provides() upstream.
  69. func (sent *CommentSentimentAnalysis) Requires() []string {
  70. return []string{uast_items.DependencyUastChanges, items.DependencyTick}
  71. }
  72. // ListConfigurationOptions returns the list of changeable public properties of this PipelineItem.
  73. func (sent *CommentSentimentAnalysis) ListConfigurationOptions() []core.ConfigurationOption {
  74. options := [...]core.ConfigurationOption{{
  75. Name: ConfigCommentSentimentMinLength,
  76. Description: "Minimum length of the comment to be analyzed.",
  77. Flag: "min-comment-len",
  78. Type: core.IntConfigurationOption,
  79. Default: DefaultCommentSentimentCommentMinLength}, {
  80. Name: ConfigCommentSentimentGap,
  81. Description: "Sentiment value threshold, values between 0.5 - X/2 and 0.5 + x/2 will not be " +
  82. "considered. Must be >= 0 and < 1. The purpose is to exclude neutral comments.",
  83. Flag: "sentiment-gap",
  84. Type: core.FloatConfigurationOption,
  85. Default: DefaultCommentSentimentGap},
  86. }
  87. return options[:]
  88. }
  89. // Flag returns the command line switch which activates the analysis.
  90. func (sent *CommentSentimentAnalysis) Flag() string {
  91. return "sentiment"
  92. }
  93. // Description returns the text which explains what the analysis is doing.
  94. func (sent *CommentSentimentAnalysis) Description() string {
  95. return "Classifies each new or changed comment per commit as containing positive or " +
  96. "negative emotions. The classifier outputs a real number between 0 and 1," +
  97. "1 is the most positive and 0 is the most negative."
  98. }
  99. // Configure sets the properties previously published by ListConfigurationOptions().
  100. func (sent *CommentSentimentAnalysis) Configure(facts map[string]interface{}) error {
  101. if l, exists := facts[core.ConfigLogger].(core.Logger); exists {
  102. sent.l = l
  103. }
  104. if val, exists := facts[ConfigCommentSentimentGap]; exists {
  105. sent.Gap = val.(float32)
  106. }
  107. if val, exists := facts[ConfigCommentSentimentMinLength]; exists {
  108. sent.MinCommentLength = val.(int)
  109. }
  110. sent.validate()
  111. sent.commitsByTick = facts[items.FactCommitsByTick].(map[int][]plumbing.Hash)
  112. return nil
  113. }
  114. func (sent *CommentSentimentAnalysis) validate() {
  115. if sent.Gap < 0 || sent.Gap >= 1 {
  116. sent.l.Warnf("Sentiment gap is too big: %f => reset to the default %f",
  117. sent.Gap, DefaultCommentSentimentGap)
  118. sent.Gap = DefaultCommentSentimentGap
  119. }
  120. if sent.MinCommentLength < 10 {
  121. sent.l.Warnf("Comment minimum length is too small: %d => reset to the default %d",
  122. sent.MinCommentLength, DefaultCommentSentimentCommentMinLength)
  123. sent.MinCommentLength = DefaultCommentSentimentCommentMinLength
  124. }
  125. }
  126. // Initialize resets the temporary caches and prepares this PipelineItem for a series of Consume()
  127. // calls. The repository which is going to be analysed is supplied as an argument.
  128. func (sent *CommentSentimentAnalysis) Initialize(repository *git.Repository) error {
  129. sent.l = core.NewLogger()
  130. sent.commentsByTick = map[int][]string{}
  131. sent.xpather = &uast_items.ChangesXPather{XPath: "//uast:Comment"}
  132. sent.validate()
  133. sent.OneShotMergeProcessor.Initialize()
  134. return nil
  135. }
  136. // Consume runs this PipelineItem on the next commit data.
  137. // `deps` contain all the results from upstream PipelineItem-s as requested by Requires().
  138. // Additionally, DependencyCommit is always present there and represents the analysed *object.Commit.
  139. // This function returns the mapping with analysis results. The keys must be the same as
  140. // in Provides(). If there was an error, nil is returned.
  141. func (sent *CommentSentimentAnalysis) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  142. if !sent.ShouldConsumeCommit(deps) {
  143. return nil, nil
  144. }
  145. changes := deps[uast_items.DependencyUastChanges].([]uast_items.Change)
  146. tick := deps[items.DependencyTick].(int)
  147. commentNodes, _ := sent.xpather.Extract(changes)
  148. comments := sent.mergeComments(commentNodes)
  149. tickComments := sent.commentsByTick[tick]
  150. if tickComments == nil {
  151. tickComments = []string{}
  152. }
  153. tickComments = append(tickComments, comments...)
  154. sent.commentsByTick[tick] = tickComments
  155. return nil, nil
  156. }
  157. // Finalize returns the result of the analysis. Further Consume() calls are not expected.
  158. func (sent *CommentSentimentAnalysis) Finalize() interface{} {
  159. result := CommentSentimentResult{
  160. EmotionsByTick: map[int]float32{},
  161. CommentsByTick: map[int][]string{},
  162. commitsByTick: sent.commitsByTick,
  163. }
  164. ticks := make([]int, 0, len(sent.commentsByTick))
  165. for tick := range sent.commentsByTick {
  166. ticks = append(ticks, tick)
  167. }
  168. sort.Ints(ticks)
  169. var texts []string
  170. for _, key := range ticks {
  171. texts = append(texts, sent.commentsByTick[key]...)
  172. }
  173. session, err := sentiment.OpenSession()
  174. if err != nil {
  175. panic(err)
  176. }
  177. defer session.Close()
  178. var bar *progress.ProgressBar
  179. callback := func(pos int, total int) {
  180. if bar == nil {
  181. bar = progress.New(total)
  182. bar.Callback = func(msg string) {
  183. os.Stderr.WriteString("\r" + msg)
  184. }
  185. bar.NotPrint = true
  186. bar.ShowPercent = false
  187. bar.ShowSpeed = false
  188. bar.SetMaxWidth(80)
  189. bar.Start()
  190. }
  191. bar.Set(pos)
  192. }
  193. // we run the bulk evaluation in the end for efficiency
  194. weights, err := sentiment.EvaluateWithProgress(texts, session, callback)
  195. if bar != nil {
  196. bar.Finish()
  197. }
  198. if err != nil {
  199. panic(err)
  200. }
  201. pos := 0
  202. for _, key := range ticks {
  203. sum := float32(0)
  204. comments := make([]string, 0, len(sent.commentsByTick[key]))
  205. for _, comment := range sent.commentsByTick[key] {
  206. if weights[pos] < 0.5*(1-sent.Gap) || weights[pos] > 0.5*(1+sent.Gap) {
  207. sum += weights[pos]
  208. comments = append(comments, comment)
  209. }
  210. pos++
  211. }
  212. if len(comments) > 0 {
  213. result.EmotionsByTick[key] = sum / float32(len(comments))
  214. result.CommentsByTick[key] = comments
  215. }
  216. }
  217. return result
  218. }
  219. // Fork clones this PipelineItem.
  220. func (sent *CommentSentimentAnalysis) Fork(n int) []core.PipelineItem {
  221. return core.ForkSamePipelineItem(sent, n)
  222. }
  223. // Serialize converts the analysis result as returned by Finalize() to text or bytes.
  224. // The text format is YAML and the bytes format is Protocol Buffers.
  225. func (sent *CommentSentimentAnalysis) Serialize(result interface{}, binary bool, writer io.Writer) error {
  226. sentimentResult := result.(CommentSentimentResult)
  227. if binary {
  228. return sent.serializeBinary(&sentimentResult, writer)
  229. }
  230. sent.serializeText(&sentimentResult, writer)
  231. return nil
  232. }
  233. func (sent *CommentSentimentAnalysis) serializeText(result *CommentSentimentResult, writer io.Writer) {
  234. ticks := make([]int, 0, len(result.EmotionsByTick))
  235. for tick := range result.EmotionsByTick {
  236. ticks = append(ticks, tick)
  237. }
  238. sort.Ints(ticks)
  239. for _, tick := range ticks {
  240. commits := result.commitsByTick[tick]
  241. hashes := make([]string, len(commits))
  242. for i, hash := range commits {
  243. hashes[i] = hash.String()
  244. }
  245. fmt.Fprintf(writer, " %d: [%.4f, [%s], \"%s\"]\n",
  246. tick, result.EmotionsByTick[tick], strings.Join(hashes, ","),
  247. strings.Join(result.CommentsByTick[tick], "|"))
  248. }
  249. }
  250. func (sent *CommentSentimentAnalysis) serializeBinary(
  251. result *CommentSentimentResult, writer io.Writer) error {
  252. message := pb.CommentSentimentResults{
  253. SentimentByTick: map[int32]*pb.Sentiment{},
  254. }
  255. for key, val := range result.EmotionsByTick {
  256. commits := make([]string, len(result.commitsByTick[key]))
  257. for i, commit := range result.commitsByTick[key] {
  258. commits[i] = commit.String()
  259. }
  260. message.SentimentByTick[int32(key)] = &pb.Sentiment{
  261. Value: val,
  262. Comments: result.CommentsByTick[key],
  263. Commits: commits,
  264. }
  265. }
  266. serialized, err := proto.Marshal(&message)
  267. if err != nil {
  268. return err
  269. }
  270. writer.Write(serialized)
  271. return nil
  272. }
  273. func (sent *CommentSentimentAnalysis) mergeComments(extracted []nodes.Node) []string {
  274. var mergedComments []string
  275. lines := map[int][]nodes.Node{}
  276. for _, node := range extracted {
  277. pos := uast.PositionsOf(node.(nodes.Object))
  278. if pos.Start() == nil {
  279. continue
  280. }
  281. lineno := int(pos.Start().Line)
  282. lines[lineno] = append(lines[lineno], node)
  283. }
  284. lineNums := make([]int, 0, len(lines))
  285. for line := range lines {
  286. lineNums = append(lineNums, line)
  287. }
  288. sort.Ints(lineNums)
  289. var buffer []string
  290. for i, line := range lineNums {
  291. lineNodes := lines[line]
  292. maxEnd := line
  293. for _, node := range lineNodes {
  294. pos := uast.PositionsOf(node.(nodes.Object))
  295. if pos.End() != nil && maxEnd < int(pos.End().Line) {
  296. maxEnd = int(pos.End().Line)
  297. }
  298. token := strings.TrimSpace(string(node.(nodes.Object)["Text"].(nodes.String)))
  299. if token != "" {
  300. buffer = append(buffer, token)
  301. }
  302. }
  303. if i < len(lineNums)-1 && lineNums[i+1] <= maxEnd+1 {
  304. continue
  305. }
  306. mergedComments = append(mergedComments, strings.Join(buffer, "\n"))
  307. buffer = make([]string, 0, len(buffer))
  308. }
  309. // We remove unneeded chars and filter too short comments
  310. filteredComments := make([]string, 0, len(mergedComments))
  311. for _, comment := range mergedComments {
  312. comment = strings.TrimSpace(comment)
  313. if comment == "" || filteredFirstCharRE.MatchString(comment[:1]) {
  314. // heuristic - we discard docstrings
  315. continue
  316. }
  317. // heuristic - remove function names
  318. comment = functionNameRE.ReplaceAllString(comment, "")
  319. comment = filteredCharsRE.ReplaceAllString(comment, "")
  320. if len(comment) < sent.MinCommentLength {
  321. continue
  322. }
  323. // collapse whitespace
  324. comment = whitespaceRE.ReplaceAllString(comment, " ")
  325. // heuristic - number of letters must be at least 60%
  326. charsCount := 0
  327. for _, match := range charsRE.FindAllStringIndex(comment, -1) {
  328. charsCount += match[1] - match[0]
  329. }
  330. if charsCount < int(float32(len(comment))*CommentLettersRatio) {
  331. continue
  332. }
  333. // heuristic - license
  334. if licenseRE.MatchString(comment) {
  335. continue
  336. }
  337. filteredComments = append(filteredComments, comment)
  338. }
  339. return filteredComments
  340. }
  341. func init() {
  342. core.Registry.Register(&CommentSentimentAnalysis{})
  343. }