comment_sentiment.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376
  1. // +build tensorflow
  2. package leaves
  3. import (
  4. "fmt"
  5. "io"
  6. "log"
  7. "os"
  8. "regexp"
  9. "sort"
  10. "strings"
  11. "github.com/gogo/protobuf/proto"
  12. "gopkg.in/bblfsh/sdk.v1/uast"
  13. progress "gopkg.in/cheggaaa/pb.v1"
  14. "gopkg.in/src-d/go-git.v4"
  15. "gopkg.in/src-d/go-git.v4/plumbing"
  16. "gopkg.in/src-d/hercules.v7/internal/core"
  17. "gopkg.in/src-d/hercules.v7/internal/pb"
  18. items "gopkg.in/src-d/hercules.v7/internal/plumbing"
  19. uast_items "gopkg.in/src-d/hercules.v7/internal/plumbing/uast"
  20. "gopkg.in/vmarkovtsev/BiDiSentiment.v1"
  21. )
  22. // CommentSentimentAnalysis measures comment sentiment through time.
  23. type CommentSentimentAnalysis struct {
  24. core.NoopMerger
  25. core.OneShotMergeProcessor
  26. MinCommentLength int
  27. Gap float32
  28. commentsByDay map[int][]string
  29. commitsByDay map[int][]plumbing.Hash
  30. xpather *uast_items.ChangesXPather
  31. }
  32. // CommentSentimentResult contains the sentiment values per day, where 1 means very negative
  33. // and 0 means very positive.
  34. type CommentSentimentResult struct {
  35. EmotionsByDay map[int]float32
  36. CommentsByDay map[int][]string
  37. commitsByDay map[int][]plumbing.Hash
  38. }
  39. const (
  40. ConfigCommentSentimentMinLength = "CommentSentiment.MinLength"
  41. ConfigCommentSentimentGap = "CommentSentiment.Gap"
  42. DefaultCommentSentimentCommentMinLength = 20
  43. DefaultCommentSentimentGap = float32(0.5)
  44. // CommentLettersRatio is the threshold to filter impure comments which contain code.
  45. CommentLettersRatio = 0.6
  46. )
  47. var (
  48. filteredFirstCharRE = regexp.MustCompile("[^a-zA-Z0-9]")
  49. filteredCharsRE = regexp.MustCompile("[^-a-zA-Z0-9_:;,./?!#&%+*=\\n \\t()]+")
  50. charsRE = regexp.MustCompile("[a-zA-Z]+")
  51. functionNameRE = regexp.MustCompile("\\s*[a-zA-Z_][a-zA-Z_0-9]*\\(\\)")
  52. whitespaceRE = regexp.MustCompile("\\s+")
  53. licenseRE = regexp.MustCompile("(?i)[li[cs]en[cs][ei]|copyright|©")
  54. )
  55. // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
  56. func (sent *CommentSentimentAnalysis) Name() string {
  57. return "Sentiment"
  58. }
  59. // Provides returns the list of names of entities which are produced by this PipelineItem.
  60. // Each produced entity will be inserted into `deps` of dependent Consume()-s according
  61. // to this list. Also used by core.Registry to build the global map of providers.
  62. func (sent *CommentSentimentAnalysis) Provides() []string {
  63. return []string{}
  64. }
  65. // Requires returns the list of names of entities which are needed by this PipelineItem.
  66. // Each requested entity will be inserted into `deps` of Consume(). In turn, those
  67. // entities are Provides() upstream.
  68. func (sent *CommentSentimentAnalysis) Requires() []string {
  69. arr := [...]string{uast_items.DependencyUastChanges, items.DependencyDay}
  70. return arr[:]
  71. }
  72. // Features which must be enabled for this PipelineItem to be automatically inserted into the DAG.
  73. func (sent *CommentSentimentAnalysis) Features() []string {
  74. arr := [...]string{uast_items.FeatureUast}
  75. return arr[:]
  76. }
  77. // ListConfigurationOptions returns the list of changeable public properties of this PipelineItem.
  78. func (sent *CommentSentimentAnalysis) ListConfigurationOptions() []core.ConfigurationOption {
  79. options := [...]core.ConfigurationOption{{
  80. Name: ConfigCommentSentimentMinLength,
  81. Description: "Minimum length of the comment to be analyzed.",
  82. Flag: "min-comment-len",
  83. Type: core.IntConfigurationOption,
  84. Default: DefaultCommentSentimentCommentMinLength}, {
  85. Name: ConfigCommentSentimentGap,
  86. Description: "Sentiment value threshold, values between 0.5 - X/2 and 0.5 + x/2 will not be " +
  87. "considered. Must be >= 0 and < 1. The purpose is to exclude neutral comments.",
  88. Flag: "sentiment-gap",
  89. Type: core.FloatConfigurationOption,
  90. Default: DefaultCommentSentimentGap},
  91. }
  92. return options[:]
  93. }
  94. // Flag returns the command line switch which activates the analysis.
  95. func (sent *CommentSentimentAnalysis) Flag() string {
  96. return "sentiment"
  97. }
  98. // Description returns the text which explains what the analysis is doing.
  99. func (sent *CommentSentimentAnalysis) Description() string {
  100. return "Classifies each new or changed comment per commit as containing positive or " +
  101. "negative emotions. The classifier outputs a real number between 0 and 1," +
  102. "1 is the most positive and 0 is the most negative."
  103. }
  104. // Configure sets the properties previously published by ListConfigurationOptions().
  105. func (sent *CommentSentimentAnalysis) Configure(facts map[string]interface{}) error {
  106. if val, exists := facts[ConfigCommentSentimentGap]; exists {
  107. sent.Gap = val.(float32)
  108. }
  109. if val, exists := facts[ConfigCommentSentimentMinLength]; exists {
  110. sent.MinCommentLength = val.(int)
  111. }
  112. sent.validate()
  113. sent.commitsByDay = facts[items.FactCommitsByDay].(map[int][]plumbing.Hash)
  114. return nil
  115. }
  116. func (sent *CommentSentimentAnalysis) validate() {
  117. if sent.Gap < 0 || sent.Gap >= 1 {
  118. log.Printf("Sentiment gap is too big: %f => reset to the default %f",
  119. sent.Gap, DefaultCommentSentimentGap)
  120. sent.Gap = DefaultCommentSentimentGap
  121. }
  122. if sent.MinCommentLength < 10 {
  123. log.Printf("Comment minimum length is too small: %d => reset to the default %d",
  124. sent.MinCommentLength, DefaultCommentSentimentCommentMinLength)
  125. sent.MinCommentLength = DefaultCommentSentimentCommentMinLength
  126. }
  127. }
  128. // Initialize resets the temporary caches and prepares this PipelineItem for a series of Consume()
  129. // calls. The repository which is going to be analysed is supplied as an argument.
  130. func (sent *CommentSentimentAnalysis) Initialize(repository *git.Repository) error {
  131. sent.commentsByDay = map[int][]string{}
  132. sent.xpather = &uast_items.ChangesXPather{XPath: "//*[@roleComment]"}
  133. sent.validate()
  134. sent.OneShotMergeProcessor.Initialize()
  135. return nil
  136. }
  137. // Consume runs this PipelineItem on the next commit data.
  138. // `deps` contain all the results from upstream PipelineItem-s as requested by Requires().
  139. // Additionally, DependencyCommit is always present there and represents the analysed *object.Commit.
  140. // This function returns the mapping with analysis results. The keys must be the same as
  141. // in Provides(). If there was an error, nil is returned.
  142. func (sent *CommentSentimentAnalysis) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  143. if !sent.ShouldConsumeCommit(deps) {
  144. return nil, nil
  145. }
  146. changes := deps[uast_items.DependencyUastChanges].([]uast_items.Change)
  147. day := deps[items.DependencyDay].(int)
  148. commentNodes := sent.xpather.Extract(changes)
  149. comments := sent.mergeComments(commentNodes)
  150. dayComments := sent.commentsByDay[day]
  151. if dayComments == nil {
  152. dayComments = []string{}
  153. }
  154. dayComments = append(dayComments, comments...)
  155. sent.commentsByDay[day] = dayComments
  156. return nil, nil
  157. }
  158. // Finalize returns the result of the analysis. Further Consume() calls are not expected.
  159. func (sent *CommentSentimentAnalysis) Finalize() interface{} {
  160. result := CommentSentimentResult{
  161. EmotionsByDay: map[int]float32{},
  162. CommentsByDay: map[int][]string{},
  163. commitsByDay: sent.commitsByDay,
  164. }
  165. days := make([]int, 0, len(sent.commentsByDay))
  166. for day := range sent.commentsByDay {
  167. days = append(days, day)
  168. }
  169. sort.Ints(days)
  170. var texts []string
  171. for _, key := range days {
  172. texts = append(texts, sent.commentsByDay[key]...)
  173. }
  174. session, err := sentiment.OpenSession()
  175. if err != nil {
  176. panic(err)
  177. }
  178. defer session.Close()
  179. var bar *progress.ProgressBar
  180. callback := func(pos int, total int) {
  181. if bar == nil {
  182. bar = progress.New(total)
  183. bar.Callback = func(msg string) {
  184. os.Stderr.WriteString("\r" + msg)
  185. }
  186. bar.NotPrint = true
  187. bar.ShowPercent = false
  188. bar.ShowSpeed = false
  189. bar.SetMaxWidth(80)
  190. bar.Start()
  191. }
  192. bar.Set(pos)
  193. }
  194. // we run the bulk evaluation in the end for efficiency
  195. weights, err := sentiment.EvaluateWithProgress(texts, session, callback)
  196. if bar != nil {
  197. bar.Finish()
  198. }
  199. if err != nil {
  200. panic(err)
  201. }
  202. pos := 0
  203. for _, key := range days {
  204. sum := float32(0)
  205. comments := make([]string, 0, len(sent.commentsByDay[key]))
  206. for _, comment := range sent.commentsByDay[key] {
  207. if weights[pos] < 0.5*(1-sent.Gap) || weights[pos] > 0.5*(1+sent.Gap) {
  208. sum += weights[pos]
  209. comments = append(comments, comment)
  210. }
  211. pos++
  212. }
  213. if len(comments) > 0 {
  214. result.EmotionsByDay[key] = sum / float32(len(comments))
  215. result.CommentsByDay[key] = comments
  216. }
  217. }
  218. return result
  219. }
  220. // Fork clones this PipelineItem.
  221. func (sent *CommentSentimentAnalysis) Fork(n int) []core.PipelineItem {
  222. return core.ForkSamePipelineItem(sent, n)
  223. }
  224. // Serialize converts the analysis result as returned by Finalize() to text or bytes.
  225. // The text format is YAML and the bytes format is Protocol Buffers.
  226. func (sent *CommentSentimentAnalysis) Serialize(result interface{}, binary bool, writer io.Writer) error {
  227. sentimentResult := result.(CommentSentimentResult)
  228. if binary {
  229. return sent.serializeBinary(&sentimentResult, writer)
  230. }
  231. sent.serializeText(&sentimentResult, writer)
  232. return nil
  233. }
  234. func (sent *CommentSentimentAnalysis) serializeText(result *CommentSentimentResult, writer io.Writer) {
  235. days := make([]int, 0, len(result.EmotionsByDay))
  236. for day := range result.EmotionsByDay {
  237. days = append(days, day)
  238. }
  239. sort.Ints(days)
  240. for _, day := range days {
  241. commits := result.commitsByDay[day]
  242. hashes := make([]string, len(commits))
  243. for i, hash := range commits {
  244. hashes[i] = hash.String()
  245. }
  246. fmt.Fprintf(writer, " %d: [%.4f, [%s], \"%s\"]\n",
  247. day, result.EmotionsByDay[day], strings.Join(hashes, ","),
  248. strings.Join(result.CommentsByDay[day], "|"))
  249. }
  250. }
  251. func (sent *CommentSentimentAnalysis) serializeBinary(
  252. result *CommentSentimentResult, writer io.Writer) error {
  253. message := pb.CommentSentimentResults{
  254. SentimentByDay: map[int32]*pb.Sentiment{},
  255. }
  256. for key, val := range result.EmotionsByDay {
  257. commits := make([]string, len(result.commitsByDay[key]))
  258. for i, commit := range result.commitsByDay[key] {
  259. commits[i] = commit.String()
  260. }
  261. message.SentimentByDay[int32(key)] = &pb.Sentiment{
  262. Value: val,
  263. Comments: result.CommentsByDay[key],
  264. Commits: commits,
  265. }
  266. }
  267. serialized, err := proto.Marshal(&message)
  268. if err != nil {
  269. return err
  270. }
  271. writer.Write(serialized)
  272. return nil
  273. }
  274. func (sent *CommentSentimentAnalysis) mergeComments(nodes []*uast.Node) []string {
  275. var mergedComments []string
  276. lines := map[int][]*uast.Node{}
  277. for _, node := range nodes {
  278. if node.StartPosition == nil {
  279. continue
  280. }
  281. lineno := int(node.StartPosition.Line)
  282. subnodes := lines[lineno]
  283. if subnodes == nil {
  284. subnodes = []*uast.Node{}
  285. }
  286. subnodes = append(subnodes, node)
  287. lines[lineno] = subnodes
  288. }
  289. lineNums := make([]int, 0, len(lines))
  290. for line := range lines {
  291. lineNums = append(lineNums, line)
  292. }
  293. sort.Ints(lineNums)
  294. var buffer []string
  295. for i, line := range lineNums {
  296. lineNodes := lines[line]
  297. maxEnd := line
  298. for _, node := range lineNodes {
  299. if node.EndPosition != nil && maxEnd < int(node.EndPosition.Line) {
  300. maxEnd = int(node.EndPosition.Line)
  301. }
  302. token := strings.TrimSpace(node.Token)
  303. if token != "" {
  304. buffer = append(buffer, token)
  305. }
  306. }
  307. if i < len(lineNums)-1 && lineNums[i+1] <= maxEnd+1 {
  308. continue
  309. }
  310. mergedComments = append(mergedComments, strings.Join(buffer, "\n"))
  311. buffer = make([]string, 0, len(buffer))
  312. }
  313. // We remove unneeded chars and filter too short comments
  314. filteredComments := make([]string, 0, len(mergedComments))
  315. for _, comment := range mergedComments {
  316. comment = strings.TrimSpace(comment)
  317. if comment == "" || filteredFirstCharRE.MatchString(comment[:1]) {
  318. // heuristic - we discard docstrings
  319. continue
  320. }
  321. // heuristic - remove function names
  322. comment = functionNameRE.ReplaceAllString(comment, "")
  323. comment = filteredCharsRE.ReplaceAllString(comment, "")
  324. if len(comment) < sent.MinCommentLength {
  325. continue
  326. }
  327. // collapse whitespace
  328. comment = whitespaceRE.ReplaceAllString(comment, " ")
  329. // heuristic - number of letters must be at least 60%
  330. charsCount := 0
  331. for _, match := range charsRE.FindAllStringIndex(comment, -1) {
  332. charsCount += match[1] - match[0]
  333. }
  334. if charsCount < int(float32(len(comment))*CommentLettersRatio) {
  335. continue
  336. }
  337. // heuristic - license
  338. if licenseRE.MatchString(comment) {
  339. continue
  340. }
  341. filteredComments = append(filteredComments, comment)
  342. }
  343. return filteredComments
  344. }
  345. func init() {
  346. core.Registry.Register(&CommentSentimentAnalysis{})
  347. }