comment_sentiment.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347
  1. // +build tensorflow
  2. package hercules
  3. import (
  4. "fmt"
  5. "io"
  6. "log"
  7. "os"
  8. "regexp"
  9. "sort"
  10. "strings"
  11. "github.com/gogo/protobuf/proto"
  12. progress "gopkg.in/cheggaaa/pb.v1"
  13. "gopkg.in/bblfsh/sdk.v1/uast"
  14. "gopkg.in/src-d/go-git.v4"
  15. "gopkg.in/src-d/go-git.v4/plumbing"
  16. "gopkg.in/src-d/hercules.v3/pb"
  17. "gopkg.in/vmarkovtsev/BiDiSentiment.v1"
  18. )
  19. // CommentSentimentAnalysis measures comment sentiment through time.
  20. type CommentSentimentAnalysis struct {
  21. MinCommentLength int
  22. Gap float32
  23. commentsByDay map[int][]string
  24. commitsByDay map[int][]plumbing.Hash
  25. xpather *ChangesXPather
  26. }
  27. // CommentSentimentResult contains the sentiment values per day, where 1 means very negative
  28. // and 0 means very positive.
  29. type CommentSentimentResult struct {
  30. EmotionsByDay map[int]float32
  31. CommentsByDay map[int][]string
  32. commitsByDay map[int][]plumbing.Hash
  33. }
  34. const (
  35. ConfigCommentSentimentMinLength = "CommentSentiment.MinLength"
  36. ConfigCommentSentimentGap = "CommentSentiment.Gap"
  37. DefaultCommentSentimentCommentMinLength = 20
  38. DefaultCommentSentimentGap = float32(0.5)
  39. // CommentLettersRatio is the threshold to filter impure comments which contain code.
  40. CommentLettersRatio = 0.6
  41. )
  42. var (
  43. filteredFirstCharRE = regexp.MustCompile("[^a-zA-Z0-9]")
  44. filteredCharsRE = regexp.MustCompile("[^-a-zA-Z0-9_:;,./?!#&%+*=\\n \\t()]+")
  45. charsRE = regexp.MustCompile("[a-zA-Z]+")
  46. functionNameRE = regexp.MustCompile("\\s*[a-zA-Z_][a-zA-Z_0-9]*\\(\\)")
  47. whitespaceRE = regexp.MustCompile("\\s+")
  48. licenseRE = regexp.MustCompile("(?i)[li[cs]en[cs][ei]|copyright|©")
  49. )
  50. // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
  51. func (sent *CommentSentimentAnalysis) Name() string {
  52. return "Sentiment"
  53. }
  54. // Provides returns the list of names of entities which are produced by this PipelineItem.
  55. // Each produced entity will be inserted into `deps` of dependent Consume()-s according
  56. // to this list. Also used by hercules.Registry to build the global map of providers.
  57. func (sent *CommentSentimentAnalysis) Provides() []string {
  58. return []string{}
  59. }
  60. // Requires returns the list of names of entities which are needed by this PipelineItem.
  61. // Each requested entity will be inserted into `deps` of Consume(). In turn, those
  62. // entities are Provides() upstream.
  63. func (sent *CommentSentimentAnalysis) Requires() []string {
  64. arr := [...]string{DependencyUastChanges, DependencyDay}
  65. return arr[:]
  66. }
  67. // Features which must be enabled for this PipelineItem to be automatically inserted into the DAG.
  68. func (sent *CommentSentimentAnalysis) Features() []string {
  69. arr := [...]string{FeatureUast}
  70. return arr[:]
  71. }
  72. // ListConfigurationOptions returns the list of changeable public properties of this PipelineItem.
  73. func (sent *CommentSentimentAnalysis) ListConfigurationOptions() []ConfigurationOption {
  74. options := [...]ConfigurationOption{{
  75. Name: ConfigCommentSentimentMinLength,
  76. Description: "Minimum length of the comment to be analyzed.",
  77. Flag: "min-comment-len",
  78. Type: IntConfigurationOption,
  79. Default: DefaultCommentSentimentCommentMinLength}, {
  80. Name: ConfigCommentSentimentGap,
  81. Description: "Sentiment value threshold, values between 0.5 - X/2 and 0.5 + x/2 will not be " +
  82. "considered. Must be >= 0 and < 1. The purpose is to exclude neutral comments.",
  83. Flag: "sentiment-gap",
  84. Type: FloatConfigurationOption,
  85. Default: DefaultCommentSentimentGap},
  86. }
  87. return options[:]
  88. }
  89. // Flag returns the command line switch which activates the analysis.
  90. func (sent *CommentSentimentAnalysis) Flag() string {
  91. return "sentiment"
  92. }
  93. // Configure sets the properties previously published by ListConfigurationOptions().
  94. func (sent *CommentSentimentAnalysis) Configure(facts map[string]interface{}) {
  95. if val, exists := facts[ConfigCommentSentimentGap]; exists {
  96. sent.Gap = val.(float32)
  97. if sent.Gap < 0 || sent.Gap >= 1 {
  98. log.Printf("Sentiment gap is too big: %f => reset to the default %f",
  99. sent.Gap, DefaultCommentSentimentGap)
  100. sent.Gap = DefaultCommentSentimentGap
  101. }
  102. }
  103. if val, exists := facts[ConfigCommentSentimentMinLength]; exists {
  104. sent.MinCommentLength = val.(int)
  105. if sent.MinCommentLength < 10 {
  106. log.Printf("Comment minimum length is too small: %d => reset to the default %d",
  107. sent.MinCommentLength, DefaultCommentSentimentCommentMinLength)
  108. sent.MinCommentLength = DefaultCommentSentimentCommentMinLength
  109. }
  110. }
  111. sent.commitsByDay = facts[FactCommitsByDay].(map[int][]plumbing.Hash)
  112. }
  113. // Initialize resets the temporary caches and prepares this PipelineItem for a series of Consume()
  114. // calls. The repository which is going to be analysed is supplied as an argument.
  115. func (sent *CommentSentimentAnalysis) Initialize(repository *git.Repository) {
  116. sent.commentsByDay = map[int][]string{}
  117. sent.xpather = &ChangesXPather{XPath: "//*[@roleComment]"}
  118. }
  119. // Consume runs this PipelineItem on the next commit data.
  120. // `deps` contain all the results from upstream PipelineItem-s as requested by Requires().
  121. // Additionally, "commit" is always present there and represents the analysed *object.Commit.
  122. // This function returns the mapping with analysis results. The keys must be the same as
  123. // in Provides(). If there was an error, nil is returned.
  124. func (sent *CommentSentimentAnalysis) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  125. changes := deps[DependencyUastChanges].([]UASTChange)
  126. day := deps[DependencyDay].(int)
  127. commentNodes := sent.xpather.Extract(changes)
  128. comments := sent.mergeComments(commentNodes)
  129. dayComments := sent.commentsByDay[day]
  130. if dayComments == nil {
  131. dayComments = []string{}
  132. }
  133. dayComments = append(dayComments, comments...)
  134. sent.commentsByDay[day] = dayComments
  135. return nil, nil
  136. }
  137. // Finalize returns the result of the analysis. Further Consume() calls are not expected.
  138. func (sent *CommentSentimentAnalysis) Finalize() interface{} {
  139. result := CommentSentimentResult{
  140. EmotionsByDay: map[int]float32{},
  141. CommentsByDay: map[int][]string{},
  142. commitsByDay: sent.commitsByDay,
  143. }
  144. texts := []string{}
  145. days := make([]int, 0, len(sent.commentsByDay))
  146. for day := range sent.commentsByDay {
  147. days = append(days, day)
  148. }
  149. sort.Ints(days)
  150. for _, key := range days {
  151. for _, val := range sent.commentsByDay[key] {
  152. texts = append(texts, val)
  153. }
  154. }
  155. session, err := sentiment.OpenSession()
  156. if err != nil {
  157. panic(err)
  158. }
  159. defer session.Close()
  160. var bar *progress.ProgressBar
  161. callback := func(pos int, total int) {
  162. if bar == nil {
  163. bar = progress.New(total)
  164. bar.Callback = func(msg string) {
  165. os.Stderr.WriteString("\r" + msg)
  166. }
  167. bar.NotPrint = true
  168. bar.ShowPercent = false
  169. bar.ShowSpeed = false
  170. bar.SetMaxWidth(80)
  171. bar.Start()
  172. }
  173. bar.Set(pos)
  174. }
  175. // we run the bulk evaluation in the end for efficiency
  176. weights, err := sentiment.EvaluateWithProgress(texts, session, callback)
  177. if bar != nil {
  178. bar.Finish()
  179. }
  180. if err != nil {
  181. panic(err)
  182. }
  183. pos := 0
  184. for _, key := range days {
  185. sum := float32(0)
  186. comments := make([]string, 0, len(sent.commentsByDay[key]))
  187. for _, comment := range sent.commentsByDay[key] {
  188. if weights[pos] < 0.5*(1-sent.Gap) || weights[pos] > 0.5*(1+sent.Gap) {
  189. sum += weights[pos]
  190. comments = append(comments, comment)
  191. }
  192. pos++
  193. }
  194. if len(comments) > 0 {
  195. result.EmotionsByDay[key] = sum / float32(len(comments))
  196. result.CommentsByDay[key] = comments
  197. }
  198. }
  199. return result
  200. }
  201. // Serialize converts the analysis result as returned by Finalize() to text or bytes.
  202. // The text format is YAML and the bytes format is Protocol Buffers.
  203. func (sent *CommentSentimentAnalysis) Serialize(result interface{}, binary bool, writer io.Writer) error {
  204. sentimentResult := result.(CommentSentimentResult)
  205. if binary {
  206. return sent.serializeBinary(&sentimentResult, writer)
  207. }
  208. sent.serializeText(&sentimentResult, writer)
  209. return nil
  210. }
  211. func (sent *CommentSentimentAnalysis) serializeText(result *CommentSentimentResult, writer io.Writer) {
  212. days := make([]int, 0, len(result.EmotionsByDay))
  213. for day := range result.EmotionsByDay {
  214. days = append(days, day)
  215. }
  216. sort.Ints(days)
  217. for _, day := range days {
  218. commits := sent.commitsByDay[day]
  219. hashes := make([]string, len(commits))
  220. for i, hash := range commits {
  221. hashes[i] = hash.String()
  222. }
  223. fmt.Fprintf(writer, " %d: [%.4f, [%s], \"%s\"]\n",
  224. day, result.EmotionsByDay[day], strings.Join(hashes, ","),
  225. strings.Join(result.CommentsByDay[day], "|"))
  226. }
  227. }
  228. func (sent *CommentSentimentAnalysis) serializeBinary(
  229. result *CommentSentimentResult, writer io.Writer) error {
  230. message := pb.CommentSentimentResults{
  231. SentimentByDay: map[int32]*pb.Sentiment{},
  232. }
  233. for key, val := range result.EmotionsByDay {
  234. commits := make([]string, len(result.commitsByDay[key]))
  235. for i, commit := range result.commitsByDay[key] {
  236. commits[i] = commit.String()
  237. }
  238. message.SentimentByDay[int32(key)] = &pb.Sentiment{
  239. Value: val,
  240. Comments: result.CommentsByDay[key],
  241. Commits: commits,
  242. }
  243. }
  244. serialized, err := proto.Marshal(&message)
  245. if err != nil {
  246. return err
  247. }
  248. writer.Write(serialized)
  249. return nil
  250. }
  251. func (sent *CommentSentimentAnalysis) mergeComments(nodes []*uast.Node) []string {
  252. mergedComments := []string{}
  253. lines := map[int][]*uast.Node{}
  254. for _, node := range nodes {
  255. lineno := int(node.StartPosition.Line)
  256. subnodes := lines[lineno]
  257. if subnodes == nil {
  258. subnodes = []*uast.Node{}
  259. }
  260. subnodes = append(subnodes, node)
  261. lines[lineno] = subnodes
  262. }
  263. lineNums := make([]int, 0, len(lines))
  264. for line := range lines {
  265. lineNums = append(lineNums, line)
  266. }
  267. sort.Ints(lineNums)
  268. buffer := []string{}
  269. for i, line := range lineNums {
  270. lineNodes := lines[line]
  271. maxEnd := line
  272. for _, node := range lineNodes {
  273. if node.EndPosition != nil && maxEnd < int(node.EndPosition.Line) {
  274. maxEnd = int(node.EndPosition.Line)
  275. }
  276. token := strings.TrimSpace(node.Token)
  277. if token != "" {
  278. buffer = append(buffer, token)
  279. }
  280. }
  281. if i < len(lineNums)-1 && lineNums[i+1] <= maxEnd+1 {
  282. continue
  283. }
  284. mergedComments = append(mergedComments, strings.Join(buffer, "\n"))
  285. buffer = buffer[:0]
  286. }
  287. // We remove unneeded chars and filter too short comments
  288. filteredComments := make([]string, 0, len(mergedComments))
  289. for _, comment := range mergedComments {
  290. comment = strings.TrimSpace(comment)
  291. if comment == "" || filteredFirstCharRE.MatchString(comment[:1]) {
  292. // heuristic - we discard docstrings
  293. continue
  294. }
  295. // heuristic - remove function names
  296. comment = functionNameRE.ReplaceAllString(comment, "")
  297. comment = filteredCharsRE.ReplaceAllString(comment, "")
  298. if len(comment) < sent.MinCommentLength {
  299. continue
  300. }
  301. // collapse whitespace
  302. comment = whitespaceRE.ReplaceAllString(comment, " ")
  303. // heuristic - number of letters must be at least 60%
  304. charsCount := 0
  305. for _, match := range charsRE.FindAllStringIndex(comment, -1) {
  306. charsCount += match[1] - match[0]
  307. }
  308. if charsCount < int(float32(len(comment))*CommentLettersRatio) {
  309. continue
  310. }
  311. // heuristic - license
  312. if licenseRE.MatchString(comment) {
  313. continue
  314. }
  315. filteredComments = append(filteredComments, comment)
  316. }
  317. return filteredComments
  318. }
  319. func init() {
  320. Registry.Register(&CommentSentimentAnalysis{})
  321. }