typos.go 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. package research
  2. import (
  3. "bytes"
  4. "fmt"
  5. "io"
  6. "unicode/utf8"
  7. "github.com/gogo/protobuf/proto"
  8. "github.com/sergi/go-diff/diffmatchpatch"
  9. "gopkg.in/bblfsh/sdk.v2/uast"
  10. "gopkg.in/bblfsh/sdk.v2/uast/nodes"
  11. "gopkg.in/src-d/go-git.v4"
  12. "gopkg.in/src-d/go-git.v4/plumbing"
  13. "gopkg.in/src-d/go-git.v4/plumbing/object"
  14. "gopkg.in/src-d/hercules.v9/internal/core"
  15. "gopkg.in/src-d/hercules.v9/internal/levenshtein"
  16. "gopkg.in/src-d/hercules.v9/internal/pb"
  17. items "gopkg.in/src-d/hercules.v9/internal/plumbing"
  18. uast_items "gopkg.in/src-d/hercules.v9/internal/plumbing/uast"
  19. )
  20. // TyposDatasetBuilder collects pairs of typo-fix in source code identifiers.
  21. type TyposDatasetBuilder struct {
  22. core.NoopMerger
  23. // MaximumAllowedDistance is the maximum Levenshtein distance between two identifiers
  24. // to consider them a typo-fix pair.
  25. MaximumAllowedDistance int
  26. // typos stores the found typo-fix pairs.
  27. typos []Typo
  28. // lcontext is the Context for measuring Levenshtein distance between lines.
  29. lcontext *levenshtein.Context
  30. // xpather filters identifiers.
  31. xpather uast_items.ChangesXPather
  32. }
  33. // TyposResult is returned by TyposDatasetBuilder.Finalize() and carries the found typo-fix
  34. // pairs of identifiers.
  35. type TyposResult struct {
  36. Typos []Typo
  37. }
  38. // Typo carries the information about a typo-fix pair.
  39. type Typo struct {
  40. Wrong string
  41. Correct string
  42. Commit plumbing.Hash
  43. File string
  44. Line int
  45. }
  46. const (
  47. // DefaultMaximumAllowedTypoDistance is the default value of the maximum Levenshtein distance
  48. // between two identifiers to consider them a typo-fix pair.
  49. DefaultMaximumAllowedTypoDistance = 4
  50. )
  51. // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
  52. func (tdb *TyposDatasetBuilder) Name() string {
  53. return "TyposDataset"
  54. }
  55. // Provides returns the list of names of entities which are produced by this PipelineItem.
  56. // Each produced entity will be inserted into `deps` of dependent Consume()-s according
  57. // to this list. Also used by core.Registry to build the global map of providers.
  58. func (tdb *TyposDatasetBuilder) Provides() []string {
  59. return []string{}
  60. }
  61. // Requires returns the list of names of entities which are needed by this PipelineItem.
  62. // Each requested entity will be inserted into `deps` of Consume(). In turn, those
  63. // entities are Provides() upstream.
  64. func (tdb *TyposDatasetBuilder) Requires() []string {
  65. arr := [...]string{
  66. uast_items.DependencyUastChanges, items.DependencyFileDiff, items.DependencyBlobCache}
  67. return arr[:]
  68. }
  69. // ListConfigurationOptions returns the list of changeable public properties of this PipelineItem.
  70. func (tdb *TyposDatasetBuilder) ListConfigurationOptions() []core.ConfigurationOption {
  71. return nil
  72. }
  73. // Configure sets the properties previously published by ListConfigurationOptions().
  74. func (tdb *TyposDatasetBuilder) Configure(facts map[string]interface{}) error {
  75. return nil
  76. }
  77. // Flag for the command line switch which enables this analysis.
  78. func (tdb *TyposDatasetBuilder) Flag() string {
  79. return "typos-dataset"
  80. }
  81. // Description returns the text which explains what the analysis is doing.
  82. func (tdb *TyposDatasetBuilder) Description() string {
  83. return "Extracts typo-fix identifier pairs from source code in commit diffs."
  84. }
  85. // Initialize resets the temporary caches and prepares this PipelineItem for a series of Consume()
  86. // calls. The repository which is going to be analysed is supplied as an argument.
  87. func (tdb *TyposDatasetBuilder) Initialize(repository *git.Repository) error {
  88. if tdb.MaximumAllowedDistance == 0 {
  89. tdb.MaximumAllowedDistance = DefaultMaximumAllowedTypoDistance
  90. }
  91. tdb.lcontext = &levenshtein.Context{}
  92. tdb.xpather.XPath = "//uast:Identifier"
  93. return nil
  94. }
  95. type candidate struct {
  96. Before int
  97. After int
  98. }
  99. // Consume runs this PipelineItem on the next commit data.
  100. // `deps` contain all the results from upstream PipelineItem-s as requested by Requires().
  101. // Additionally, DependencyCommit is always present there and represents the analysed *object.Commit.
  102. // This function returns the mapping with analysis results. The keys must be the same as
  103. // in Provides(). If there was an error, nil is returned.
  104. func (tdb *TyposDatasetBuilder) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  105. if deps[core.DependencyIsMerge].(bool) {
  106. return nil, nil
  107. }
  108. commit := deps[core.DependencyCommit].(*object.Commit).Hash
  109. cache := deps[items.DependencyBlobCache].(map[plumbing.Hash]*items.CachedBlob)
  110. diffs := deps[items.DependencyFileDiff].(map[string]items.FileDiffData)
  111. changes := deps[uast_items.DependencyUastChanges].([]uast_items.Change)
  112. for _, change := range changes {
  113. if change.Before == nil || change.After == nil {
  114. continue
  115. }
  116. linesBefore := bytes.Split(cache[change.Change.From.TreeEntry.Hash].Data, []byte{'\n'})
  117. linesAfter := bytes.Split(cache[change.Change.To.TreeEntry.Hash].Data, []byte{'\n'})
  118. diff := diffs[change.Change.To.Name]
  119. var lineNumBefore, lineNumAfter int
  120. clear := false
  121. var candidates []candidate
  122. focusedLinesBefore := map[int]bool{}
  123. focusedLinesAfter := map[int]bool{}
  124. for _, edit := range diff.Diffs {
  125. size := utf8.RuneCountInString(edit.Text)
  126. switch edit.Type {
  127. case diffmatchpatch.DiffDelete:
  128. lineNumBefore += size
  129. clear = size == 1
  130. case diffmatchpatch.DiffInsert:
  131. if size == 1 && clear {
  132. dist := tdb.lcontext.Distance(
  133. string(linesBefore[lineNumBefore-1]),
  134. string(linesAfter[lineNumAfter]))
  135. if dist <= tdb.MaximumAllowedDistance {
  136. candidates = append(candidates, candidate{lineNumBefore - 1, lineNumAfter})
  137. focusedLinesBefore[lineNumBefore-1] = true
  138. focusedLinesAfter[lineNumAfter] = true
  139. }
  140. }
  141. lineNumAfter += size
  142. clear = false
  143. case diffmatchpatch.DiffEqual:
  144. lineNumBefore += size
  145. lineNumAfter += size
  146. clear = false
  147. }
  148. }
  149. if len(candidates) == 0 {
  150. continue
  151. }
  152. // at this point we have pairs of very similar lines
  153. // we need to build the line mappings of the identifiers before/after the change
  154. // we should keep only those which are present on those focused lines
  155. nodesAdded, nodesRemoved := tdb.xpather.Extract([]uast_items.Change{change})
  156. addedIdentifiers := map[int][]nodes.Node{}
  157. removedIdentifiers := map[int][]nodes.Node{}
  158. for _, n := range nodesAdded {
  159. pos := uast.PositionsOf(n.(nodes.Object))
  160. if pos.Start() != nil {
  161. line := int(pos.Start().Line) - 1
  162. if focusedLinesAfter[line] {
  163. addedIdentifiers[line] = append(addedIdentifiers[line], n)
  164. }
  165. }
  166. }
  167. for _, n := range nodesRemoved {
  168. pos := uast.PositionsOf(n.(nodes.Object))
  169. line := int(pos.Start().Line) - 1
  170. if pos.Start() != nil {
  171. if focusedLinesBefore[line] {
  172. removedIdentifiers[line] = append(removedIdentifiers[line], n)
  173. }
  174. }
  175. }
  176. for _, c := range candidates {
  177. nodesBefore := addedIdentifiers[c.Before]
  178. nodesAfter := removedIdentifiers[c.After]
  179. if len(nodesBefore) == 1 && len(nodesAfter) == 1 {
  180. idBefore := string(nodesBefore[0].(nodes.Object)["Name"].(nodes.String))
  181. idAfter := string(nodesAfter[0].(nodes.Object)["Name"].(nodes.String))
  182. tdb.typos = append(tdb.typos, Typo{
  183. Wrong: idBefore,
  184. Correct: idAfter,
  185. Commit: commit,
  186. File: change.Change.To.Name,
  187. Line: c.After,
  188. })
  189. }
  190. }
  191. }
  192. return nil, nil
  193. }
  194. // Finalize returns the result of the analysis. Further Consume() calls are not expected.
  195. func (tdb *TyposDatasetBuilder) Finalize() interface{} {
  196. // deduplicate
  197. typos := make([]Typo, 0, len(tdb.typos))
  198. pairs := map[string]bool{}
  199. for _, t := range tdb.typos {
  200. id := t.Wrong + "|" + t.Correct
  201. if _, exists := pairs[id]; !exists {
  202. pairs[id] = true
  203. typos = append(typos, t)
  204. }
  205. }
  206. return TyposResult{Typos: typos}
  207. }
  208. // Fork clones this pipeline item.
  209. func (tdb *TyposDatasetBuilder) Fork(n int) []core.PipelineItem {
  210. return core.ForkSamePipelineItem(tdb, n)
  211. }
  212. // Serialize converts the analysis result as returned by Finalize() to text or bytes.
  213. // The text format is YAML and the bytes format is Protocol Buffers.
  214. func (tdb *TyposDatasetBuilder) Serialize(result interface{}, binary bool, writer io.Writer) error {
  215. commitsResult := result.(TyposResult)
  216. if binary {
  217. return tdb.serializeBinary(&commitsResult, writer)
  218. }
  219. tdb.serializeText(&commitsResult, writer)
  220. return nil
  221. }
  222. func (tdb *TyposDatasetBuilder) serializeText(result *TyposResult, writer io.Writer) {
  223. for _, t := range result.Typos {
  224. fmt.Fprintf(writer, " - wrong: %s\n", t.Wrong)
  225. fmt.Fprintf(writer, " correct: %s\n", t.Correct)
  226. fmt.Fprintf(writer, " commit: %s\n", t.Commit.String())
  227. fmt.Fprintf(writer, " file: %s\n", t.File)
  228. fmt.Fprintf(writer, " line: %d\n", t.Line)
  229. }
  230. }
  231. func (tdb *TyposDatasetBuilder) serializeBinary(result *TyposResult, writer io.Writer) error {
  232. message := pb.TyposDataset{}
  233. message.Typos = make([]*pb.Typo, len(result.Typos))
  234. for i, t := range result.Typos {
  235. message.Typos[i] = &pb.Typo{
  236. Wrong: t.Wrong,
  237. Correct: t.Correct,
  238. Commit: t.Commit.String(),
  239. File: t.File,
  240. Line: int32(t.Line),
  241. }
  242. }
  243. serialized, err := proto.Marshal(&message)
  244. if err != nil {
  245. return err
  246. }
  247. _, err = writer.Write(serialized)
  248. return err
  249. }
  250. func init() {
  251. core.Registry.Register(&TyposDatasetBuilder{})
  252. }