typos.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312
  1. package research
  2. import (
  3. "bytes"
  4. "fmt"
  5. "io"
  6. "unicode/utf8"
  7. "github.com/gogo/protobuf/proto"
  8. "github.com/sergi/go-diff/diffmatchpatch"
  9. "gopkg.in/bblfsh/sdk.v2/uast"
  10. "gopkg.in/bblfsh/sdk.v2/uast/nodes"
  11. "gopkg.in/src-d/go-git.v4"
  12. "gopkg.in/src-d/go-git.v4/plumbing"
  13. "gopkg.in/src-d/go-git.v4/plumbing/object"
  14. "gopkg.in/src-d/hercules.v10/internal/core"
  15. "gopkg.in/src-d/hercules.v10/internal/levenshtein"
  16. "gopkg.in/src-d/hercules.v10/internal/pb"
  17. items "gopkg.in/src-d/hercules.v10/internal/plumbing"
  18. uast_items "gopkg.in/src-d/hercules.v10/internal/plumbing/uast"
  19. "gopkg.in/src-d/hercules.v10/internal/yaml"
  20. )
  21. // TyposDatasetBuilder collects pairs of typo-fix in source code identifiers.
  22. type TyposDatasetBuilder struct {
  23. core.NoopMerger
  24. // MaximumAllowedDistance is the maximum Levenshtein distance between two identifiers
  25. // to consider them a typo-fix pair.
  26. MaximumAllowedDistance int
  27. // typos stores the found typo-fix pairs.
  28. typos []Typo
  29. // lcontext is the Context for measuring Levenshtein distance between lines.
  30. lcontext *levenshtein.Context
  31. // xpather filters identifiers.
  32. xpather uast_items.ChangesXPather
  33. // remote carries the repository remote URL (for debugging)
  34. remote string
  35. l core.Logger
  36. }
  37. // TyposResult is returned by TyposDatasetBuilder.Finalize() and carries the found typo-fix
  38. // pairs of identifiers.
  39. type TyposResult struct {
  40. Typos []Typo
  41. }
  42. // Typo carries the information about a typo-fix pair.
  43. type Typo struct {
  44. Wrong string
  45. Correct string
  46. Commit plumbing.Hash
  47. File string
  48. Line int
  49. }
  50. const (
  51. // DefaultMaximumAllowedTypoDistance is the default value of the maximum Levenshtein distance
  52. // between two identifiers to consider them a typo-fix pair.
  53. DefaultMaximumAllowedTypoDistance = 4
  54. // ConfigTyposDatasetMaximumAllowedDistance is the name of the configuration option
  55. // (`TyposDatasetBuilder.Configure()`) which sets the maximum Levenshtein distance between
  56. // two identifiers to consider them a typo-fix pair.
  57. ConfigTyposDatasetMaximumAllowedDistance = "TyposDatasetBuilder.MaximumAllowedDistance"
  58. )
  59. // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
  60. func (tdb *TyposDatasetBuilder) Name() string {
  61. return "TyposDataset"
  62. }
  63. // Provides returns the list of names of entities which are produced by this PipelineItem.
  64. // Each produced entity will be inserted into `deps` of dependent Consume()-s according
  65. // to this list. Also used by core.Registry to build the global map of providers.
  66. func (tdb *TyposDatasetBuilder) Provides() []string {
  67. return []string{}
  68. }
  69. // Requires returns the list of names of entities which are needed by this PipelineItem.
  70. // Each requested entity will be inserted into `deps` of Consume(). In turn, those
  71. // entities are Provides() upstream.
  72. func (tdb *TyposDatasetBuilder) Requires() []string {
  73. arr := [...]string{
  74. uast_items.DependencyUastChanges, items.DependencyFileDiff, items.DependencyBlobCache}
  75. return arr[:]
  76. }
  77. // ListConfigurationOptions returns the list of changeable public properties of this PipelineItem.
  78. func (tdb *TyposDatasetBuilder) ListConfigurationOptions() []core.ConfigurationOption {
  79. options := [...]core.ConfigurationOption{{
  80. Name: ConfigTyposDatasetMaximumAllowedDistance,
  81. Description: "Maximum Levenshtein distance between two identifiers to consider them " +
  82. "a typo-fix pair.",
  83. Flag: "typos-max-distance",
  84. Type: core.IntConfigurationOption,
  85. Default: DefaultMaximumAllowedTypoDistance},
  86. }
  87. return options[:]
  88. }
  89. // Configure sets the properties previously published by ListConfigurationOptions().
  90. func (tdb *TyposDatasetBuilder) Configure(facts map[string]interface{}) error {
  91. if l, exists := facts[core.ConfigLogger].(core.Logger); exists {
  92. tdb.l = l
  93. }
  94. if val, exists := facts[ConfigTyposDatasetMaximumAllowedDistance].(int); exists {
  95. tdb.MaximumAllowedDistance = val
  96. }
  97. return nil
  98. }
  99. // Flag for the command line switch which enables this analysis.
  100. func (tdb *TyposDatasetBuilder) Flag() string {
  101. return "typos-dataset"
  102. }
  103. // Description returns the text which explains what the analysis is doing.
  104. func (tdb *TyposDatasetBuilder) Description() string {
  105. return "Extracts typo-fix identifier pairs from source code in commit diffs."
  106. }
  107. // Initialize resets the temporary caches and prepares this PipelineItem for a series of Consume()
  108. // calls. The repository which is going to be analysed is supplied as an argument.
  109. func (tdb *TyposDatasetBuilder) Initialize(repository *git.Repository) error {
  110. tdb.l = core.NewLogger()
  111. if tdb.MaximumAllowedDistance <= 0 {
  112. tdb.MaximumAllowedDistance = DefaultMaximumAllowedTypoDistance
  113. }
  114. tdb.lcontext = &levenshtein.Context{}
  115. tdb.xpather.XPath = "//uast:Identifier"
  116. tdb.remote = core.GetSensibleRemote(repository)
  117. return nil
  118. }
  119. type candidate struct {
  120. Before int
  121. After int
  122. }
  123. // Consume runs this PipelineItem on the next commit data.
  124. // `deps` contain all the results from upstream PipelineItem-s as requested by Requires().
  125. // Additionally, DependencyCommit is always present there and represents the analysed *object.Commit.
  126. // This function returns the mapping with analysis results. The keys must be the same as
  127. // in Provides(). If there was an error, nil is returned.
  128. func (tdb *TyposDatasetBuilder) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  129. if deps[core.DependencyIsMerge].(bool) {
  130. return nil, nil
  131. }
  132. commit := deps[core.DependencyCommit].(*object.Commit).Hash
  133. cache := deps[items.DependencyBlobCache].(map[plumbing.Hash]*items.CachedBlob)
  134. diffs := deps[items.DependencyFileDiff].(map[string]items.FileDiffData)
  135. changes := deps[uast_items.DependencyUastChanges].([]uast_items.Change)
  136. for _, change := range changes {
  137. if change.Before == nil || change.After == nil {
  138. continue
  139. }
  140. linesBefore := bytes.Split(cache[change.Change.From.TreeEntry.Hash].Data, []byte{'\n'})
  141. linesAfter := bytes.Split(cache[change.Change.To.TreeEntry.Hash].Data, []byte{'\n'})
  142. diff := diffs[change.Change.To.Name]
  143. var lineNumBefore, lineNumAfter int
  144. var candidates []candidate
  145. focusedLinesBefore := map[int]bool{}
  146. focusedLinesAfter := map[int]bool{}
  147. removedSize := 0
  148. for _, edit := range diff.Diffs {
  149. size := utf8.RuneCountInString(edit.Text)
  150. switch edit.Type {
  151. case diffmatchpatch.DiffDelete:
  152. lineNumBefore += size
  153. removedSize = size
  154. case diffmatchpatch.DiffInsert:
  155. if size == removedSize {
  156. for i := 0; i < size; i++ {
  157. lb := lineNumBefore - size + i
  158. la := lineNumAfter + i
  159. dist := tdb.lcontext.Distance(string(linesBefore[lb]), string(linesAfter[la]))
  160. if dist <= tdb.MaximumAllowedDistance {
  161. candidates = append(candidates, candidate{lb, la})
  162. focusedLinesBefore[lb] = true
  163. focusedLinesAfter[la] = true
  164. }
  165. }
  166. }
  167. lineNumAfter += size
  168. removedSize = 0
  169. case diffmatchpatch.DiffEqual:
  170. lineNumBefore += size
  171. lineNumAfter += size
  172. removedSize = 0
  173. }
  174. }
  175. if len(candidates) == 0 {
  176. continue
  177. }
  178. // at this point we have pairs of very similar lines
  179. // we need to build the line mappings of the identifiers before/after the change
  180. // we should keep only those which are present on those focused lines
  181. nodesAdded, nodesRemoved := tdb.xpather.Extract([]uast_items.Change{change})
  182. addedIdentifiers := map[int][]nodes.Node{}
  183. removedIdentifiers := map[int][]nodes.Node{}
  184. for _, n := range nodesAdded {
  185. pos := uast.PositionsOf(n.(nodes.Object))
  186. if pos.Start() == nil {
  187. tdb.l.Infof("repo %s commit %s file %s adds identifier %s with no position",
  188. tdb.remote, commit.String(), change.Change.To.Name,
  189. n.(nodes.Object)["Name"].(nodes.String))
  190. continue
  191. }
  192. line := int(pos.Start().Line) - 1
  193. if focusedLinesAfter[line] {
  194. addedIdentifiers[line] = append(addedIdentifiers[line], n)
  195. }
  196. }
  197. for _, n := range nodesRemoved {
  198. pos := uast.PositionsOf(n.(nodes.Object))
  199. if pos.Start() == nil {
  200. tdb.l.Infof("repo %s commit %s file %s removes identifier %s with no position",
  201. tdb.remote, commit.String(), change.Change.To.Name,
  202. n.(nodes.Object)["Name"].(nodes.String))
  203. continue
  204. }
  205. line := int(pos.Start().Line) - 1
  206. if focusedLinesBefore[line] {
  207. removedIdentifiers[line] = append(removedIdentifiers[line], n)
  208. }
  209. }
  210. for _, c := range candidates {
  211. nodesBefore := removedIdentifiers[c.Before]
  212. nodesAfter := addedIdentifiers[c.After]
  213. if len(nodesBefore) == 1 && len(nodesAfter) == 1 {
  214. idBefore := string(nodesBefore[0].(nodes.Object)["Name"].(nodes.String))
  215. idAfter := string(nodesAfter[0].(nodes.Object)["Name"].(nodes.String))
  216. tdb.typos = append(tdb.typos, Typo{
  217. Wrong: idBefore,
  218. Correct: idAfter,
  219. Commit: commit,
  220. File: change.Change.To.Name,
  221. Line: c.After,
  222. })
  223. }
  224. }
  225. }
  226. return nil, nil
  227. }
  228. // Finalize returns the result of the analysis. Further Consume() calls are not expected.
  229. func (tdb *TyposDatasetBuilder) Finalize() interface{} {
  230. // deduplicate
  231. typos := make([]Typo, 0, len(tdb.typos))
  232. pairs := map[string]bool{}
  233. for _, t := range tdb.typos {
  234. id := t.Wrong + "|" + t.Correct
  235. if _, exists := pairs[id]; !exists {
  236. pairs[id] = true
  237. typos = append(typos, t)
  238. }
  239. }
  240. return TyposResult{Typos: typos}
  241. }
  242. // Fork clones this pipeline item.
  243. func (tdb *TyposDatasetBuilder) Fork(n int) []core.PipelineItem {
  244. return core.ForkSamePipelineItem(tdb, n)
  245. }
  246. // Serialize converts the analysis result as returned by Finalize() to text or bytes.
  247. // The text format is YAML and the bytes format is Protocol Buffers.
  248. func (tdb *TyposDatasetBuilder) Serialize(result interface{}, binary bool, writer io.Writer) error {
  249. commitsResult := result.(TyposResult)
  250. if binary {
  251. return tdb.serializeBinary(&commitsResult, writer)
  252. }
  253. tdb.serializeText(&commitsResult, writer)
  254. return nil
  255. }
  256. func (tdb *TyposDatasetBuilder) serializeText(result *TyposResult, writer io.Writer) {
  257. for _, t := range result.Typos {
  258. fmt.Fprintf(writer, " - wrong: %s\n", yaml.SafeString(t.Wrong))
  259. fmt.Fprintf(writer, " correct: %s\n", yaml.SafeString(t.Correct))
  260. fmt.Fprintf(writer, " commit: %s\n", t.Commit.String())
  261. fmt.Fprintf(writer, " file: %s\n", yaml.SafeString(t.File))
  262. fmt.Fprintf(writer, " line: %d\n", t.Line)
  263. }
  264. }
  265. func (tdb *TyposDatasetBuilder) serializeBinary(result *TyposResult, writer io.Writer) error {
  266. message := pb.TyposDataset{}
  267. message.Typos = make([]*pb.Typo, len(result.Typos))
  268. for i, t := range result.Typos {
  269. message.Typos[i] = &pb.Typo{
  270. Wrong: t.Wrong,
  271. Correct: t.Correct,
  272. Commit: t.Commit.String(),
  273. File: t.File,
  274. Line: int32(t.Line),
  275. }
  276. }
  277. serialized, err := proto.Marshal(&message)
  278. if err != nil {
  279. return err
  280. }
  281. _, err = writer.Write(serialized)
  282. return err
  283. }
  284. func init() {
  285. core.Registry.Register(&TyposDatasetBuilder{})
  286. }