identity.go 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
  1. package hercules
  2. import (
  3. "bufio"
  4. "os"
  5. "sort"
  6. "strings"
  7. "gopkg.in/src-d/go-git.v4"
  8. "gopkg.in/src-d/go-git.v4/plumbing/object"
  9. )
  10. // IdentityDetector determines the author of a commit. Same person can commit under different
  11. // signatures, and we apply some heuristics to merge those together.
  12. // It is a PipelineItem.
  13. type IdentityDetector struct {
  14. // PeopleDict maps email || name -> developer id.
  15. PeopleDict map[string]int
  16. // ReversedPeopleDict maps developer id -> description
  17. ReversedPeopleDict []string
  18. }
  19. const (
  20. // AuthorMissing is the internal author index which denotes any unmatched identities
  21. // (IdentityDetector.Consume()).
  22. AuthorMissing = (1 << 18) - 1
  23. // AuthorMissingName is the string name which corresponds to AuthorMissing.
  24. AuthorMissingName = "<unmatched>"
  25. // FactIdentityDetectorPeopleDict is the name of the fact which is inserted in
  26. // IdentityDetector.Configure(). It corresponds to IdentityDetector.PeopleDict - the mapping
  27. // from the signatures to the author indices.
  28. FactIdentityDetectorPeopleDict = "IdentityDetector.PeopleDict"
  29. // FactIdentityDetectorReversedPeopleDict is the name of the fact which is inserted in
  30. // IdentityDetector.Configure(). It corresponds to IdentityDetector.ReversedPeopleDict -
  31. // the mapping from the author indices to the main signature.
  32. FactIdentityDetectorReversedPeopleDict = "IdentityDetector.ReversedPeopleDict"
  33. // ConfigIdentityDetectorPeopleDictPath is the name of the configuration option
  34. // (IdentityDetector.Configure()) which allows to set the external PeopleDict mapping from a file.
  35. ConfigIdentityDetectorPeopleDictPath = "IdentityDetector.PeopleDictPath"
  36. // FactIdentityDetectorPeopleCount is the name of the fact which is inserted in
  37. // IdentityDetector.Configure(). It is equal to the overall number of unique authors
  38. // (the length of ReversedPeopleDict).
  39. FactIdentityDetectorPeopleCount = "IdentityDetector.PeopleCount"
  40. // DependencyAuthor is the name of the dependency provided by IdentityDetector.
  41. DependencyAuthor = "author"
  42. )
  43. // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
  44. func (id *IdentityDetector) Name() string {
  45. return "IdentityDetector"
  46. }
  47. // Provides returns the list of names of entities which are produced by this PipelineItem.
  48. // Each produced entity will be inserted into `deps` of dependent Consume()-s according
  49. // to this list. Also used by hercules.Registry to build the global map of providers.
  50. func (id *IdentityDetector) Provides() []string {
  51. arr := [...]string{DependencyAuthor}
  52. return arr[:]
  53. }
  54. // Requires returns the list of names of entities which are needed by this PipelineItem.
  55. // Each requested entity will be inserted into `deps` of Consume(). In turn, those
  56. // entities are Provides() upstream.
  57. func (id *IdentityDetector) Requires() []string {
  58. return []string{}
  59. }
  60. // ListConfigurationOptions returns the list of changeable public properties of this PipelineItem.
  61. func (id *IdentityDetector) ListConfigurationOptions() []ConfigurationOption {
  62. options := [...]ConfigurationOption{{
  63. Name: ConfigIdentityDetectorPeopleDictPath,
  64. Description: "Path to the developers' email associations.",
  65. Flag: "people-dict",
  66. Type: StringConfigurationOption,
  67. Default: ""},
  68. }
  69. return options[:]
  70. }
  71. // Configure sets the properties previously published by ListConfigurationOptions().
  72. func (id *IdentityDetector) Configure(facts map[string]interface{}) {
  73. if val, exists := facts[FactIdentityDetectorPeopleDict].(map[string]int); exists {
  74. id.PeopleDict = val
  75. }
  76. if val, exists := facts[FactIdentityDetectorReversedPeopleDict].([]string); exists {
  77. id.ReversedPeopleDict = val
  78. }
  79. if id.PeopleDict == nil || id.ReversedPeopleDict == nil {
  80. peopleDictPath, _ := facts[ConfigIdentityDetectorPeopleDictPath].(string)
  81. if peopleDictPath != "" {
  82. id.LoadPeopleDict(peopleDictPath)
  83. facts[FactIdentityDetectorPeopleCount] = len(id.ReversedPeopleDict) - 1
  84. } else {
  85. if _, exists := facts[ConfigPipelineCommits]; !exists {
  86. panic("IdentityDetector needs a list of commits to initialize.")
  87. }
  88. id.GeneratePeopleDict(facts[ConfigPipelineCommits].([]*object.Commit))
  89. facts[FactIdentityDetectorPeopleCount] = len(id.ReversedPeopleDict)
  90. }
  91. } else {
  92. facts[FactIdentityDetectorPeopleCount] = len(id.ReversedPeopleDict)
  93. }
  94. facts[FactIdentityDetectorPeopleDict] = id.PeopleDict
  95. facts[FactIdentityDetectorReversedPeopleDict] = id.ReversedPeopleDict
  96. }
  97. // Initialize resets the temporary caches and prepares this PipelineItem for a series of Consume()
  98. // calls. The repository which is going to be analysed is supplied as an argument.
  99. func (id *IdentityDetector) Initialize(repository *git.Repository) {
  100. }
  101. // Consume runs this PipelineItem on the next commit data.
  102. // `deps` contain all the results from upstream PipelineItem-s as requested by Requires().
  103. // Additionally, "commit" is always present there and represents the analysed *object.Commit.
  104. // This function returns the mapping with analysis results. The keys must be the same as
  105. // in Provides(). If there was an error, nil is returned.
  106. func (id *IdentityDetector) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  107. commit := deps["commit"].(*object.Commit)
  108. signature := commit.Author
  109. authorID, exists := id.PeopleDict[strings.ToLower(signature.Email)]
  110. if !exists {
  111. authorID, exists = id.PeopleDict[strings.ToLower(signature.Name)]
  112. if !exists {
  113. authorID = AuthorMissing
  114. }
  115. }
  116. return map[string]interface{}{DependencyAuthor: authorID}, nil
  117. }
  118. // LoadPeopleDict loads author signatures from a text file.
  119. // The format is one signature per line, and the signature consists of several
  120. // keys separated by "|". The first key is the main one and used to reference all the rest.
  121. func (id *IdentityDetector) LoadPeopleDict(path string) error {
  122. file, err := os.Open(path)
  123. if err != nil {
  124. return err
  125. }
  126. defer file.Close()
  127. scanner := bufio.NewScanner(file)
  128. dict := make(map[string]int)
  129. reverseDict := []string{}
  130. size := 0
  131. for scanner.Scan() {
  132. ids := strings.Split(scanner.Text(), "|")
  133. for _, id := range ids {
  134. dict[strings.ToLower(id)] = size
  135. }
  136. reverseDict = append(reverseDict, ids[0])
  137. size++
  138. }
  139. reverseDict = append(reverseDict, AuthorMissingName)
  140. id.PeopleDict = dict
  141. id.ReversedPeopleDict = reverseDict
  142. return nil
  143. }
  144. // GeneratePeopleDict loads author signatures from the specified list of Git commits.
  145. func (id *IdentityDetector) GeneratePeopleDict(commits []*object.Commit) {
  146. dict := map[string]int{}
  147. emails := map[int][]string{}
  148. names := map[int][]string{}
  149. size := 0
  150. mailmapFile, err := commits[len(commits)-1].File(".mailmap")
  151. if err == nil {
  152. mailMapContents, err := mailmapFile.Contents()
  153. if err == nil {
  154. mailmap := ParseMailmap(mailMapContents)
  155. for key, val := range mailmap {
  156. key = strings.ToLower(key)
  157. toEmail := strings.ToLower(val.Email)
  158. toName := strings.ToLower(val.Name)
  159. id, exists := dict[toEmail]
  160. if !exists {
  161. id, exists = dict[toName]
  162. }
  163. if exists {
  164. dict[key] = id
  165. } else {
  166. id = size
  167. size++
  168. if toEmail != "" {
  169. dict[toEmail] = id
  170. emails[id] = append(emails[id], toEmail)
  171. }
  172. if toName != "" {
  173. dict[toName] = id
  174. names[id] = append(names[id], toName)
  175. }
  176. dict[key] = id
  177. }
  178. if strings.Contains(key, "@") {
  179. exists := false
  180. for _, val := range emails[id] {
  181. if key == val {
  182. exists = true
  183. break
  184. }
  185. }
  186. if !exists {
  187. emails[id] = append(emails[id], key)
  188. }
  189. } else {
  190. exists := false
  191. for _, val := range names[id] {
  192. if key == val {
  193. exists = true
  194. break
  195. }
  196. }
  197. if !exists {
  198. names[id] = append(names[id], key)
  199. }
  200. }
  201. }
  202. }
  203. }
  204. for _, commit := range commits {
  205. email := strings.ToLower(commit.Author.Email)
  206. name := strings.ToLower(commit.Author.Name)
  207. id, exists := dict[email]
  208. if exists {
  209. _, exists := dict[name]
  210. if !exists {
  211. dict[name] = id
  212. names[id] = append(names[id], name)
  213. }
  214. continue
  215. }
  216. id, exists = dict[name]
  217. if exists {
  218. dict[email] = id
  219. emails[id] = append(emails[id], email)
  220. continue
  221. }
  222. dict[email] = size
  223. dict[name] = size
  224. emails[size] = append(emails[size], email)
  225. names[size] = append(names[size], name)
  226. size++
  227. }
  228. reverseDict := make([]string, size)
  229. for _, val := range dict {
  230. sort.Strings(names[val])
  231. sort.Strings(emails[val])
  232. reverseDict[val] = strings.Join(names[val], "|") + "|" + strings.Join(emails[val], "|")
  233. }
  234. id.PeopleDict = dict
  235. id.ReversedPeopleDict = reverseDict
  236. }
  237. // MergeReversedDicts joins two identity lists together, excluding duplicates, in-order.
  238. func (id IdentityDetector) MergeReversedDicts(rd1, rd2 []string) (map[string][3]int, []string) {
  239. people := map[string][3]int{}
  240. for i, pid := range rd1 {
  241. ptrs := people[pid]
  242. ptrs[0] = len(people)
  243. ptrs[1] = i
  244. ptrs[2] = -1
  245. people[pid] = ptrs
  246. }
  247. for i, pid := range rd2 {
  248. ptrs, exists := people[pid]
  249. if !exists {
  250. ptrs[0] = len(people)
  251. ptrs[1] = -1
  252. }
  253. ptrs[2] = i
  254. people[pid] = ptrs
  255. }
  256. mrd := make([]string, len(people))
  257. for name, ptrs := range people {
  258. mrd[ptrs[0]] = name
  259. }
  260. return people, mrd
  261. }
  262. func init() {
  263. Registry.Register(&IdentityDetector{})
  264. }