identity.go 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264
  1. package hercules
  2. import (
  3. "bufio"
  4. "os"
  5. "sort"
  6. "strings"
  7. "gopkg.in/src-d/go-git.v4"
  8. "gopkg.in/src-d/go-git.v4/plumbing/object"
  9. )
  10. // IdentityDetector determines the author of a commit. Same person can commit under different
  11. // signatures, and we apply some heuristics to merge those together.
  12. // It is a PipelineItem.
  13. type IdentityDetector struct {
  14. // PeopleDict maps email || name -> developer id.
  15. PeopleDict map[string]int
  16. // ReversedPeopleDict maps developer id -> description
  17. ReversedPeopleDict []string
  18. }
  19. const (
  20. // AuthorMissing is the internal author index which denotes any unmatched identities
  21. // (IdentityDetector.Consume()).
  22. AuthorMissing = (1 << 18) - 1
  23. // AuthorMissingName is the string name which corresponds to AuthorMissing.
  24. AuthorMissingName = "<unmatched>"
  25. // FactIdentityDetectorPeopleDict is the name of the fact which is inserted in
  26. // IdentityDetector.Configure(). It corresponds to IdentityDetector.PeopleDict - the mapping
  27. // from the signatures to the author indices.
  28. FactIdentityDetectorPeopleDict = "IdentityDetector.PeopleDict"
  29. // FactIdentityDetectorReversedPeopleDict is the name of the fact which is inserted in
  30. // IdentityDetector.Configure(). It corresponds to IdentityDetector.ReversedPeopleDict -
  31. // the mapping from the author indices to the main signature.
  32. FactIdentityDetectorReversedPeopleDict = "IdentityDetector.ReversedPeopleDict"
  33. // ConfigIdentityDetectorPeopleDictPath is the name of the configuration option
  34. // (IdentityDetector.Configure()) which allows to set the external PeopleDict mapping from a file.
  35. ConfigIdentityDetectorPeopleDictPath = "IdentityDetector.PeopleDictPath"
  36. // FactIdentityDetectorPeopleCount is the name of the fact which is inserted in
  37. // IdentityDetector.Configure(). It is equal to the overall number of unique authors
  38. // (the length of ReversedPeopleDict).
  39. FactIdentityDetectorPeopleCount = "IdentityDetector.PeopleCount"
  40. // DependencyAuthor is the name of the dependency provided by IdentityDetector.
  41. DependencyAuthor = "author"
  42. )
  43. func (id *IdentityDetector) Name() string {
  44. return "IdentityDetector"
  45. }
  46. func (id *IdentityDetector) Provides() []string {
  47. arr := [...]string{DependencyAuthor}
  48. return arr[:]
  49. }
  50. func (id *IdentityDetector) Requires() []string {
  51. return []string{}
  52. }
  53. func (id *IdentityDetector) ListConfigurationOptions() []ConfigurationOption {
  54. options := [...]ConfigurationOption{{
  55. Name: ConfigIdentityDetectorPeopleDictPath,
  56. Description: "Path to the developers' email associations.",
  57. Flag: "people-dict",
  58. Type: StringConfigurationOption,
  59. Default: ""},
  60. }
  61. return options[:]
  62. }
  63. func (id *IdentityDetector) Configure(facts map[string]interface{}) {
  64. if val, exists := facts[FactIdentityDetectorPeopleDict].(map[string]int); exists {
  65. id.PeopleDict = val
  66. }
  67. if val, exists := facts[FactIdentityDetectorReversedPeopleDict].([]string); exists {
  68. id.ReversedPeopleDict = val
  69. }
  70. if id.PeopleDict == nil || id.ReversedPeopleDict == nil {
  71. peopleDictPath, _ := facts[ConfigIdentityDetectorPeopleDictPath].(string)
  72. if peopleDictPath != "" {
  73. id.LoadPeopleDict(peopleDictPath)
  74. facts[FactIdentityDetectorPeopleCount] = len(id.ReversedPeopleDict) - 1
  75. } else {
  76. if _, exists := facts[ConfigPipelineCommits]; !exists {
  77. panic("IdentityDetector needs a list of commits to initialize.")
  78. }
  79. id.GeneratePeopleDict(facts[ConfigPipelineCommits].([]*object.Commit))
  80. facts[FactIdentityDetectorPeopleCount] = len(id.ReversedPeopleDict)
  81. }
  82. } else {
  83. facts[FactIdentityDetectorPeopleCount] = len(id.ReversedPeopleDict)
  84. }
  85. facts[FactIdentityDetectorPeopleDict] = id.PeopleDict
  86. facts[FactIdentityDetectorReversedPeopleDict] = id.ReversedPeopleDict
  87. }
  88. func (id *IdentityDetector) Initialize(repository *git.Repository) {
  89. }
  90. func (id *IdentityDetector) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  91. commit := deps["commit"].(*object.Commit)
  92. signature := commit.Author
  93. authorID, exists := id.PeopleDict[strings.ToLower(signature.Email)]
  94. if !exists {
  95. authorID, exists = id.PeopleDict[strings.ToLower(signature.Name)]
  96. if !exists {
  97. authorID = AuthorMissing
  98. }
  99. }
  100. return map[string]interface{}{DependencyAuthor: authorID}, nil
  101. }
  102. func (id *IdentityDetector) LoadPeopleDict(path string) error {
  103. file, err := os.Open(path)
  104. if err != nil {
  105. return err
  106. }
  107. defer file.Close()
  108. scanner := bufio.NewScanner(file)
  109. dict := make(map[string]int)
  110. reverseDict := []string{}
  111. size := 0
  112. for scanner.Scan() {
  113. ids := strings.Split(scanner.Text(), "|")
  114. for _, id := range ids {
  115. dict[strings.ToLower(id)] = size
  116. }
  117. reverseDict = append(reverseDict, ids[0])
  118. size++
  119. }
  120. reverseDict = append(reverseDict, AuthorMissingName)
  121. id.PeopleDict = dict
  122. id.ReversedPeopleDict = reverseDict
  123. return nil
  124. }
  125. func (id *IdentityDetector) GeneratePeopleDict(commits []*object.Commit) {
  126. dict := map[string]int{}
  127. emails := map[int][]string{}
  128. names := map[int][]string{}
  129. size := 0
  130. mailmapFile, err := commits[len(commits)-1].File(".mailmap")
  131. if err == nil {
  132. mailMapContents, err := mailmapFile.Contents()
  133. if err == nil {
  134. mailmap := ParseMailmap(mailMapContents)
  135. for key, val := range mailmap {
  136. key = strings.ToLower(key)
  137. toEmail := strings.ToLower(val.Email)
  138. toName := strings.ToLower(val.Name)
  139. id, exists := dict[toEmail]
  140. if !exists {
  141. id, exists = dict[toName]
  142. }
  143. if exists {
  144. dict[key] = id
  145. } else {
  146. id = size
  147. size++
  148. if toEmail != "" {
  149. dict[toEmail] = id
  150. emails[id] = append(emails[id], toEmail)
  151. }
  152. if toName != "" {
  153. dict[toName] = id
  154. names[id] = append(names[id], toName)
  155. }
  156. dict[key] = id
  157. }
  158. if strings.Contains(key, "@") {
  159. exists := false
  160. for _, val := range emails[id] {
  161. if key == val {
  162. exists = true
  163. break
  164. }
  165. }
  166. if !exists {
  167. emails[id] = append(emails[id], key)
  168. }
  169. } else {
  170. exists := false
  171. for _, val := range names[id] {
  172. if key == val {
  173. exists = true
  174. break
  175. }
  176. }
  177. if !exists {
  178. names[id] = append(names[id], key)
  179. }
  180. }
  181. }
  182. }
  183. }
  184. for _, commit := range commits {
  185. email := strings.ToLower(commit.Author.Email)
  186. name := strings.ToLower(commit.Author.Name)
  187. id, exists := dict[email]
  188. if exists {
  189. _, exists := dict[name]
  190. if !exists {
  191. dict[name] = id
  192. names[id] = append(names[id], name)
  193. }
  194. continue
  195. }
  196. id, exists = dict[name]
  197. if exists {
  198. dict[email] = id
  199. emails[id] = append(emails[id], email)
  200. continue
  201. }
  202. dict[email] = size
  203. dict[name] = size
  204. emails[size] = append(emails[size], email)
  205. names[size] = append(names[size], name)
  206. size++
  207. }
  208. reverseDict := make([]string, size)
  209. for _, val := range dict {
  210. sort.Strings(names[val])
  211. sort.Strings(emails[val])
  212. reverseDict[val] = strings.Join(names[val], "|") + "|" + strings.Join(emails[val], "|")
  213. }
  214. id.PeopleDict = dict
  215. id.ReversedPeopleDict = reverseDict
  216. }
  217. // MergeReversedDicts joins two identity lists together, excluding duplicates, in-order.
  218. func (id IdentityDetector) MergeReversedDicts(rd1, rd2 []string) (map[string][3]int, []string) {
  219. people := map[string][3]int{}
  220. for i, pid := range rd1 {
  221. ptrs := people[pid]
  222. ptrs[0] = len(people)
  223. ptrs[1] = i
  224. ptrs[2] = -1
  225. people[pid] = ptrs
  226. }
  227. for i, pid := range rd2 {
  228. ptrs, exists := people[pid]
  229. if !exists {
  230. ptrs[0] = len(people)
  231. ptrs[1] = -1
  232. }
  233. ptrs[2] = i
  234. people[pid] = ptrs
  235. }
  236. mrd := make([]string, len(people))
  237. for name, ptrs := range people {
  238. mrd[ptrs[0]] = name
  239. }
  240. return people, mrd
  241. }
  242. func init() {
  243. Registry.Register(&IdentityDetector{})
  244. }