identity.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464
  1. package identity
  2. import (
  3. "bufio"
  4. "os"
  5. "sort"
  6. "strings"
  7. "github.com/pkg/errors"
  8. "gopkg.in/src-d/go-git.v4"
  9. "gopkg.in/src-d/go-git.v4/plumbing/object"
  10. "gopkg.in/src-d/hercules.v10/internal/core"
  11. )
  12. // Detector determines the author of a commit. Same person can commit under different
  13. // signatures, and we apply some heuristics to merge those together.
  14. // It is a PipelineItem.
  15. type Detector struct {
  16. core.NoopMerger
  17. // PeopleDict maps email || name -> developer id
  18. PeopleDict map[string]int
  19. // ReversedPeopleDict maps developer id -> description
  20. ReversedPeopleDict []string
  21. l core.Logger
  22. }
  23. const (
  24. // AuthorMissing is the internal author index which denotes any unmatched identities
  25. // (Detector.Consume()). It may *not* be (1 << 18) - 1, see BurndownAnalysis.packPersonWithDay().
  26. AuthorMissing = (1 << 18) - 2
  27. // AuthorMissingName is the string name which corresponds to AuthorMissing.
  28. AuthorMissingName = "<unmatched>"
  29. // FactIdentityDetectorPeopleDict is the name of the fact which is inserted in
  30. // Detector.Configure(). It corresponds to Detector.PeopleDict - the mapping
  31. // from the signatures to the author indices.
  32. FactIdentityDetectorPeopleDict = "IdentityDetector.PeopleDict"
  33. // FactIdentityDetectorReversedPeopleDict is the name of the fact which is inserted in
  34. // Detector.Configure(). It corresponds to Detector.ReversedPeopleDict -
  35. // the mapping from the author indices to the main signature.
  36. FactIdentityDetectorReversedPeopleDict = "IdentityDetector.ReversedPeopleDict"
  37. // ConfigIdentityDetectorPeopleDictPath is the name of the configuration option
  38. // (Detector.Configure()) which allows to set the external PeopleDict mapping from a file.
  39. ConfigIdentityDetectorPeopleDictPath = "IdentityDetector.PeopleDictPath"
  40. // FactIdentityDetectorPeopleCount is the name of the fact which is inserted in
  41. // Detector.Configure(). It is equal to the overall number of unique authors
  42. // (the length of ReversedPeopleDict).
  43. FactIdentityDetectorPeopleCount = "IdentityDetector.PeopleCount"
  44. // DependencyAuthor is the name of the dependency provided by Detector.
  45. DependencyAuthor = "author"
  46. )
  47. // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
  48. func (detector *Detector) Name() string {
  49. return "IdentityDetector"
  50. }
  51. // Provides returns the list of names of entities which are produced by this PipelineItem.
  52. // Each produced entity will be inserted into `deps` of dependent Consume()-s according
  53. // to this list. Also used by core.Registry to build the global map of providers.
  54. func (detector *Detector) Provides() []string {
  55. return []string{DependencyAuthor}
  56. }
  57. // Requires returns the list of names of entities which are needed by this PipelineItem.
  58. // Each requested entity will be inserted into `deps` of Consume(). In turn, those
  59. // entities are Provides() upstream.
  60. func (detector *Detector) Requires() []string {
  61. return []string{}
  62. }
  63. // ListConfigurationOptions returns the list of changeable public properties of this PipelineItem.
  64. func (detector *Detector) ListConfigurationOptions() []core.ConfigurationOption {
  65. options := [...]core.ConfigurationOption{{
  66. Name: ConfigIdentityDetectorPeopleDictPath,
  67. Description: "Path to the file with developer -> name|email associations.",
  68. Flag: "people-dict",
  69. Type: core.PathConfigurationOption,
  70. Default: ""},
  71. }
  72. return options[:]
  73. }
  74. // Configure sets the properties previously published by ListConfigurationOptions().
  75. func (detector *Detector) Configure(facts map[string]interface{}) error {
  76. if l, exists := facts[core.ConfigLogger].(core.Logger); exists {
  77. detector.l = l
  78. } else {
  79. detector.l = core.NewLogger()
  80. }
  81. if val, exists := facts[FactIdentityDetectorPeopleDict].(map[string]int); exists {
  82. detector.PeopleDict = val
  83. }
  84. if val, exists := facts[FactIdentityDetectorReversedPeopleDict].([]string); exists {
  85. detector.ReversedPeopleDict = val
  86. }
  87. if detector.PeopleDict == nil || detector.ReversedPeopleDict == nil {
  88. peopleDictPath, _ := facts[ConfigIdentityDetectorPeopleDictPath].(string)
  89. if peopleDictPath != "" {
  90. err := detector.LoadPeopleDict(peopleDictPath)
  91. if err != nil {
  92. return errors.Errorf("failed to load %s: %v", peopleDictPath, err)
  93. }
  94. facts[FactIdentityDetectorPeopleCount] = len(detector.ReversedPeopleDict) - 1
  95. } else {
  96. if _, exists := facts[core.ConfigPipelineCommits]; !exists {
  97. panic("IdentityDetector needs a list of commits to initialize.")
  98. }
  99. detector.GeneratePeopleDict(facts[core.ConfigPipelineCommits].([]*object.Commit))
  100. facts[FactIdentityDetectorPeopleCount] = len(detector.ReversedPeopleDict)
  101. }
  102. } else {
  103. facts[FactIdentityDetectorPeopleCount] = len(detector.ReversedPeopleDict)
  104. }
  105. facts[FactIdentityDetectorPeopleDict] = detector.PeopleDict
  106. facts[FactIdentityDetectorReversedPeopleDict] = detector.ReversedPeopleDict
  107. return nil
  108. }
  109. // Initialize resets the temporary caches and prepares this PipelineItem for a series of Consume()
  110. // calls. The repository which is going to be analysed is supplied as an argument.
  111. func (detector *Detector) Initialize(repository *git.Repository) error {
  112. detector.l = core.NewLogger()
  113. return nil
  114. }
  115. // Consume runs this PipelineItem on the next commit data.
  116. // `deps` contain all the results from upstream PipelineItem-s as requested by Requires().
  117. // Additionally, DependencyCommit is always present there and represents the analysed *object.Commit.
  118. // This function returns the mapping with analysis results. The keys must be the same as
  119. // in Provides(). If there was an error, nil is returned.
  120. func (detector *Detector) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  121. commit := deps[core.DependencyCommit].(*object.Commit)
  122. signature := commit.Author
  123. authorID, exists := detector.PeopleDict[strings.ToLower(signature.Email)]
  124. if !exists {
  125. authorID, exists = detector.PeopleDict[strings.ToLower(signature.Name)]
  126. if !exists {
  127. authorID = AuthorMissing
  128. }
  129. }
  130. return map[string]interface{}{DependencyAuthor: authorID}, nil
  131. }
  132. // Fork clones this PipelineItem.
  133. func (detector *Detector) Fork(n int) []core.PipelineItem {
  134. return core.ForkSamePipelineItem(detector, n)
  135. }
  136. // LoadPeopleDict loads author signatures from a text file.
  137. // The format is one signature per line, and the signature consists of several
  138. // keys separated by "|". The first key is the main one and used to reference all the rest.
  139. func (detector *Detector) LoadPeopleDict(path string) error {
  140. file, err := os.Open(path)
  141. if err != nil {
  142. return err
  143. }
  144. defer file.Close()
  145. scanner := bufio.NewScanner(file)
  146. dict := make(map[string]int)
  147. var reverseDict []string
  148. size := 0
  149. for scanner.Scan() {
  150. ids := strings.Split(scanner.Text(), "|")
  151. for _, id := range ids {
  152. dict[strings.ToLower(id)] = size
  153. }
  154. reverseDict = append(reverseDict, ids[0])
  155. size++
  156. }
  157. reverseDict = append(reverseDict, AuthorMissingName)
  158. detector.PeopleDict = dict
  159. detector.ReversedPeopleDict = reverseDict
  160. return nil
  161. }
  162. // GeneratePeopleDict loads author signatures from the specified list of Git commits.
  163. func (detector *Detector) GeneratePeopleDict(commits []*object.Commit) {
  164. dict := map[string]int{}
  165. emails := map[int][]string{}
  166. names := map[int][]string{}
  167. size := 0
  168. mailmapFile, err := commits[len(commits)-1].File(".mailmap")
  169. if err == nil {
  170. mailMapContents, err := mailmapFile.Contents()
  171. if err == nil {
  172. mailmap := ParseMailmap(mailMapContents)
  173. for key, val := range mailmap {
  174. key = strings.ToLower(key)
  175. toEmail := strings.ToLower(val.Email)
  176. toName := strings.ToLower(val.Name)
  177. id, exists := dict[toEmail]
  178. if !exists {
  179. id, exists = dict[toName]
  180. }
  181. if exists {
  182. dict[key] = id
  183. } else {
  184. id = size
  185. size++
  186. if toEmail != "" {
  187. dict[toEmail] = id
  188. emails[id] = append(emails[id], toEmail)
  189. }
  190. if toName != "" {
  191. dict[toName] = id
  192. names[id] = append(names[id], toName)
  193. }
  194. dict[key] = id
  195. }
  196. if strings.Contains(key, "@") {
  197. exists := false
  198. for _, val := range emails[id] {
  199. if key == val {
  200. exists = true
  201. break
  202. }
  203. }
  204. if !exists {
  205. emails[id] = append(emails[id], key)
  206. }
  207. } else {
  208. exists := false
  209. for _, val := range names[id] {
  210. if key == val {
  211. exists = true
  212. break
  213. }
  214. }
  215. if !exists {
  216. names[id] = append(names[id], key)
  217. }
  218. }
  219. }
  220. }
  221. }
  222. for _, commit := range commits {
  223. email := strings.ToLower(commit.Author.Email)
  224. name := strings.ToLower(commit.Author.Name)
  225. id, exists := dict[email]
  226. if exists {
  227. _, exists := dict[name]
  228. if !exists {
  229. dict[name] = id
  230. names[id] = append(names[id], name)
  231. }
  232. continue
  233. }
  234. id, exists = dict[name]
  235. if exists {
  236. dict[email] = id
  237. emails[id] = append(emails[id], email)
  238. continue
  239. }
  240. dict[email] = size
  241. dict[name] = size
  242. emails[size] = append(emails[size], email)
  243. names[size] = append(names[size], name)
  244. size++
  245. }
  246. reverseDict := make([]string, size)
  247. for _, val := range dict {
  248. sort.Strings(names[val])
  249. sort.Strings(emails[val])
  250. reverseDict[val] = strings.Join(names[val], "|") + "|" + strings.Join(emails[val], "|")
  251. }
  252. detector.PeopleDict = dict
  253. detector.ReversedPeopleDict = reverseDict
  254. }
  255. // MergedIndex is the result of merging `rd1[First]` and `rd2[Second]`: the index in the final reversed
  256. // dictionary. -1 for `First` or `Second` means that the corresponding string does not exist
  257. // in respectively `rd1` and `rd2`.
  258. // See also:
  259. // * MergeReversedDictsLiteral()
  260. // * MergeReversedDictsIdentities()
  261. type MergedIndex struct {
  262. Final int
  263. First int
  264. Second int
  265. }
  266. // MergeReversedDictsLiteral joins two string lists together, excluding duplicates, in-order.
  267. // The string comparisons are the usual ones.
  268. // The returned mapping's keys are the unique strings in `rd1 ∪ rd2`, and the values are:
  269. // 1. Index after merging.
  270. // 2. Corresponding index in the first array - `rd1`. -1 means that it does not exist.
  271. // 3. Corresponding index in the second array - `rd2`. -1 means that it does not exist.
  272. func MergeReversedDictsLiteral(rd1, rd2 []string) (map[string]MergedIndex, []string) {
  273. people := map[string]MergedIndex{}
  274. for i, pid := range rd1 {
  275. people[pid] = MergedIndex{len(people), i, -1}
  276. }
  277. for i, pid := range rd2 {
  278. if ptrs, exists := people[pid]; !exists {
  279. people[pid] = MergedIndex{len(people), -1, i}
  280. } else {
  281. people[pid] = MergedIndex{ptrs.Final, ptrs.First, i}
  282. }
  283. }
  284. mrd := make([]string, len(people))
  285. for name, ptrs := range people {
  286. mrd[ptrs.Final] = name
  287. }
  288. return people, mrd
  289. }
  290. type identityPair struct {
  291. Index1 int
  292. Index2 int
  293. }
  294. // MergeReversedDictsIdentities joins two identity lists together, excluding duplicates.
  295. // The strings are split by "|" and we find the connected components..
  296. // The returned mapping's keys are the unique strings in `rd1 ∪ rd2`, and the values are:
  297. // 1. Index after merging.
  298. // 2. Corresponding index in the first array - `rd1`. -1 means that it does not exist.
  299. // 3. Corresponding index in the second array - `rd2`. -1 means that it does not exist.
  300. func MergeReversedDictsIdentities(rd1, rd2 []string) (map[string]MergedIndex, []string) {
  301. vocabulary := map[string]identityPair{}
  302. vertices1 := make([][]string, len(rd1))
  303. for i, s := range rd1 {
  304. parts := strings.Split(s, "|")
  305. vertices1[i] = parts
  306. for _, p := range parts {
  307. vocabulary[p] = identityPair{i, -1}
  308. }
  309. }
  310. vertices2 := make([][]string, len(rd2))
  311. for i, s := range rd2 {
  312. parts := strings.Split(s, "|")
  313. vertices2[i] = parts
  314. for _, p := range parts {
  315. if ip, exists := vocabulary[p]; !exists {
  316. vocabulary[p] = identityPair{-1, i}
  317. } else {
  318. ip.Index2 = i
  319. vocabulary[p] = ip
  320. }
  321. }
  322. }
  323. // find the connected components by walking the graph
  324. var walks []map[string]bool
  325. visited := map[string]bool{}
  326. walkFromVertex := func(root []string) {
  327. walk := map[string]bool{}
  328. pending := map[string]bool{}
  329. for _, p := range root {
  330. pending[p] = true
  331. }
  332. for len(pending) > 0 {
  333. var element string
  334. for e := range pending {
  335. element = e
  336. delete(pending, e)
  337. break
  338. }
  339. if !walk[element] {
  340. walk[element] = true
  341. ip := vocabulary[element]
  342. if ip.Index1 >= 0 {
  343. for _, p := range vertices1[ip.Index1] {
  344. if !walk[p] {
  345. pending[p] = true
  346. }
  347. }
  348. }
  349. if ip.Index2 >= 0 {
  350. for _, p := range vertices2[ip.Index2] {
  351. if !walk[p] {
  352. pending[p] = true
  353. }
  354. }
  355. }
  356. }
  357. }
  358. for e := range walk {
  359. visited[e] = true
  360. }
  361. walks = append(walks, walk)
  362. }
  363. for i1 := range rd1 {
  364. var skip bool
  365. for _, p := range vertices1[i1] {
  366. if visited[p] {
  367. skip = true
  368. break
  369. }
  370. }
  371. if skip {
  372. continue
  373. }
  374. walkFromVertex(vertices1[i1])
  375. }
  376. for i2 := range rd2 {
  377. var skip bool
  378. for _, p := range vertices2[i2] {
  379. if visited[p] {
  380. skip = true
  381. break
  382. }
  383. }
  384. if skip {
  385. continue
  386. }
  387. walkFromVertex(vertices2[i2])
  388. }
  389. mergedStrings := make([]string, 0, len(walks))
  390. mergedIndex := map[string]MergedIndex{}
  391. // convert each walk from strings to indexes
  392. for walkIndex, walk := range walks {
  393. ids := make([]string, 0, len(walk))
  394. for key := range walk {
  395. ids = append(ids, key)
  396. }
  397. // place emails after names
  398. sort.Slice(ids, func(i, j int) bool {
  399. iid := ids[i]
  400. jid := ids[j]
  401. iHasAt := strings.ContainsRune(iid, '@')
  402. jHasAt := strings.ContainsRune(jid, '@')
  403. if iHasAt == jHasAt {
  404. return iid < jid
  405. }
  406. return jHasAt
  407. })
  408. mergedStrings = append(mergedStrings, strings.Join(ids, "|"))
  409. for _, key := range ids {
  410. ipair := vocabulary[key]
  411. if ipair.Index1 >= 0 {
  412. s1 := rd1[ipair.Index1]
  413. if mi, exists := mergedIndex[s1]; !exists {
  414. mergedIndex[s1] = MergedIndex{walkIndex, ipair.Index1, -1}
  415. } else {
  416. mergedIndex[s1] = MergedIndex{walkIndex, ipair.Index1, mi.Second}
  417. }
  418. }
  419. if ipair.Index2 >= 0 {
  420. s2 := rd2[ipair.Index2]
  421. if mi, exists := mergedIndex[s2]; !exists {
  422. mergedIndex[s2] = MergedIndex{walkIndex, -1, ipair.Index2}
  423. } else {
  424. mergedIndex[s2] = MergedIndex{walkIndex, mi.First, ipair.Index2}
  425. }
  426. }
  427. }
  428. }
  429. return mergedIndex, mergedStrings
  430. }
  431. func init() {
  432. core.Registry.Register(&Detector{})
  433. }