identity.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465
  1. package identity
  2. import (
  3. "bufio"
  4. "os"
  5. "sort"
  6. "strings"
  7. "github.com/pkg/errors"
  8. "gopkg.in/src-d/go-git.v4"
  9. "gopkg.in/src-d/go-git.v4/plumbing/object"
  10. "gopkg.in/src-d/hercules.v10/internal/core"
  11. )
  12. // Detector determines the author of a commit. Same person can commit under different
  13. // signatures, and we apply some heuristics to merge those together.
  14. // It is a PipelineItem.
  15. type Detector struct {
  16. core.NoopMerger
  17. // PeopleDict maps email || name -> developer id
  18. PeopleDict map[string]int
  19. // ReversedPeopleDict maps developer id -> description
  20. ReversedPeopleDict []string
  21. l core.Logger
  22. }
  23. const (
  24. // AuthorMissing is the internal author index which denotes any unmatched identities
  25. // (Detector.Consume()). It may *not* be (1 << 18) - 1, see BurndownAnalysis.packPersonWithDay().
  26. AuthorMissing = (1 << 18) - 2
  27. // AuthorMissingName is the string name which corresponds to AuthorMissing.
  28. AuthorMissingName = "<unmatched>"
  29. // FactIdentityDetectorPeopleDict is the name of the fact which is inserted in
  30. // Detector.Configure(). It corresponds to Detector.PeopleDict - the mapping
  31. // from the signatures to the author indices.
  32. FactIdentityDetectorPeopleDict = "IdentityDetector.PeopleDict"
  33. // FactIdentityDetectorReversedPeopleDict is the name of the fact which is inserted in
  34. // Detector.Configure(). It corresponds to Detector.ReversedPeopleDict -
  35. // the mapping from the author indices to the main signature.
  36. FactIdentityDetectorReversedPeopleDict = "IdentityDetector.ReversedPeopleDict"
  37. // ConfigIdentityDetectorPeopleDictPath is the name of the configuration option
  38. // (Detector.Configure()) which allows to set the external PeopleDict mapping from a file.
  39. ConfigIdentityDetectorPeopleDictPath = "IdentityDetector.PeopleDictPath"
  40. // FactIdentityDetectorPeopleCount is the name of the fact which is inserted in
  41. // Detector.Configure(). It is equal to the overall number of unique authors
  42. // (the length of ReversedPeopleDict).
  43. FactIdentityDetectorPeopleCount = "IdentityDetector.PeopleCount"
  44. // DependencyAuthor is the name of the dependency provided by Detector.
  45. DependencyAuthor = "author"
  46. )
  47. // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
  48. func (detector *Detector) Name() string {
  49. return "IdentityDetector"
  50. }
  51. // Provides returns the list of names of entities which are produced by this PipelineItem.
  52. // Each produced entity will be inserted into `deps` of dependent Consume()-s according
  53. // to this list. Also used by core.Registry to build the global map of providers.
  54. func (detector *Detector) Provides() []string {
  55. arr := [...]string{DependencyAuthor}
  56. return arr[:]
  57. }
  58. // Requires returns the list of names of entities which are needed by this PipelineItem.
  59. // Each requested entity will be inserted into `deps` of Consume(). In turn, those
  60. // entities are Provides() upstream.
  61. func (detector *Detector) Requires() []string {
  62. return []string{}
  63. }
  64. // ListConfigurationOptions returns the list of changeable public properties of this PipelineItem.
  65. func (detector *Detector) ListConfigurationOptions() []core.ConfigurationOption {
  66. options := [...]core.ConfigurationOption{{
  67. Name: ConfigIdentityDetectorPeopleDictPath,
  68. Description: "Path to the file with developer -> name|email associations.",
  69. Flag: "people-dict",
  70. Type: core.PathConfigurationOption,
  71. Default: ""},
  72. }
  73. return options[:]
  74. }
  75. // Configure sets the properties previously published by ListConfigurationOptions().
  76. func (detector *Detector) Configure(facts map[string]interface{}) error {
  77. if l, exists := facts[core.ConfigLogger].(core.Logger); exists {
  78. detector.l = l
  79. } else {
  80. detector.l = core.NewLogger()
  81. }
  82. if val, exists := facts[FactIdentityDetectorPeopleDict].(map[string]int); exists {
  83. detector.PeopleDict = val
  84. }
  85. if val, exists := facts[FactIdentityDetectorReversedPeopleDict].([]string); exists {
  86. detector.ReversedPeopleDict = val
  87. }
  88. if detector.PeopleDict == nil || detector.ReversedPeopleDict == nil {
  89. peopleDictPath, _ := facts[ConfigIdentityDetectorPeopleDictPath].(string)
  90. if peopleDictPath != "" {
  91. err := detector.LoadPeopleDict(peopleDictPath)
  92. if err != nil {
  93. return errors.Errorf("failed to load %s: %v", peopleDictPath, err)
  94. }
  95. facts[FactIdentityDetectorPeopleCount] = len(detector.ReversedPeopleDict) - 1
  96. } else {
  97. if _, exists := facts[core.ConfigPipelineCommits]; !exists {
  98. panic("IdentityDetector needs a list of commits to initialize.")
  99. }
  100. detector.GeneratePeopleDict(facts[core.ConfigPipelineCommits].([]*object.Commit))
  101. facts[FactIdentityDetectorPeopleCount] = len(detector.ReversedPeopleDict)
  102. }
  103. } else {
  104. facts[FactIdentityDetectorPeopleCount] = len(detector.ReversedPeopleDict)
  105. }
  106. facts[FactIdentityDetectorPeopleDict] = detector.PeopleDict
  107. facts[FactIdentityDetectorReversedPeopleDict] = detector.ReversedPeopleDict
  108. return nil
  109. }
  110. // Initialize resets the temporary caches and prepares this PipelineItem for a series of Consume()
  111. // calls. The repository which is going to be analysed is supplied as an argument.
  112. func (detector *Detector) Initialize(repository *git.Repository) error {
  113. detector.l = core.NewLogger()
  114. return nil
  115. }
  116. // Consume runs this PipelineItem on the next commit data.
  117. // `deps` contain all the results from upstream PipelineItem-s as requested by Requires().
  118. // Additionally, DependencyCommit is always present there and represents the analysed *object.Commit.
  119. // This function returns the mapping with analysis results. The keys must be the same as
  120. // in Provides(). If there was an error, nil is returned.
  121. func (detector *Detector) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  122. commit := deps[core.DependencyCommit].(*object.Commit)
  123. signature := commit.Author
  124. authorID, exists := detector.PeopleDict[strings.ToLower(signature.Email)]
  125. if !exists {
  126. authorID, exists = detector.PeopleDict[strings.ToLower(signature.Name)]
  127. if !exists {
  128. authorID = AuthorMissing
  129. }
  130. }
  131. return map[string]interface{}{DependencyAuthor: authorID}, nil
  132. }
  133. // Fork clones this PipelineItem.
  134. func (detector *Detector) Fork(n int) []core.PipelineItem {
  135. return core.ForkSamePipelineItem(detector, n)
  136. }
  137. // LoadPeopleDict loads author signatures from a text file.
  138. // The format is one signature per line, and the signature consists of several
  139. // keys separated by "|". The first key is the main one and used to reference all the rest.
  140. func (detector *Detector) LoadPeopleDict(path string) error {
  141. file, err := os.Open(path)
  142. if err != nil {
  143. return err
  144. }
  145. defer file.Close()
  146. scanner := bufio.NewScanner(file)
  147. dict := make(map[string]int)
  148. var reverseDict []string
  149. size := 0
  150. for scanner.Scan() {
  151. ids := strings.Split(scanner.Text(), "|")
  152. for _, id := range ids {
  153. dict[strings.ToLower(id)] = size
  154. }
  155. reverseDict = append(reverseDict, ids[0])
  156. size++
  157. }
  158. reverseDict = append(reverseDict, AuthorMissingName)
  159. detector.PeopleDict = dict
  160. detector.ReversedPeopleDict = reverseDict
  161. return nil
  162. }
  163. // GeneratePeopleDict loads author signatures from the specified list of Git commits.
  164. func (detector *Detector) GeneratePeopleDict(commits []*object.Commit) {
  165. dict := map[string]int{}
  166. emails := map[int][]string{}
  167. names := map[int][]string{}
  168. size := 0
  169. mailmapFile, err := commits[len(commits)-1].File(".mailmap")
  170. if err == nil {
  171. mailMapContents, err := mailmapFile.Contents()
  172. if err == nil {
  173. mailmap := ParseMailmap(mailMapContents)
  174. for key, val := range mailmap {
  175. key = strings.ToLower(key)
  176. toEmail := strings.ToLower(val.Email)
  177. toName := strings.ToLower(val.Name)
  178. id, exists := dict[toEmail]
  179. if !exists {
  180. id, exists = dict[toName]
  181. }
  182. if exists {
  183. dict[key] = id
  184. } else {
  185. id = size
  186. size++
  187. if toEmail != "" {
  188. dict[toEmail] = id
  189. emails[id] = append(emails[id], toEmail)
  190. }
  191. if toName != "" {
  192. dict[toName] = id
  193. names[id] = append(names[id], toName)
  194. }
  195. dict[key] = id
  196. }
  197. if strings.Contains(key, "@") {
  198. exists := false
  199. for _, val := range emails[id] {
  200. if key == val {
  201. exists = true
  202. break
  203. }
  204. }
  205. if !exists {
  206. emails[id] = append(emails[id], key)
  207. }
  208. } else {
  209. exists := false
  210. for _, val := range names[id] {
  211. if key == val {
  212. exists = true
  213. break
  214. }
  215. }
  216. if !exists {
  217. names[id] = append(names[id], key)
  218. }
  219. }
  220. }
  221. }
  222. }
  223. for _, commit := range commits {
  224. email := strings.ToLower(commit.Author.Email)
  225. name := strings.ToLower(commit.Author.Name)
  226. id, exists := dict[email]
  227. if exists {
  228. _, exists := dict[name]
  229. if !exists {
  230. dict[name] = id
  231. names[id] = append(names[id], name)
  232. }
  233. continue
  234. }
  235. id, exists = dict[name]
  236. if exists {
  237. dict[email] = id
  238. emails[id] = append(emails[id], email)
  239. continue
  240. }
  241. dict[email] = size
  242. dict[name] = size
  243. emails[size] = append(emails[size], email)
  244. names[size] = append(names[size], name)
  245. size++
  246. }
  247. reverseDict := make([]string, size)
  248. for _, val := range dict {
  249. sort.Strings(names[val])
  250. sort.Strings(emails[val])
  251. reverseDict[val] = strings.Join(names[val], "|") + "|" + strings.Join(emails[val], "|")
  252. }
  253. detector.PeopleDict = dict
  254. detector.ReversedPeopleDict = reverseDict
  255. }
  256. // MergedIndex is the result of merging `rd1[First]` and `rd2[Second]`: the index in the final reversed
  257. // dictionary. -1 for `First` or `Second` means that the corresponding string does not exist
  258. // in respectively `rd1` and `rd2`.
  259. // See also:
  260. // * MergeReversedDictsLiteral()
  261. // * MergeReversedDictsIdentities()
  262. type MergedIndex struct {
  263. Final int
  264. First int
  265. Second int
  266. }
  267. // MergeReversedDictsLiteral joins two string lists together, excluding duplicates, in-order.
  268. // The string comparisons are the usual ones.
  269. // The returned mapping's keys are the unique strings in `rd1 ∪ rd2`, and the values are:
  270. // 1. Index after merging.
  271. // 2. Corresponding index in the first array - `rd1`. -1 means that it does not exist.
  272. // 3. Corresponding index in the second array - `rd2`. -1 means that it does not exist.
  273. func MergeReversedDictsLiteral(rd1, rd2 []string) (map[string]MergedIndex, []string) {
  274. people := map[string]MergedIndex{}
  275. for i, pid := range rd1 {
  276. people[pid] = MergedIndex{len(people), i, -1}
  277. }
  278. for i, pid := range rd2 {
  279. if ptrs, exists := people[pid]; !exists {
  280. people[pid] = MergedIndex{len(people), -1, i}
  281. } else {
  282. people[pid] = MergedIndex{ptrs.Final, ptrs.First, i}
  283. }
  284. }
  285. mrd := make([]string, len(people))
  286. for name, ptrs := range people {
  287. mrd[ptrs.Final] = name
  288. }
  289. return people, mrd
  290. }
  291. type identityPair struct {
  292. Index1 int
  293. Index2 int
  294. }
  295. // MergeReversedDictsIdentities joins two identity lists together, excluding duplicates.
  296. // The strings are split by "|" and we find the connected components..
  297. // The returned mapping's keys are the unique strings in `rd1 ∪ rd2`, and the values are:
  298. // 1. Index after merging.
  299. // 2. Corresponding index in the first array - `rd1`. -1 means that it does not exist.
  300. // 3. Corresponding index in the second array - `rd2`. -1 means that it does not exist.
  301. func MergeReversedDictsIdentities(rd1, rd2 []string) (map[string]MergedIndex, []string) {
  302. vocabulary := map[string]identityPair{}
  303. vertices1 := make([][]string, len(rd1))
  304. for i, s := range rd1 {
  305. parts := strings.Split(s, "|")
  306. vertices1[i] = parts
  307. for _, p := range parts {
  308. vocabulary[p] = identityPair{i, -1}
  309. }
  310. }
  311. vertices2 := make([][]string, len(rd2))
  312. for i, s := range rd2 {
  313. parts := strings.Split(s, "|")
  314. vertices2[i] = parts
  315. for _, p := range parts {
  316. if ip, exists := vocabulary[p]; !exists {
  317. vocabulary[p] = identityPair{-1, i}
  318. } else {
  319. ip.Index2 = i
  320. vocabulary[p] = ip
  321. }
  322. }
  323. }
  324. // find the connected components by walking the graph
  325. var walks []map[string]bool
  326. visited := map[string]bool{}
  327. walkFromVertex := func(root []string) {
  328. walk := map[string]bool{}
  329. pending := map[string]bool{}
  330. for _, p := range root {
  331. pending[p] = true
  332. }
  333. for len(pending) > 0 {
  334. var element string
  335. for e := range pending {
  336. element = e
  337. delete(pending, e)
  338. break
  339. }
  340. if !walk[element] {
  341. walk[element] = true
  342. ip := vocabulary[element]
  343. if ip.Index1 >= 0 {
  344. for _, p := range vertices1[ip.Index1] {
  345. if !walk[p] {
  346. pending[p] = true
  347. }
  348. }
  349. }
  350. if ip.Index2 >= 0 {
  351. for _, p := range vertices2[ip.Index2] {
  352. if !walk[p] {
  353. pending[p] = true
  354. }
  355. }
  356. }
  357. }
  358. }
  359. for e := range walk {
  360. visited[e] = true
  361. }
  362. walks = append(walks, walk)
  363. }
  364. for i1 := range rd1 {
  365. var skip bool
  366. for _, p := range vertices1[i1] {
  367. if visited[p] {
  368. skip = true
  369. break
  370. }
  371. }
  372. if skip {
  373. continue
  374. }
  375. walkFromVertex(vertices1[i1])
  376. }
  377. for i2 := range rd2 {
  378. var skip bool
  379. for _, p := range vertices2[i2] {
  380. if visited[p] {
  381. skip = true
  382. break
  383. }
  384. }
  385. if skip {
  386. continue
  387. }
  388. walkFromVertex(vertices2[i2])
  389. }
  390. mergedStrings := make([]string, 0, len(walks))
  391. mergedIndex := map[string]MergedIndex{}
  392. // convert each walk from strings to indexes
  393. for walkIndex, walk := range walks {
  394. ids := make([]string, 0, len(walk))
  395. for key := range walk {
  396. ids = append(ids, key)
  397. }
  398. // place emails after names
  399. sort.Slice(ids, func(i, j int) bool {
  400. iid := ids[i]
  401. jid := ids[j]
  402. iHasAt := strings.ContainsRune(iid, '@')
  403. jHasAt := strings.ContainsRune(jid, '@')
  404. if iHasAt == jHasAt {
  405. return iid < jid
  406. }
  407. return jHasAt
  408. })
  409. mergedStrings = append(mergedStrings, strings.Join(ids, "|"))
  410. for _, key := range ids {
  411. ipair := vocabulary[key]
  412. if ipair.Index1 >= 0 {
  413. s1 := rd1[ipair.Index1]
  414. if mi, exists := mergedIndex[s1]; !exists {
  415. mergedIndex[s1] = MergedIndex{walkIndex, ipair.Index1, -1}
  416. } else {
  417. mergedIndex[s1] = MergedIndex{walkIndex, ipair.Index1, mi.Second}
  418. }
  419. }
  420. if ipair.Index2 >= 0 {
  421. s2 := rd2[ipair.Index2]
  422. if mi, exists := mergedIndex[s2]; !exists {
  423. mergedIndex[s2] = MergedIndex{walkIndex, -1, ipair.Index2}
  424. } else {
  425. mergedIndex[s2] = MergedIndex{walkIndex, mi.First, ipair.Index2}
  426. }
  427. }
  428. }
  429. }
  430. return mergedIndex, mergedStrings
  431. }
  432. func init() {
  433. core.Registry.Register(&Detector{})
  434. }