identity.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501
  1. package identity
  2. import (
  3. "bufio"
  4. "os"
  5. "sort"
  6. "strings"
  7. "github.com/pkg/errors"
  8. "gopkg.in/src-d/go-git.v4"
  9. "gopkg.in/src-d/go-git.v4/plumbing/object"
  10. "gopkg.in/src-d/hercules.v10/internal/core"
  11. )
  12. // Detector determines the author of a commit. Same person can commit under different
  13. // signatures, and we apply some heuristics to merge those together.
  14. // It is a PipelineItem.
  15. type Detector struct {
  16. core.NoopMerger
  17. // PeopleDict maps email || name -> developer id
  18. PeopleDict map[string]int
  19. // ReversedPeopleDict maps developer id -> description
  20. ReversedPeopleDict []string
  21. // ExactSignatures chooses the matching algorithm: opportunistic email || name
  22. // or exact email && name
  23. ExactSignatures bool
  24. l core.Logger
  25. }
  26. const (
  27. // AuthorMissing is the internal author index which denotes any unmatched identities
  28. // (Detector.Consume()). It may *not* be (1 << 18) - 1, see BurndownAnalysis.packPersonWithDay().
  29. AuthorMissing = (1 << 18) - 2
  30. // AuthorMissingName is the string name which corresponds to AuthorMissing.
  31. AuthorMissingName = "<unmatched>"
  32. // FactIdentityDetectorPeopleDict is the name of the fact which is inserted in
  33. // Detector.Configure(). It corresponds to Detector.PeopleDict - the mapping
  34. // from the signatures to the author indices.
  35. FactIdentityDetectorPeopleDict = "IdentityDetector.PeopleDict"
  36. // FactIdentityDetectorReversedPeopleDict is the name of the fact which is inserted in
  37. // Detector.Configure(). It corresponds to Detector.ReversedPeopleDict -
  38. // the mapping from the author indices to the main signature.
  39. FactIdentityDetectorReversedPeopleDict = "IdentityDetector.ReversedPeopleDict"
  40. // ConfigIdentityDetectorPeopleDictPath is the name of the configuration option
  41. // (Detector.Configure()) which allows to set the external PeopleDict mapping from a file.
  42. ConfigIdentityDetectorPeopleDictPath = "IdentityDetector.PeopleDictPath"
  43. // ConfigIdentityDetectorExactSignatures is the name of the configuration option
  44. // (Detector.Configure()) which changes the matching algorithm to exact signature (name + email)
  45. // correspondence.
  46. ConfigIdentityDetectorExactSignatures = "IdentityDetector.ExactSignatures"
  47. // FactIdentityDetectorPeopleCount is the name of the fact which is inserted in
  48. // Detector.Configure(). It is equal to the overall number of unique authors
  49. // (the length of ReversedPeopleDict).
  50. FactIdentityDetectorPeopleCount = "IdentityDetector.PeopleCount"
  51. // DependencyAuthor is the name of the dependency provided by Detector.
  52. DependencyAuthor = "author"
  53. )
  54. // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
  55. func (detector *Detector) Name() string {
  56. return "IdentityDetector"
  57. }
  58. // Provides returns the list of names of entities which are produced by this PipelineItem.
  59. // Each produced entity will be inserted into `deps` of dependent Consume()-s according
  60. // to this list. Also used by core.Registry to build the global map of providers.
  61. func (detector *Detector) Provides() []string {
  62. return []string{DependencyAuthor}
  63. }
  64. // Requires returns the list of names of entities which are needed by this PipelineItem.
  65. // Each requested entity will be inserted into `deps` of Consume(). In turn, those
  66. // entities are Provides() upstream.
  67. func (detector *Detector) Requires() []string {
  68. return []string{}
  69. }
  70. // ListConfigurationOptions returns the list of changeable public properties of this PipelineItem.
  71. func (detector *Detector) ListConfigurationOptions() []core.ConfigurationOption {
  72. options := [...]core.ConfigurationOption{{
  73. Name: ConfigIdentityDetectorPeopleDictPath,
  74. Description: "Path to the file with developer -> name|email associations.",
  75. Flag: "people-dict",
  76. Type: core.PathConfigurationOption,
  77. Default: ""}, {
  78. Name: ConfigIdentityDetectorExactSignatures,
  79. Description: "Disable separate name/email matching. This will lead to considerbly more " +
  80. "identities and should not be normally used.",
  81. Flag: "exact-signatures",
  82. Type: core.BoolConfigurationOption,
  83. Default: false},
  84. }
  85. return options[:]
  86. }
  87. // Configure sets the properties previously published by ListConfigurationOptions().
  88. func (detector *Detector) Configure(facts map[string]interface{}) error {
  89. if l, exists := facts[core.ConfigLogger].(core.Logger); exists {
  90. detector.l = l
  91. } else {
  92. detector.l = core.NewLogger()
  93. }
  94. if val, exists := facts[FactIdentityDetectorPeopleDict].(map[string]int); exists {
  95. detector.PeopleDict = val
  96. }
  97. if val, exists := facts[FactIdentityDetectorReversedPeopleDict].([]string); exists {
  98. detector.ReversedPeopleDict = val
  99. }
  100. if val, exists := facts[ConfigIdentityDetectorExactSignatures].(bool); exists {
  101. detector.ExactSignatures = val
  102. }
  103. if detector.PeopleDict == nil || detector.ReversedPeopleDict == nil {
  104. peopleDictPath, _ := facts[ConfigIdentityDetectorPeopleDictPath].(string)
  105. if peopleDictPath != "" {
  106. err := detector.LoadPeopleDict(peopleDictPath)
  107. if err != nil {
  108. return errors.Errorf("failed to load %s: %v", peopleDictPath, err)
  109. }
  110. facts[FactIdentityDetectorPeopleCount] = len(detector.ReversedPeopleDict) - 1
  111. } else {
  112. if _, exists := facts[core.ConfigPipelineCommits]; !exists {
  113. panic("IdentityDetector needs a list of commits to initialize.")
  114. }
  115. detector.GeneratePeopleDict(facts[core.ConfigPipelineCommits].([]*object.Commit))
  116. facts[FactIdentityDetectorPeopleCount] = len(detector.ReversedPeopleDict)
  117. }
  118. } else {
  119. facts[FactIdentityDetectorPeopleCount] = len(detector.ReversedPeopleDict)
  120. }
  121. facts[FactIdentityDetectorPeopleDict] = detector.PeopleDict
  122. facts[FactIdentityDetectorReversedPeopleDict] = detector.ReversedPeopleDict
  123. return nil
  124. }
  125. // Initialize resets the temporary caches and prepares this PipelineItem for a series of Consume()
  126. // calls. The repository which is going to be analysed is supplied as an argument.
  127. func (detector *Detector) Initialize(repository *git.Repository) error {
  128. detector.l = core.NewLogger()
  129. return nil
  130. }
  131. // Consume runs this PipelineItem on the next commit data.
  132. // `deps` contain all the results from upstream PipelineItem-s as requested by Requires().
  133. // Additionally, DependencyCommit is always present there and represents the analysed *object.Commit.
  134. // This function returns the mapping with analysis results. The keys must be the same as
  135. // in Provides(). If there was an error, nil is returned.
  136. func (detector *Detector) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  137. commit := deps[core.DependencyCommit].(*object.Commit)
  138. var authorID int
  139. var exists bool
  140. signature := commit.Author
  141. if !detector.ExactSignatures {
  142. authorID, exists = detector.PeopleDict[strings.ToLower(signature.Email)]
  143. if !exists {
  144. authorID, exists = detector.PeopleDict[strings.ToLower(signature.Name)]
  145. }
  146. } else {
  147. authorID, exists = detector.PeopleDict[strings.ToLower(signature.String())]
  148. }
  149. if !exists {
  150. authorID = AuthorMissing
  151. }
  152. return map[string]interface{}{DependencyAuthor: authorID}, nil
  153. }
  154. // Fork clones this PipelineItem.
  155. func (detector *Detector) Fork(n int) []core.PipelineItem {
  156. return core.ForkSamePipelineItem(detector, n)
  157. }
  158. // LoadPeopleDict loads author signatures from a text file.
  159. // The format is one signature per line, and the signature consists of several
  160. // keys separated by "|". The first key is the main one and used to reference all the rest.
  161. func (detector *Detector) LoadPeopleDict(path string) error {
  162. file, err := os.Open(path)
  163. if err != nil {
  164. return err
  165. }
  166. defer file.Close()
  167. scanner := bufio.NewScanner(file)
  168. dict := make(map[string]int)
  169. var reverseDict []string
  170. size := 0
  171. for scanner.Scan() {
  172. ids := strings.Split(scanner.Text(), "|")
  173. for _, id := range ids {
  174. dict[strings.ToLower(id)] = size
  175. }
  176. reverseDict = append(reverseDict, ids[0])
  177. size++
  178. }
  179. reverseDict = append(reverseDict, AuthorMissingName)
  180. detector.PeopleDict = dict
  181. detector.ReversedPeopleDict = reverseDict
  182. return nil
  183. }
  184. // GeneratePeopleDict loads author signatures from the specified list of Git commits.
  185. func (detector *Detector) GeneratePeopleDict(commits []*object.Commit) {
  186. dict := map[string]int{}
  187. emails := map[int][]string{}
  188. names := map[int][]string{}
  189. size := 0
  190. mailmapFile, err := commits[len(commits)-1].File(".mailmap")
  191. // TODO(vmarkovtsev): properly handle .mailmap if ExactSignatures
  192. if !detector.ExactSignatures && err == nil {
  193. mailMapContents, err := mailmapFile.Contents()
  194. if err == nil {
  195. mailmap := ParseMailmap(mailMapContents)
  196. for key, val := range mailmap {
  197. key = strings.ToLower(key)
  198. toEmail := strings.ToLower(val.Email)
  199. toName := strings.ToLower(val.Name)
  200. id, exists := dict[toEmail]
  201. if !exists {
  202. id, exists = dict[toName]
  203. }
  204. if exists {
  205. dict[key] = id
  206. } else {
  207. id = size
  208. size++
  209. if toEmail != "" {
  210. dict[toEmail] = id
  211. emails[id] = append(emails[id], toEmail)
  212. }
  213. if toName != "" {
  214. dict[toName] = id
  215. names[id] = append(names[id], toName)
  216. }
  217. dict[key] = id
  218. }
  219. if strings.Contains(key, "@") {
  220. exists := false
  221. for _, val := range emails[id] {
  222. if key == val {
  223. exists = true
  224. break
  225. }
  226. }
  227. if !exists {
  228. emails[id] = append(emails[id], key)
  229. }
  230. } else {
  231. exists := false
  232. for _, val := range names[id] {
  233. if key == val {
  234. exists = true
  235. break
  236. }
  237. }
  238. if !exists {
  239. names[id] = append(names[id], key)
  240. }
  241. }
  242. }
  243. }
  244. }
  245. for _, commit := range commits {
  246. if !detector.ExactSignatures {
  247. email := strings.ToLower(commit.Author.Email)
  248. name := strings.ToLower(commit.Author.Name)
  249. id, exists := dict[email]
  250. if exists {
  251. _, exists := dict[name]
  252. if !exists {
  253. dict[name] = id
  254. names[id] = append(names[id], name)
  255. }
  256. continue
  257. }
  258. id, exists = dict[name]
  259. if exists {
  260. dict[email] = id
  261. emails[id] = append(emails[id], email)
  262. continue
  263. }
  264. dict[email] = size
  265. dict[name] = size
  266. emails[size] = append(emails[size], email)
  267. names[size] = append(names[size], name)
  268. size++
  269. } else { // !detector.ExactSignatures
  270. sig := strings.ToLower(commit.Author.String())
  271. if _, exists := dict[sig]; !exists {
  272. dict[sig] = size
  273. size++
  274. }
  275. }
  276. }
  277. reverseDict := make([]string, size)
  278. if !detector.ExactSignatures {
  279. for _, val := range dict {
  280. sort.Strings(names[val])
  281. sort.Strings(emails[val])
  282. reverseDict[val] = strings.Join(names[val], "|") + "|" + strings.Join(emails[val], "|")
  283. }
  284. } else {
  285. for key, val := range dict {
  286. reverseDict[val] = key
  287. }
  288. }
  289. detector.PeopleDict = dict
  290. detector.ReversedPeopleDict = reverseDict
  291. }
  292. // MergedIndex is the result of merging `rd1[First]` and `rd2[Second]`: the index in the final reversed
  293. // dictionary. -1 for `First` or `Second` means that the corresponding string does not exist
  294. // in respectively `rd1` and `rd2`.
  295. // See also:
  296. // * MergeReversedDictsLiteral()
  297. // * MergeReversedDictsIdentities()
  298. type MergedIndex struct {
  299. Final int
  300. First int
  301. Second int
  302. }
  303. // MergeReversedDictsLiteral joins two string lists together, excluding duplicates, in-order.
  304. // The string comparisons are the usual ones.
  305. // The returned mapping's keys are the unique strings in `rd1 ∪ rd2`, and the values are:
  306. // 1. Index after merging.
  307. // 2. Corresponding index in the first array - `rd1`. -1 means that it does not exist.
  308. // 3. Corresponding index in the second array - `rd2`. -1 means that it does not exist.
  309. func MergeReversedDictsLiteral(rd1, rd2 []string) (map[string]MergedIndex, []string) {
  310. people := map[string]MergedIndex{}
  311. for i, pid := range rd1 {
  312. people[pid] = MergedIndex{len(people), i, -1}
  313. }
  314. for i, pid := range rd2 {
  315. if ptrs, exists := people[pid]; !exists {
  316. people[pid] = MergedIndex{len(people), -1, i}
  317. } else {
  318. people[pid] = MergedIndex{ptrs.Final, ptrs.First, i}
  319. }
  320. }
  321. mrd := make([]string, len(people))
  322. for name, ptrs := range people {
  323. mrd[ptrs.Final] = name
  324. }
  325. return people, mrd
  326. }
  327. type identityPair struct {
  328. Index1 int
  329. Index2 int
  330. }
  331. // MergeReversedDictsIdentities joins two identity lists together, excluding duplicates.
  332. // The strings are split by "|" and we find the connected components..
  333. // The returned mapping's keys are the unique strings in `rd1 ∪ rd2`, and the values are:
  334. // 1. Index after merging.
  335. // 2. Corresponding index in the first array - `rd1`. -1 means that it does not exist.
  336. // 3. Corresponding index in the second array - `rd2`. -1 means that it does not exist.
  337. func MergeReversedDictsIdentities(rd1, rd2 []string) (map[string]MergedIndex, []string) {
  338. vocabulary := map[string]identityPair{}
  339. vertices1 := make([][]string, len(rd1))
  340. for i, s := range rd1 {
  341. parts := strings.Split(s, "|")
  342. vertices1[i] = parts
  343. for _, p := range parts {
  344. vocabulary[p] = identityPair{i, -1}
  345. }
  346. }
  347. vertices2 := make([][]string, len(rd2))
  348. for i, s := range rd2 {
  349. parts := strings.Split(s, "|")
  350. vertices2[i] = parts
  351. for _, p := range parts {
  352. if ip, exists := vocabulary[p]; !exists {
  353. vocabulary[p] = identityPair{-1, i}
  354. } else {
  355. ip.Index2 = i
  356. vocabulary[p] = ip
  357. }
  358. }
  359. }
  360. // find the connected components by walking the graph
  361. var walks []map[string]bool
  362. visited := map[string]bool{}
  363. walkFromVertex := func(root []string) {
  364. walk := map[string]bool{}
  365. pending := map[string]bool{}
  366. for _, p := range root {
  367. pending[p] = true
  368. }
  369. for len(pending) > 0 {
  370. var element string
  371. for e := range pending {
  372. element = e
  373. delete(pending, e)
  374. break
  375. }
  376. if !walk[element] {
  377. walk[element] = true
  378. ip := vocabulary[element]
  379. if ip.Index1 >= 0 {
  380. for _, p := range vertices1[ip.Index1] {
  381. if !walk[p] {
  382. pending[p] = true
  383. }
  384. }
  385. }
  386. if ip.Index2 >= 0 {
  387. for _, p := range vertices2[ip.Index2] {
  388. if !walk[p] {
  389. pending[p] = true
  390. }
  391. }
  392. }
  393. }
  394. }
  395. for e := range walk {
  396. visited[e] = true
  397. }
  398. walks = append(walks, walk)
  399. }
  400. for i1 := range rd1 {
  401. var skip bool
  402. for _, p := range vertices1[i1] {
  403. if visited[p] {
  404. skip = true
  405. break
  406. }
  407. }
  408. if skip {
  409. continue
  410. }
  411. walkFromVertex(vertices1[i1])
  412. }
  413. for i2 := range rd2 {
  414. var skip bool
  415. for _, p := range vertices2[i2] {
  416. if visited[p] {
  417. skip = true
  418. break
  419. }
  420. }
  421. if skip {
  422. continue
  423. }
  424. walkFromVertex(vertices2[i2])
  425. }
  426. mergedStrings := make([]string, 0, len(walks))
  427. mergedIndex := map[string]MergedIndex{}
  428. // convert each walk from strings to indexes
  429. for walkIndex, walk := range walks {
  430. ids := make([]string, 0, len(walk))
  431. for key := range walk {
  432. ids = append(ids, key)
  433. }
  434. // place emails after names
  435. sort.Slice(ids, func(i, j int) bool {
  436. iid := ids[i]
  437. jid := ids[j]
  438. iHasAt := strings.ContainsRune(iid, '@')
  439. jHasAt := strings.ContainsRune(jid, '@')
  440. if iHasAt == jHasAt {
  441. return iid < jid
  442. }
  443. return jHasAt
  444. })
  445. mergedStrings = append(mergedStrings, strings.Join(ids, "|"))
  446. for _, key := range ids {
  447. ipair := vocabulary[key]
  448. if ipair.Index1 >= 0 {
  449. s1 := rd1[ipair.Index1]
  450. if mi, exists := mergedIndex[s1]; !exists {
  451. mergedIndex[s1] = MergedIndex{walkIndex, ipair.Index1, -1}
  452. } else {
  453. mergedIndex[s1] = MergedIndex{walkIndex, ipair.Index1, mi.Second}
  454. }
  455. }
  456. if ipair.Index2 >= 0 {
  457. s2 := rd2[ipair.Index2]
  458. if mi, exists := mergedIndex[s2]; !exists {
  459. mergedIndex[s2] = MergedIndex{walkIndex, -1, ipair.Index2}
  460. } else {
  461. mergedIndex[s2] = MergedIndex{walkIndex, mi.First, ipair.Index2}
  462. }
  463. }
  464. }
  465. }
  466. return mergedIndex, mergedStrings
  467. }
  468. func init() {
  469. core.Registry.Register(&Detector{})
  470. }