couples.go 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602
  1. package leaves
  2. import (
  3. "fmt"
  4. "io"
  5. "sort"
  6. "github.com/gogo/protobuf/proto"
  7. "gopkg.in/src-d/go-git.v4"
  8. "gopkg.in/src-d/go-git.v4/plumbing/object"
  9. "gopkg.in/src-d/go-git.v4/utils/merkletrie"
  10. "gopkg.in/src-d/hercules.v8/internal/core"
  11. "gopkg.in/src-d/hercules.v8/internal/pb"
  12. items "gopkg.in/src-d/hercules.v8/internal/plumbing"
  13. "gopkg.in/src-d/hercules.v8/internal/plumbing/identity"
  14. "gopkg.in/src-d/hercules.v8/internal/yaml"
  15. )
  16. // CouplesAnalysis calculates the number of common commits for files and authors.
  17. // The results are matrices, where cell at row X and column Y is the number of commits which
  18. // changed X and Y together. In case with people, the numbers are summed for every common file.
  19. type CouplesAnalysis struct {
  20. core.NoopMerger
  21. core.OneShotMergeProcessor
  22. // PeopleNumber is the number of developers for which to build the matrix. 0 disables this analysis.
  23. PeopleNumber int
  24. // people store how many times every developer committed to every file.
  25. people []map[string]int
  26. // peopleCommits is the number of commits each author made.
  27. peopleCommits []int
  28. // files store every file occurred in the same commit with every other file.
  29. files map[string]map[string]int
  30. // renames point from new file name to old file name.
  31. renames *[]rename
  32. // lastCommit is the last commit which was consumed.
  33. lastCommit *object.Commit
  34. // reversedPeopleDict references IdentityDetector.ReversedPeopleDict
  35. reversedPeopleDict []string
  36. }
  37. // CouplesResult is returned by CouplesAnalysis.Finalize() and carries couples matrices from
  38. // authors and files.
  39. type CouplesResult struct {
  40. PeopleMatrix []map[int]int64
  41. PeopleFiles [][]int
  42. FilesMatrix []map[int]int64
  43. Files []string
  44. // reversedPeopleDict references IdentityDetector.ReversedPeopleDict
  45. reversedPeopleDict []string
  46. }
  47. const (
  48. // CouplesMaximumMeaningfulContextSize is the threshold on the number of files in a commit to
  49. // consider them as grouped together.
  50. CouplesMaximumMeaningfulContextSize = 1000
  51. )
  52. type rename struct {
  53. FromName string
  54. ToName string
  55. }
  56. // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
  57. func (couples *CouplesAnalysis) Name() string {
  58. return "Couples"
  59. }
  60. // Provides returns the list of names of entities which are produced by this PipelineItem.
  61. // Each produced entity will be inserted into `deps` of dependent Consume()-s according
  62. // to this list. Also used by core.Registry to build the global map of providers.
  63. func (couples *CouplesAnalysis) Provides() []string {
  64. return []string{}
  65. }
  66. // Requires returns the list of names of entities which are needed by this PipelineItem.
  67. // Each requested entity will be inserted into `deps` of Consume(). In turn, those
  68. // entities are Provides() upstream.
  69. func (couples *CouplesAnalysis) Requires() []string {
  70. arr := [...]string{identity.DependencyAuthor, items.DependencyTreeChanges}
  71. return arr[:]
  72. }
  73. // ListConfigurationOptions returns the list of changeable public properties of this PipelineItem.
  74. func (couples *CouplesAnalysis) ListConfigurationOptions() []core.ConfigurationOption {
  75. return []core.ConfigurationOption{}
  76. }
  77. // Configure sets the properties previously published by ListConfigurationOptions().
  78. func (couples *CouplesAnalysis) Configure(facts map[string]interface{}) error {
  79. if val, exists := facts[identity.FactIdentityDetectorPeopleCount].(int); exists {
  80. couples.PeopleNumber = val
  81. couples.reversedPeopleDict = facts[identity.FactIdentityDetectorReversedPeopleDict].([]string)
  82. }
  83. return nil
  84. }
  85. // Flag for the command line switch which enables this analysis.
  86. func (couples *CouplesAnalysis) Flag() string {
  87. return "couples"
  88. }
  89. // Description returns the text which explains what the analysis is doing.
  90. func (couples *CouplesAnalysis) Description() string {
  91. return "The result is a square matrix, the value in each cell corresponds to the number " +
  92. "of times the pair of files appeared in the same commit or pair of developers " +
  93. "committed to the same file."
  94. }
  95. // Initialize resets the temporary caches and prepares this PipelineItem for a series of Consume()
  96. // calls. The repository which is going to be analysed is supplied as an argument.
  97. func (couples *CouplesAnalysis) Initialize(repository *git.Repository) error {
  98. couples.people = make([]map[string]int, couples.PeopleNumber+1)
  99. for i := range couples.people {
  100. couples.people[i] = map[string]int{}
  101. }
  102. couples.peopleCommits = make([]int, couples.PeopleNumber+1)
  103. couples.files = map[string]map[string]int{}
  104. couples.renames = &[]rename{}
  105. couples.OneShotMergeProcessor.Initialize()
  106. return nil
  107. }
  108. // Consume runs this PipelineItem on the next commit data.
  109. // `deps` contain all the results from upstream PipelineItem-s as requested by Requires().
  110. // Additionally, DependencyCommit is always present there and represents the analysed *object.Commit.
  111. // This function returns the mapping with analysis results. The keys must be the same as
  112. // in Provides(). If there was an error, nil is returned.
  113. func (couples *CouplesAnalysis) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  114. firstMerge := couples.ShouldConsumeCommit(deps)
  115. mergeMode := deps[core.DependencyIsMerge].(bool)
  116. couples.lastCommit = deps[core.DependencyCommit].(*object.Commit)
  117. author := deps[identity.DependencyAuthor].(int)
  118. if author == identity.AuthorMissing {
  119. author = couples.PeopleNumber
  120. }
  121. if firstMerge {
  122. couples.peopleCommits[author]++
  123. }
  124. treeDiff := deps[items.DependencyTreeChanges].(object.Changes)
  125. context := make([]string, 0, len(treeDiff))
  126. for _, change := range treeDiff {
  127. action, err := change.Action()
  128. if err != nil {
  129. return nil, err
  130. }
  131. toName := change.To.Name
  132. fromName := change.From.Name
  133. switch action {
  134. case merkletrie.Insert:
  135. if !mergeMode || couples.files[toName] == nil {
  136. context = append(context, toName)
  137. couples.people[author][toName]++
  138. }
  139. case merkletrie.Delete:
  140. if !mergeMode {
  141. couples.people[author][fromName]++
  142. }
  143. case merkletrie.Modify:
  144. if fromName != toName {
  145. // renamed
  146. *couples.renames = append(
  147. *couples.renames, rename{ToName: toName, FromName: fromName})
  148. }
  149. if !mergeMode || couples.files[toName] == nil {
  150. context = append(context, toName)
  151. couples.people[author][toName]++
  152. }
  153. }
  154. }
  155. if len(context) <= CouplesMaximumMeaningfulContextSize {
  156. for _, file := range context {
  157. for _, otherFile := range context {
  158. lane, exists := couples.files[file]
  159. if !exists {
  160. lane = map[string]int{}
  161. couples.files[file] = lane
  162. }
  163. lane[otherFile]++
  164. }
  165. }
  166. }
  167. return nil, nil
  168. }
  169. // Finalize returns the result of the analysis. Further Consume() calls are not expected.
  170. func (couples *CouplesAnalysis) Finalize() interface{} {
  171. files, people := couples.propagateRenames(couples.currentFiles())
  172. filesSequence := make([]string, len(files))
  173. i := 0
  174. for file := range files {
  175. filesSequence[i] = file
  176. i++
  177. }
  178. sort.Strings(filesSequence)
  179. filesIndex := map[string]int{}
  180. for i, file := range filesSequence {
  181. filesIndex[file] = i
  182. }
  183. peopleMatrix := make([]map[int]int64, couples.PeopleNumber+1)
  184. peopleFiles := make([][]int, couples.PeopleNumber+1)
  185. for i := range peopleMatrix {
  186. peopleMatrix[i] = map[int]int64{}
  187. for file, commits := range people[i] {
  188. if fi, exists := filesIndex[file]; exists {
  189. peopleFiles[i] = append(peopleFiles[i], fi)
  190. }
  191. for j, otherFiles := range people {
  192. otherCommits := otherFiles[file]
  193. delta := otherCommits
  194. if otherCommits > commits {
  195. delta = commits
  196. }
  197. if delta > 0 {
  198. peopleMatrix[i][j] += int64(delta)
  199. }
  200. }
  201. }
  202. sort.Ints(peopleFiles[i])
  203. }
  204. filesMatrix := make([]map[int]int64, len(filesIndex))
  205. for i := range filesMatrix {
  206. filesMatrix[i] = map[int]int64{}
  207. for otherFile, cooccs := range files[filesSequence[i]] {
  208. filesMatrix[i][filesIndex[otherFile]] = int64(cooccs)
  209. }
  210. }
  211. return CouplesResult{
  212. PeopleMatrix: peopleMatrix,
  213. PeopleFiles: peopleFiles,
  214. Files: filesSequence,
  215. FilesMatrix: filesMatrix,
  216. reversedPeopleDict: couples.reversedPeopleDict,
  217. }
  218. }
  219. // Fork clones this pipeline item.
  220. func (couples *CouplesAnalysis) Fork(n int) []core.PipelineItem {
  221. return core.ForkCopyPipelineItem(couples, n)
  222. }
  223. // Serialize converts the analysis result as returned by Finalize() to text or bytes.
  224. // The text format is YAML and the bytes format is Protocol Buffers.
  225. func (couples *CouplesAnalysis) Serialize(result interface{}, binary bool, writer io.Writer) error {
  226. couplesResult := result.(CouplesResult)
  227. if binary {
  228. return couples.serializeBinary(&couplesResult, writer)
  229. }
  230. couples.serializeText(&couplesResult, writer)
  231. return nil
  232. }
  233. // Deserialize converts the specified protobuf bytes to CouplesResult.
  234. func (couples *CouplesAnalysis) Deserialize(pbmessage []byte) (interface{}, error) {
  235. message := pb.CouplesAnalysisResults{}
  236. err := proto.Unmarshal(pbmessage, &message)
  237. if err != nil {
  238. return nil, err
  239. }
  240. result := CouplesResult{
  241. Files: message.FileCouples.Index,
  242. FilesMatrix: make([]map[int]int64, message.FileCouples.Matrix.NumberOfRows),
  243. PeopleFiles: make([][]int, len(message.PeopleCouples.Index)),
  244. PeopleMatrix: make([]map[int]int64, message.PeopleCouples.Matrix.NumberOfRows),
  245. reversedPeopleDict: message.PeopleCouples.Index,
  246. }
  247. for i, files := range message.PeopleFiles {
  248. result.PeopleFiles[i] = make([]int, len(files.Files))
  249. for j, val := range files.Files {
  250. result.PeopleFiles[i][j] = int(val)
  251. }
  252. }
  253. convertCSR := func(dest []map[int]int64, src *pb.CompressedSparseRowMatrix) {
  254. for indptr := range src.Indptr {
  255. if indptr == 0 {
  256. continue
  257. }
  258. dest[indptr-1] = map[int]int64{}
  259. for j := src.Indptr[indptr-1]; j < src.Indptr[indptr]; j++ {
  260. dest[indptr-1][int(src.Indices[j])] = src.Data[j]
  261. }
  262. }
  263. }
  264. convertCSR(result.FilesMatrix, message.FileCouples.Matrix)
  265. convertCSR(result.PeopleMatrix, message.PeopleCouples.Matrix)
  266. return result, nil
  267. }
  268. // MergeResults combines two CouplesAnalysis-s together.
  269. func (couples *CouplesAnalysis) MergeResults(r1, r2 interface{}, c1, c2 *core.CommonAnalysisResult) interface{} {
  270. cr1 := r1.(CouplesResult)
  271. cr2 := r2.(CouplesResult)
  272. merged := CouplesResult{}
  273. var people, files map[string][3]int
  274. people, merged.reversedPeopleDict = identity.Detector{}.MergeReversedDicts(
  275. cr1.reversedPeopleDict, cr2.reversedPeopleDict)
  276. files, merged.Files = identity.Detector{}.MergeReversedDicts(cr1.Files, cr2.Files)
  277. merged.PeopleFiles = make([][]int, len(merged.reversedPeopleDict))
  278. peopleFilesDicts := make([]map[int]bool, len(merged.reversedPeopleDict))
  279. addPeopleFiles := func(peopleFiles [][]int, reversedPeopleDict []string,
  280. reversedFilesDict []string) {
  281. for pi, fs := range peopleFiles {
  282. idx := people[reversedPeopleDict[pi]][0]
  283. m := peopleFilesDicts[idx]
  284. if m == nil {
  285. m = map[int]bool{}
  286. peopleFilesDicts[idx] = m
  287. }
  288. for _, f := range fs {
  289. m[files[reversedFilesDict[f]][0]] = true
  290. }
  291. }
  292. }
  293. addPeopleFiles(cr1.PeopleFiles, cr1.reversedPeopleDict, cr1.Files)
  294. addPeopleFiles(cr2.PeopleFiles, cr2.reversedPeopleDict, cr2.Files)
  295. for i, m := range peopleFilesDicts {
  296. merged.PeopleFiles[i] = make([]int, len(m))
  297. j := 0
  298. for f := range m {
  299. merged.PeopleFiles[i][j] = f
  300. j++
  301. }
  302. sort.Ints(merged.PeopleFiles[i])
  303. }
  304. merged.PeopleMatrix = make([]map[int]int64, len(merged.reversedPeopleDict)+1)
  305. addPeople := func(peopleMatrix []map[int]int64, reversedPeopleDict []string,
  306. reversedFilesDict []string) {
  307. for pi, pc := range peopleMatrix {
  308. var idx int
  309. if pi < len(reversedPeopleDict) {
  310. idx = people[reversedPeopleDict[pi]][0]
  311. } else {
  312. idx = len(merged.reversedPeopleDict)
  313. }
  314. m := merged.PeopleMatrix[idx]
  315. if m == nil {
  316. m = map[int]int64{}
  317. merged.PeopleMatrix[idx] = m
  318. }
  319. for file, val := range pc {
  320. m[files[reversedFilesDict[file]][0]] += val
  321. }
  322. }
  323. }
  324. addPeople(cr1.PeopleMatrix, cr1.reversedPeopleDict, cr1.Files)
  325. addPeople(cr2.PeopleMatrix, cr2.reversedPeopleDict, cr2.Files)
  326. merged.FilesMatrix = make([]map[int]int64, len(merged.Files))
  327. addFiles := func(filesMatrix []map[int]int64, reversedFilesDict []string) {
  328. for fi, fc := range filesMatrix {
  329. idx := people[reversedFilesDict[fi]][0]
  330. m := merged.FilesMatrix[idx]
  331. if m == nil {
  332. m = map[int]int64{}
  333. merged.FilesMatrix[idx] = m
  334. }
  335. for file, val := range fc {
  336. m[files[reversedFilesDict[file]][0]] += val
  337. }
  338. }
  339. }
  340. addFiles(cr1.FilesMatrix, cr1.Files)
  341. addFiles(cr2.FilesMatrix, cr2.Files)
  342. return merged
  343. }
  344. func (couples *CouplesAnalysis) serializeText(result *CouplesResult, writer io.Writer) {
  345. fmt.Fprintln(writer, " files_coocc:")
  346. fmt.Fprintln(writer, " index:")
  347. for _, file := range result.Files {
  348. fmt.Fprintf(writer, " - %s\n", yaml.SafeString(file))
  349. }
  350. fmt.Fprintln(writer, " matrix:")
  351. for _, files := range result.FilesMatrix {
  352. fmt.Fprint(writer, " - {")
  353. var indices []int
  354. for file := range files {
  355. indices = append(indices, file)
  356. }
  357. sort.Ints(indices)
  358. for i, file := range indices {
  359. fmt.Fprintf(writer, "%d: %d", file, files[file])
  360. if i < len(indices)-1 {
  361. fmt.Fprint(writer, ", ")
  362. }
  363. }
  364. fmt.Fprintln(writer, "}")
  365. }
  366. fmt.Fprintln(writer, " people_coocc:")
  367. fmt.Fprintln(writer, " index:")
  368. for _, person := range result.reversedPeopleDict {
  369. fmt.Fprintf(writer, " - %s\n", yaml.SafeString(person))
  370. }
  371. fmt.Fprintln(writer, " matrix:")
  372. for _, people := range result.PeopleMatrix {
  373. fmt.Fprint(writer, " - {")
  374. var indices []int
  375. for file := range people {
  376. indices = append(indices, file)
  377. }
  378. sort.Ints(indices)
  379. for i, person := range indices {
  380. fmt.Fprintf(writer, "%d: %d", person, people[person])
  381. if i < len(indices)-1 {
  382. fmt.Fprint(writer, ", ")
  383. }
  384. }
  385. fmt.Fprintln(writer, "}")
  386. }
  387. fmt.Fprintln(writer, " author_files:") // sorted by number of files each author changed
  388. peopleFiles := sortByNumberOfFiles(result.PeopleFiles, result.reversedPeopleDict, result.Files)
  389. for _, authorFiles := range peopleFiles {
  390. fmt.Fprintf(writer, " - %s:\n", yaml.SafeString(authorFiles.Author))
  391. sort.Strings(authorFiles.Files)
  392. for _, file := range authorFiles.Files {
  393. fmt.Fprintf(writer, " - %s\n", yaml.SafeString(file)) // sorted by path
  394. }
  395. }
  396. }
  397. func sortByNumberOfFiles(
  398. peopleFiles [][]int, peopleDict []string, filesDict []string) authorFilesList {
  399. var pfl authorFilesList
  400. for peopleIdx, files := range peopleFiles {
  401. if peopleIdx < len(peopleDict) {
  402. fileNames := make([]string, len(files))
  403. for i, fi := range files {
  404. fileNames[i] = filesDict[fi]
  405. }
  406. pfl = append(pfl, authorFiles{peopleDict[peopleIdx], fileNames})
  407. }
  408. }
  409. sort.Sort(pfl)
  410. return pfl
  411. }
  412. type authorFiles struct {
  413. Author string
  414. Files []string
  415. }
  416. type authorFilesList []authorFiles
  417. func (s authorFilesList) Len() int {
  418. return len(s)
  419. }
  420. func (s authorFilesList) Swap(i, j int) {
  421. s[i], s[j] = s[j], s[i]
  422. }
  423. func (s authorFilesList) Less(i, j int) bool {
  424. return len(s[i].Files) < len(s[j].Files)
  425. }
  426. func (couples *CouplesAnalysis) serializeBinary(result *CouplesResult, writer io.Writer) error {
  427. message := pb.CouplesAnalysisResults{}
  428. message.FileCouples = &pb.Couples{
  429. Index: result.Files,
  430. Matrix: pb.MapToCompressedSparseRowMatrix(result.FilesMatrix),
  431. }
  432. message.PeopleCouples = &pb.Couples{
  433. Index: result.reversedPeopleDict,
  434. Matrix: pb.MapToCompressedSparseRowMatrix(result.PeopleMatrix),
  435. }
  436. message.PeopleFiles = make([]*pb.TouchedFiles, len(result.reversedPeopleDict))
  437. for key := range result.reversedPeopleDict {
  438. files := result.PeopleFiles[key]
  439. int32Files := make([]int32, len(files))
  440. for i, f := range files {
  441. int32Files[i] = int32(f)
  442. }
  443. message.PeopleFiles[key] = &pb.TouchedFiles{
  444. Files: int32Files,
  445. }
  446. }
  447. serialized, err := proto.Marshal(&message)
  448. if err != nil {
  449. return err
  450. }
  451. _, err = writer.Write(serialized)
  452. return err
  453. }
  454. // currentFiles return the list of files in the last consumed commit.
  455. func (couples *CouplesAnalysis) currentFiles() map[string]bool {
  456. files := map[string]bool{}
  457. if couples.lastCommit == nil {
  458. for key := range couples.files {
  459. files[key] = true
  460. }
  461. }
  462. tree, _ := couples.lastCommit.Tree()
  463. fileIter := tree.Files()
  464. fileIter.ForEach(func(fobj *object.File) error {
  465. files[fobj.Name] = true
  466. return nil
  467. })
  468. return files
  469. }
  470. // propagateRenames applies `renames` over the files from `lastCommit`.
  471. func (couples *CouplesAnalysis) propagateRenames(files map[string]bool) (
  472. map[string]map[string]int, []map[string]int) {
  473. renames := *couples.renames
  474. reducedFiles := map[string]map[string]int{}
  475. for file := range files {
  476. fmap := map[string]int{}
  477. refmap := couples.files[file]
  478. for other := range files {
  479. refval := refmap[other]
  480. if refval > 0 {
  481. fmap[other] = refval
  482. }
  483. }
  484. if len(fmap) > 0 {
  485. reducedFiles[file] = fmap
  486. }
  487. }
  488. // propagate renames
  489. aliases := map[string]map[string]bool{}
  490. pointers := map[string]string{}
  491. for i := range renames {
  492. rename := renames[len(renames)-i-1]
  493. toName := rename.ToName
  494. if newTo, exists := pointers[toName]; exists {
  495. toName = newTo
  496. }
  497. if _, exists := reducedFiles[toName]; exists {
  498. if rename.FromName != toName {
  499. var set map[string]bool
  500. if set, exists = aliases[toName]; !exists {
  501. set = map[string]bool{}
  502. aliases[toName] = set
  503. }
  504. set[rename.FromName] = true
  505. pointers[rename.FromName] = toName
  506. }
  507. continue
  508. }
  509. }
  510. adjustments := map[string]map[string]int{}
  511. for final, set := range aliases {
  512. adjustment := map[string]int{}
  513. for alias := range set {
  514. for k, v := range couples.files[alias] {
  515. adjustment[k] += v
  516. }
  517. }
  518. adjustments[final] = adjustment
  519. }
  520. for _, adjustment := range adjustments {
  521. for final, set := range aliases {
  522. for alias := range set {
  523. adjustment[final] += adjustment[alias]
  524. delete(adjustment, alias)
  525. }
  526. }
  527. }
  528. for final, adjustment := range adjustments {
  529. for key, val := range adjustment {
  530. if coocc, exists := reducedFiles[final][key]; exists {
  531. reducedFiles[final][key] = coocc + val
  532. reducedFiles[key][final] = coocc + val
  533. }
  534. }
  535. }
  536. people := make([]map[string]int, len(couples.people))
  537. for i, counts := range couples.people {
  538. reducedCounts := map[string]int{}
  539. people[i] = reducedCounts
  540. for file := range files {
  541. count := counts[file]
  542. for alias := range aliases[file] {
  543. count += counts[alias]
  544. }
  545. if count > 0 {
  546. reducedCounts[file] = count
  547. }
  548. }
  549. for key, val := range counts {
  550. if _, exists := files[key]; !exists {
  551. if _, exists = pointers[key]; !exists {
  552. reducedCounts[key] = val
  553. }
  554. }
  555. }
  556. }
  557. return reducedFiles, people
  558. }
  559. func init() {
  560. core.Registry.Register(&CouplesAnalysis{})
  561. }