couples.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469
  1. package leaves
  2. import (
  3. "fmt"
  4. "io"
  5. "sort"
  6. "github.com/gogo/protobuf/proto"
  7. "gopkg.in/src-d/go-git.v4"
  8. "gopkg.in/src-d/go-git.v4/plumbing/object"
  9. "gopkg.in/src-d/go-git.v4/utils/merkletrie"
  10. "gopkg.in/src-d/hercules.v4/internal/core"
  11. "gopkg.in/src-d/hercules.v4/internal/pb"
  12. items "gopkg.in/src-d/hercules.v4/internal/plumbing"
  13. "gopkg.in/src-d/hercules.v4/internal/plumbing/identity"
  14. "gopkg.in/src-d/hercules.v4/yaml"
  15. )
  16. // CouplesAnalysis calculates the number of common commits for files and authors.
  17. // The results are matrices, where cell at row X and column Y is the number of commits which
  18. // changed X and Y together. In case with people, the numbers are summed for every common file.
  19. type CouplesAnalysis struct {
  20. // PeopleNumber is the number of developers for which to build the matrix. 0 disables this analysis.
  21. PeopleNumber int
  22. // people store how many times every developer committed to every file.
  23. people []map[string]int
  24. // peopleCommits is the number of commits each author made.
  25. peopleCommits []int
  26. // files store every file occurred in the same commit with every other file.
  27. files map[string]map[string]int
  28. // reversedPeopleDict references IdentityDetector.ReversedPeopleDict
  29. reversedPeopleDict []string
  30. }
  31. // CouplesResult is returned by CouplesAnalysis.Finalize() and carries couples matrices from
  32. // authors and files.
  33. type CouplesResult struct {
  34. PeopleMatrix []map[int]int64
  35. PeopleFiles [][]int
  36. FilesMatrix []map[int]int64
  37. Files []string
  38. // reversedPeopleDict references IdentityDetector.ReversedPeopleDict
  39. reversedPeopleDict []string
  40. }
  41. // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
  42. func (couples *CouplesAnalysis) Name() string {
  43. return "Couples"
  44. }
  45. // Provides returns the list of names of entities which are produced by this PipelineItem.
  46. // Each produced entity will be inserted into `deps` of dependent Consume()-s according
  47. // to this list. Also used by core.Registry to build the global map of providers.
  48. func (couples *CouplesAnalysis) Provides() []string {
  49. return []string{}
  50. }
  51. // Requires returns the list of names of entities which are needed by this PipelineItem.
  52. // Each requested entity will be inserted into `deps` of Consume(). In turn, those
  53. // entities are Provides() upstream.
  54. func (couples *CouplesAnalysis) Requires() []string {
  55. arr := [...]string{identity.DependencyAuthor, items.DependencyTreeChanges}
  56. return arr[:]
  57. }
  58. // ListConfigurationOptions returns the list of changeable public properties of this PipelineItem.
  59. func (couples *CouplesAnalysis) ListConfigurationOptions() []core.ConfigurationOption {
  60. return []core.ConfigurationOption{}
  61. }
  62. // Configure sets the properties previously published by ListConfigurationOptions().
  63. func (couples *CouplesAnalysis) Configure(facts map[string]interface{}) {
  64. if val, exists := facts[identity.FactIdentityDetectorPeopleCount].(int); exists {
  65. couples.PeopleNumber = val
  66. couples.reversedPeopleDict = facts[identity.FactIdentityDetectorReversedPeopleDict].([]string)
  67. }
  68. }
  69. // Flag for the command line switch which enables this analysis.
  70. func (couples *CouplesAnalysis) Flag() string {
  71. return "couples"
  72. }
  73. // Initialize resets the temporary caches and prepares this PipelineItem for a series of Consume()
  74. // calls. The repository which is going to be analysed is supplied as an argument.
  75. func (couples *CouplesAnalysis) Initialize(repository *git.Repository) {
  76. couples.people = make([]map[string]int, couples.PeopleNumber+1)
  77. for i := range couples.people {
  78. couples.people[i] = map[string]int{}
  79. }
  80. couples.peopleCommits = make([]int, couples.PeopleNumber+1)
  81. couples.files = map[string]map[string]int{}
  82. }
  83. // Consume runs this PipelineItem on the next commit data.
  84. // `deps` contain all the results from upstream PipelineItem-s as requested by Requires().
  85. // Additionally, "commit" is always present there and represents the analysed *object.Commit.
  86. // This function returns the mapping with analysis results. The keys must be the same as
  87. // in Provides(). If there was an error, nil is returned.
  88. func (couples *CouplesAnalysis) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  89. author := deps[identity.DependencyAuthor].(int)
  90. if author == identity.AuthorMissing {
  91. author = couples.PeopleNumber
  92. }
  93. couples.peopleCommits[author]++
  94. treeDiff := deps[items.DependencyTreeChanges].(object.Changes)
  95. context := make([]string, 0)
  96. deleteFile := func(name string) {
  97. // we do not remove the file from people - the context does not expire
  98. delete(couples.files, name)
  99. for _, otherFiles := range couples.files {
  100. delete(otherFiles, name)
  101. }
  102. }
  103. for _, change := range treeDiff {
  104. action, err := change.Action()
  105. if err != nil {
  106. return nil, err
  107. }
  108. toName := change.To.Name
  109. fromName := change.From.Name
  110. switch action {
  111. case merkletrie.Insert:
  112. context = append(context, toName)
  113. couples.people[author][toName]++
  114. case merkletrie.Delete:
  115. deleteFile(fromName)
  116. couples.people[author][fromName]++
  117. case merkletrie.Modify:
  118. if fromName != toName {
  119. // renamed
  120. couples.files[toName] = couples.files[fromName]
  121. for _, otherFiles := range couples.files {
  122. val, exists := otherFiles[fromName]
  123. if exists {
  124. otherFiles[toName] = val
  125. }
  126. }
  127. deleteFile(fromName)
  128. for _, authorFiles := range couples.people {
  129. val, exists := authorFiles[fromName]
  130. if exists {
  131. authorFiles[toName] = val
  132. delete(authorFiles, fromName)
  133. }
  134. }
  135. }
  136. context = append(context, toName)
  137. couples.people[author][toName]++
  138. }
  139. }
  140. for _, file := range context {
  141. for _, otherFile := range context {
  142. lane, exists := couples.files[file]
  143. if !exists {
  144. lane = map[string]int{}
  145. couples.files[file] = lane
  146. }
  147. lane[otherFile]++
  148. }
  149. }
  150. return nil, nil
  151. }
  152. // Finalize returns the result of the analysis. Further Consume() calls are not expected.
  153. func (couples *CouplesAnalysis) Finalize() interface{} {
  154. filesSequence := make([]string, len(couples.files))
  155. i := 0
  156. for file := range couples.files {
  157. filesSequence[i] = file
  158. i++
  159. }
  160. sort.Strings(filesSequence)
  161. filesIndex := map[string]int{}
  162. for i, file := range filesSequence {
  163. filesIndex[file] = i
  164. }
  165. peopleMatrix := make([]map[int]int64, couples.PeopleNumber+1)
  166. peopleFiles := make([][]int, couples.PeopleNumber+1)
  167. for i := range peopleMatrix {
  168. peopleMatrix[i] = map[int]int64{}
  169. for file, commits := range couples.people[i] {
  170. fi, exists := filesIndex[file]
  171. if exists {
  172. peopleFiles[i] = append(peopleFiles[i], fi)
  173. }
  174. for j, otherFiles := range couples.people {
  175. otherCommits := otherFiles[file]
  176. delta := otherCommits
  177. if otherCommits > commits {
  178. delta = commits
  179. }
  180. if delta > 0 {
  181. peopleMatrix[i][j] += int64(delta)
  182. }
  183. }
  184. }
  185. sort.Ints(peopleFiles[i])
  186. }
  187. filesMatrix := make([]map[int]int64, len(filesIndex))
  188. for i := range filesMatrix {
  189. filesMatrix[i] = map[int]int64{}
  190. for otherFile, cooccs := range couples.files[filesSequence[i]] {
  191. filesMatrix[i][filesIndex[otherFile]] = int64(cooccs)
  192. }
  193. }
  194. return CouplesResult{
  195. PeopleMatrix: peopleMatrix,
  196. PeopleFiles: peopleFiles,
  197. Files: filesSequence,
  198. FilesMatrix: filesMatrix,
  199. reversedPeopleDict: couples.reversedPeopleDict,
  200. }
  201. }
  202. // Serialize converts the analysis result as returned by Finalize() to text or bytes.
  203. // The text format is YAML and the bytes format is Protocol Buffers.
  204. func (couples *CouplesAnalysis) Serialize(result interface{}, binary bool, writer io.Writer) error {
  205. couplesResult := result.(CouplesResult)
  206. if binary {
  207. return couples.serializeBinary(&couplesResult, writer)
  208. }
  209. couples.serializeText(&couplesResult, writer)
  210. return nil
  211. }
  212. // Deserialize converts the specified protobuf bytes to CouplesResult.
  213. func (couples *CouplesAnalysis) Deserialize(pbmessage []byte) (interface{}, error) {
  214. message := pb.CouplesAnalysisResults{}
  215. err := proto.Unmarshal(pbmessage, &message)
  216. if err != nil {
  217. return nil, err
  218. }
  219. result := CouplesResult{
  220. Files: message.FileCouples.Index,
  221. FilesMatrix: make([]map[int]int64, message.FileCouples.Matrix.NumberOfRows),
  222. PeopleFiles: make([][]int, len(message.PeopleCouples.Index)),
  223. PeopleMatrix: make([]map[int]int64, message.PeopleCouples.Matrix.NumberOfRows),
  224. reversedPeopleDict: message.PeopleCouples.Index,
  225. }
  226. for i, files := range message.PeopleFiles {
  227. result.PeopleFiles[i] = make([]int, len(files.Files))
  228. for j, val := range files.Files {
  229. result.PeopleFiles[i][j] = int(val)
  230. }
  231. }
  232. convertCSR := func(dest []map[int]int64, src *pb.CompressedSparseRowMatrix) {
  233. for indptr := range src.Indptr {
  234. if indptr == 0 {
  235. continue
  236. }
  237. dest[indptr-1] = map[int]int64{}
  238. for j := src.Indptr[indptr-1]; j < src.Indptr[indptr]; j++ {
  239. dest[indptr-1][int(src.Indices[j])] = src.Data[j]
  240. }
  241. }
  242. }
  243. convertCSR(result.FilesMatrix, message.FileCouples.Matrix)
  244. convertCSR(result.PeopleMatrix, message.PeopleCouples.Matrix)
  245. return result, nil
  246. }
  247. // MergeResults combines two CouplesAnalysis-s together.
  248. func (couples *CouplesAnalysis) MergeResults(r1, r2 interface{}, c1, c2 *core.CommonAnalysisResult) interface{} {
  249. cr1 := r1.(CouplesResult)
  250. cr2 := r2.(CouplesResult)
  251. merged := CouplesResult{}
  252. var people, files map[string][3]int
  253. people, merged.reversedPeopleDict = identity.Detector{}.MergeReversedDicts(
  254. cr1.reversedPeopleDict, cr2.reversedPeopleDict)
  255. files, merged.Files = identity.Detector{}.MergeReversedDicts(cr1.Files, cr2.Files)
  256. merged.PeopleFiles = make([][]int, len(merged.reversedPeopleDict))
  257. peopleFilesDicts := make([]map[int]bool, len(merged.reversedPeopleDict))
  258. addPeopleFiles := func(peopleFiles [][]int, reversedPeopleDict []string,
  259. reversedFilesDict []string) {
  260. for pi, fs := range peopleFiles {
  261. idx := people[reversedPeopleDict[pi]][0]
  262. m := peopleFilesDicts[idx]
  263. if m == nil {
  264. m = map[int]bool{}
  265. peopleFilesDicts[idx] = m
  266. }
  267. for _, f := range fs {
  268. m[files[reversedFilesDict[f]][0]] = true
  269. }
  270. }
  271. }
  272. addPeopleFiles(cr1.PeopleFiles, cr1.reversedPeopleDict, cr1.Files)
  273. addPeopleFiles(cr2.PeopleFiles, cr2.reversedPeopleDict, cr2.Files)
  274. for i, m := range peopleFilesDicts {
  275. merged.PeopleFiles[i] = make([]int, len(m))
  276. j := 0
  277. for f := range m {
  278. merged.PeopleFiles[i][j] = f
  279. j++
  280. }
  281. sort.Ints(merged.PeopleFiles[i])
  282. }
  283. merged.PeopleMatrix = make([]map[int]int64, len(merged.reversedPeopleDict)+1)
  284. addPeople := func(peopleMatrix []map[int]int64, reversedPeopleDict []string,
  285. reversedFilesDict []string) {
  286. for pi, pc := range peopleMatrix {
  287. var idx int
  288. if pi < len(reversedPeopleDict) {
  289. idx = people[reversedPeopleDict[pi]][0]
  290. } else {
  291. idx = len(merged.reversedPeopleDict)
  292. }
  293. m := merged.PeopleMatrix[idx]
  294. if m == nil {
  295. m = map[int]int64{}
  296. merged.PeopleMatrix[idx] = m
  297. }
  298. for file, val := range pc {
  299. m[files[reversedFilesDict[file]][0]] += val
  300. }
  301. }
  302. }
  303. addPeople(cr1.PeopleMatrix, cr1.reversedPeopleDict, cr1.Files)
  304. addPeople(cr2.PeopleMatrix, cr2.reversedPeopleDict, cr2.Files)
  305. merged.FilesMatrix = make([]map[int]int64, len(merged.Files))
  306. addFiles := func(filesMatrix []map[int]int64, reversedFilesDict []string) {
  307. for fi, fc := range filesMatrix {
  308. idx := people[reversedFilesDict[fi]][0]
  309. m := merged.FilesMatrix[idx]
  310. if m == nil {
  311. m = map[int]int64{}
  312. merged.FilesMatrix[idx] = m
  313. }
  314. for file, val := range fc {
  315. m[files[reversedFilesDict[file]][0]] += val
  316. }
  317. }
  318. }
  319. addFiles(cr1.FilesMatrix, cr1.Files)
  320. addFiles(cr2.FilesMatrix, cr2.Files)
  321. return merged
  322. }
  323. func (couples *CouplesAnalysis) serializeText(result *CouplesResult, writer io.Writer) {
  324. fmt.Fprintln(writer, " files_coocc:")
  325. fmt.Fprintln(writer, " index:")
  326. for _, file := range result.Files {
  327. fmt.Fprintf(writer, " - %s\n", yaml.SafeString(file))
  328. }
  329. fmt.Fprintln(writer, " matrix:")
  330. for _, files := range result.FilesMatrix {
  331. fmt.Fprint(writer, " - {")
  332. indices := []int{}
  333. for file := range files {
  334. indices = append(indices, file)
  335. }
  336. sort.Ints(indices)
  337. for i, file := range indices {
  338. fmt.Fprintf(writer, "%d: %d", file, files[file])
  339. if i < len(indices)-1 {
  340. fmt.Fprint(writer, ", ")
  341. }
  342. }
  343. fmt.Fprintln(writer, "}")
  344. }
  345. fmt.Fprintln(writer, " people_coocc:")
  346. fmt.Fprintln(writer, " index:")
  347. for _, person := range couples.reversedPeopleDict {
  348. fmt.Fprintf(writer, " - %s\n", yaml.SafeString(person))
  349. }
  350. fmt.Fprintln(writer, " matrix:")
  351. for _, people := range result.PeopleMatrix {
  352. fmt.Fprint(writer, " - {")
  353. indices := []int{}
  354. for file := range people {
  355. indices = append(indices, file)
  356. }
  357. sort.Ints(indices)
  358. for i, person := range indices {
  359. fmt.Fprintf(writer, "%d: %d", person, people[person])
  360. if i < len(indices)-1 {
  361. fmt.Fprint(writer, ", ")
  362. }
  363. }
  364. fmt.Fprintln(writer, "}")
  365. }
  366. fmt.Fprintln(writer, " author_files:") // sorted by number of files each author changed
  367. peopleFiles := sortByNumberOfFiles(result.PeopleFiles, couples.reversedPeopleDict, result.Files)
  368. for _, authorFiles := range peopleFiles {
  369. fmt.Fprintf(writer, " - %s:\n", yaml.SafeString(authorFiles.Author))
  370. sort.Strings(authorFiles.Files)
  371. for _, file := range authorFiles.Files {
  372. fmt.Fprintf(writer, " - %s\n", yaml.SafeString(file)) // sorted by path
  373. }
  374. }
  375. }
  376. func sortByNumberOfFiles(
  377. peopleFiles [][]int, peopleDict []string, filesDict []string) authorFilesList {
  378. var pfl authorFilesList
  379. for peopleIdx, files := range peopleFiles {
  380. if peopleIdx < len(peopleDict) {
  381. fileNames := make([]string, len(files))
  382. for i, fi := range files {
  383. fileNames[i] = filesDict[fi]
  384. }
  385. pfl = append(pfl, authorFiles{peopleDict[peopleIdx], fileNames})
  386. }
  387. }
  388. sort.Sort(pfl)
  389. return pfl
  390. }
  391. type authorFiles struct {
  392. Author string
  393. Files []string
  394. }
  395. type authorFilesList []authorFiles
  396. func (s authorFilesList) Len() int {
  397. return len(s)
  398. }
  399. func (s authorFilesList) Swap(i, j int) {
  400. s[i], s[j] = s[j], s[i]
  401. }
  402. func (s authorFilesList) Less(i, j int) bool {
  403. return len(s[i].Files) < len(s[j].Files)
  404. }
  405. func (couples *CouplesAnalysis) serializeBinary(result *CouplesResult, writer io.Writer) error {
  406. message := pb.CouplesAnalysisResults{}
  407. message.FileCouples = &pb.Couples{
  408. Index: result.Files,
  409. Matrix: pb.MapToCompressedSparseRowMatrix(result.FilesMatrix),
  410. }
  411. message.PeopleCouples = &pb.Couples{
  412. Index: result.reversedPeopleDict,
  413. Matrix: pb.MapToCompressedSparseRowMatrix(result.PeopleMatrix),
  414. }
  415. message.PeopleFiles = make([]*pb.TouchedFiles, len(result.reversedPeopleDict))
  416. for key := range result.reversedPeopleDict {
  417. files := result.PeopleFiles[key]
  418. int32Files := make([]int32, len(files))
  419. for i, f := range files {
  420. int32Files[i] = int32(f)
  421. }
  422. message.PeopleFiles[key] = &pb.TouchedFiles{
  423. Files: int32Files,
  424. }
  425. }
  426. serialized, err := proto.Marshal(&message)
  427. if err != nil {
  428. return err
  429. }
  430. writer.Write(serialized)
  431. return nil
  432. }
  433. func init() {
  434. core.Registry.Register(&CouplesAnalysis{})
  435. }