couples.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466
  1. package hercules
  2. import (
  3. "fmt"
  4. "io"
  5. "sort"
  6. "github.com/gogo/protobuf/proto"
  7. "gopkg.in/src-d/go-git.v4"
  8. "gopkg.in/src-d/go-git.v4/plumbing/object"
  9. "gopkg.in/src-d/go-git.v4/utils/merkletrie"
  10. "gopkg.in/src-d/hercules.v3/pb"
  11. "gopkg.in/src-d/hercules.v3/yaml"
  12. )
  13. // CouplesAnalysis calculates the number of common commits for files and authors.
  14. // The results are matrices, where cell at row X and column Y is the number of commits which
  15. // changed X and Y together. In case with people, the numbers are summed for every common file.
  16. type CouplesAnalysis struct {
  17. // PeopleNumber is the number of developers for which to build the matrix. 0 disables this analysis.
  18. PeopleNumber int
  19. // people store how many times every developer committed to every file.
  20. people []map[string]int
  21. // peopleCommits is the number of commits each author made.
  22. peopleCommits []int
  23. // files store every file occurred in the same commit with every other file.
  24. files map[string]map[string]int
  25. // reversedPeopleDict references IdentityDetector.ReversedPeopleDict
  26. reversedPeopleDict []string
  27. }
  28. // CouplesResult is returned by CouplesAnalysis.Finalize() and carries couples matrices from
  29. // authors and files.
  30. type CouplesResult struct {
  31. PeopleMatrix []map[int]int64
  32. PeopleFiles [][]int
  33. FilesMatrix []map[int]int64
  34. Files []string
  35. // reversedPeopleDict references IdentityDetector.ReversedPeopleDict
  36. reversedPeopleDict []string
  37. }
  38. // Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
  39. func (couples *CouplesAnalysis) Name() string {
  40. return "Couples"
  41. }
  42. // Provides returns the list of names of entities which are produced by this PipelineItem.
  43. // Each produced entity will be inserted into `deps` of dependent Consume()-s according
  44. // to this list. Also used by hercules.Registry to build the global map of providers.
  45. func (couples *CouplesAnalysis) Provides() []string {
  46. return []string{}
  47. }
  48. // Requires returns the list of names of entities which are needed by this PipelineItem.
  49. // Each requested entity will be inserted into `deps` of Consume(). In turn, those
  50. // entities are Provides() upstream.
  51. func (couples *CouplesAnalysis) Requires() []string {
  52. arr := [...]string{DependencyAuthor, DependencyTreeChanges}
  53. return arr[:]
  54. }
  55. // ListConfigurationOptions returns the list of changeable public properties of this PipelineItem.
  56. func (couples *CouplesAnalysis) ListConfigurationOptions() []ConfigurationOption {
  57. return []ConfigurationOption{}
  58. }
  59. // Configure sets the properties previously published by ListConfigurationOptions().
  60. func (couples *CouplesAnalysis) Configure(facts map[string]interface{}) {
  61. if val, exists := facts[FactIdentityDetectorPeopleCount].(int); exists {
  62. couples.PeopleNumber = val
  63. couples.reversedPeopleDict = facts[FactIdentityDetectorReversedPeopleDict].([]string)
  64. }
  65. }
  66. // Flag for the command line switch which enables this analysis.
  67. func (couples *CouplesAnalysis) Flag() string {
  68. return "couples"
  69. }
  70. // Initialize resets the temporary caches and prepares this PipelineItem for a series of Consume()
  71. // calls. The repository which is going to be analysed is supplied as an argument.
  72. func (couples *CouplesAnalysis) Initialize(repository *git.Repository) {
  73. couples.people = make([]map[string]int, couples.PeopleNumber+1)
  74. for i := range couples.people {
  75. couples.people[i] = map[string]int{}
  76. }
  77. couples.peopleCommits = make([]int, couples.PeopleNumber+1)
  78. couples.files = map[string]map[string]int{}
  79. }
  80. // Consume runs this PipelineItem on the next commit data.
  81. // `deps` contain all the results from upstream PipelineItem-s as requested by Requires().
  82. // Additionally, "commit" is always present there and represents the analysed *object.Commit.
  83. // This function returns the mapping with analysis results. The keys must be the same as
  84. // in Provides(). If there was an error, nil is returned.
  85. func (couples *CouplesAnalysis) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  86. author := deps[DependencyAuthor].(int)
  87. if author == AuthorMissing {
  88. author = couples.PeopleNumber
  89. }
  90. couples.peopleCommits[author]++
  91. treeDiff := deps[DependencyTreeChanges].(object.Changes)
  92. context := make([]string, 0)
  93. deleteFile := func(name string) {
  94. // we do not remove the file from people - the context does not expire
  95. delete(couples.files, name)
  96. for _, otherFiles := range couples.files {
  97. delete(otherFiles, name)
  98. }
  99. }
  100. for _, change := range treeDiff {
  101. action, err := change.Action()
  102. if err != nil {
  103. return nil, err
  104. }
  105. toName := change.To.Name
  106. fromName := change.From.Name
  107. switch action {
  108. case merkletrie.Insert:
  109. context = append(context, toName)
  110. couples.people[author][toName]++
  111. case merkletrie.Delete:
  112. deleteFile(fromName)
  113. couples.people[author][fromName]++
  114. case merkletrie.Modify:
  115. if fromName != toName {
  116. // renamed
  117. couples.files[toName] = couples.files[fromName]
  118. for _, otherFiles := range couples.files {
  119. val, exists := otherFiles[fromName]
  120. if exists {
  121. otherFiles[toName] = val
  122. }
  123. }
  124. deleteFile(fromName)
  125. for _, authorFiles := range couples.people {
  126. val, exists := authorFiles[fromName]
  127. if exists {
  128. authorFiles[toName] = val
  129. delete(authorFiles, fromName)
  130. }
  131. }
  132. }
  133. context = append(context, toName)
  134. couples.people[author][toName]++
  135. }
  136. }
  137. for _, file := range context {
  138. for _, otherFile := range context {
  139. lane, exists := couples.files[file]
  140. if !exists {
  141. lane = map[string]int{}
  142. couples.files[file] = lane
  143. }
  144. lane[otherFile]++
  145. }
  146. }
  147. return nil, nil
  148. }
  149. // Finalize returns the result of the analysis. Further Consume() calls are not expected.
  150. func (couples *CouplesAnalysis) Finalize() interface{} {
  151. filesSequence := make([]string, len(couples.files))
  152. i := 0
  153. for file := range couples.files {
  154. filesSequence[i] = file
  155. i++
  156. }
  157. sort.Strings(filesSequence)
  158. filesIndex := map[string]int{}
  159. for i, file := range filesSequence {
  160. filesIndex[file] = i
  161. }
  162. peopleMatrix := make([]map[int]int64, couples.PeopleNumber+1)
  163. peopleFiles := make([][]int, couples.PeopleNumber+1)
  164. for i := range peopleMatrix {
  165. peopleMatrix[i] = map[int]int64{}
  166. for file, commits := range couples.people[i] {
  167. fi, exists := filesIndex[file]
  168. if exists {
  169. peopleFiles[i] = append(peopleFiles[i], fi)
  170. }
  171. for j, otherFiles := range couples.people {
  172. otherCommits := otherFiles[file]
  173. delta := otherCommits
  174. if otherCommits > commits {
  175. delta = commits
  176. }
  177. if delta > 0 {
  178. peopleMatrix[i][j] += int64(delta)
  179. }
  180. }
  181. }
  182. sort.Ints(peopleFiles[i])
  183. }
  184. filesMatrix := make([]map[int]int64, len(filesIndex))
  185. for i := range filesMatrix {
  186. filesMatrix[i] = map[int]int64{}
  187. for otherFile, cooccs := range couples.files[filesSequence[i]] {
  188. filesMatrix[i][filesIndex[otherFile]] = int64(cooccs)
  189. }
  190. }
  191. return CouplesResult{
  192. PeopleMatrix: peopleMatrix,
  193. PeopleFiles: peopleFiles,
  194. Files: filesSequence,
  195. FilesMatrix: filesMatrix,
  196. reversedPeopleDict: couples.reversedPeopleDict,
  197. }
  198. }
  199. // Serialize converts the analysis result as returned by Finalize() to text or bytes.
  200. // The text format is YAML and the bytes format is Protocol Buffers.
  201. func (couples *CouplesAnalysis) Serialize(result interface{}, binary bool, writer io.Writer) error {
  202. couplesResult := result.(CouplesResult)
  203. if binary {
  204. return couples.serializeBinary(&couplesResult, writer)
  205. }
  206. couples.serializeText(&couplesResult, writer)
  207. return nil
  208. }
  209. // Deserialize converts the specified protobuf bytes to CouplesResult.
  210. func (couples *CouplesAnalysis) Deserialize(pbmessage []byte) (interface{}, error) {
  211. message := pb.CouplesAnalysisResults{}
  212. err := proto.Unmarshal(pbmessage, &message)
  213. if err != nil {
  214. return nil, err
  215. }
  216. result := CouplesResult{
  217. Files: message.FileCouples.Index,
  218. FilesMatrix: make([]map[int]int64, message.FileCouples.Matrix.NumberOfRows),
  219. PeopleFiles: make([][]int, len(message.PeopleCouples.Index)),
  220. PeopleMatrix: make([]map[int]int64, message.PeopleCouples.Matrix.NumberOfRows),
  221. reversedPeopleDict: message.PeopleCouples.Index,
  222. }
  223. for i, files := range message.PeopleFiles {
  224. result.PeopleFiles[i] = make([]int, len(files.Files))
  225. for j, val := range files.Files {
  226. result.PeopleFiles[i][j] = int(val)
  227. }
  228. }
  229. convertCSR := func(dest []map[int]int64, src *pb.CompressedSparseRowMatrix) {
  230. for indptr := range src.Indptr {
  231. if indptr == 0 {
  232. continue
  233. }
  234. dest[indptr-1] = map[int]int64{}
  235. for j := src.Indptr[indptr-1]; j < src.Indptr[indptr]; j++ {
  236. dest[indptr-1][int(src.Indices[j])] = src.Data[j]
  237. }
  238. }
  239. }
  240. convertCSR(result.FilesMatrix, message.FileCouples.Matrix)
  241. convertCSR(result.PeopleMatrix, message.PeopleCouples.Matrix)
  242. return result, nil
  243. }
  244. // MergeResults combines two CouplesAnalysis-s together.
  245. func (couples *CouplesAnalysis) MergeResults(r1, r2 interface{}, c1, c2 *CommonAnalysisResult) interface{} {
  246. cr1 := r1.(CouplesResult)
  247. cr2 := r2.(CouplesResult)
  248. merged := CouplesResult{}
  249. var people, files map[string][3]int
  250. people, merged.reversedPeopleDict = IdentityDetector{}.MergeReversedDicts(
  251. cr1.reversedPeopleDict, cr2.reversedPeopleDict)
  252. files, merged.Files = IdentityDetector{}.MergeReversedDicts(cr1.Files, cr2.Files)
  253. merged.PeopleFiles = make([][]int, len(merged.reversedPeopleDict))
  254. peopleFilesDicts := make([]map[int]bool, len(merged.reversedPeopleDict))
  255. addPeopleFiles := func(peopleFiles [][]int, reversedPeopleDict []string,
  256. reversedFilesDict []string) {
  257. for pi, fs := range peopleFiles {
  258. idx := people[reversedPeopleDict[pi]][0]
  259. m := peopleFilesDicts[idx]
  260. if m == nil {
  261. m = map[int]bool{}
  262. peopleFilesDicts[idx] = m
  263. }
  264. for _, f := range fs {
  265. m[files[reversedFilesDict[f]][0]] = true
  266. }
  267. }
  268. }
  269. addPeopleFiles(cr1.PeopleFiles, cr1.reversedPeopleDict, cr1.Files)
  270. addPeopleFiles(cr2.PeopleFiles, cr2.reversedPeopleDict, cr2.Files)
  271. for i, m := range peopleFilesDicts {
  272. merged.PeopleFiles[i] = make([]int, len(m))
  273. j := 0
  274. for f := range m {
  275. merged.PeopleFiles[i][j] = f
  276. j++
  277. }
  278. sort.Ints(merged.PeopleFiles[i])
  279. }
  280. merged.PeopleMatrix = make([]map[int]int64, len(merged.reversedPeopleDict)+1)
  281. addPeople := func(peopleMatrix []map[int]int64, reversedPeopleDict []string,
  282. reversedFilesDict []string) {
  283. for pi, pc := range peopleMatrix {
  284. var idx int
  285. if pi < len(reversedPeopleDict) {
  286. idx = people[reversedPeopleDict[pi]][0]
  287. } else {
  288. idx = len(merged.reversedPeopleDict)
  289. }
  290. m := merged.PeopleMatrix[idx]
  291. if m == nil {
  292. m = map[int]int64{}
  293. merged.PeopleMatrix[idx] = m
  294. }
  295. for file, val := range pc {
  296. m[files[reversedFilesDict[file]][0]] += val
  297. }
  298. }
  299. }
  300. addPeople(cr1.PeopleMatrix, cr1.reversedPeopleDict, cr1.Files)
  301. addPeople(cr2.PeopleMatrix, cr2.reversedPeopleDict, cr2.Files)
  302. merged.FilesMatrix = make([]map[int]int64, len(merged.Files))
  303. addFiles := func(filesMatrix []map[int]int64, reversedFilesDict []string) {
  304. for fi, fc := range filesMatrix {
  305. idx := people[reversedFilesDict[fi]][0]
  306. m := merged.FilesMatrix[idx]
  307. if m == nil {
  308. m = map[int]int64{}
  309. merged.FilesMatrix[idx] = m
  310. }
  311. for file, val := range fc {
  312. m[files[reversedFilesDict[file]][0]] += val
  313. }
  314. }
  315. }
  316. addFiles(cr1.FilesMatrix, cr1.Files)
  317. addFiles(cr2.FilesMatrix, cr2.Files)
  318. return merged
  319. }
  320. func (couples *CouplesAnalysis) serializeText(result *CouplesResult, writer io.Writer) {
  321. fmt.Fprintln(writer, " files_coocc:")
  322. fmt.Fprintln(writer, " index:")
  323. for _, file := range result.Files {
  324. fmt.Fprintf(writer, " - %s\n", yaml.SafeString(file))
  325. }
  326. fmt.Fprintln(writer, " matrix:")
  327. for _, files := range result.FilesMatrix {
  328. fmt.Fprint(writer, " - {")
  329. indices := []int{}
  330. for file := range files {
  331. indices = append(indices, file)
  332. }
  333. sort.Ints(indices)
  334. for i, file := range indices {
  335. fmt.Fprintf(writer, "%d: %d", file, files[file])
  336. if i < len(indices)-1 {
  337. fmt.Fprint(writer, ", ")
  338. }
  339. }
  340. fmt.Fprintln(writer, "}")
  341. }
  342. fmt.Fprintln(writer, " people_coocc:")
  343. fmt.Fprintln(writer, " index:")
  344. for _, person := range couples.reversedPeopleDict {
  345. fmt.Fprintf(writer, " - %s\n", yaml.SafeString(person))
  346. }
  347. fmt.Fprintln(writer, " matrix:")
  348. for _, people := range result.PeopleMatrix {
  349. fmt.Fprint(writer, " - {")
  350. indices := []int{}
  351. for file := range people {
  352. indices = append(indices, file)
  353. }
  354. sort.Ints(indices)
  355. for i, person := range indices {
  356. fmt.Fprintf(writer, "%d: %d", person, people[person])
  357. if i < len(indices)-1 {
  358. fmt.Fprint(writer, ", ")
  359. }
  360. }
  361. fmt.Fprintln(writer, "}")
  362. }
  363. fmt.Fprintln(writer, " author_files:") // sorted by number of files each author changed
  364. peopleFiles := sortByNumberOfFiles(result.PeopleFiles, couples.reversedPeopleDict, result.Files)
  365. for _, authorFiles := range peopleFiles {
  366. fmt.Fprintf(writer, " - %s:\n", yaml.SafeString(authorFiles.Author))
  367. sort.Strings(authorFiles.Files)
  368. for _, file := range authorFiles.Files {
  369. fmt.Fprintf(writer, " - %s\n", yaml.SafeString(file)) // sorted by path
  370. }
  371. }
  372. }
  373. func sortByNumberOfFiles(
  374. peopleFiles [][]int, peopleDict []string, filesDict []string) authorFilesList {
  375. var pfl authorFilesList
  376. for peopleIdx, files := range peopleFiles {
  377. if peopleIdx < len(peopleDict) {
  378. fileNames := make([]string, len(files))
  379. for i, fi := range files {
  380. fileNames[i] = filesDict[fi]
  381. }
  382. pfl = append(pfl, authorFiles{peopleDict[peopleIdx], fileNames})
  383. }
  384. }
  385. sort.Sort(pfl)
  386. return pfl
  387. }
  388. type authorFiles struct {
  389. Author string
  390. Files []string
  391. }
  392. type authorFilesList []authorFiles
  393. func (s authorFilesList) Len() int {
  394. return len(s)
  395. }
  396. func (s authorFilesList) Swap(i, j int) {
  397. s[i], s[j] = s[j], s[i]
  398. }
  399. func (s authorFilesList) Less(i, j int) bool {
  400. return len(s[i].Files) < len(s[j].Files)
  401. }
  402. func (couples *CouplesAnalysis) serializeBinary(result *CouplesResult, writer io.Writer) error {
  403. message := pb.CouplesAnalysisResults{}
  404. message.FileCouples = &pb.Couples{
  405. Index: result.Files,
  406. Matrix: pb.MapToCompressedSparseRowMatrix(result.FilesMatrix),
  407. }
  408. message.PeopleCouples = &pb.Couples{
  409. Index: result.reversedPeopleDict,
  410. Matrix: pb.MapToCompressedSparseRowMatrix(result.PeopleMatrix),
  411. }
  412. message.PeopleFiles = make([]*pb.TouchedFiles, len(result.reversedPeopleDict))
  413. for key := range result.reversedPeopleDict {
  414. files := result.PeopleFiles[key]
  415. int32Files := make([]int32, len(files))
  416. for i, f := range files {
  417. int32Files[i] = int32(f)
  418. }
  419. message.PeopleFiles[key] = &pb.TouchedFiles{
  420. Files: int32Files,
  421. }
  422. }
  423. serialized, err := proto.Marshal(&message)
  424. if err != nil {
  425. return err
  426. }
  427. writer.Write(serialized)
  428. return nil
  429. }
  430. func init() {
  431. Registry.Register(&CouplesAnalysis{})
  432. }