浏览代码

Merge pull request #7 from bzz/add-files-per-author

people_cooc: add files-per-author information to .yml
Vadim Markovtsev 7 年之前
父节点
当前提交
db325a212d
共有 4 个文件被更改,包括 114 次插入42 次删除
  1. 18 3
      README.md
  2. 76 37
      cmd/hercules/main.go
  3. 8 2
      couples.go
  4. 12 0
      requirements.txt

+ 18 - 3
README.md

@@ -38,7 +38,7 @@ There is a [presentation](http://vmarkovtsev.github.io/techtalks-2017-moscow-lig
 You are going to need Go and Python 2 or 3.
 You are going to need Go and Python 2 or 3.
 ```
 ```
 go get gopkg.in/src-d/hercules.v1/cmd/hercules
 go get gopkg.in/src-d/hercules.v1/cmd/hercules
-pip install pandas seaborn
+pip install -r requirements.txt
 wget https://github.com/src-d/hercules/raw/master/labours.py
 wget https://github.com/src-d/hercules/raw/master/labours.py
 ```
 ```
 
 
@@ -137,7 +137,7 @@ co-occurrence probability through the Euclidean distance. The training requires
 [Tensorflow](http://tensorflow.org) installation. The intermediate files are stored in the
 [Tensorflow](http://tensorflow.org) installation. The intermediate files are stored in the
 system temporary directory or `--couples-tmp-dir` if it is specified. The trained embeddings are
 system temporary directory or `--couples-tmp-dir` if it is specified. The trained embeddings are
 written to the current working directory with the name depending on `-o`. The output format is TSV
 written to the current working directory with the name depending on `-o`. The output format is TSV
-and matches [Tensorflow Projector])(http://projector.tensorflow.org/) so that the files and people
+and matches [Tensorflow Projector](http://projector.tensorflow.org/) so that the files and people
 can be visualized with t-SNE implemented in TF Projector.
 can be visualized with t-SNE implemented in TF Projector.
 
 
 #### Everything in a single pass
 #### Everything in a single pass
@@ -168,6 +168,11 @@ python3 labours.py [--style=white|black] [--backend=]
 `--style` changes the background to be either white ("black" foreground) or black ("white" foreground).
 `--style` changes the background to be either white ("black" foreground) or black ("white" foreground).
 `--backend` chooses the Matplotlib backend.
 `--backend` chooses the Matplotlib backend.
 
 
+To use matplotlib on macOS and avoid runtime errors, one can pin default backend by
+```
+echo "backend: TkAgg" > ~/.matplotlib/matplotlibrc
+```
+
 These options are effective in burndown charts only:
 These options are effective in burndown charts only:
 
 
 ```
 ```
@@ -181,9 +186,19 @@ python3 labours.py [--text-size] [--relative]
 1. Currently, go-git's file system storage backend is considerably slower than the in-memory one,
 1. Currently, go-git's file system storage backend is considerably slower than the in-memory one,
 so you should clone repos instead of reading them from disk whenever possible. Please note that the
 so you should clone repos instead of reading them from disk whenever possible. Please note that the
 in-memory storage may require much RAM, for example, the Linux kernel takes over 200GB in 2017.
 in-memory storage may require much RAM, for example, the Linux kernel takes over 200GB in 2017.
-2. Parsing YAML in Python is slow when the number of internal objects is big. `hercules`' output
+1. Parsing YAML in Python is slow when the number of internal objects is big. `hercules`' output
 for the Linux kernel in "couples" mode is 1.5 GB and takes more than an hour / 180GB RAM to be
 for the Linux kernel in "couples" mode is 1.5 GB and takes more than an hour / 180GB RAM to be
 parsed. However, most of the repositories are parsed within a minute.
 parsed. However, most of the repositories are parsed within a minute.
+1. To speed-up yaml parsing
+   ```
+   apt-get install yaml-cpp-dev
+   #or
+   brew install yaml-cpp libyaml
+
+   #you might need to re-install pyyaml for changs to take effect
+   pip uninstall pyyaml
+   pip --no-cache-dir install pyyaml
+   ```
 
 
 ### License
 ### License
 MIT.
 MIT.

+ 76 - 37
cmd/hercules/main.go

@@ -78,47 +78,86 @@ func printMatrix(matrix [][]int64, name string, fixNegative bool) {
 
 
 func printCouples(result *hercules.CouplesResult, peopleDict []string) {
 func printCouples(result *hercules.CouplesResult, peopleDict []string) {
 	fmt.Println("files_coocc:")
 	fmt.Println("files_coocc:")
-		fmt.Println("  index:")
-		for _, file := range result.Files {
-			fmt.Printf("    - %s\n", safeString(file))
+	fmt.Println("  index:")
+	for _, file := range result.Files {
+		fmt.Printf("    - %s\n", safeString(file))
+	}
+
+	fmt.Println("  matrix:")
+	for _, files := range result.FilesMatrix {
+		fmt.Print("    - {")
+		indices := []int{}
+		for file := range files {
+			indices = append(indices, file)
 		}
 		}
-		fmt.Println("  matrix:")
-		for _, files := range result.FilesMatrix {
-			fmt.Print("    - {")
-			indices := []int{}
-			for file := range files {
-				indices = append(indices, file)
-			}
-			sort.Ints(indices)
-			for i, file := range indices {
-				fmt.Printf("%d: %d", file, files[file])
-				if i < len(indices) - 1 {
-					fmt.Print(", ")
-				}
+		sort.Ints(indices)
+		for i, file := range indices {
+			fmt.Printf("%d: %d", file, files[file])
+			if i < len(indices)-1 {
+				fmt.Print(", ")
 			}
 			}
-			fmt.Println("}")
 		}
 		}
-		fmt.Println("people_coocc:")
-	  fmt.Println("  index:")
-	  for _, person := range peopleDict {
-		  fmt.Printf("    - %s\n", safeString(person))
-	  }
-	  fmt.Println("  matrix:")
-		for _, people := range result.PeopleMatrix {
-			fmt.Print("    - {")
-			indices := []int{}
-			for file := range people {
-				indices = append(indices, file)
-			}
-			sort.Ints(indices)
-			for i, person := range indices {
-				fmt.Printf("%d: %d", person, people[person])
-				if i < len(indices) - 1 {
-					fmt.Print(", ")
-				}
+		fmt.Println("}")
+	}
+
+	fmt.Println("people_coocc:")
+	fmt.Println("  index:")
+	for _, person := range peopleDict {
+		fmt.Printf("    - %s\n", safeString(person))
+	}
+
+	fmt.Println("  matrix:")
+	for _, people := range result.PeopleMatrix {
+		fmt.Print("    - {")
+		indices := []int{}
+		for file := range people {
+			indices = append(indices, file)
+		}
+		sort.Ints(indices)
+		for i, person := range indices {
+			fmt.Printf("%d: %d", person, people[person])
+			if i < len(indices)-1 {
+				fmt.Print(", ")
 			}
 			}
-			fmt.Println("}")
 		}
 		}
+		fmt.Println("}")
+	}
+
+	fmt.Println("  author_files:") // sorted by number of files each author changed
+	peopleFiles := sortByNumberOfFiles(result.PeopleFiles, peopleDict)
+	for _, authorFiles := range peopleFiles {
+		fmt.Printf("    - %s:\n", safeString(authorFiles.Author))
+		sort.Strings(authorFiles.Files)
+		for _, file := range authorFiles.Files {
+			fmt.Printf("      - %s\n", safeString(file)) // sorted by path
+		}
+	}
+}
+
+func sortByNumberOfFiles(peopleFiles [][]string, peopleDict []string) AuthorFilesList {
+	var pfl AuthorFilesList
+	for peopleIdx, files := range peopleFiles {
+		pfl = append(pfl, AuthorFiles{peopleDict[peopleIdx], files})
+	}
+	sort.Sort(pfl)
+	return pfl
+}
+
+type AuthorFiles struct {
+	Author string
+	Files  []string
+}
+
+type AuthorFilesList []AuthorFiles
+
+func (s AuthorFilesList) Len() int {
+	return len(s)
+}
+func (s AuthorFilesList) Swap(i, j int) {
+	s[i], s[j] = s[j], s[i]
+}
+func (s AuthorFilesList) Less(i, j int) bool {
+	return len(s[i].Files) < len(s[j].Files)
 }
 }
 
 
 func sortedKeys(m map[string][][]int64) []string {
 func sortedKeys(m map[string][][]int64) []string {
@@ -282,6 +321,6 @@ func main() {
 		printMatrix(burndown_results.PeopleMatrix, "", false)
 		printMatrix(burndown_results.PeopleMatrix, "", false)
 	}
 	}
 	if with_couples {
 	if with_couples {
-    printCouples(&couples_result, id_matcher.ReversePeopleDict)
+		printCouples(&couples_result, id_matcher.ReversePeopleDict)
 	}
 	}
 }
 }

+ 8 - 2
couples.go

@@ -1,10 +1,11 @@
 package hercules
 package hercules
 
 
 import (
 import (
+	"sort"
+
 	"gopkg.in/src-d/go-git.v4"
 	"gopkg.in/src-d/go-git.v4"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
 	"gopkg.in/src-d/go-git.v4/utils/merkletrie"
 	"gopkg.in/src-d/go-git.v4/utils/merkletrie"
-	"sort"
 )
 )
 
 
 type Couples struct {
 type Couples struct {
@@ -21,6 +22,7 @@ type Couples struct {
 
 
 type CouplesResult struct {
 type CouplesResult struct {
 	PeopleMatrix []map[int]int64
 	PeopleMatrix []map[int]int64
+	PeopleFiles  [][]string
 	FilesMatrix  []map[int]int
 	FilesMatrix  []map[int]int
 	Files        []string
 	Files        []string
 }
 }
@@ -103,9 +105,13 @@ func (couples *Couples) Consume(deps map[string]interface{}) (map[string]interfa
 
 
 func (couples *Couples) Finalize() interface{} {
 func (couples *Couples) Finalize() interface{} {
 	peopleMatrix := make([]map[int]int64, couples.PeopleNumber)
 	peopleMatrix := make([]map[int]int64, couples.PeopleNumber)
+	peopleFiles := make([][]string, couples.PeopleNumber)
 	for i := range peopleMatrix {
 	for i := range peopleMatrix {
 		peopleMatrix[i] = map[int]int64{}
 		peopleMatrix[i] = map[int]int64{}
 		for file, commits := range couples.people[i] {
 		for file, commits := range couples.people[i] {
+			//could be normalized further, by replacing file with idx in fileSequence
+			//but this would trade the space for readability of result
+			peopleFiles[i] = append(peopleFiles[i], file)
 			for j, otherFiles := range couples.people {
 			for j, otherFiles := range couples.people {
 				if i == j {
 				if i == j {
 					continue
 					continue
@@ -140,5 +146,5 @@ func (couples *Couples) Finalize() interface{} {
 			filesMatrix[i][filesIndex[otherFile]] = cooccs
 			filesMatrix[i][filesIndex[otherFile]] = cooccs
 		}
 		}
 	}
 	}
-	return CouplesResult{PeopleMatrix: peopleMatrix, Files: filesSequence, FilesMatrix: filesMatrix}
+	return CouplesResult{PeopleMatrix: peopleMatrix, PeopleFiles: peopleFiles, Files: filesSequence, FilesMatrix: filesMatrix}
 }
 }

+ 12 - 0
requirements.txt

@@ -0,0 +1,12 @@
+args==0.1.0
+clint==0.5.1
+cycler==0.10.0
+matplotlib==2.0.2
+numpy==1.13.1
+pandas==0.20.3
+pyparsing==2.2.0
+python-dateutil==2.6.1
+pytz==2017.2
+PyYAML==3.12
+scipy==0.19.1
+six==1.10.0