Преглед изворни кода

Merge pull request #7 from bzz/add-files-per-author

people_cooc: add files-per-author information to .yml
Vadim Markovtsev пре 7 година
родитељ
комит
db325a212d
4 измењених фајлова са 114 додато и 42 уклоњено
  1. 18 3
      README.md
  2. 76 37
      cmd/hercules/main.go
  3. 8 2
      couples.go
  4. 12 0
      requirements.txt

+ 18 - 3
README.md

@@ -38,7 +38,7 @@ There is a [presentation](http://vmarkovtsev.github.io/techtalks-2017-moscow-lig
 You are going to need Go and Python 2 or 3.
 ```
 go get gopkg.in/src-d/hercules.v1/cmd/hercules
-pip install pandas seaborn
+pip install -r requirements.txt
 wget https://github.com/src-d/hercules/raw/master/labours.py
 ```
 
@@ -137,7 +137,7 @@ co-occurrence probability through the Euclidean distance. The training requires
 [Tensorflow](http://tensorflow.org) installation. The intermediate files are stored in the
 system temporary directory or `--couples-tmp-dir` if it is specified. The trained embeddings are
 written to the current working directory with the name depending on `-o`. The output format is TSV
-and matches [Tensorflow Projector])(http://projector.tensorflow.org/) so that the files and people
+and matches [Tensorflow Projector](http://projector.tensorflow.org/) so that the files and people
 can be visualized with t-SNE implemented in TF Projector.
 
 #### Everything in a single pass
@@ -168,6 +168,11 @@ python3 labours.py [--style=white|black] [--backend=]
 `--style` changes the background to be either white ("black" foreground) or black ("white" foreground).
 `--backend` chooses the Matplotlib backend.
 
+To use matplotlib on macOS and avoid runtime errors, one can pin default backend by
+```
+echo "backend: TkAgg" > ~/.matplotlib/matplotlibrc
+```
+
 These options are effective in burndown charts only:
 
 ```
@@ -181,9 +186,19 @@ python3 labours.py [--text-size] [--relative]
 1. Currently, go-git's file system storage backend is considerably slower than the in-memory one,
 so you should clone repos instead of reading them from disk whenever possible. Please note that the
 in-memory storage may require much RAM, for example, the Linux kernel takes over 200GB in 2017.
-2. Parsing YAML in Python is slow when the number of internal objects is big. `hercules`' output
+1. Parsing YAML in Python is slow when the number of internal objects is big. `hercules`' output
 for the Linux kernel in "couples" mode is 1.5 GB and takes more than an hour / 180GB RAM to be
 parsed. However, most of the repositories are parsed within a minute.
+1. To speed-up yaml parsing
+   ```
+   apt-get install yaml-cpp-dev
+   #or
+   brew install yaml-cpp libyaml
+
+   #you might need to re-install pyyaml for changs to take effect
+   pip uninstall pyyaml
+   pip --no-cache-dir install pyyaml
+   ```
 
 ### License
 MIT.

+ 76 - 37
cmd/hercules/main.go

@@ -78,47 +78,86 @@ func printMatrix(matrix [][]int64, name string, fixNegative bool) {
 
 func printCouples(result *hercules.CouplesResult, peopleDict []string) {
 	fmt.Println("files_coocc:")
-		fmt.Println("  index:")
-		for _, file := range result.Files {
-			fmt.Printf("    - %s\n", safeString(file))
+	fmt.Println("  index:")
+	for _, file := range result.Files {
+		fmt.Printf("    - %s\n", safeString(file))
+	}
+
+	fmt.Println("  matrix:")
+	for _, files := range result.FilesMatrix {
+		fmt.Print("    - {")
+		indices := []int{}
+		for file := range files {
+			indices = append(indices, file)
 		}
-		fmt.Println("  matrix:")
-		for _, files := range result.FilesMatrix {
-			fmt.Print("    - {")
-			indices := []int{}
-			for file := range files {
-				indices = append(indices, file)
-			}
-			sort.Ints(indices)
-			for i, file := range indices {
-				fmt.Printf("%d: %d", file, files[file])
-				if i < len(indices) - 1 {
-					fmt.Print(", ")
-				}
+		sort.Ints(indices)
+		for i, file := range indices {
+			fmt.Printf("%d: %d", file, files[file])
+			if i < len(indices)-1 {
+				fmt.Print(", ")
 			}
-			fmt.Println("}")
 		}
-		fmt.Println("people_coocc:")
-	  fmt.Println("  index:")
-	  for _, person := range peopleDict {
-		  fmt.Printf("    - %s\n", safeString(person))
-	  }
-	  fmt.Println("  matrix:")
-		for _, people := range result.PeopleMatrix {
-			fmt.Print("    - {")
-			indices := []int{}
-			for file := range people {
-				indices = append(indices, file)
-			}
-			sort.Ints(indices)
-			for i, person := range indices {
-				fmt.Printf("%d: %d", person, people[person])
-				if i < len(indices) - 1 {
-					fmt.Print(", ")
-				}
+		fmt.Println("}")
+	}
+
+	fmt.Println("people_coocc:")
+	fmt.Println("  index:")
+	for _, person := range peopleDict {
+		fmt.Printf("    - %s\n", safeString(person))
+	}
+
+	fmt.Println("  matrix:")
+	for _, people := range result.PeopleMatrix {
+		fmt.Print("    - {")
+		indices := []int{}
+		for file := range people {
+			indices = append(indices, file)
+		}
+		sort.Ints(indices)
+		for i, person := range indices {
+			fmt.Printf("%d: %d", person, people[person])
+			if i < len(indices)-1 {
+				fmt.Print(", ")
 			}
-			fmt.Println("}")
 		}
+		fmt.Println("}")
+	}
+
+	fmt.Println("  author_files:") // sorted by number of files each author changed
+	peopleFiles := sortByNumberOfFiles(result.PeopleFiles, peopleDict)
+	for _, authorFiles := range peopleFiles {
+		fmt.Printf("    - %s:\n", safeString(authorFiles.Author))
+		sort.Strings(authorFiles.Files)
+		for _, file := range authorFiles.Files {
+			fmt.Printf("      - %s\n", safeString(file)) // sorted by path
+		}
+	}
+}
+
+func sortByNumberOfFiles(peopleFiles [][]string, peopleDict []string) AuthorFilesList {
+	var pfl AuthorFilesList
+	for peopleIdx, files := range peopleFiles {
+		pfl = append(pfl, AuthorFiles{peopleDict[peopleIdx], files})
+	}
+	sort.Sort(pfl)
+	return pfl
+}
+
+type AuthorFiles struct {
+	Author string
+	Files  []string
+}
+
+type AuthorFilesList []AuthorFiles
+
+func (s AuthorFilesList) Len() int {
+	return len(s)
+}
+func (s AuthorFilesList) Swap(i, j int) {
+	s[i], s[j] = s[j], s[i]
+}
+func (s AuthorFilesList) Less(i, j int) bool {
+	return len(s[i].Files) < len(s[j].Files)
 }
 
 func sortedKeys(m map[string][][]int64) []string {
@@ -282,6 +321,6 @@ func main() {
 		printMatrix(burndown_results.PeopleMatrix, "", false)
 	}
 	if with_couples {
-    printCouples(&couples_result, id_matcher.ReversePeopleDict)
+		printCouples(&couples_result, id_matcher.ReversePeopleDict)
 	}
 }

+ 8 - 2
couples.go

@@ -1,10 +1,11 @@
 package hercules
 
 import (
+	"sort"
+
 	"gopkg.in/src-d/go-git.v4"
 	"gopkg.in/src-d/go-git.v4/plumbing/object"
 	"gopkg.in/src-d/go-git.v4/utils/merkletrie"
-	"sort"
 )
 
 type Couples struct {
@@ -21,6 +22,7 @@ type Couples struct {
 
 type CouplesResult struct {
 	PeopleMatrix []map[int]int64
+	PeopleFiles  [][]string
 	FilesMatrix  []map[int]int
 	Files        []string
 }
@@ -103,9 +105,13 @@ func (couples *Couples) Consume(deps map[string]interface{}) (map[string]interfa
 
 func (couples *Couples) Finalize() interface{} {
 	peopleMatrix := make([]map[int]int64, couples.PeopleNumber)
+	peopleFiles := make([][]string, couples.PeopleNumber)
 	for i := range peopleMatrix {
 		peopleMatrix[i] = map[int]int64{}
 		for file, commits := range couples.people[i] {
+			//could be normalized further, by replacing file with idx in fileSequence
+			//but this would trade the space for readability of result
+			peopleFiles[i] = append(peopleFiles[i], file)
 			for j, otherFiles := range couples.people {
 				if i == j {
 					continue
@@ -140,5 +146,5 @@ func (couples *Couples) Finalize() interface{} {
 			filesMatrix[i][filesIndex[otherFile]] = cooccs
 		}
 	}
-	return CouplesResult{PeopleMatrix: peopleMatrix, Files: filesSequence, FilesMatrix: filesMatrix}
+	return CouplesResult{PeopleMatrix: peopleMatrix, PeopleFiles: peopleFiles, Files: filesSequence, FilesMatrix: filesMatrix}
 }

+ 12 - 0
requirements.txt

@@ -0,0 +1,12 @@
+args==0.1.0
+clint==0.5.1
+cycler==0.10.0
+matplotlib==2.0.2
+numpy==1.13.1
+pandas==0.20.3
+pyparsing==2.2.0
+python-dateutil==2.6.1
+pytz==2017.2
+PyYAML==3.12
+scipy==0.19.1
+six==1.10.0