преди 8 години · b45345ff04
--- a/README.md
+++ b/README.md
@@ -3,19 +3,19 @@
 
																 Hercules
															
 
																 --------
															
 
																-This tool calculates the lines burnout stats in a Git repository.
															
 
																+This project calculates and plots the lines burndown and other fun stats in Git repositories.
															
 
																 Exactly the same what [git-of-theseus](https://github.com/erikbern/git-of-theseus)
															
 
																 does actually, but using [go-git](https://github.com/src-d/go-git).
															
 
																 Why? [source{d}](http://sourced.tech) builds it's own data pipeline to
															
 
																 process every git repository in the world and the calculation of the
															
 
																-annual burnout ratio will be embedded into it. This project is an
															
 
																+annual burnout ratio will be embedded into it. `hercules` contains an
															
 
																 open source implementation of the specific `git blame` flavour on top
															
 
																-of go-git. Blaming is done incrementally using the custom RB tree tracking
															
 
																+of go-git. Blaming is performed incrementally using the custom RB tree tracking
															
 
																 algorithm, only the last modification date is recorded.
															
 
																 There are two tools: `hercules` and `labours.py`. The first is the program
															
 
																-written in Go which collects the burnout stats from a Git repository.
															
 
																-The second is the Python script which draws the stack area plot and optionally
															
 
																+written in Go which collects the burndown and other stats from a Git repository.
															
 
																+The second is the Python script which draws the stack area plots and optionally
															
 
																 resamples the time series. These two tools are normally used together through
															
 
																 the pipe. `hercules` prints results in plain text. The first line is four numbers:
															
 
																 UNIX timestamp which corresponds to the time the repository was created,
															
@@ -54,26 +54,136 @@ hercules https://github.com/git/git /tmp/repo-cache | python3 labours.py --resam
 
																 # Now something fun
															
 
																 # Get the linear history from git rev-list, reverse it
															
 
																 # Pipe to hercules, produce the snapshots for every 30 days grouped by 30 days
															
 
																-# Save the raw data to cache.txt, so that later simply cat cache.txt | python3 labours.py
															
 
																+# Save the raw data to cache.yaml, so that later is possible to python3 labours.py -i cache.yaml
															
 
																 # Pipe the raw data to labours.py, set text font size to 16pt, use Agg matplotlib backend and save the plot to output.png
															
 
																-git rev-list HEAD | tac | hercules -commits - https://github.com/git/git | tee cache.txt | python3 labours.py --font-size 16 --backend Agg --output git.png
															
 
																+git rev-list HEAD | tac | hercules -commits - https://github.com/git/git | tee cache.yaml | python3 labours.py --font-size 16 --backend Agg --output git.png
															
 
																 ```
															
 
																+`labours.py -i /path/to/yaml` allows to read the output from `hercules` which was saved on disk.
															
 
																+
															
 
																 ### Extensions
															
 
																-Option `-files` additionally prints the corresponding burndown table for every
															
 
																-file in the repository. `-people` does the same for the developers; `-people-dict` allows to specify
															
 
																-the custom identity matching.
															
 
																+#### Files
															
 
																+
															
 
																+```
															
 
																+hercules -files
															
 
																+python3 labours.py -m files
															
 
																+```
															
 
																+
															
 
																+Burndown statistics for every file in the repository which is alive in the latest revision.
															
 
																+
															
 
																+#### People
															
 
																+
															
 
																+```
															
 
																+hercules -people [-people-dict=/path/to/identities]
															
 
																+python3 labours.py -m person
															
 
																+```
															
 
																+
															
 
																+Burndown statistics for developers. If `-people-dict` is not specified, the identities are
															
 
																+discovered by the following algorithm:
															
 
																+
															
 
																+0. We start from the root commit towards the HEAD. Emails and names are converted to lower case.
															
 
																+1. If we process an unknown email and name, record them as a new developer.
															
 
																+2. If we process a known email but unknown name, match to the developer with the matching email,
															
 
																+and add the unknown name to the list of that developer's names.
															
 
																+3. If we process an unknown email but known name, match to the developer with the matching name,
															
 
																+and add the unknown email to the list of that developer's emails.
															
 
																+
															
 
																+If `-people-dict` is specified, it should point to a text file with the custom identities. The
															
 
																+format is: every line is a single developer, it contains all the matching emails and names separated
															
 
																+by `|`. The case is ignored.
															
 
																+
															
 
																+#### Churn matrix
															
 
																+
															
 
																+```
															
 
																+hercules -people [-people-dict=/path/to/identities]
															
 
																+python3 labours.py -m churn_matrix
															
 
																+```
															
 
																+
															
 
																+Besides the burndown information, `-people` collects the added and deleted line statistics per
															
 
																+developer. It shows how many lines written by developer A are removed by developer B. The format is
															
 
																+the matrix with N rows and (N+2) columns, where N is the number of developers.
															
 
																+
															
 
																+1. First column is the number of lines the developer wrote.
															
 
																+2. Second column is how many lines were written by the developer and deleted by unidentified developers
															
 
																+(if `-people-dict` is not specified, it is always 0).
															
 
																+3. The rest of the columns show how many lines were written by the developer and deleted by identified
															
 
																+developers.
															
 
																+
															
 
																+The sequence of developers is stored in `people_sequence` YAML node.
															
 
																+
															
 
																+#### Code share
															
 
																+
															
 
																+```
															
 
																+hercules -people [-people-dict=/path/to/identities]
															
 
																+python3 labours.py -m people
															
 
																+```
															
 
																+
															
 
																+`-people` also allows to draw the code share through time stacked area plot. That is,
															
 
																+how many lines are alive at the sampled moments in time for each identified developer.
															
 
																+
															
 
																+#### Couples
															
 
																+
															
 
																+```
															
 
																+hercules -couples [-people-dict=/path/to/identities]
															
 
																+python3 labours.py -m couples -o <name> [--couples-tmp-dir=/tmp]
															
 
																+```
															
 
																+
															
 
																+The files are coupled if they are changed in the same commit. The developers are coupled if they
															
 
																+change the same file. `hercules` records the number of couples throught the whole commti history
															
 
																+and outputs the two corresponding co-occurrence matrices. `labours.py` then trains
															
 
																+[Swivel embeddings](https://github.com/src-d/tensorflow-swivel) - dense vectors which reflect the
															
 
																+co-occurrence probability through the Euclidean distance. The training requires a working
															
 
																+[Tensorflow](http://tensorflow.org) installation. The intermediate files are stored in the
															
 
																+system temporary directory or `--couples-tmp-dir` if it is specified. The trained embeddings are
															
 
																+written to the current working directory with the name depending on `-o`. The output format is TSV
															
 
																+and matches [Tensorflow Projector])(http://projector.tensorflow.org/) so that the files and people
															
 
																+can be visualized with t-SNE implemented in TF Projector.
															
 
																-Correspondingly, `labours.py` has `--mode` which allows to plot all the burndowns for files,
															
 
																-people and the overwrite matrix. The latter shows how much code written by a developer is removed
															
 
																-by other developers, the rows are normalized to the number of individual insertions.
															
 
																+#### Everything in a single pass
															
 
																+```
															
 
																+hercules -files -people -couples [-people-dict=/path/to/identities]
															
 
																+python3 labours.py -m all
															
 
																+```
															
 
																+
															
 
																+### Bad unicode errors
															
 
																+
															
 
																+YAML does not support the whole range of Unicode characters and the parser on `labours.py` side
															
 
																+may raise exceptions. Filter the output from `hercules` through `fix_yaml_unicode.py` to discard
															
 
																+such offending characters.
															
 
																+
															
 
																+```
															
 
																+hercules -people https://github.com/... | python3 fix_yaml_unicode.py | python3 labours.py -m people
															
 
																+```
															
 
																+
															
 
																+### Plotting
															
 
																+
															
 
																+These options affects all plots:
															
 
																+
															
 
																+```
															
 
																+python3 labours.py [--style=white|black] [--backend=]
															
 
																+```
															
 
																+
															
 
																+`--style` changes the background to be either white ("black" foreground) or black ("white" foreground).
															
 
																+`--backend` chooses the Matplotlib backend.
															
 
																+
															
 
																+These options are effective in burndown charts only:
															
 
																+
															
 
																+```
															
 
																+python3 labours.py [--text-size] [--relative]
															
 
																+```
															
 
																+`--text-size` changes the font size, `--relative` activate the stretched burndown layout.
															
 
																 ### Caveats
															
 
																-1. Currently, go-git's "file system" backend is considerably slower than the in-memory one, so you should clone repos instead of reading them from disk whenever possible.
															
 
																+1. Currently, go-git's file system storage backend is considerably slower than the in-memory one,
															
 
																+so you should clone repos instead of reading them from disk whenever possible. Please note that the
															
 
																+in-memory storage may require much RAM, for example, the Linux kernel takes over 200GB in 2017.
															
 
																+2. Parsing YAML in Python is slow when the number of internal objects is big. `hercules`' output
															
 
																+for the Linux kernel in "couples" mode is 1.5 GB and takes more than an hour / 180GB RAM to be
															
 
																+parsed. However, most of the repositories are parsed within a minute.
															
 
																 ### License
															
 
																 MIT.
															
--- a/labours.py
+++ b/labours.py
@@ -36,7 +36,7 @@ def parse_args():
 
																                         help="Occupy 100%% height for every measurement.")
															
 
																     parser.add_argument("--couples-tmp-dir", help="Temporary directory to work with couples.")
															
 
																     parser.add_argument("-m", "--mode",
															
 
																-                        choices=["project", "file", "person", "matrix", "people", "couples",
															
 
																+                        choices=["project", "file", "person", "churn_matrix", "people", "couples",
															
 
																                                  "all"],
															
 
																                         default="project", help="What to plot.")
															
 
																     parser.add_argument(
															
@@ -177,7 +177,7 @@ def load_main(header, name, matrix, resample):
 
																     return name, matrix, date_range_sampling, labels, granularity, sampling, resample
															
 
																-def load_matrix(contents):
															
 
																+def load_churn_matrix(contents):
															
 
																     matrix = numpy.array([numpy.fromstring(line, dtype=int, sep=" ")
															
 
																                           for line in contents.split("\n")])
															
 
																     return matrix
															
@@ -324,7 +324,7 @@ def plot_many(args, target, header, parts):
 
																     sys.stdout.write(stdout.getvalue())
															
 
																-def plot_matrix(args, repo, people, matrix):
															
 
																+def plot_churn_matrix(args, repo, people, matrix):
															
 
																     matrix = matrix.astype(float)
															
 
																     zeros = matrix[:, 0] == 0
															
 
																     matrix[zeros, :] = 1
															
@@ -471,10 +471,10 @@ def train_embeddings(coocc_tree, tmpdir, shard_size=4096):
 
																         swivel.FLAGS.submatrix_cols = shard_size
															
 
																         if len(meta_index) < 10000:
															
 
																             embedding_size = 50
															
 
																-            num_epochs = 100
															
 
																+            num_epochs = 200
															
 
																         elif len(meta_index) < 100000:
															
 
																             embedding_size = 100
															
 
																-            num_epochs = 200
															
 
																+            num_epochs = 250
															
 
																         elif len(meta_index) < 500000:
															
 
																             embedding_size = 200
															
 
																             num_epochs = 300
															
@@ -541,11 +541,11 @@ def main():
 
																             print(people_warning)
															
 
																             return
															
 
																         plot_many(args, "person", header, people_contents)
															
 
																-    elif args.mode == "matrix":
															
 
																+    elif args.mode == "churn_matrix":
															
 
																         if not people_contents:
															
 
																             print(people_warning)
															
 
																             return
															
 
																-        plot_matrix(args, name, people_sequence, load_matrix(people_matrix))
															
 
																+        plot_churn_matrix(args, name, people_sequence, load_churn_matrix(people_matrix))
															
 
																     elif args.mode == "people":
															
 
																         if not people_contents:
															
 
																             print(people_warning)
															
@@ -563,7 +563,7 @@ def main():
 
																             plot_many(args, "file", header, files_contents)
															
 
																         if people_contents:
															
 
																             plot_many(args, "person", header, people_contents)
															
 
																-            plot_matrix(args, name, people_sequence, load_matrix(people_matrix))
															
 
																+            plot_churn_matrix(args, name, people_sequence, load_churn_matrix(people_matrix))
															
 
																             plot_people(args, name, *load_people(header, people_sequence, people_contents))
															
 
																         if people_coocc:
															
 
																             assert files_coocc