瀏覽代碼

Add --devs visualization

Signed-off-by: Vadim Markovtsev <vadim@sourced.tech>
Vadim Markovtsev 6 年之前
父節點
當前提交
9549b732f2
共有 5 個文件被更改,包括 245 次插入48 次删除
  1. 4 3
      .travis.yml
  2. 1 0
      Dockerfile
  3. 11 11
      README.md
  4. 226 34
      labours.py
  5. 3 0
      requirements.txt

+ 4 - 3
.travis.yml

@@ -49,6 +49,7 @@ before_install:
   - export PATH=~/usr/bin:$GOPATH/bin:$PATH
   - make --version
   - pip3 --version
+  - pip3 install --user cython
   - pip3 install --user --no-build-isolation -r requirements.txt tensorflow flake8
   - docker run -d --privileged -p 9432:9432 --name bblfshd bblfsh/bblfshd
   - docker exec -it bblfshd bblfshctl driver install python bblfsh/python-driver:latest
@@ -67,13 +68,13 @@ script:
   - flake8
   - go test -coverpkg=all -v -coverprofile=coverage.txt -covermode=count gopkg.in/src-d/hercules.v5/... && sed -i '/cmd\/hercules\|core.go/d' coverage.txt
   - $GOPATH/bin/hercules version
-  - $GOPATH/bin/hercules --burndown --couples --quiet --pb https://github.com/src-d/hercules > 1.pb
+  - $GOPATH/bin/hercules --burndown --couples --devs --quiet --pb https://github.com/src-d/hercules > 1.pb
   - cp 1.pb 2.pb
   - $GOPATH/bin/hercules combine 1.pb 2.pb > 12.pb
   - ($GOPATH/bin/hercules generate-plugin -n MyPlug -o myplug && cd myplug && make)
   - (cd contrib/_plugin_example && make)
-  - $GOPATH/bin/hercules --burndown --burndown-files --burndown-people --couples --quiet https://github.com/src-d/hercules | python3 labours.py -m all -o out --backend Agg --disable-projector
-  - $GOPATH/bin/hercules --burndown --burndown-files --burndown-people --couples --quiet --pb https://github.com/src-d/hercules | python3 labours.py -f pb -m all -o out --backend Agg --disable-projector
+  - $GOPATH/bin/hercules --burndown --burndown-files --burndown-people --couples --devs --quiet https://github.com/src-d/hercules | python3 labours.py -m all -o out --backend Agg --disable-projector
+  - $GOPATH/bin/hercules --burndown --burndown-files --burndown-people --couples --devs --quiet --pb https://github.com/src-d/hercules | python3 labours.py -f pb -m all -o out --backend Agg --disable-projector
   - # $GOPATH/bin/hercules --sentiment --quiet --languages Python https://github.com/src-d/hercules > /dev/null
   - set +e
   - if [ $TRAVIS_GO_VERSION = "1.11.*" ]; then bash <(curl -s https://codecov.io/bash); fi

+ 1 - 0
Dockerfile

@@ -29,6 +29,7 @@ echo "	$@"\n\
 echo\n\' > /browser && \
     chmod +x /browser && \
     curl https://bootstrap.pypa.io/get-pip.py | python3 && \
+    pip3 install --no-cache-dir --no-build-isolation cython && \
     pip3 install --no-cache-dir --no-build-isolation -r /root/src/gopkg.in/src-d/hercules.v5/requirements.txt https://github.com/mind/wheels/releases/download/tf1.7-cpu/tensorflow-1.7.0-cp36-cp36m-linux_x86_64.whl && \
     rm -rf /root/* && \
     apt-get remove -y software-properties-common golang-1.10-go python3-dev libyaml-dev libxml2-dev curl git make unzip g++ && \

+ 11 - 11
README.md

@@ -40,7 +40,7 @@ Blog posts: [1](https://blog.sourced.tech/post/hercules.v4), [2](https://blog.so
 <p align="center">The DAG of burndown and couples analyses with UAST diff refining. Generated with <code>hercules --burndown --burndown-people --couples --feature=uast --dry-run --dump-dag doc/dag.dot https://github.com/src-d/hercules</code></p>
 
 ![git/git image](doc/linux.png)
-<p align="center">torvalds/linux line burndown (granularity 30, sampling 30, resampled by year). Generated with <code>hercules --burndown --first-parent --pb https://github.com/torvalds/linux | python3 labours.py -f pb -m project</code></p>
+<p align="center">torvalds/linux line burndown (granularity 30, sampling 30, resampled by year). Generated with <code>hercules --burndown --first-parent --pb https://github.com/torvalds/linux | python3 labours.py -f pb -m burndown-project</code></p>
 
 ## Installation
 
@@ -85,18 +85,18 @@ Some examples:
 
 ```
 # Use "memory" go-git backend and display the burndown plot. "memory" is the fastest but the repository's git data must fit into RAM.
-hercules --burndown https://github.com/src-d/go-git | python3 labours.py -m project --resample month
+hercules --burndown https://github.com/src-d/go-git | python3 labours.py -m burndown-project --resample month
 # Use "file system" go-git backend and print some basic information about the repository.
 hercules /path/to/cloned/go-git
 # Use "file system" go-git backend, cache the cloned repository to /tmp/repo-cache, use Protocol Buffers and display the burndown plot without resampling.
-hercules --burndown --pb https://github.com/git/git /tmp/repo-cache | python3 labours.py -m project -f pb --resample raw
+hercules --burndown --pb https://github.com/git/git /tmp/repo-cache | python3 labours.py -m burndown-project -f pb --resample raw
 
 # Now something fun
 # Get the linear history from git rev-list, reverse it
 # Pipe to hercules, produce burndown snapshots for every 30 days grouped by 30 days
 # Save the raw data to cache.yaml, so that later is possible to python3 labours.py -i cache.yaml
 # Pipe the raw data to labours.py, set text font size to 16pt, use Agg matplotlib backend and save the plot to output.png
-git rev-list HEAD | tac | hercules --commits - --burndown https://github.com/git/git | tee cache.yaml | python3 labours.py -m project --font-size 16 --backend Agg --output git.png
+git rev-list HEAD | tac | hercules --commits - --burndown https://github.com/git/git | tee cache.yaml | python3 labours.py -m burndown-project --font-size 16 --backend Agg --output git.png
 ```
 
 `labours.py -i /path/to/yaml` allows to read the output from `hercules` which was saved on disk.
@@ -117,7 +117,7 @@ hercules --some-analysis /tmp/repo-cache
 #### Docker image
 
 ```
-docker run --rm srcd/hercules hercules --burndown --pb https://github.com/git/git | docker run --rm -i -v $(pwd):/io srcd/hercules labours.py -f pb -m project -o /io/git_git.png
+docker run --rm srcd/hercules hercules --burndown --pb https://github.com/git/git | docker run --rm -i -v $(pwd):/io srcd/hercules labours.py -f pb -m burndown-project -o /io/git_git.png
 ```
 
 ### Built-in analyses
@@ -126,7 +126,7 @@ docker run --rm srcd/hercules hercules --burndown --pb https://github.com/git/gi
 
 ```
 hercules --burndown
-python3 labours.py -m project
+python3 labours.py -m burndown-project
 ```
 
 Line burndown statistics for the whole repository.
@@ -148,7 +148,7 @@ Unresampled bands are apparently not aligned and start from the project's birth
 
 ```
 hercules --burndown --burndown-files
-python3 labours.py -m file
+python3 labours.py -m burndown-file
 ```
 
 Burndown statistics for every file in the repository which is alive in the latest revision.
@@ -159,7 +159,7 @@ Note: it will generate separate graph for every file. You might don't want to ru
 
 ```
 hercules --burndown --burndown-people [-people-dict=/path/to/identities]
-python3 labours.py -m person
+python3 labours.py -m burndown-person
 ```
 
 Burndown statistics for the repository's contributors. If `-people-dict` is not specified, the identities are
@@ -183,7 +183,7 @@ by `|`. The case is ignored.
 
 ```
 hercules --burndown --burndown-people [-people-dict=/path/to/identities]
-python3 labours.py -m churn_matrix
+python3 labours.py -m churn-matrix
 ```
 
 Besides the burndown information, `-people` collects the added and deleted line statistics per
@@ -287,7 +287,7 @@ Such a build requires [`libtensorflow`](https://www.tensorflow.org/install/insta
 #### Everything in a single pass
 
 ```
-hercules --burndown --burndown-files --burndown-people --couples --shotness [-people-dict=/path/to/identities]
+hercules --burndown --burndown-files --burndown-people --couples --shotness --devs [-people-dict=/path/to/identities]
 python3 labours.py -m all
 ```
 
@@ -302,7 +302,7 @@ Hercules has a plugin system and allows to run custom analyses. See [PLUGINS.md]
 ```
 hercules --burndown --pb https://github.com/src-d/go-git > go-git.pb
 hercules --burndown --pb https://github.com/src-d/hercules > hercules.pb
-hercules combine go-git.pb hercules.pb | python3 labours.py -f pb -m project --resample M
+hercules combine go-git.pb hercules.pb | python3 labours.py -f pb -m burndown-project --resample M
 ```
 
 ### Bad unicode errors

+ 226 - 34
labours.py

@@ -1,5 +1,8 @@
 #!/usr/bin/env python3
 import argparse
+from collections import defaultdict, namedtuple
+from datetime import datetime, timedelta
+from importlib import import_module
 import io
 import json
 import os
@@ -11,8 +14,7 @@ import tempfile
 import threading
 import time
 import warnings
-from datetime import datetime, timedelta
-from importlib import import_module
+
 
 try:
     from clint.textui import progress
@@ -28,13 +30,6 @@ if sys.version_info[0] < 3:
     input = raw_input  # noqa: F821
 
 
-PB_MESSAGES = {
-    "Burndown": "internal.pb.pb_pb2.BurndownAnalysisResults",
-    "Couples": "internal.pb.pb_pb2.CouplesAnalysisResults",
-    "Shotness": "internal.pb.pb_pb2.ShotnessAnalysisResults",
-}
-
-
 def list_matplotlib_styles():
     script = "import sys; from matplotlib import pyplot; " \
              "sys.stdout.write(repr(pyplot.style.available))"
@@ -64,8 +59,9 @@ def parse_args():
                         help="Occupy 100%% height for every measurement.")
     parser.add_argument("--couples-tmp-dir", help="Temporary directory to work with couples.")
     parser.add_argument("-m", "--mode",
-                        choices=["project", "file", "person", "churn_matrix", "ownership",
-                                 "couples", "shotness", "sentiment", "all", "run_times"],
+                        choices=["burndown-project", "burndown-file", "burndown-person",
+                                 "churn-matrix", "ownership", "couples", "shotness", "sentiment",
+                                 "devs", "all", "run-times"],
                         help="What to plot.")
     parser.add_argument(
         "--resample", default="year",
@@ -121,6 +117,12 @@ class Reader(object):
     def get_shotness(self):
         raise NotImplementedError
 
+    def get_sentiment(self):
+        raise NotImplementedError
+
+    def get_devs(self):
+        raise NotImplementedError
+
 
 class YamlReader(Reader):
     def read(self, file):
@@ -224,6 +226,12 @@ class YamlReader(Reader):
             "Value": float(vals[0])
         } for key, vals in self.data["Sentiment"].items()})
 
+    def get_devs(self):
+        people = self.data["Devs"]["people"]
+        days = {int(d): {int(dev): DevDay(*(int(x) for x in day)) for dev, day in devs.items()}
+                for d, devs in self.data["Devs"]["days"].items()}
+        return days, people
+
     def _parse_burndown_matrix(self, matrix):
         return numpy.array([numpy.fromstring(line, dtype=int, sep=" ")
                             for line in matrix.split("\n")])
@@ -336,6 +344,13 @@ class ProtobufReader(Reader):
             raise KeyError
         return byday
 
+    def get_devs(self):
+        people = list(self.contents["Devs"].dev_index)
+        days = {d: {dev: DevDay(stats.commits, stats.added, stats.removed, stats.changed)
+                    for dev, stats in day.devs.items()}
+                for d, day in self.contents["Devs"].days.items()}
+        return days, people
+
     def _parse_burndown_matrix(self, matrix):
         dense = numpy.zeros((matrix.number_of_rows, matrix.number_of_columns), dtype=int)
         for y, row in enumerate(matrix.rows):
@@ -350,6 +365,12 @@ class ProtobufReader(Reader):
 
 
 READERS = {"yaml": YamlReader, "yml": YamlReader, "pb": ProtobufReader}
+PB_MESSAGES = {
+    "Burndown": "internal.pb.pb_pb2.BurndownAnalysisResults",
+    "Couples": "internal.pb.pb_pb2.CouplesAnalysisResults",
+    "Shotness": "internal.pb.pb_pb2.ShotnessAnalysisResults",
+    "Devs": "internal.pb.pb_pb2.DevsAnalysisResults",
+}
 
 
 def read_input(args):
@@ -366,6 +387,9 @@ def read_input(args):
     return reader
 
 
+DevDay = namedtuple("DevDay", ("Commits", "Added", "Removed", "Changed"))
+
+
 def calculate_average_lifetime(matrix):
     lifetimes = numpy.zeros(matrix.shape[1] - 1)
     for band in matrix:
@@ -717,7 +741,6 @@ def plot_burndown(args, target, name, matrix, date_range_sampling, labels, granu
         legend_loc = 3
     else:
         legend_loc = 2
-    pyplot.style.use("ggplot")
     legend = pyplot.legend(loc=legend_loc, fontsize=args.font_size)
     pyplot.ylabel("Lines of code")
     pyplot.xlabel("Time")
@@ -1086,12 +1109,12 @@ def show_shotness_stats(data):
         print("%8d  %s:%s [%s]" % (count, r.file, r.name, r.internal_role))
 
 
-def show_sentiment_stats(args, name, resample, start, data):
+def show_sentiment_stats(args, name, resample, start_date, data):
     matplotlib, pyplot = import_pyplot(args.backend, args.style)
 
-    start = datetime.fromtimestamp(start)
+    start_date = datetime.fromtimestamp(start_date)
     data = sorted(data.items())
-    xdates = [start + timedelta(days=d[0]) for d in data]
+    xdates = [start_date + timedelta(days=d[0]) for d in data]
     xpos = []
     ypos = []
     xneg = []
@@ -1152,6 +1175,168 @@ def show_sentiment_stats(args, name, resample, start, data):
     deploy_plot(title, args.output, args.style)
 
 
+def show_devs(args, name, start_date, end_date, data):
+    try:
+        from fastdtw import fastdtw
+    except ImportError as e:
+        print("Cannot import fastdtw: %s\nInstall it from https://github.com/slaypni/fastdtw" % e)
+        sys.exit(1)
+    try:
+        from ortools.constraint_solver import pywrapcp, routing_enums_pb2
+    except ImportError as e:
+        print("Cannot import ortools: %s\nInstall it from "
+              "https://developers.google.com/optimization/install/python/" % e)
+        sys.exit(1)
+    try:
+        from hdbscan import HDBSCAN
+    except ImportError as e:
+        print("Cannot import ortools: %s\nInstall it from "
+              "https://developers.google.com/optimization/install/python/" % e)
+        sys.exit(1)
+    from scipy.signal import convolve, slepian
+
+    days, people = data
+    max_people = 50
+    if len(people) > max_people:
+        print("Picking top 100 developers by commit count")
+        # pick top N developers by commit count
+        commits = defaultdict(int)
+        for devs in days.values():
+            for dev, stats in devs.items():
+                commits[dev] += stats.Commits
+        commits = sorted(((v, k) for k, v in commits.items()), reverse=True)
+        chosen_people = {people[k] for _, k in commits[:max_people]}
+    else:
+        chosen_people = set(people)
+    devseries = defaultdict(list)
+    for day, devs in sorted(days.items()):
+        for dev, stats in devs.items():
+            if people[dev] in chosen_people:
+                devseries[dev].append((day, stats.Commits))
+    print("Calculating the distance matrix")
+    # max-normalize the time series using a sliding window
+    keys = list(devseries.keys())
+    series = list(devseries.values())
+    for i, s in enumerate(series):
+        arr = numpy.array(s).transpose().astype(numpy.float32)
+        commits = arr[1]
+        if len(commits) < 7:
+            commits /= commits.max()
+        else:
+            # 4 is sizeof(float32)
+            windows = numpy.lib.stride_tricks.as_strided(commits, [len(commits) - 6, 7], [4, 4])
+            commits = numpy.concatenate((
+                [windows[0, 0] / windows[0].max(),
+                 windows[0, 1] / windows[0].max(),
+                 windows[0, 2] / windows[0].max()],
+                windows[:, 3] / windows.max(axis=1),
+                [windows[-1, 4] / windows[-1].max(),
+                 windows[-1, 5] / windows[-1].max(),
+                 windows[-1, 6] / windows[-1].max()]
+            ))
+        arr[1] = commits * 7  # 7 is a pure heuristic here and is not related to window size
+        series[i] = list(arr.transpose())
+    # calculate the distance matrix using dynamic time warping metric
+    dists = numpy.full((len(series)+1, len(series)+1), -100500, dtype=numpy.float32)
+    for x in range(len(series)):
+        dists[x, x] = 0
+        for y in range(x + 1, len(series)):
+            # L1 norm
+            dist, _ = fastdtw(series[x], series[y], radius=5, dist=1)
+            dists[x, y] = dists[y, x] = dist
+    # preparation for seriation ordering
+    dists[len(series), :] = 0
+    dists[:, len(series)] = 0
+    assert (dists >= 0).all()
+    print("Ordering the series")
+    # solve the TSP on the distance matrix
+    routing = pywrapcp.RoutingModel(dists.shape[0], 1, len(series))
+
+    def dist_callback(x, y):
+        # ortools wants integers, so we approximate here
+        return int(dists[x][y] * 1000)
+
+    routing.SetArcCostEvaluatorOfAllVehicles(dist_callback)
+    search_parameters = pywrapcp.RoutingModel.DefaultSearchParameters()
+    search_parameters.local_search_metaheuristic = (
+        routing_enums_pb2.LocalSearchMetaheuristic.GUIDED_LOCAL_SEARCH)
+    search_parameters.time_limit_ms = 2000
+    assignment = routing.SolveWithParameters(search_parameters)
+    index = routing.Start(0)
+    route = []
+    while not routing.IsEnd(index):
+        node = routing.IndexToNode(index)
+        if node < len(keys):
+            route.append(node)
+        index = assignment.Value(routing.NextVar(index))
+    route_map = {v: i for i, v in enumerate(route)}
+
+    # determine clusters
+    opt_dist_chain = numpy.cumsum(numpy.array(
+        [0] + [dists[route[i], route[i + 1]] for i in range(len(route)-1)]))
+    clusters = HDBSCAN(min_cluster_size=2).fit_predict(opt_dist_chain[:, numpy.newaxis])
+    route = [keys[node] for node in route]
+
+    print("Plotting")
+    # smooth time series
+    start_date = datetime.fromtimestamp(start_date)
+    start_date = datetime(start_date.year, start_date.month, start_date.day)
+    end_date = datetime.fromtimestamp(end_date)
+    end_date = datetime(end_date.year, end_date.month, end_date.day)
+    size = (end_date - start_date).days + 1
+    plot_x = [start_date + timedelta(days=i) for i in range(size)]
+    resolution = 64
+    window = slepian(size // resolution, 0.5)
+    series = list(devseries.values())
+    final = numpy.zeros((len(devseries), size), dtype=numpy.float32)
+    for i, s in enumerate(series):
+        arr = numpy.array(s).transpose()
+        full_history = numpy.zeros(size, dtype=numpy.float32)
+        full_history[arr[0]] = arr[1]
+        final[route_map[i]] = convolve(full_history, window, "same")
+
+    matplotlib, pyplot = import_pyplot(args.backend, args.style)
+    prop_cycle = pyplot.rcParams["axes.prop_cycle"]
+    colors = prop_cycle.by_key()["color"]
+    fig, axes = pyplot.subplots(final.shape[0], 1)
+    for ax, series, cluster, dev_i in zip(axes, final, clusters, route):
+        if cluster >= 0:
+            color = colors[cluster % len(colors)]
+        else:
+            # outlier
+            color = "grey"
+        ax.plot(plot_x, series, color=color)
+        ax.set_axis_off()
+        author = people[dev_i]
+        ax.text(0.03, 0.5, author[:36] + (author[36:] and "..."),
+                horizontalalignment="right", verticalalignment="center",
+                transform=ax.transAxes, fontsize=14)
+        ax.text(0.97, 0.5, sum(p[1] for p in devseries[dev_i]),
+                horizontalalignment="left", verticalalignment="center",
+                transform=ax.transAxes, fontsize=14)
+    axes[-1].set_axis_on()
+    target_num_labels = 12
+    num_months = (end_date.year - start_date.year) * 12 + end_date.month - start_date.month
+    interval = int(numpy.ceil(num_months / target_num_labels))
+    if interval >= 8:
+        interval = int(numpy.ceil(num_months / (12 * target_num_labels)))
+        axes[-1].xaxis.set_major_locator(matplotlib.dates.YearLocator(interval=interval))
+        axes[-1].xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y"))
+    else:
+        axes[-1].xaxis.set_major_locator(matplotlib.dates.MonthLocator(interval=interval))
+        axes[-1].xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%m"))
+    for tick in axes[-1].xaxis.get_major_ticks():
+        tick.label.set_fontsize(16)
+    axes[-1].spines["left"].set_visible(False)
+    axes[-1].spines["right"].set_visible(False)
+    axes[-1].spines["top"].set_visible(False)
+    axes[-1].get_yaxis().set_visible(False)
+    axes[-1].set_facecolor((1.0,) * 3 + (0.0,))
+
+    title = "%s commits" % name
+    deploy_plot(title, args.output, args.style)
+
+
 def main():
     args = parse_args()
     reader = read_input(args)
@@ -1169,6 +1354,7 @@ def main():
     shotness_warning = "Structural hotness stats were not collected. Re-run hercules with " \
                        "--shotness. Also check --languages - the output may be empty."
     sentiment_warning = "Sentiment stats were not collected. Re-run hercules with --sentiment."
+    devs_warning = "Devs stats were not collected. Re-run hercules with --devs."
 
     def run_times():
         rt = reader.get_run_times()
@@ -1262,25 +1448,30 @@ def main():
             return
         show_sentiment_stats(args, reader.get_name(), args.resample, reader.get_header()[0], data)
 
-    if args.mode == "run_times":
-        run_times()
-    elif args.mode == "project":
-        project_burndown()
-    elif args.mode == "file":
-        files_burndown()
-    elif args.mode == "person":
-        people_burndown()
-    elif args.mode == "churn_matrix":
-        churn_matrix()
-    elif args.mode == "ownership":
-        ownership_burndown()
-    elif args.mode == "couples":
-        couples()
-    elif args.mode == "shotness":
-        shotness()
-    elif args.mode == "sentiment":
-        sentiment()
-    elif args.mode == "all":
+    def devs():
+        try:
+            data = reader.get_devs()
+        except KeyError:
+            print(devs_warning)
+            return
+        show_devs(args, reader.get_name(), *reader.get_header(), data)
+
+    modes = {
+        "run-times": run_times,
+        "burndown-project": project_burndown,
+        "burndown-file": files_burndown,
+        "burndown-person": people_burndown,
+        "churn-matrix": churn_matrix,
+        "ownership": ownership_burndown,
+        "couples": couples,
+        "shotness": shotness,
+        "sentiment": sentiment,
+        "devs": devs,
+    }
+    try:
+        modes[args.mode]()
+    except KeyError:
+        assert args.mode == "all"
         project_burndown()
         files_burndown()
         people_burndown()
@@ -1289,6 +1480,7 @@ def main():
         couples()
         shotness()
         sentiment()
+        devs()
 
     if web_server.running:
         secs = int(os.getenv("COUPLES_SERVER_TIME", "60"))

+ 3 - 0
requirements.txt

@@ -6,3 +6,6 @@ PyYAML>=3.12,<4.0
 scipy>=0.19.0,<2.0
 protobuf>=3.5.0,<4.0
 munch>=2.0
+hdbscan==0.8.18
+ortools==6.9.5824
+fastdtw==0.3.2