Bläddra i källkod

Merge pull request #135 from vmarkovtsev/master

devs README
Vadim Markovtsev 6 år sedan
förälder
incheckning
5cf526cbee
3 ändrade filer med 86 tillägg och 5 borttagningar
  1. 36 1
      README.md
  2. BIN
      doc/devs_tensorflow.png
  3. 50 4
      labours.py

+ 36 - 1
README.md

@@ -263,12 +263,47 @@ Couples analysis automatically loads "shotness" data if available.
 ![Jinja2 functions grouped by structural hotness](doc/jinja.png)
 <p align="center"><code>hercules --shotness --pb https://github.com/pallets/jinja | python3 labours.py -m couples -f pb</code></p>
 
+#### Aligned commit series
+
+![tensorflow/tensorflow](doc/devs_tensorflow.png)
+<p align="center">tensorflow/tensorflow aligned commit series of top 50 developers by commit number.</p>
+
+```
+hercules --devs [-people-dict=/path/to/identities]
+python3 labours.py -m devs -o <name>
+```
+
+We record how many commits made, as well as lines added, removed and changed per day for each developer.
+We plot the resulting commit time series using a few tricks to show the temporal grouping. In other words,
+two adjacent commit series should look similar after normalization.
+
+1. We compute the distance matrix of the commit series. Our distance metric is
+[Dynamic Time Warping](https://en.wikipedia.org/wiki/Dynamic_time_warping).
+We use [FastDTW](https://cs.fit.edu/~pkc/papers/tdm04.pdf) algorithm which has linear complexity
+proportional to the length of time series. Thus the overall complexity of computing the matrix is quadratic.
+2. We compile the linear list of commit series with
+[Seriation](http://nicolas.kruchten.com/content/2018/02/seriation/) technique.
+Particularly, we solve the [Travelling Salesman Problem](https://en.wikipedia.org/wiki/Travelling_salesman_problem) which is NP-complete.
+However, given the typical number of developers which is less than 1,000, there is a good chance that
+the solution does not take much time. We use [Google or-tools](https://developers.google.com/optimization/routing/tsp) solver.
+3. We find 1-dimensional clusters in the resulting path with [HDBSCAN](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html)
+algorithm and assign colors accordingly.
+4. Time series are smoothed by convolving with the [Slepian window](https://en.wikipedia.org/wiki/Window_function#DPSS_or_Slepian_window).
+
+This plot allows to discover how the development team evolved through time. It also shows "commit flashmobs"
+such as [Hacktoberfest](https://hacktoberfest.digitalocean.com/). For example, here are the revealed
+insights from the `tensorflow/tensorflow` plot above:
+
+1. "Tensorflow Gardener" is classified as the only outlier.
+2. The "blue" group of developers covers the global maintainers and a few people who left (at the top).
+3. The "red" group shows how core developers join the project or become less active.
+
 #### Sentiment (positive and negative code)
 
 ![Django sentiment](doc/sentiment.png)
 <p align="center"><code>hercules --sentiment --pb https://github.com/django/django | python3 labours.py -m sentiment -f pb</code></p>
 
-We extract new or changed comments from source code on every commit, apply [BiDiSentiment]()
+We extract new or changed comments from source code on every commit, apply [BiDiSentiment](https://github.com/vmarkovtsev/bidisentiment)
 general purpose sentiment recurrent neural network and plot the results. Requires
 [libtensorflow](https://www.tensorflow.org/install/install_go).
 E.g. [`sadly, we need to hide the rect from the documentation finder for now`](https://github.com/pygame/pygame/commit/b6091d38c8a5639d311858660b38841d96598509#diff-eae59f175858fcef57cb17e733981c73R27) is negative and

BIN
doc/devs_tensorflow.png


+ 50 - 4
labours.py

@@ -387,7 +387,12 @@ def read_input(args):
     return reader
 
 
-DevDay = namedtuple("DevDay", ("Commits", "Added", "Removed", "Changed"))
+class DevDay(namedtuple("DevDay", ("Commits", "Added", "Removed", "Changed"))):
+    def add(self, dd):
+        return DevDay(Commits=self.Commits + dd.Commits,
+                      Added=self.Added + dd.Added,
+                      Removed=self.Removed + dd.Removed,
+                      Changed=self.Changed + dd.Changed)
 
 
 def calculate_average_lifetime(matrix):
@@ -1209,10 +1214,12 @@ def show_devs(args, name, start_date, end_date, data):
     else:
         chosen_people = set(people)
     devseries = defaultdict(list)
+    devstats = defaultdict(lambda: DevDay(0, 0, 0, 0))
     for day, devs in sorted(days.items()):
         for dev, stats in devs.items():
             if people[dev] in chosen_people:
                 devseries[dev].append((day, stats.Commits))
+                devstats[dev] = devstats[dev].add(stats)
     print("Calculating the distance matrix")
     # max-normalize the time series using a sliding window
     keys = list(devseries.keys())
@@ -1299,6 +1306,7 @@ def show_devs(args, name, start_date, end_date, data):
     prop_cycle = pyplot.rcParams["axes.prop_cycle"]
     colors = prop_cycle.by_key()["color"]
     fig, axes = pyplot.subplots(final.shape[0], 1)
+    backgrounds = ("#C4FFDB", "#FFD0CD") if args.background == "white" else ("#05401C", "#40110E")
     for ax, series, cluster, dev_i in zip(axes, final, clusters, route):
         if cluster >= 0:
             color = colors[cluster % len(colors)]
@@ -1310,10 +1318,19 @@ def show_devs(args, name, start_date, end_date, data):
         author = people[dev_i]
         ax.text(0.03, 0.5, author[:36] + (author[36:] and "..."),
                 horizontalalignment="right", verticalalignment="center",
-                transform=ax.transAxes, fontsize=14)
-        ax.text(0.97, 0.5, sum(p[1] for p in devseries[dev_i]),
+                transform=ax.transAxes, fontsize=14,
+                color="black" if args.background == "white" else "white")
+        ds = devstats[dev_i]
+        stats = "%5d %8s %8s" % (ds[0], _format_number(ds[1] - ds[2]), _format_number(ds[3]))
+        ax.text(0.97, 0.5, stats,
                 horizontalalignment="left", verticalalignment="center",
-                transform=ax.transAxes, fontsize=14)
+                transform=ax.transAxes, fontsize=14, family="monospace",
+                backgroundcolor=backgrounds[ds[1] <= ds[2]],
+                color="black" if args.background == "white" else "white")
+    axes[0].text(0.97, 1.75, " cmts    delta  changed",
+                 horizontalalignment="left", verticalalignment="center",
+                 transform=axes[0].transAxes, fontsize=14, family="monospace",
+                 color="black" if args.background == "white" else "white")
     axes[-1].set_axis_on()
     target_num_labels = 12
     num_months = (end_date.year - start_date.year) * 12 + end_date.month - start_date.month
@@ -1337,6 +1354,35 @@ def show_devs(args, name, start_date, end_date, data):
     deploy_plot(title, args.output, args.style)
 
 
+def _format_number(n):
+    if n == 0:
+        return "0"
+    assert n > 0
+    power = int(numpy.log10(abs(n)))
+    if power >= 6:
+        n = n / 1000000
+        if n >= 10:
+            n = str(int(n))
+        else:
+            n = "%.1f" % n
+            if n.endswith("0"):
+                n = n[:-2]
+        suffix = "M"
+    elif power >= 3:
+        n = n / 1000
+        if n >= 10:
+            n = str(int(n))
+        else:
+            n = "%.1f" % n
+            if n.endswith("0"):
+                n = n[:-2]
+        suffix = "K"
+    else:
+        n = str(n)
+        suffix = ""
+    return n + suffix
+
+
 def main():
     args = parse_args()
     reader = read_input(args)