#!/usr/bin/env python3
import argparse
from collections import defaultdict, namedtuple
from datetime import datetime, timedelta
from importlib import import_module
import io
from itertools import chain
import json
import os
import re
import shutil
import subprocess
import sys
import tempfile
import threading
import time
import warnings


try:
    from clint.textui import progress
except ImportError:
    print("Warning: clint is not installed, no fancy progressbars in the terminal for you.")
    progress = None
import numpy
import yaml


if sys.version_info[0] < 3:
    # OK, ancients, I will support Python 2, but you owe me a beer
    input = raw_input  # noqa: F821


def list_matplotlib_styles():
    script = "import sys; from matplotlib import pyplot; " \
             "sys.stdout.write(repr(pyplot.style.available))"
    styles = eval(subprocess.check_output([sys.executable, "-c", script]))
    styles.remove("classic")
    return ["default", "classic"] + styles


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("-o", "--output", default="",
                        help="Path to the output file/directory (empty for display). "
                             "If the extension is JSON, the data is saved instead of "
                             "the real image.")
    parser.add_argument("-i", "--input", default="-",
                        help="Path to the input file (- for stdin).")
    parser.add_argument("-f", "--input-format", default="auto", choices=["yaml", "pb", "auto"])
    parser.add_argument("--font-size", default=12, type=int,
                        help="Size of the labels and legend.")
    parser.add_argument("--style", default="ggplot", choices=list_matplotlib_styles(),
                        help="Plot style to use.")
    parser.add_argument("--backend", help="Matplotlib backend to use.")
    parser.add_argument("--background", choices=["black", "white"], default="white",
                        help="Plot's general color scheme.")
    parser.add_argument("--size", help="Axes' size in inches, for example \"12,9\"")
    parser.add_argument("--relative", action="store_true",
                        help="Occupy 100%% height for every measurement.")
    parser.add_argument("--couples-tmp-dir", help="Temporary directory to work with couples.")
    parser.add_argument("-m", "--mode",
                        choices=["burndown-project", "burndown-file", "burndown-person",
                                 "churn-matrix", "ownership", "couples-files", "couples-people",
                                 "couples-shotness", "shotness", "sentiment", "devs",
                                 "devs-efforts", "old-vs-new", "all", "run-times", "languages",
                                 "devs-parallel"],
                        help="What to plot.")
    parser.add_argument(
        "--resample", default="year",
        help="The way to resample the time series. Possible values are: "
             "\"month\", \"year\", \"no\", \"raw\" and pandas offset aliases ("
             "http://pandas.pydata.org/pandas-docs/stable/timeseries.html"
             "#offset-aliases).")
    dateutil_url = "https://dateutil.readthedocs.io/en/stable/parser.html#dateutil.parser.parse"
    parser.add_argument("--start-date",
                        help="Start date of time-based plots. Any format is accepted which is "
                             "supported by %s" % dateutil_url)
    parser.add_argument("--end-date",
                        help="End date of time-based plots. Any format is accepted which is "
                             "supported by %s" % dateutil_url)
    parser.add_argument("--disable-projector", action="store_true",
                        help="Do not run Tensorflow Projector on couples.")
    parser.add_argument("--max-people", default=20, type=int,
                        help="Maximum number of developers in churn matrix and people plots.")
    args = parser.parse_args()
    return args


class Reader(object):
    def read(self, file):
        raise NotImplementedError

    def get_name(self):
        raise NotImplementedError

    def get_header(self):
        raise NotImplementedError

    def get_burndown_parameters(self):
        raise NotImplementedError

    def get_project_burndown(self):
        raise NotImplementedError

    def get_files_burndown(self):
        raise NotImplementedError

    def get_people_burndown(self):
        raise NotImplementedError

    def get_ownership_burndown(self):
        raise NotImplementedError

    def get_people_interaction(self):
        raise NotImplementedError

    def get_files_coocc(self):
        raise NotImplementedError

    def get_people_coocc(self):
        raise NotImplementedError

    def get_shotness_coocc(self):
        raise NotImplementedError

    def get_shotness(self):
        raise NotImplementedError

    def get_sentiment(self):
        raise NotImplementedError

    def get_devs(self):
        raise NotImplementedError


class YamlReader(Reader):
    def read(self, file):
        yaml.reader.Reader.NON_PRINTABLE = re.compile(r"(?!x)x")
        try:
            loader = yaml.CLoader
        except AttributeError:
            print("Warning: failed to import yaml.CLoader, falling back to slow yaml.Loader")
            loader = yaml.Loader
        try:
            if file != "-":
                with open(file) as fin:
                    data = yaml.load(fin, Loader=loader)
            else:
                data = yaml.load(sys.stdin, Loader=loader)
        except (UnicodeEncodeError, yaml.reader.ReaderError) as e:
            print("\nInvalid unicode in the input: %s\nPlease filter it through "
                  "fix_yaml_unicode.py" % e)
            sys.exit(1)
        if data is None:
            print("\nNo data has been read - has Hercules crashed?")
            sys.exit(1)
        self.data = data

    def get_run_times(self):
        return {}

    def get_name(self):
        return self.data["hercules"]["repository"]

    def get_header(self):
        header = self.data["hercules"]
        return header["begin_unix_time"], header["end_unix_time"]

    def get_burndown_parameters(self):
        header = self.data["Burndown"]
        return header["sampling"], header["granularity"]

    def get_project_burndown(self):
        return self.data["hercules"]["repository"], \
            self._parse_burndown_matrix(self.data["Burndown"]["project"]).T

    def get_files_burndown(self):
        return [(p[0], self._parse_burndown_matrix(p[1]).T)
                for p in self.data["Burndown"]["files"].items()]

    def get_people_burndown(self):
        return [(p[0], self._parse_burndown_matrix(p[1]).T)
                for p in self.data["Burndown"]["people"].items()]

    def get_ownership_burndown(self):
        return self.data["Burndown"]["people_sequence"].copy(), \
            {p[0]: self._parse_burndown_matrix(p[1])
             for p in self.data["Burndown"]["people"].items()}

    def get_people_interaction(self):
        return self.data["Burndown"]["people_sequence"].copy(), \
            self._parse_burndown_matrix(self.data["Burndown"]["people_interaction"])

    def get_files_coocc(self):
        coocc = self.data["Couples"]["files_coocc"]
        return coocc["index"], self._parse_coocc_matrix(coocc["matrix"])

    def get_people_coocc(self):
        coocc = self.data["Couples"]["people_coocc"]
        return coocc["index"], self._parse_coocc_matrix(coocc["matrix"])

    def get_shotness_coocc(self):
        shotness = self.data["Shotness"]
        index = ["%s:%s" % (i["file"], i["name"]) for i in shotness]
        indptr = numpy.zeros(len(shotness) + 1, dtype=numpy.int64)
        indices = []
        data = []
        for i, record in enumerate(shotness):
            pairs = [(int(k), v) for k, v in record["counters"].items()]
            pairs.sort()
            indptr[i + 1] = indptr[i] + len(pairs)
            for k, v in pairs:
                indices.append(k)
                data.append(v)
        indices = numpy.array(indices, dtype=numpy.int32)
        data = numpy.array(data, dtype=numpy.int32)
        from scipy.sparse import csr_matrix
        return index, csr_matrix((data, indices, indptr), shape=(len(shotness),) * 2)

    def get_shotness(self):
        from munch import munchify
        obj = munchify(self.data["Shotness"])
        # turn strings into ints
        for item in obj:
            item.counters = {int(k): v for k, v in item.counters.items()}
        if len(obj) == 0:
            raise KeyError
        return obj

    def get_sentiment(self):
        from munch import munchify
        return munchify({int(key): {
            "Comments": vals[2].split("|"),
            "Commits": vals[1],
            "Value": float(vals[0])
        } for key, vals in self.data["Sentiment"].items()})

    def get_devs(self):
        people = self.data["Devs"]["people"]
        days = {int(d): {int(dev): DevDay(*(int(x) for x in day[:-1]), day[-1])
                         for dev, day in devs.items()}
                for d, devs in self.data["Devs"]["ticks"].items()}
        return people, days

    def _parse_burndown_matrix(self, matrix):
        return numpy.array([numpy.fromstring(line, dtype=int, sep=" ")
                            for line in matrix.split("\n")])

    def _parse_coocc_matrix(self, matrix):
        from scipy.sparse import csr_matrix
        data = []
        indices = []
        indptr = [0]
        for row in matrix:
            for k, v in sorted(row.items()):
                data.append(v)
                indices.append(k)
            indptr.append(indptr[-1] + len(row))
        return csr_matrix((data, indices, indptr), shape=(len(matrix),) * 2)


class ProtobufReader(Reader):
    def read(self, file):
        try:
            from labours.pb_pb2 import AnalysisResults
        except ImportError as e:
            print("\n\n>>> You need to generate python/hercules/pb/pb_pb2.py - run \"make\"\n",
                  file=sys.stderr)
            raise e from None
        self.data = AnalysisResults()
        if file != "-":
            with open(file, "rb") as fin:
                bytes = fin.read()
        else:
            bytes = sys.stdin.buffer.read()
        if not bytes:
            raise ValueError("empty input")
        self.data.ParseFromString(bytes)
        self.contents = {}
        for key, val in self.data.contents.items():
            try:
                mod, name = PB_MESSAGES[key].rsplit(".", 1)
            except KeyError:
                sys.stderr.write("Warning: there is no registered PB decoder for %s\n" % key)
                continue
            cls = getattr(import_module(mod), name)
            self.contents[key] = msg = cls()
            msg.ParseFromString(val)

    def get_run_times(self):
        return {key: val for key, val in self.data.header.run_time_per_item.items()}

    def get_name(self):
        return self.data.header.repository

    def get_header(self):
        header = self.data.header
        return header.begin_unix_time, header.end_unix_time

    def get_burndown_parameters(self):
        burndown = self.contents["Burndown"]
        return burndown.sampling, burndown.granularity

    def get_project_burndown(self):
        return self._parse_burndown_matrix(self.contents["Burndown"].project)

    def get_files_burndown(self):
        return [self._parse_burndown_matrix(i) for i in self.contents["Burndown"].files]

    def get_people_burndown(self):
        return [self._parse_burndown_matrix(i) for i in self.contents["Burndown"].people]

    def get_ownership_burndown(self):
        people = self.get_people_burndown()
        return [p[0] for p in people], {p[0]: p[1].T for p in people}

    def get_people_interaction(self):
        burndown = self.contents["Burndown"]
        return [i.name for i in burndown.people], \
            self._parse_sparse_matrix(burndown.people_interaction).toarray()

    def get_files_coocc(self):
        node = self.contents["Couples"].file_couples
        return list(node.index), self._parse_sparse_matrix(node.matrix)

    def get_people_coocc(self):
        node = self.contents["Couples"].people_couples
        return list(node.index), self._parse_sparse_matrix(node.matrix)

    def get_shotness_coocc(self):
        shotness = self.get_shotness()
        index = ["%s:%s" % (i.file, i.name) for i in shotness]
        indptr = numpy.zeros(len(shotness) + 1, dtype=numpy.int32)
        indices = []
        data = []
        for i, record in enumerate(shotness):
            pairs = list(record.counters.items())
            pairs.sort()
            indptr[i + 1] = indptr[i] + len(pairs)
            for k, v in pairs:
                indices.append(k)
                data.append(v)
        indices = numpy.array(indices, dtype=numpy.int32)
        data = numpy.array(data, dtype=numpy.int32)
        from scipy.sparse import csr_matrix
        return index, csr_matrix((data, indices, indptr), shape=(len(shotness),) * 2)

    def get_shotness(self):
        records = self.contents["Shotness"].records
        if len(records) == 0:
            raise KeyError
        return records

    def get_sentiment(self):
        byday = self.contents["Sentiment"].SentimentByDay
        if len(byday) == 0:
            raise KeyError
        return byday

    def get_devs(self):
        people = list(self.contents["Devs"].dev_index)
        days = {d: {dev: DevDay(stats.commits, stats.stats.added, stats.stats.removed,
                                stats.stats.changed, {k: [v.added, v.removed, v.changed]
                                                      for k, v in stats.languages.items()})
                    for dev, stats in day.devs.items()}
                for d, day in self.contents["Devs"].ticks.items()}
        return people, days

    def _parse_burndown_matrix(self, matrix):
        dense = numpy.zeros((matrix.number_of_rows, matrix.number_of_columns), dtype=int)
        for y, row in enumerate(matrix.rows):
            for x, col in enumerate(row.columns):
                dense[y, x] = col
        return matrix.name, dense.T

    def _parse_sparse_matrix(self, matrix):
        from scipy.sparse import csr_matrix
        return csr_matrix((list(matrix.data), list(matrix.indices), list(matrix.indptr)),
                          shape=(matrix.number_of_rows, matrix.number_of_columns))


READERS = {"yaml": YamlReader, "yml": YamlReader, "pb": ProtobufReader}
PB_MESSAGES = {
    "Burndown": "labours.pb_pb2.BurndownAnalysisResults",
    "Couples": "labours.pb_pb2.CouplesAnalysisResults",
    "Shotness": "labours.pb_pb2.ShotnessAnalysisResults",
    "Devs": "labours.pb_pb2.DevsAnalysisResults",
}


def read_input(args):
    sys.stdout.write("Reading the input... ")
    sys.stdout.flush()
    if args.input != "-":
        if args.input_format == "auto":
            try:
                args.input_format = args.input.rsplit(".", 1)[1]
            except IndexError:
                try:
                    with open(args.input) as f:
                        f.read(1 << 16)
                    args.input_format = "yaml"
                except UnicodeDecodeError:
                    args.input_format = "pb"
    elif args.input_format == "auto":
        args.input_format = "yaml"
    reader = READERS[args.input_format]()
    reader.read(args.input)
    print("done")
    return reader


class DevDay(namedtuple("DevDay", ("Commits", "Added", "Removed", "Changed", "Languages"))):
    def add(self, dd):
        langs = defaultdict(lambda: [0] * 3)
        for key, val in self.Languages.items():
            for i in range(3):
                langs[key][i] += val[i]
        for key, val in dd.Languages.items():
            for i in range(3):
                langs[key][i] += val[i]
        return DevDay(Commits=self.Commits + dd.Commits,
                      Added=self.Added + dd.Added,
                      Removed=self.Removed + dd.Removed,
                      Changed=self.Changed + dd.Changed,
                      Languages=dict(langs))


def fit_kaplan_meier(matrix):
    from lifelines import KaplanMeierFitter

    T = []
    W = []
    indexes = numpy.arange(matrix.shape[0], dtype=int)
    entries = numpy.zeros(matrix.shape[0], int)
    dead = set()
    for i in range(1, matrix.shape[1]):
        diff = matrix[:, i - 1] - matrix[:, i]
        entries[diff < 0] = i
        mask = diff > 0
        deaths = diff[mask]
        T.append(numpy.full(len(deaths), i) - entries[indexes[mask]])
        W.append(deaths)
        entered = entries > 0
        entered[0] = True
        dead = dead.union(set(numpy.where((matrix[:, i] == 0) & entered)[0]))
    # add the survivors as censored
    nnzind = entries != 0
    nnzind[0] = True
    nnzind[sorted(dead)] = False
    T.append(numpy.full(nnzind.sum(), matrix.shape[1]) - entries[nnzind])
    W.append(matrix[nnzind, -1])
    T = numpy.concatenate(T)
    E = numpy.ones(len(T), bool)
    E[-nnzind.sum():] = 0
    W = numpy.concatenate(W)
    if T.size == 0:
        return None
    kmf = KaplanMeierFitter().fit(T, E, weights=W)
    return kmf


def print_survival_function(kmf, sampling):
    sf = kmf.survival_function_
    sf.index = [timedelta(days=d) for d in sf.index * sampling]
    sf.columns = ["Ratio of survived lines"]
    try:
        print(sf[len(sf) // 6::len(sf) // 6].append(sf.tail(1)))
    except ValueError:
        pass


def interpolate_burndown_matrix(matrix, granularity, sampling):
    daily = numpy.zeros(
        (matrix.shape[0] * granularity, matrix.shape[1] * sampling),
        dtype=numpy.float32)
    """
    ----------> samples, x
    |
    |
    |
    ⌄
    bands, y
    """
    for y in range(matrix.shape[0]):
        for x in range(matrix.shape[1]):
            if y * granularity > (x + 1) * sampling:
                # the future is zeros
                continue

            def decay(start_index: int, start_val: float):
                if start_val == 0:
                    return
                k = matrix[y][x] / start_val  # <= 1
                scale = (x + 1) * sampling - start_index
                for i in range(y * granularity, (y + 1) * granularity):
                    initial = daily[i][start_index - 1]
                    for j in range(start_index, (x + 1) * sampling):
                        daily[i][j] = initial * (
                            1 + (k - 1) * (j - start_index + 1) / scale)

            def grow(finish_index: int, finish_val: float):
                initial = matrix[y][x - 1] if x > 0 else 0
                start_index = x * sampling
                if start_index < y * granularity:
                    start_index = y * granularity
                if finish_index == start_index:
                    return
                avg = (finish_val - initial) / (finish_index - start_index)
                for j in range(x * sampling, finish_index):
                    for i in range(start_index, j + 1):
                        daily[i][j] = avg
                # copy [x*g..y*s)
                for j in range(x * sampling, finish_index):
                    for i in range(y * granularity, x * sampling):
                        daily[i][j] = daily[i][j - 1]

            if (y + 1) * granularity >= (x + 1) * sampling:
                # x*granularity <= (y+1)*sampling
                # 1. x*granularity <= y*sampling
                #    y*sampling..(y+1)sampling
                #
                #       x+1
                #        /
                #       /
                #      / y+1  -|
                #     /        |
                #    / y      -|
                #   /
                #  / x
                #
                # 2. x*granularity > y*sampling
                #    x*granularity..(y+1)sampling
                #
                #       x+1
                #        /
                #       /
                #      / y+1  -|
                #     /        |
                #    / x      -|
                #   /
                #  / y
                if y * granularity <= x * sampling:
                    grow((x + 1) * sampling, matrix[y][x])
                elif (x + 1) * sampling > y * granularity:
                    grow((x + 1) * sampling, matrix[y][x])
                    avg = matrix[y][x] / ((x + 1) * sampling - y * granularity)
                    for j in range(y * granularity, (x + 1) * sampling):
                        for i in range(y * granularity, j + 1):
                            daily[i][j] = avg
            elif (y + 1) * granularity >= x * sampling:
                # y*sampling <= (x+1)*granularity < (y+1)sampling
                # y*sampling..(x+1)*granularity
                # (x+1)*granularity..(y+1)sampling
                #        x+1
                #         /\
                #        /  \
                #       /    \
                #      /    y+1
                #     /
                #    y
                v1 = matrix[y][x - 1]
                v2 = matrix[y][x]
                delta = (y + 1) * granularity - x * sampling
                previous = 0
                if x > 0 and (x - 1) * sampling >= y * granularity:
                    # x*g <= (y-1)*s <= y*s <= (x+1)*g <= (y+1)*s
                    #           |________|.......^
                    if x > 1:
                        previous = matrix[y][x - 2]
                    scale = sampling
                else:
                    # (y-1)*s < x*g <= y*s <= (x+1)*g <= (y+1)*s
                    #            |______|.......^
                    scale = sampling if x == 0 else x * sampling - y * granularity
                peak = v1 + (v1 - previous) / scale * delta
                if v2 > peak:
                    # we need to adjust the peak, it may not be less than the decayed value
                    if x < matrix.shape[1] - 1:
                        # y*s <= (x+1)*g <= (y+1)*s < (y+2)*s
                        #           ^.........|_________|
                        k = (v2 - matrix[y][x + 1]) / sampling  # > 0
                        peak = matrix[y][x] + k * ((x + 1) * sampling - (y + 1) * granularity)
                        # peak > v2 > v1
                    else:
                        peak = v2
                        # not enough data to interpolate; this is at least not restricted
                grow((y + 1) * granularity, peak)
                decay((y + 1) * granularity, peak)
            else:
                # (x+1)*granularity < y*sampling
                # y*sampling..(y+1)sampling
                decay(x * sampling, matrix[y][x - 1])
    return daily


def import_pandas():
    import pandas
    try:
        from pandas.plotting import register_matplotlib_converters
        register_matplotlib_converters()
    except ImportError:
        pass
    return pandas


def load_burndown(header, name, matrix, resample, report_survival=True):
    pandas = import_pandas()

    start, last, sampling, granularity = header
    assert sampling > 0
    assert granularity > 0
    start = datetime.fromtimestamp(start)
    last = datetime.fromtimestamp(last)
    if report_survival:
        kmf = fit_kaplan_meier(matrix)
        if kmf is not None:
            print_survival_function(kmf, sampling)
    finish = start + timedelta(days=matrix.shape[1] * sampling)
    if resample not in ("no", "raw"):
        print("resampling to %s, please wait..." % resample)
        # Interpolate the day x day matrix.
        # Each day brings equal weight in the granularity.
        # Sampling's interpolation is linear.
        daily = interpolate_burndown_matrix(matrix, granularity, sampling)
        daily[(last - start).days:] = 0
        # Resample the bands
        aliases = {
            "year": "A",
            "month": "M"
        }
        resample = aliases.get(resample, resample)
        periods = 0
        date_granularity_sampling = [start]
        while date_granularity_sampling[-1] < finish:
            periods += 1
            date_granularity_sampling = pandas.date_range(
                start, periods=periods, freq=resample)
        if date_granularity_sampling[0] > finish:
            if resample == "A":
                print("too loose resampling - by year, trying by month")
                return load_burndown(header, name, matrix, "month", report_survival=False)
            else:
                raise ValueError("Too loose resampling: %s. Try finer." % resample)
        date_range_sampling = pandas.date_range(
            date_granularity_sampling[0],
            periods=(finish - date_granularity_sampling[0]).days,
            freq="1D")
        # Fill the new square matrix
        matrix = numpy.zeros(
            (len(date_granularity_sampling), len(date_range_sampling)),
            dtype=numpy.float32)
        for i, gdt in enumerate(date_granularity_sampling):
            istart = (date_granularity_sampling[i - 1] - start).days \
                if i > 0 else 0
            ifinish = (gdt - start).days

            for j, sdt in enumerate(date_range_sampling):
                if (sdt - start).days >= istart:
                    break
            matrix[i, j:] = \
                daily[istart:ifinish, (sdt - start).days:].sum(axis=0)
        # Hardcode some cases to improve labels' readability
        if resample in ("year", "A"):
            labels = [dt.year for dt in date_granularity_sampling]
        elif resample in ("month", "M"):
            labels = [dt.strftime("%Y %B") for dt in date_granularity_sampling]
        else:
            labels = [dt.date() for dt in date_granularity_sampling]
    else:
        labels = [
            "%s - %s" % ((start + timedelta(days=i * granularity)).date(),
                         (
                         start + timedelta(days=(i + 1) * granularity)).date())
            for i in range(matrix.shape[0])]
        if len(labels) > 18:
            warnings.warn("Too many labels - consider resampling.")
        resample = "M"  # fake resampling type is checked while plotting
        date_range_sampling = pandas.date_range(
            start + timedelta(days=sampling), periods=matrix.shape[1],
            freq="%dD" % sampling)
    return name, matrix, date_range_sampling, labels, granularity, sampling, resample


def load_ownership(header, sequence, contents, max_people):
    pandas = import_pandas()

    start, last, sampling, _ = header
    start = datetime.fromtimestamp(start)
    last = datetime.fromtimestamp(last)
    people = []
    for name in sequence:
        people.append(contents[name].sum(axis=1))
    people = numpy.array(people)
    date_range_sampling = pandas.date_range(
        start + timedelta(days=sampling), periods=people[0].shape[0],
        freq="%dD" % sampling)

    if people.shape[0] > max_people:
        order = numpy.argsort(-people.sum(axis=1))
        chosen_people = people[order[:max_people + 1]]
        chosen_people[max_people] = people[order[max_people:]].sum(axis=0)
        people = chosen_people
        sequence = [sequence[i] for i in order[:max_people]] + ["others"]
        print("Warning: truncated people to the most owning %d" % max_people)
    for i, name in enumerate(sequence):
        if len(name) > 40:
            sequence[i] = name[:37] + "..."

    return sequence, people, date_range_sampling, last


def load_churn_matrix(people, matrix, max_people):
    matrix = matrix.astype(float)
    if matrix.shape[0] > max_people:
        order = numpy.argsort(-matrix[:, 0])
        matrix = matrix[order[:max_people]][:, [0, 1] + list(2 + order[:max_people])]
        people = [people[i] for i in order[:max_people]]
        print("Warning: truncated people to most productive %d" % max_people)
    zeros = matrix[:, 0] == 0
    matrix[zeros, :] = 1
    matrix /= matrix[:, 0][:, None]
    matrix = -matrix[:, 1:]
    matrix[zeros, :] = 0
    for i, name in enumerate(people):
        if len(name) > 40:
            people[i] = name[:37] + "..."
    return people, matrix


def import_pyplot(backend, style):
    import matplotlib
    if backend:
        matplotlib.use(backend)
    from matplotlib import pyplot
    pyplot.style.use(style)
    print("matplotlib: backend is", matplotlib.get_backend())
    return matplotlib, pyplot


def apply_plot_style(figure, axes, legend, background, font_size, axes_size):
    foreground = "black" if background == "white" else "white"
    if axes_size is None:
        axes_size = (16, 12)
    else:
        axes_size = tuple(float(p) for p in axes_size.split(","))
    figure.set_size_inches(*axes_size)
    for side in ("bottom", "top", "left", "right"):
        axes.spines[side].set_color(foreground)
    for axis in (axes.xaxis, axes.yaxis):
        axis.label.update(dict(fontsize=font_size, color=foreground))
    for axis in ("x", "y"):
        getattr(axes, axis + "axis").get_offset_text().set_size(font_size)
        axes.tick_params(axis=axis, colors=foreground, labelsize=font_size)
    try:
        axes.ticklabel_format(axis="y", style="sci", scilimits=(0, 3))
    except AttributeError:
        pass
    figure.patch.set_facecolor(background)
    axes.set_facecolor(background)
    if legend is not None:
        frame = legend.get_frame()
        for setter in (frame.set_facecolor, frame.set_edgecolor):
            setter(background)
        for text in legend.get_texts():
            text.set_color(foreground)


def get_plot_path(base, name):
    root, ext = os.path.splitext(base)
    if not ext:
        ext = ".png"
    output = os.path.join(root, name + ext)
    os.makedirs(os.path.dirname(output), exist_ok=True)
    return output


def deploy_plot(title, output, background, tight=True):
    import matplotlib.pyplot as pyplot

    if not output:
        pyplot.gcf().canvas.set_window_title(title)
        pyplot.show()
    else:
        if title:
            pyplot.title(title, color="black" if background == "white" else "white")
        if tight:
            try:
                pyplot.tight_layout()
            except:  # noqa: E722
                print("Warning: failed to set the tight layout")
        pyplot.savefig(output, transparent=True)
    pyplot.clf()


def default_json(x):
    if hasattr(x, "tolist"):
        return x.tolist()
    if hasattr(x, "isoformat"):
        return x.isoformat()
    return x


def parse_date(text, default):
    if not text:
        return default
    from dateutil.parser import parse
    return parse(text)


def plot_burndown(args, target, name, matrix, date_range_sampling, labels, granularity,
                  sampling, resample):
    if args.output and args.output.endswith(".json"):
        data = locals().copy()
        del data["args"]
        data["type"] = "burndown"
        if args.mode == "project" and target == "project":
            output = args.output
        else:
            if target == "project":
                name = "project"
            output = get_plot_path(args.output, name)
        with open(output, "w") as fout:
            json.dump(data, fout, sort_keys=True, default=default_json)
        return

    matplotlib, pyplot = import_pyplot(args.backend, args.style)

    pyplot.stackplot(date_range_sampling, matrix, labels=labels)
    if args.relative:
        for i in range(matrix.shape[1]):
            matrix[:, i] /= matrix[:, i].sum()
        pyplot.ylim(0, 1)
        legend_loc = 3
    else:
        legend_loc = 2
    legend = pyplot.legend(loc=legend_loc, fontsize=args.font_size)
    pyplot.ylabel("Lines of code")
    pyplot.xlabel("Time")
    apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.background,
                     args.font_size, args.size)
    pyplot.xlim(parse_date(args.start_date, date_range_sampling[0]),
                parse_date(args.end_date, date_range_sampling[-1]))
    locator = pyplot.gca().xaxis.get_major_locator()
    # set the optimal xticks locator
    if "M" not in resample:
        pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
    locs = pyplot.gca().get_xticks().tolist()
    if len(locs) >= 16:
        pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
        locs = pyplot.gca().get_xticks().tolist()
        if len(locs) >= 16:
            pyplot.gca().xaxis.set_major_locator(locator)
    if locs[0] < pyplot.xlim()[0]:
        del locs[0]
    endindex = -1
    if len(locs) >= 2 and pyplot.xlim()[1] - locs[-1] > (locs[-1] - locs[-2]) / 2:
        locs.append(pyplot.xlim()[1])
        endindex = len(locs) - 1
    startindex = -1
    if len(locs) >= 2 and locs[0] - pyplot.xlim()[0] > (locs[1] - locs[0]) / 2:
        locs.append(pyplot.xlim()[0])
        startindex = len(locs) - 1
    pyplot.gca().set_xticks(locs)
    # hacking time!
    labels = pyplot.gca().get_xticklabels()
    if startindex >= 0:
        labels[startindex].set_text(date_range_sampling[0].date())
        labels[startindex].set_text = lambda _: None
        labels[startindex].set_rotation(30)
        labels[startindex].set_ha("right")
    if endindex >= 0:
        labels[endindex].set_text(date_range_sampling[-1].date())
        labels[endindex].set_text = lambda _: None
        labels[endindex].set_rotation(30)
        labels[endindex].set_ha("right")
    title = "%s %d x %d (granularity %d, sampling %d)" % \
        ((name,) + matrix.shape + (granularity, sampling))
    output = args.output
    if output:
        if args.mode == "project" and target == "project":
            output = args.output
        else:
            if target == "project":
                name = "project"
            output = get_plot_path(args.output, name)
    deploy_plot(title, output, args.background)


def plot_many_burndown(args, target, header, parts):
    if not args.output:
        print("Warning: output not set, showing %d plots." % len(parts))
    itercnt = progress.bar(parts, expected_size=len(parts)) \
        if progress is not None else parts
    stdout = io.StringIO()
    for name, matrix in itercnt:
        backup = sys.stdout
        sys.stdout = stdout
        plot_burndown(args, target, *load_burndown(header, name, matrix, args.resample))
        sys.stdout = backup
    sys.stdout.write(stdout.getvalue())


def plot_churn_matrix(args, repo, people, matrix):
    if args.output and args.output.endswith(".json"):
        data = locals().copy()
        del data["args"]
        data["type"] = "churn_matrix"
        if args.mode == "all":
            output = get_plot_path(args.output, "matrix")
        else:
            output = args.output
        with open(output, "w") as fout:
            json.dump(data, fout, sort_keys=True, default=default_json)
        return

    matplotlib, pyplot = import_pyplot(args.backend, args.style)

    s = 4 + matrix.shape[1] * 0.3
    fig = pyplot.figure(figsize=(s, s))
    ax = fig.add_subplot(111)
    ax.xaxis.set_label_position("top")
    ax.matshow(matrix, cmap=pyplot.cm.OrRd)
    ax.set_xticks(numpy.arange(0, matrix.shape[1]))
    ax.set_yticks(numpy.arange(0, matrix.shape[0]))
    ax.set_yticklabels(people, va="center")
    ax.set_xticks(numpy.arange(0.5, matrix.shape[1] + 0.5), minor=True)
    ax.set_xticklabels(["Unidentified"] + people, rotation=45, ha="left",
                       va="bottom", rotation_mode="anchor")
    ax.set_yticks(numpy.arange(0.5, matrix.shape[0] + 0.5), minor=True)
    ax.grid(False)
    ax.grid(which="minor")
    apply_plot_style(fig, ax, None, args.background, args.font_size, args.size)
    if not args.output:
        pos1 = ax.get_position()
        pos2 = (pos1.x0 + 0.15, pos1.y0 - 0.1, pos1.width * 0.9, pos1.height * 0.9)
        ax.set_position(pos2)
    if args.mode == "all" and args.output:
        output = get_plot_path(args.output, "matrix")
    else:
        output = args.output
    title = "%s %d developers overwrite" % (repo, matrix.shape[0])
    if args.output:
        # FIXME(vmarkovtsev): otherwise the title is screwed in savefig()
        title = ""
    deploy_plot(title, output, args.background)


def plot_ownership(args, repo, names, people, date_range, last):
    if args.output and args.output.endswith(".json"):
        data = locals().copy()
        del data["args"]
        data["type"] = "ownership"
        if args.mode == "all" and args.output:
            output = get_plot_path(args.output, "people")
        else:
            output = args.output
        with open(output, "w") as fout:
            json.dump(data, fout, sort_keys=True, default=default_json)
        return

    matplotlib, pyplot = import_pyplot(args.backend, args.style)

    polys = pyplot.stackplot(date_range, people, labels=names)
    if names[-1] == "others":
        polys[-1].set_hatch("/")
    pyplot.xlim(parse_date(args.start_date, date_range[0]), parse_date(args.end_date, last))

    if args.relative:
        for i in range(people.shape[1]):
            people[:, i] /= people[:, i].sum()
        pyplot.ylim(0, 1)
        legend_loc = 3
    else:
        legend_loc = 2
    ncol = 1 if len(names) < 15 else 2
    legend = pyplot.legend(loc=legend_loc, fontsize=args.font_size, ncol=ncol)
    apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.background,
                     args.font_size, args.size)
    if args.mode == "all" and args.output:
        output = get_plot_path(args.output, "people")
    else:
        output = args.output
    deploy_plot("%s code ownership through time" % repo, output, args.background)


IDEAL_SHARD_SIZE = 4096


def train_embeddings(index, matrix, tmpdir, shard_size=IDEAL_SHARD_SIZE):
    try:
        from . import swivel
    except (SystemError, ImportError):
        import swivel
    import tensorflow as tf

    assert matrix.shape[0] == matrix.shape[1]
    assert len(index) <= matrix.shape[0]
    outlier_threshold = numpy.percentile(matrix.data, 99)
    matrix.data[matrix.data > outlier_threshold] = outlier_threshold
    nshards = len(index) // shard_size
    if nshards * shard_size < len(index):
        nshards += 1
        shard_size = len(index) // nshards
        nshards = len(index) // shard_size
    remainder = len(index) - nshards * shard_size
    if remainder > 0:
        lengths = matrix.indptr[1:] - matrix.indptr[:-1]
        filtered = sorted(numpy.argsort(lengths)[remainder:])
    else:
        filtered = list(range(len(index)))
    if len(filtered) < matrix.shape[0]:
        print("Truncating the sparse matrix...")
        matrix = matrix[filtered, :][:, filtered]
    meta_index = []
    for i, j in enumerate(filtered):
        meta_index.append((index[j], matrix[i, i]))
    index = [mi[0] for mi in meta_index]
    with tempfile.TemporaryDirectory(prefix="hercules_labours_", dir=tmpdir or None) as tmproot:
        print("Writing Swivel metadata...")
        vocabulary = "\n".join(index)
        with open(os.path.join(tmproot, "row_vocab.txt"), "w") as out:
            out.write(vocabulary)
        with open(os.path.join(tmproot, "col_vocab.txt"), "w") as out:
            out.write(vocabulary)
        del vocabulary
        bool_sums = matrix.indptr[1:] - matrix.indptr[:-1]
        bool_sums_str = "\n".join(map(str, bool_sums.tolist()))
        with open(os.path.join(tmproot, "row_sums.txt"), "w") as out:
            out.write(bool_sums_str)
        with open(os.path.join(tmproot, "col_sums.txt"), "w") as out:
            out.write(bool_sums_str)
        del bool_sums_str
        reorder = numpy.argsort(-bool_sums)

        print("Writing Swivel shards...")
        for row in range(nshards):
            for col in range(nshards):
                def _int64s(xs):
                    return tf.train.Feature(
                        int64_list=tf.train.Int64List(value=list(xs)))

                def _floats(xs):
                    return tf.train.Feature(
                        float_list=tf.train.FloatList(value=list(xs)))

                indices_row = reorder[row::nshards]
                indices_col = reorder[col::nshards]
                shard = matrix[indices_row][:, indices_col].tocoo()

                example = tf.train.Example(features=tf.train.Features(feature={
                    "global_row": _int64s(indices_row),
                    "global_col": _int64s(indices_col),
                    "sparse_local_row": _int64s(shard.row),
                    "sparse_local_col": _int64s(shard.col),
                    "sparse_value": _floats(shard.data)}))

                with open(os.path.join(tmproot, "shard-%03d-%03d.pb" % (row, col)), "wb") as out:
                    out.write(example.SerializeToString())
        print("Training Swivel model...")
        swivel.FLAGS.submatrix_rows = shard_size
        swivel.FLAGS.submatrix_cols = shard_size
        if len(meta_index) <= IDEAL_SHARD_SIZE / 16:
            embedding_size = 50
            num_epochs = 100000
        elif len(meta_index) <= IDEAL_SHARD_SIZE:
            embedding_size = 50
            num_epochs = 50000
        elif len(meta_index) <= IDEAL_SHARD_SIZE * 2:
            embedding_size = 60
            num_epochs = 10000
        elif len(meta_index) <= IDEAL_SHARD_SIZE * 4:
            embedding_size = 70
            num_epochs = 8000
        elif len(meta_index) <= IDEAL_SHARD_SIZE * 10:
            embedding_size = 80
            num_epochs = 5000
        elif len(meta_index) <= IDEAL_SHARD_SIZE * 25:
            embedding_size = 100
            num_epochs = 1000
        elif len(meta_index) <= IDEAL_SHARD_SIZE * 100:
            embedding_size = 200
            num_epochs = 600
        else:
            embedding_size = 300
            num_epochs = 300
        if os.getenv("CI"):
            # Travis, AppVeyor etc. during the integration tests
            num_epochs /= 10
        swivel.FLAGS.embedding_size = embedding_size
        swivel.FLAGS.input_base_path = tmproot
        swivel.FLAGS.output_base_path = tmproot
        swivel.FLAGS.loss_multiplier = 1.0 / shard_size
        swivel.FLAGS.num_epochs = num_epochs
        # Tensorflow 1.5 parses sys.argv unconditionally *applause*
        argv_backup = sys.argv[1:]
        del sys.argv[1:]
        swivel.main(None)
        sys.argv.extend(argv_backup)
        print("Reading Swivel embeddings...")
        embeddings = []
        with open(os.path.join(tmproot, "row_embedding.tsv")) as frow:
            with open(os.path.join(tmproot, "col_embedding.tsv")) as fcol:
                for i, (lrow, lcol) in enumerate(zip(frow, fcol)):
                    prow, pcol = (l.split("\t", 1) for l in (lrow, lcol))
                    assert prow[0] == pcol[0]
                    erow, ecol = \
                        (numpy.fromstring(p[1], dtype=numpy.float32, sep="\t")
                         for p in (prow, pcol))
                    embeddings.append((erow + ecol) / 2)
    return meta_index, embeddings


class CORSWebServer(object):
    def __init__(self):
        self.thread = threading.Thread(target=self.serve)
        self.server = None

    def serve(self):
        outer = self

        try:
            from http.server import HTTPServer, SimpleHTTPRequestHandler, test
        except ImportError:  # Python 2
            from BaseHTTPServer import HTTPServer, test
            from SimpleHTTPServer import SimpleHTTPRequestHandler

        class ClojureServer(HTTPServer):
            def __init__(self, *args, **kwargs):
                HTTPServer.__init__(self, *args, **kwargs)
                outer.server = self

        class CORSRequestHandler(SimpleHTTPRequestHandler):
            def end_headers(self):
                self.send_header("Access-Control-Allow-Origin", "*")
                SimpleHTTPRequestHandler.end_headers(self)

        test(CORSRequestHandler, ClojureServer)

    def start(self):
        self.thread.start()

    def stop(self):
        if self.running:
            self.server.shutdown()
            self.thread.join()

    @property
    def running(self):
        return self.server is not None


web_server = CORSWebServer()


def write_embeddings(name, output, run_server, index, embeddings):
    print("Writing Tensorflow Projector files...")
    if not output:
        output = "couples"
    if output.endswith(".json"):
        output = os.path.join(output[:-5], "couples")
        run_server = False
    metaf = "%s_%s_meta.tsv" % (output, name)
    with open(metaf, "w") as fout:
        fout.write("name\tcommits\n")
        for pair in index:
            fout.write("%s\t%s\n" % pair)
    print("Wrote", metaf)
    dataf = "%s_%s_data.tsv" % (output, name)
    with open(dataf, "w") as fout:
        for vec in embeddings:
            fout.write("\t".join(str(v) for v in vec))
            fout.write("\n")
    print("Wrote", dataf)
    jsonf = "%s_%s.json" % (output, name)
    with open(jsonf, "w") as fout:
        fout.write("""{
  "embeddings": [
    {
      "tensorName": "%s %s coupling",
      "tensorShape": [%s, %s],
      "tensorPath": "http://0.0.0.0:8000/%s",
      "metadataPath": "http://0.0.0.0:8000/%s"
    }
  ]
}
""" % (output, name, len(embeddings), len(embeddings[0]), dataf, metaf))
    print("Wrote %s" % jsonf)
    if run_server and not web_server.running:
        web_server.start()
    url = "http://projector.tensorflow.org/?config=http://0.0.0.0:8000/" + jsonf
    print(url)
    if run_server:
        if shutil.which("xdg-open") is not None:
            os.system("xdg-open " + url)
        else:
            browser = os.getenv("BROWSER", "")
            if browser:
                os.system(browser + " " + url)
            else:
                print("\t" + url)


def show_shotness_stats(data):
    top = sorted(((r.counters[i], i) for i, r in enumerate(data)), reverse=True)
    for count, i in top:
        r = data[i]
        print("%8d  %s:%s [%s]" % (count, r.file, r.name, r.internal_role))


def show_sentiment_stats(args, name, resample, start_date, data):
    from scipy.signal import convolve, slepian

    matplotlib, pyplot = import_pyplot(args.backend, args.style)

    start_date = datetime.fromtimestamp(start_date)
    data = sorted(data.items())
    mood = numpy.zeros(data[-1][0] + 1, dtype=numpy.float32)
    timeline = numpy.array([start_date + timedelta(days=i) for i in range(mood.shape[0])])
    for d, val in data:
        mood[d] = (0.5 - val.Value) * 2
    resolution = 32
    window = slepian(len(timeline) // resolution, 0.5)
    window /= window.sum()
    mood_smooth = convolve(mood, window, "same")
    pos = mood_smooth.copy()
    pos[pos < 0] = 0
    neg = mood_smooth.copy()
    neg[neg >= 0] = 0
    resolution = 4
    window = numpy.ones(len(timeline) // resolution)
    window /= window.sum()
    avg = convolve(mood, window, "same")
    pyplot.fill_between(timeline, pos, color="#8DB843", label="Positive")
    pyplot.fill_between(timeline, neg, color="#E14C35", label="Negative")
    pyplot.plot(timeline, avg, color="grey", label="Average", linewidth=5)
    legend = pyplot.legend(loc=1, fontsize=args.font_size)
    pyplot.ylabel("Comment sentiment")
    pyplot.xlabel("Time")
    apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.background,
                     args.font_size, args.size)
    pyplot.xlim(parse_date(args.start_date, timeline[0]), parse_date(args.end_date, timeline[-1]))
    locator = pyplot.gca().xaxis.get_major_locator()
    # set the optimal xticks locator
    if "M" not in resample:
        pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
    locs = pyplot.gca().get_xticks().tolist()
    if len(locs) >= 16:
        pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())
        locs = pyplot.gca().get_xticks().tolist()
        if len(locs) >= 16:
            pyplot.gca().xaxis.set_major_locator(locator)
    if locs[0] < pyplot.xlim()[0]:
        del locs[0]
    endindex = -1
    if len(locs) >= 2 and pyplot.xlim()[1] - locs[-1] > (locs[-1] - locs[-2]) / 2:
        locs.append(pyplot.xlim()[1])
        endindex = len(locs) - 1
    startindex = -1
    if len(locs) >= 2 and locs[0] - pyplot.xlim()[0] > (locs[1] - locs[0]) / 2:
        locs.append(pyplot.xlim()[0])
        startindex = len(locs) - 1
    pyplot.gca().set_xticks(locs)
    # hacking time!
    labels = pyplot.gca().get_xticklabels()
    if startindex >= 0:
        labels[startindex].set_text(timeline[0].date())
        labels[startindex].set_text = lambda _: None
        labels[startindex].set_rotation(30)
        labels[startindex].set_ha("right")
    if endindex >= 0:
        labels[endindex].set_text(timeline[-1].date())
        labels[endindex].set_text = lambda _: None
        labels[endindex].set_rotation(30)
        labels[endindex].set_ha("right")
    overall_pos = sum(2 * (0.5 - d[1].Value) for d in data if d[1].Value < 0.5)
    overall_neg = sum(2 * (d[1].Value - 0.5) for d in data if d[1].Value > 0.5)
    title = "%s sentiment +%.1f -%.1f δ=%.1f" % (
        name, overall_pos, overall_neg, overall_pos - overall_neg)
    deploy_plot(title, args.output, args.background)


def show_devs(args, name, start_date, end_date, people, days):
    from scipy.signal import convolve, slepian

    max_people = 50
    if len(people) > max_people:
        print("Picking top 100 developers by commit count")
        # pick top N developers by commit count
        commits = defaultdict(int)
        for devs in days.values():
            for dev, stats in devs.items():
                commits[dev] += stats.Commits
        commits = sorted(((v, k) for k, v in commits.items()), reverse=True)
        chosen_people = {people[k] for _, k in commits[:max_people]}
    else:
        chosen_people = set(people)
    dists, devseries, devstats, route = order_commits(chosen_people, days, people)
    route_map = {v: i for i, v in enumerate(route)}
    # determine clusters
    clusters = hdbscan_cluster_routed_series(dists, route)
    keys = list(devseries.keys())
    route = [keys[node] for node in route]
    print("Plotting")
    # smooth time series
    start_date = datetime.fromtimestamp(start_date)
    start_date = datetime(start_date.year, start_date.month, start_date.day)
    end_date = datetime.fromtimestamp(end_date)
    end_date = datetime(end_date.year, end_date.month, end_date.day)
    size = (end_date - start_date).days + 1
    plot_x = [start_date + timedelta(days=i) for i in range(size)]
    resolution = 64
    window = slepian(size // resolution, 0.5)
    final = numpy.zeros((len(devseries), size), dtype=numpy.float32)
    for i, s in enumerate(devseries.values()):
        arr = numpy.array(s).transpose()
        full_history = numpy.zeros(size, dtype=numpy.float32)
        mask = arr[0] < size
        full_history[arr[0][mask]] = arr[1][mask]
        final[route_map[i]] = convolve(full_history, window, "same")

    matplotlib, pyplot = import_pyplot(args.backend, args.style)
    pyplot.rcParams["figure.figsize"] = (32, 16)
    prop_cycle = pyplot.rcParams["axes.prop_cycle"]
    colors = prop_cycle.by_key()["color"]
    fig, axes = pyplot.subplots(final.shape[0], 1)
    backgrounds = ("#C4FFDB", "#FFD0CD") if args.background == "white" else ("#05401C", "#40110E")
    max_cluster = numpy.max(clusters)
    for ax, series, cluster, dev_i in zip(axes, final, clusters, route):
        if cluster >= 0:
            color = colors[cluster % len(colors)]
            i = 1
            while color == "#777777":
                color = colors[(max_cluster + i) % len(colors)]
                i += 1
        else:
            # outlier
            color = "#777777"
        ax.fill_between(plot_x, series, color=color)
        ax.set_axis_off()
        author = people[dev_i]
        ax.text(0.03, 0.5, author[:36] + (author[36:] and "..."),
                horizontalalignment="right", verticalalignment="center",
                transform=ax.transAxes, fontsize=14,
                color="black" if args.background == "white" else "white")
        ds = devstats[dev_i]
        stats = "%5d %8s %8s" % (ds[0], _format_number(ds[1] - ds[2]), _format_number(ds[3]))
        ax.text(0.97, 0.5, stats,
                horizontalalignment="left", verticalalignment="center",
                transform=ax.transAxes, fontsize=14, family="monospace",
                backgroundcolor=backgrounds[ds[1] <= ds[2]],
                color="black" if args.background == "white" else "white")
    axes[0].text(0.97, 1.75, " cmts    delta  changed",
                 horizontalalignment="left", verticalalignment="center",
                 transform=axes[0].transAxes, fontsize=14, family="monospace",
                 color="black" if args.background == "white" else "white")
    axes[-1].set_axis_on()
    target_num_labels = 12
    num_months = (end_date.year - start_date.year) * 12 + end_date.month - start_date.month
    interval = int(numpy.ceil(num_months / target_num_labels))
    if interval >= 8:
        interval = int(numpy.ceil(num_months / (12 * target_num_labels)))
        axes[-1].xaxis.set_major_locator(matplotlib.dates.YearLocator(base=max(1, interval // 12)))
        axes[-1].xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y"))
    else:
        axes[-1].xaxis.set_major_locator(matplotlib.dates.MonthLocator(interval=interval))
        axes[-1].xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%m"))
    for tick in axes[-1].xaxis.get_major_ticks():
        tick.label.set_fontsize(args.font_size)
    axes[-1].spines["left"].set_visible(False)
    axes[-1].spines["right"].set_visible(False)
    axes[-1].spines["top"].set_visible(False)
    axes[-1].get_yaxis().set_visible(False)
    axes[-1].set_facecolor((1.0,) * 3 + (0.0,))

    title = ("%s commits" % name) if not args.output else ""
    deploy_plot(title, args.output, args.background)


def order_commits(chosen_people, days, people):
    from seriate import seriate
    try:
        from fastdtw import fastdtw
    except ImportError as e:
        print("Cannot import fastdtw: %s\nInstall it from https://github.com/slaypni/fastdtw" % e)
        sys.exit(1)
    # FIXME(vmarkovtsev): remove once https://github.com/slaypni/fastdtw/pull/28 is merged&released
    try:
        sys.modules["fastdtw.fastdtw"].__norm = lambda p: lambda a, b: numpy.linalg.norm(
            numpy.atleast_1d(a) - numpy.atleast_1d(b), p)
    except KeyError:
        # the native extension does not have this bug
        pass

    devseries = defaultdict(list)
    devstats = defaultdict(lambda: DevDay(0, 0, 0, 0, {}))
    for day, devs in sorted(days.items()):
        for dev, stats in devs.items():
            if people[dev] in chosen_people:
                devseries[dev].append((day, stats.Commits))
                devstats[dev] = devstats[dev].add(stats)
    print("Calculating the distance matrix")
    # max-normalize the time series using a sliding window
    series = list(devseries.values())
    for i, s in enumerate(series):
        arr = numpy.array(s).transpose().astype(numpy.float32)
        commits = arr[1]
        if len(commits) < 7:
            commits /= commits.max()
        else:
            # 4 is sizeof(float32)
            windows = numpy.lib.stride_tricks.as_strided(commits, [len(commits) - 6, 7], [4, 4])
            commits = numpy.concatenate((
                [windows[0, 0] / windows[0].max(),
                 windows[0, 1] / windows[0].max(),
                 windows[0, 2] / windows[0].max()],
                windows[:, 3] / windows.max(axis=1),
                [windows[-1, 4] / windows[-1].max(),
                 windows[-1, 5] / windows[-1].max(),
                 windows[-1, 6] / windows[-1].max()]
            ))
        arr[1] = commits * 7  # 7 is a pure heuristic here and is not related to the window size
        series[i] = arr.transpose()
    # calculate the distance matrix using dynamic time warping metric
    dists = numpy.full((len(series),) * 2, -100500, dtype=numpy.float32)
    for x, serx in enumerate(series):
        dists[x, x] = 0
        for y, sery in enumerate(series[x + 1:], start=x + 1):
            min_day = int(min(serx[0][0], sery[0][0]))
            max_day = int(max(serx[-1][0], sery[-1][0]))
            arrx = numpy.zeros(max_day - min_day + 1, dtype=numpy.float32)
            arry = numpy.zeros_like(arrx)
            arrx[serx[:, 0].astype(int) - min_day] = serx[:, 1]
            arry[sery[:, 0].astype(int) - min_day] = sery[:, 1]
            # L1 norm
            dist, _ = fastdtw(arrx, arry, radius=5, dist=1)
            dists[x, y] = dists[y, x] = dist
    print("Ordering the series")
    route = seriate(dists)
    return dists, devseries, devstats, route


def hdbscan_cluster_routed_series(dists, route):
    try:
        from hdbscan import HDBSCAN
    except ImportError as e:
        print("Cannot import ortools: %s\nInstall it from "
              "https://developers.google.com/optimization/install/python/" % e)
        sys.exit(1)

    opt_dist_chain = numpy.cumsum(numpy.array(
        [0] + [dists[route[i], route[i + 1]] for i in range(len(route) - 1)]))
    clusters = HDBSCAN(min_cluster_size=2).fit_predict(opt_dist_chain[:, numpy.newaxis])
    return clusters


def show_devs_efforts(args, name, start_date, end_date, people, days, max_people):
    from scipy.signal import convolve, slepian

    start_date = datetime.fromtimestamp(start_date)
    start_date = datetime(start_date.year, start_date.month, start_date.day)
    end_date = datetime.fromtimestamp(end_date)
    end_date = datetime(end_date.year, end_date.month, end_date.day)

    efforts_by_dev = defaultdict(int)
    for day, devs in days.items():
        for dev, stats in devs.items():
            efforts_by_dev[dev] += stats.Added + stats.Removed + stats.Changed
    if len(efforts_by_dev) > max_people:
        chosen = {v for k, v in sorted(
            ((v, k) for k, v in efforts_by_dev.items()), reverse=True)[:max_people]}
        print("Warning: truncated people to the most active %d" % max_people)
    else:
        chosen = set(efforts_by_dev)
    chosen_efforts = sorted(((efforts_by_dev[k], k) for k in chosen), reverse=True)
    chosen_order = {k: i for i, (_, k) in enumerate(chosen_efforts)}

    efforts = numpy.zeros((len(chosen) + 1, (end_date - start_date).days + 1), dtype=numpy.float32)
    for day, devs in days.items():
        if day < efforts.shape[1]:
            for dev, stats in devs.items():
                dev = chosen_order.get(dev, len(chosen_order))
                efforts[dev][day] += stats.Added + stats.Removed + stats.Changed
    efforts_cum = numpy.cumsum(efforts, axis=1)
    window = slepian(10, 0.5)
    window /= window.sum()
    for e in (efforts, efforts_cum):
        for i in range(e.shape[0]):
            ending = e[i][-len(window) * 2:].copy()
            e[i] = convolve(e[i], window, "same")
            e[i][-len(ending):] = ending
    matplotlib, pyplot = import_pyplot(args.backend, args.style)
    plot_x = [start_date + timedelta(days=i) for i in range(efforts.shape[1])]

    people = [people[k] for _, k in chosen_efforts] + ["others"]
    for i, name in enumerate(people):
        if len(name) > 40:
            people[i] = name[:37] + "..."

    polys = pyplot.stackplot(plot_x, efforts_cum, labels=people)
    if len(polys) == max_people + 1:
        polys[-1].set_hatch("/")
    polys = pyplot.stackplot(plot_x, -efforts * efforts_cum.max() / efforts.max())
    if len(polys) == max_people + 1:
        polys[-1].set_hatch("/")
    yticks = []
    for tick in pyplot.gca().yaxis.iter_ticks():
        if tick[1] >= 0:
            yticks.append(tick[1])
    pyplot.gca().yaxis.set_ticks(yticks)
    legend = pyplot.legend(loc=2, ncol=2, fontsize=args.font_size)
    apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.background,
                     args.font_size, args.size or "16,10")
    deploy_plot("Efforts through time (changed lines of code)", args.output, args.background)


def show_old_vs_new(args, name, start_date, end_date, people, days):
    from scipy.signal import convolve, slepian

    start_date = datetime.fromtimestamp(start_date)
    start_date = datetime(start_date.year, start_date.month, start_date.day)
    end_date = datetime.fromtimestamp(end_date)
    end_date = datetime(end_date.year, end_date.month, end_date.day)
    new_lines = numpy.zeros((end_date - start_date).days + 1)
    old_lines = numpy.zeros_like(new_lines)
    for day, devs in days.items():
        for stats in devs.values():
            new_lines[day] += stats.Added
            old_lines[day] += stats.Removed + stats.Changed
    resolution = 32
    window = slepian(len(new_lines) // resolution, 0.5)
    new_lines = convolve(new_lines, window, "same")
    old_lines = convolve(old_lines, window, "same")
    matplotlib, pyplot = import_pyplot(args.backend, args.style)
    plot_x = [start_date + timedelta(days=i) for i in range(len(new_lines))]
    pyplot.fill_between(plot_x, new_lines, color="#8DB843", label="Changed new lines")
    pyplot.fill_between(plot_x, old_lines, color="#E14C35", label="Changed existing lines")
    pyplot.legend(loc=2, fontsize=args.font_size)
    for tick in chain(pyplot.gca().xaxis.get_major_ticks(), pyplot.gca().yaxis.get_major_ticks()):
        tick.label.set_fontsize(args.font_size)
    deploy_plot("Additions vs changes", args.output, args.background)


def show_languages(args, name, start_date, end_date, people, days):
    devlangs = defaultdict(lambda: defaultdict(lambda: numpy.zeros(3, dtype=int)))
    for day, devs in days.items():
        for dev, stats in devs.items():
            for lang, vals in stats.Languages.items():
                devlangs[dev][lang] += vals
    devlangs = sorted(devlangs.items(), key=lambda p: -sum(x.sum() for x in p[1].values()))
    for dev, ls in devlangs:
        print()
        print("#", people[dev])
        ls = sorted(((vals.sum(), lang) for lang, vals in ls.items()), reverse=True)
        for vals, lang in ls:
            if lang:
                print("%s: %d" % (lang, vals))


class ParallelDevData:
    def __init__(self):
        self.commits_rank = -1
        self.commits = -1
        self.lines_rank = -1
        self.lines = -1
        self.ownership_rank = -1
        self.ownership = -1
        self.couples_index = -1
        self.couples_cluster = -1
        self.commit_coocc_index = -1
        self.commit_coocc_cluster = -1

    def __str__(self):
        return str(self.__dict__)

    def __repr__(self):
        return str(self)


def load_devs_parallel(ownership, couples, devs, max_people):
    from seriate import seriate
    try:
        from hdbscan import HDBSCAN
    except ImportError as e:
        print("Cannot import ortools: %s\nInstall it from "
              "https://developers.google.com/optimization/install/python/" % e)
        sys.exit(1)

    people, owned = ownership
    _, cmatrix = couples
    _, days = devs

    print("calculating - commits")
    commits = defaultdict(int)
    for day, devs in days.items():
        for dev, stats in devs.items():
            commits[people[dev]] += stats.Commits
    chosen = [k for v, k in sorted(((v, k) for k, v in commits.items()),
                                   reverse=True)[:max_people]]
    result = {k: ParallelDevData() for k in chosen}
    for k, v in result.items():
        v.commits_rank = chosen.index(k)
        v.commits = commits[k]

    print("calculating - lines")
    lines = defaultdict(int)
    for day, devs in days.items():
        for dev, stats in devs.items():
            lines[people[dev]] += stats.Added + stats.Removed + stats.Changed
    lines_index = {k: i for i, (_, k) in enumerate(sorted(
        ((v, k) for k, v in lines.items() if k in chosen), reverse=True))}
    for k, v in result.items():
        v.lines_rank = lines_index[k]
        v.lines = lines[k]

    print("calculating - ownership")
    owned_index = {k: i for i, (_, k) in enumerate(sorted(
        ((owned[k][-1].sum(), k) for k in chosen), reverse=True))}
    for k, v in result.items():
        v.ownership_rank = owned_index[k]
        v.ownership = owned[k][-1].sum()

    print("calculating - couples")
    embeddings = numpy.genfromtxt(fname="couples_people_data.tsv", delimiter="\t")[
        [people.index(k) for k in chosen]]
    embeddings /= numpy.linalg.norm(embeddings, axis=1)[:, None]
    cos = embeddings.dot(embeddings.T)
    cos[cos > 1] = 1  # tiny precision faults
    dists = numpy.arccos(cos)
    clusters = HDBSCAN(min_cluster_size=2, metric="precomputed").fit_predict(dists)
    for k, v in result.items():
        v.couples_cluster = clusters[chosen.index(k)]

    couples_order = seriate(dists)
    roll_options = []
    for i in range(len(couples_order)):
        loss = 0
        for k, v in result.items():
            loss += abs(
                v.ownership_rank - (couples_order.index(chosen.index(k)) + i) % len(chosen))
        roll_options.append(loss)
    best_roll = numpy.argmin(roll_options)
    couples_order = list(numpy.roll(couples_order, best_roll))
    for k, v in result.items():
        v.couples_index = couples_order.index(chosen.index(k))

    print("calculating - commit series")
    dists, devseries, _, orig_route = order_commits(chosen, days, people)
    keys = list(devseries.keys())
    route = [keys[node] for node in orig_route]
    for roll in range(len(route)):
        loss = 0
        for k, v in result.items():
            i = route.index(people.index(k))
            loss += abs(v.couples_index - ((i + roll) % len(route)))
        roll_options[roll] = loss
    best_roll = numpy.argmin(roll_options)
    route = list(numpy.roll(route, best_roll))
    orig_route = list(numpy.roll(orig_route, best_roll))
    clusters = hdbscan_cluster_routed_series(dists, orig_route)
    for k, v in result.items():
        v.commit_coocc_index = route.index(people.index(k))
        v.commit_coocc_cluster = clusters[v.commit_coocc_index]

    return result


def show_devs_parallel(args, name, start_date, end_date, devs):
    matplotlib, pyplot = import_pyplot(args.backend, args.style)
    from matplotlib.collections import LineCollection

    def solve_equations(x1, y1, x2, y2):
        xcube = (x1 - x2) ** 3
        a = 2 * (y2 - y1) / xcube
        b = 3 * (y1 - y2) * (x1 + x2) / xcube
        c = 6 * (y2 - y1) * x1 * x2 / xcube
        d = y1 - a * x1 ** 3 - b * x1 ** 2 - c * x1
        return a, b, c, d

    # biggest = {k: max(getattr(d, k) for d in devs.values())
    #            for k in ("commits", "lines", "ownership")}
    for k, dev in devs.items():
        points = numpy.array([
            (1, dev.commits_rank),
            (2, dev.lines_rank),
            (3, dev.ownership_rank),
            (4, dev.couples_index),
            (5, dev.commit_coocc_index)],
            dtype=float)
        points[:, 1] = points[:, 1] / len(devs)
        splines = []
        for i in range(len(points) - 1):
            a, b, c, d = solve_equations(*points[i], *points[i + 1])
            x = numpy.linspace(i + 1, i + 2, 100)
            smooth_points = numpy.array(
                [x, a * x ** 3 + b * x ** 2 + c * x + d]).T.reshape(-1, 1, 2)
            splines.append(smooth_points)
        points = numpy.concatenate(splines)
        segments = numpy.concatenate([points[:-1], points[1:]], axis=1)
        lc = LineCollection(segments)
        lc.set_array(numpy.linspace(0, 0.1, segments.shape[0]))
        pyplot.gca().add_collection(lc)

    pyplot.xlim(0, 6)
    pyplot.ylim(-0.1, 1.1)
    deploy_plot("Developers", args.output, args.background)


def _format_number(n):
    if n == 0:
        return "0"
    power = int(numpy.log10(abs(n)))
    if power >= 6:
        n = n / 1000000
        if n >= 10:
            n = str(int(n))
        else:
            n = "%.1f" % n
            if n.endswith("0"):
                n = n[:-2]
        suffix = "M"
    elif power >= 3:
        n = n / 1000
        if n >= 10:
            n = str(int(n))
        else:
            n = "%.1f" % n
            if n.endswith("0"):
                n = n[:-2]
        suffix = "K"
    else:
        n = str(n)
        suffix = ""
    return n + suffix


def main():
    args = parse_args()
    reader = read_input(args)
    header = reader.get_header()
    name = reader.get_name()

    burndown_warning = "Burndown stats were not collected. Re-run hercules with --burndown."
    burndown_files_warning = \
        "Burndown stats for files were not collected. Re-run hercules with " \
        "--burndown --burndown-files."
    burndown_people_warning = \
        "Burndown stats for people were not collected. Re-run hercules with " \
        "--burndown --burndown-people."
    couples_warning = "Coupling stats were not collected. Re-run hercules with --couples."
    shotness_warning = "Structural hotness stats were not collected. Re-run hercules with " \
                       "--shotness. Also check --languages - the output may be empty."
    sentiment_warning = "Sentiment stats were not collected. Re-run hercules with --sentiment."
    devs_warning = "Devs stats were not collected. Re-run hercules with --devs."

    def run_times():
        rt = reader.get_run_times()
        pandas = import_pandas()
        series = pandas.to_timedelta(pandas.Series(rt).sort_values(ascending=False), unit="s")
        df = pandas.concat([series, series / series.sum()], axis=1)
        df.columns = ["time", "ratio"]
        print(df)

    def project_burndown():
        try:
            full_header = header + reader.get_burndown_parameters()
        except KeyError:
            print("project: " + burndown_warning)
            return
        plot_burndown(args, "project",
                      *load_burndown(full_header, *reader.get_project_burndown(),
                                     resample=args.resample))

    def files_burndown():
        try:
            full_header = header + reader.get_burndown_parameters()
        except KeyError:
            print(burndown_warning)
            return
        try:
            plot_many_burndown(args, "file", full_header, reader.get_files_burndown())
        except KeyError:
            print("files: " + burndown_files_warning)

    def people_burndown():
        try:
            full_header = header + reader.get_burndown_parameters()
        except KeyError:
            print(burndown_warning)
            return
        try:
            plot_many_burndown(args, "person", full_header, reader.get_people_burndown())
        except KeyError:
            print("people: " + burndown_people_warning)

    def churn_matrix():
        try:
            plot_churn_matrix(args, name, *load_churn_matrix(
                *reader.get_people_interaction(), max_people=args.max_people))
        except KeyError:
            print("churn_matrix: " + burndown_people_warning)

    def ownership_burndown():
        try:
            full_header = header + reader.get_burndown_parameters()
        except KeyError:
            print(burndown_warning)
            return
        try:
            plot_ownership(args, name, *load_ownership(
                full_header, *reader.get_ownership_burndown(), max_people=args.max_people))
        except KeyError:
            print("ownership: " + burndown_people_warning)

    def couples_files():
        try:
            write_embeddings("files", args.output, not args.disable_projector,
                             *train_embeddings(*reader.get_files_coocc(),
                                               tmpdir=args.couples_tmp_dir))
        except KeyError:
            print(couples_warning)

    def couples_people():
        try:
            write_embeddings("people", args.output, not args.disable_projector,
                             *train_embeddings(*reader.get_people_coocc(),
                                               tmpdir=args.couples_tmp_dir))
        except KeyError:
            print(couples_warning)

    def couples_shotness():
        try:
            write_embeddings("shotness", args.output, not args.disable_projector,
                             *train_embeddings(*reader.get_shotness_coocc(),
                                               tmpdir=args.couples_tmp_dir))
        except KeyError:
            print(shotness_warning)

    def shotness():
        try:
            data = reader.get_shotness()
        except KeyError:
            print(shotness_warning)
            return
        show_shotness_stats(data)

    def sentiment():
        try:
            data = reader.get_sentiment()
        except KeyError:
            print(sentiment_warning)
            return
        show_sentiment_stats(args, reader.get_name(), args.resample, reader.get_header()[0], data)

    def devs():
        try:
            data = reader.get_devs()
        except KeyError:
            print(devs_warning)
            return
        show_devs(args, reader.get_name(), *reader.get_header(), *data)

    def devs_efforts():
        try:
            data = reader.get_devs()
        except KeyError:
            print(devs_warning)
            return
        show_devs_efforts(args, reader.get_name(), *reader.get_header(), *data,
                          max_people=args.max_people)

    def old_vs_new():
        try:
            data = reader.get_devs()
        except KeyError:
            print(devs_warning)
            return
        show_old_vs_new(args, reader.get_name(), *reader.get_header(), *data)

    def languages():
        try:
            data = reader.get_devs()
        except KeyError:
            print(devs_warning)
            return
        show_languages(args, reader.get_name(), *reader.get_header(), *data)

    def devs_parallel():
        try:
            ownership = reader.get_ownership_burndown()
        except KeyError:
            print(burndown_people_warning)
            return
        try:
            couples = reader.get_people_coocc()
        except KeyError:
            print(couples_warning)
            return
        try:
            devs = reader.get_devs()
        except KeyError:
            print(devs_warning)
            return
        show_devs_parallel(args, reader.get_name(), *reader.get_header(),
                           load_devs_parallel(ownership, couples, devs, args.max_people))

    modes = {
        "run-times": run_times,
        "burndown-project": project_burndown,
        "burndown-file": files_burndown,
        "burndown-person": people_burndown,
        "churn-matrix": churn_matrix,
        "ownership": ownership_burndown,
        "couples-files": couples_files,
        "couples-people": couples_people,
        "couples-shotness": couples_shotness,
        "shotness": shotness,
        "sentiment": sentiment,
        "devs": devs,
        "devs-efforts": devs_efforts,
        "old-vs-new": old_vs_new,
        "languages": languages,
        "devs-parallel": devs_parallel,
    }
    try:
        modes[args.mode]()
    except KeyError:
        assert args.mode == "all"
        project_burndown()
        files_burndown()
        people_burndown()
        churn_matrix()
        ownership_burndown()
        couples_files()
        couples_people()
        couples_shotness()
        shotness()
        sentiment()
        devs()
        devs_efforts()
        # devs_parallel()

    if web_server.running:
        secs = int(os.getenv("COUPLES_SERVER_TIME", "60"))
        print("Sleeping for %d seconds, safe to Ctrl-C" % secs)
        sys.stdout.flush()
        try:
            time.sleep(secs)
        except KeyboardInterrupt:
            pass
        web_server.stop()


if __name__ == "__main__":
    sys.exit(main())