| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957 | #!/usr/bin/env python3import argparsefrom collections import defaultdict, namedtuplefrom datetime import datetime, timedeltafrom importlib import import_moduleimport iofrom itertools import chainimport jsonimport osimport reimport shutilimport subprocessimport sysimport tempfileimport threadingimport timeimport warningstry:    from clint.textui import progressexcept ImportError:    print("Warning: clint is not installed, no fancy progressbars in the terminal for you.")    progress = Noneimport numpyimport yamlif sys.version_info[0] < 3:    # OK, ancients, I will support Python 2, but you owe me a beer    input = raw_input  # noqa: F821def list_matplotlib_styles():    script = "import sys; from matplotlib import pyplot; " \             "sys.stdout.write(repr(pyplot.style.available))"    styles = eval(subprocess.check_output([sys.executable, "-c", script]))    styles.remove("classic")    return ["default", "classic"] + stylesdef parse_args():    parser = argparse.ArgumentParser()    parser.add_argument("-o", "--output", default="",                        help="Path to the output file/directory (empty for display). "                             "If the extension is JSON, the data is saved instead of "                             "the real image.")    parser.add_argument("-i", "--input", default="-",                        help="Path to the input file (- for stdin).")    parser.add_argument("-f", "--input-format", default="auto", choices=["yaml", "pb", "auto"])    parser.add_argument("--font-size", default=12, type=int,                        help="Size of the labels and legend.")    parser.add_argument("--style", default="ggplot", choices=list_matplotlib_styles(),                        help="Plot style to use.")    parser.add_argument("--backend", help="Matplotlib backend to use.")    parser.add_argument("--background", choices=["black", "white"], default="white",                        help="Plot's general color scheme.")    parser.add_argument("--size", help="Axes' size in inches, for example \"12,9\"")    parser.add_argument("--relative", action="store_true",                        help="Occupy 100%% height for every measurement.")    parser.add_argument("--couples-tmp-dir", help="Temporary directory to work with couples.")    parser.add_argument("-m", "--mode",                        choices=["burndown-project", "burndown-file", "burndown-person",                                 "churn-matrix", "ownership", "couples-files", "couples-people",                                 "couples-shotness", "shotness", "sentiment", "devs",                                 "devs-efforts", "old-vs-new", "all", "run-times", "languages",                                 "devs-parallel"],                        help="What to plot.")    parser.add_argument(        "--resample", default="year",        help="The way to resample the time series. Possible values are: "             "\"month\", \"year\", \"no\", \"raw\" and pandas offset aliases ("             "http://pandas.pydata.org/pandas-docs/stable/timeseries.html"             "#offset-aliases).")    dateutil_url = "https://dateutil.readthedocs.io/en/stable/parser.html#dateutil.parser.parse"    parser.add_argument("--start-date",                        help="Start date of time-based plots. Any format is accepted which is "                             "supported by %s" % dateutil_url)    parser.add_argument("--end-date",                        help="End date of time-based plots. Any format is accepted which is "                             "supported by %s" % dateutil_url)    parser.add_argument("--disable-projector", action="store_true",                        help="Do not run Tensorflow Projector on couples.")    parser.add_argument("--max-people", default=20, type=int,                        help="Maximum number of developers in churn matrix and people plots.")    args = parser.parse_args()    return argsclass Reader(object):    def read(self, file):        raise NotImplementedError    def get_name(self):        raise NotImplementedError    def get_header(self):        raise NotImplementedError    def get_burndown_parameters(self):        raise NotImplementedError    def get_project_burndown(self):        raise NotImplementedError    def get_files_burndown(self):        raise NotImplementedError    def get_people_burndown(self):        raise NotImplementedError    def get_ownership_burndown(self):        raise NotImplementedError    def get_people_interaction(self):        raise NotImplementedError    def get_files_coocc(self):        raise NotImplementedError    def get_people_coocc(self):        raise NotImplementedError    def get_shotness_coocc(self):        raise NotImplementedError    def get_shotness(self):        raise NotImplementedError    def get_sentiment(self):        raise NotImplementedError    def get_devs(self):        raise NotImplementedErrorclass YamlReader(Reader):    def read(self, file):        yaml.reader.Reader.NON_PRINTABLE = re.compile(r"(?!x)x")        try:            loader = yaml.CLoader        except AttributeError:            print("Warning: failed to import yaml.CLoader, falling back to slow yaml.Loader")            loader = yaml.Loader        try:            if file != "-":                with open(file) as fin:                    data = yaml.load(fin, Loader=loader)            else:                data = yaml.load(sys.stdin, Loader=loader)        except (UnicodeEncodeError, yaml.reader.ReaderError) as e:            print("\nInvalid unicode in the input: %s\nPlease filter it through "                  "fix_yaml_unicode.py" % e)            sys.exit(1)        if data is None:            print("\nNo data has been read - has Hercules crashed?")            sys.exit(1)        self.data = data    def get_run_times(self):        return {}    def get_name(self):        return self.data["hercules"]["repository"]    def get_header(self):        header = self.data["hercules"]        return header["begin_unix_time"], header["end_unix_time"]    def get_burndown_parameters(self):        header = self.data["Burndown"]        return header["sampling"], header["granularity"]    def get_project_burndown(self):        return self.data["hercules"]["repository"], \            self._parse_burndown_matrix(self.data["Burndown"]["project"]).T    def get_files_burndown(self):        return [(p[0], self._parse_burndown_matrix(p[1]).T)                for p in self.data["Burndown"]["files"].items()]    def get_people_burndown(self):        return [(p[0], self._parse_burndown_matrix(p[1]).T)                for p in self.data["Burndown"]["people"].items()]    def get_ownership_burndown(self):        return self.data["Burndown"]["people_sequence"].copy(), \            {p[0]: self._parse_burndown_matrix(p[1])             for p in self.data["Burndown"]["people"].items()}    def get_people_interaction(self):        return self.data["Burndown"]["people_sequence"].copy(), \            self._parse_burndown_matrix(self.data["Burndown"]["people_interaction"])    def get_files_coocc(self):        coocc = self.data["Couples"]["files_coocc"]        return coocc["index"], self._parse_coocc_matrix(coocc["matrix"])    def get_people_coocc(self):        coocc = self.data["Couples"]["people_coocc"]        return coocc["index"], self._parse_coocc_matrix(coocc["matrix"])    def get_shotness_coocc(self):        shotness = self.data["Shotness"]        index = ["%s:%s" % (i["file"], i["name"]) for i in shotness]        indptr = numpy.zeros(len(shotness) + 1, dtype=numpy.int64)        indices = []        data = []        for i, record in enumerate(shotness):            pairs = [(int(k), v) for k, v in record["counters"].items()]            pairs.sort()            indptr[i + 1] = indptr[i] + len(pairs)            for k, v in pairs:                indices.append(k)                data.append(v)        indices = numpy.array(indices, dtype=numpy.int32)        data = numpy.array(data, dtype=numpy.int32)        from scipy.sparse import csr_matrix        return index, csr_matrix((data, indices, indptr), shape=(len(shotness),) * 2)    def get_shotness(self):        from munch import munchify        obj = munchify(self.data["Shotness"])        # turn strings into ints        for item in obj:            item.counters = {int(k): v for k, v in item.counters.items()}        if len(obj) == 0:            raise KeyError        return obj    def get_sentiment(self):        from munch import munchify        return munchify({int(key): {            "Comments": vals[2].split("|"),            "Commits": vals[1],            "Value": float(vals[0])        } for key, vals in self.data["Sentiment"].items()})    def get_devs(self):        people = self.data["Devs"]["people"]        days = {int(d): {int(dev): DevDay(*(int(x) for x in day[:-1]), day[-1])                         for dev, day in devs.items()}                for d, devs in self.data["Devs"]["ticks"].items()}        return people, days    def _parse_burndown_matrix(self, matrix):        return numpy.array([numpy.fromstring(line, dtype=int, sep=" ")                            for line in matrix.split("\n")])    def _parse_coocc_matrix(self, matrix):        from scipy.sparse import csr_matrix        data = []        indices = []        indptr = [0]        for row in matrix:            for k, v in sorted(row.items()):                data.append(v)                indices.append(k)            indptr.append(indptr[-1] + len(row))        return csr_matrix((data, indices, indptr), shape=(len(matrix),) * 2)class ProtobufReader(Reader):    def read(self, file):        try:            from labours.pb_pb2 import AnalysisResults        except ImportError as e:            print("\n\n>>> You need to generate python/hercules/pb/pb_pb2.py - run \"make\"\n",                  file=sys.stderr)            raise e from None        self.data = AnalysisResults()        if file != "-":            with open(file, "rb") as fin:                bytes = fin.read()        else:            bytes = sys.stdin.buffer.read()        if not bytes:            raise ValueError("empty input")        self.data.ParseFromString(bytes)        self.contents = {}        for key, val in self.data.contents.items():            try:                mod, name = PB_MESSAGES[key].rsplit(".", 1)            except KeyError:                sys.stderr.write("Warning: there is no registered PB decoder for %s\n" % key)                continue            cls = getattr(import_module(mod), name)            self.contents[key] = msg = cls()            msg.ParseFromString(val)    def get_run_times(self):        return {key: val for key, val in self.data.header.run_time_per_item.items()}    def get_name(self):        return self.data.header.repository    def get_header(self):        header = self.data.header        return header.begin_unix_time, header.end_unix_time    def get_burndown_parameters(self):        burndown = self.contents["Burndown"]        return burndown.sampling, burndown.granularity    def get_project_burndown(self):        return self._parse_burndown_matrix(self.contents["Burndown"].project)    def get_files_burndown(self):        return [self._parse_burndown_matrix(i) for i in self.contents["Burndown"].files]    def get_people_burndown(self):        return [self._parse_burndown_matrix(i) for i in self.contents["Burndown"].people]    def get_ownership_burndown(self):        people = self.get_people_burndown()        return [p[0] for p in people], {p[0]: p[1].T for p in people}    def get_people_interaction(self):        burndown = self.contents["Burndown"]        return [i.name for i in burndown.people], \            self._parse_sparse_matrix(burndown.people_interaction).toarray()    def get_files_coocc(self):        node = self.contents["Couples"].file_couples        return list(node.index), self._parse_sparse_matrix(node.matrix)    def get_people_coocc(self):        node = self.contents["Couples"].people_couples        return list(node.index), self._parse_sparse_matrix(node.matrix)    def get_shotness_coocc(self):        shotness = self.get_shotness()        index = ["%s:%s" % (i.file, i.name) for i in shotness]        indptr = numpy.zeros(len(shotness) + 1, dtype=numpy.int32)        indices = []        data = []        for i, record in enumerate(shotness):            pairs = list(record.counters.items())            pairs.sort()            indptr[i + 1] = indptr[i] + len(pairs)            for k, v in pairs:                indices.append(k)                data.append(v)        indices = numpy.array(indices, dtype=numpy.int32)        data = numpy.array(data, dtype=numpy.int32)        from scipy.sparse import csr_matrix        return index, csr_matrix((data, indices, indptr), shape=(len(shotness),) * 2)    def get_shotness(self):        records = self.contents["Shotness"].records        if len(records) == 0:            raise KeyError        return records    def get_sentiment(self):        byday = self.contents["Sentiment"].SentimentByDay        if len(byday) == 0:            raise KeyError        return byday    def get_devs(self):        people = list(self.contents["Devs"].dev_index)        days = {d: {dev: DevDay(stats.commits, stats.stats.added, stats.stats.removed,                                stats.stats.changed, {k: [v.added, v.removed, v.changed]                                                      for k, v in stats.languages.items()})                    for dev, stats in day.devs.items()}                for d, day in self.contents["Devs"].ticks.items()}        return people, days    def _parse_burndown_matrix(self, matrix):        dense = numpy.zeros((matrix.number_of_rows, matrix.number_of_columns), dtype=int)        for y, row in enumerate(matrix.rows):            for x, col in enumerate(row.columns):                dense[y, x] = col        return matrix.name, dense.T    def _parse_sparse_matrix(self, matrix):        from scipy.sparse import csr_matrix        return csr_matrix((list(matrix.data), list(matrix.indices), list(matrix.indptr)),                          shape=(matrix.number_of_rows, matrix.number_of_columns))READERS = {"yaml": YamlReader, "yml": YamlReader, "pb": ProtobufReader}PB_MESSAGES = {    "Burndown": "labours.pb_pb2.BurndownAnalysisResults",    "Couples": "labours.pb_pb2.CouplesAnalysisResults",    "Shotness": "labours.pb_pb2.ShotnessAnalysisResults",    "Devs": "labours.pb_pb2.DevsAnalysisResults",}def read_input(args):    sys.stdout.write("Reading the input... ")    sys.stdout.flush()    if args.input != "-":        if args.input_format == "auto":            try:                args.input_format = args.input.rsplit(".", 1)[1]            except IndexError:                try:                    with open(args.input) as f:                        f.read(1 << 16)                    args.input_format = "yaml"                except UnicodeDecodeError:                    args.input_format = "pb"    elif args.input_format == "auto":        args.input_format = "yaml"    reader = READERS[args.input_format]()    reader.read(args.input)    print("done")    return readerclass DevDay(namedtuple("DevDay", ("Commits", "Added", "Removed", "Changed", "Languages"))):    def add(self, dd):        langs = defaultdict(lambda: [0] * 3)        for key, val in self.Languages.items():            for i in range(3):                langs[key][i] += val[i]        for key, val in dd.Languages.items():            for i in range(3):                langs[key][i] += val[i]        return DevDay(Commits=self.Commits + dd.Commits,                      Added=self.Added + dd.Added,                      Removed=self.Removed + dd.Removed,                      Changed=self.Changed + dd.Changed,                      Languages=dict(langs))def fit_kaplan_meier(matrix):    from lifelines import KaplanMeierFitter    T = []    W = []    indexes = numpy.arange(matrix.shape[0], dtype=int)    entries = numpy.zeros(matrix.shape[0], int)    dead = set()    for i in range(1, matrix.shape[1]):        diff = matrix[:, i - 1] - matrix[:, i]        entries[diff < 0] = i        mask = diff > 0        deaths = diff[mask]        T.append(numpy.full(len(deaths), i) - entries[indexes[mask]])        W.append(deaths)        entered = entries > 0        entered[0] = True        dead = dead.union(set(numpy.where((matrix[:, i] == 0) & entered)[0]))    # add the survivors as censored    nnzind = entries != 0    nnzind[0] = True    nnzind[sorted(dead)] = False    T.append(numpy.full(nnzind.sum(), matrix.shape[1]) - entries[nnzind])    W.append(matrix[nnzind, -1])    T = numpy.concatenate(T)    E = numpy.ones(len(T), bool)    E[-nnzind.sum():] = 0    W = numpy.concatenate(W)    if T.size == 0:        return None    kmf = KaplanMeierFitter().fit(T, E, weights=W)    return kmfdef print_survival_function(kmf, sampling):    sf = kmf.survival_function_    sf.index = [timedelta(days=d) for d in sf.index * sampling]    sf.columns = ["Ratio of survived lines"]    try:        print(sf[len(sf) // 6::len(sf) // 6].append(sf.tail(1)))    except ValueError:        passdef interpolate_burndown_matrix(matrix, granularity, sampling):    daily = numpy.zeros(        (matrix.shape[0] * granularity, matrix.shape[1] * sampling),        dtype=numpy.float32)    """    ----------> samples, x    |    |    |    ⌄    bands, y    """    for y in range(matrix.shape[0]):        for x in range(matrix.shape[1]):            if y * granularity > (x + 1) * sampling:                # the future is zeros                continue            def decay(start_index: int, start_val: float):                if start_val == 0:                    return                k = matrix[y][x] / start_val  # <= 1                scale = (x + 1) * sampling - start_index                for i in range(y * granularity, (y + 1) * granularity):                    initial = daily[i][start_index - 1]                    for j in range(start_index, (x + 1) * sampling):                        daily[i][j] = initial * (                            1 + (k - 1) * (j - start_index + 1) / scale)            def grow(finish_index: int, finish_val: float):                initial = matrix[y][x - 1] if x > 0 else 0                start_index = x * sampling                if start_index < y * granularity:                    start_index = y * granularity                if finish_index == start_index:                    return                avg = (finish_val - initial) / (finish_index - start_index)                for j in range(x * sampling, finish_index):                    for i in range(start_index, j + 1):                        daily[i][j] = avg                # copy [x*g..y*s)                for j in range(x * sampling, finish_index):                    for i in range(y * granularity, x * sampling):                        daily[i][j] = daily[i][j - 1]            if (y + 1) * granularity >= (x + 1) * sampling:                # x*granularity <= (y+1)*sampling                # 1. x*granularity <= y*sampling                #    y*sampling..(y+1)sampling                #                #       x+1                #        /                #       /                #      / y+1  -|                #     /        |                #    / y      -|                #   /                #  / x                #                # 2. x*granularity > y*sampling                #    x*granularity..(y+1)sampling                #                #       x+1                #        /                #       /                #      / y+1  -|                #     /        |                #    / x      -|                #   /                #  / y                if y * granularity <= x * sampling:                    grow((x + 1) * sampling, matrix[y][x])                elif (x + 1) * sampling > y * granularity:                    grow((x + 1) * sampling, matrix[y][x])                    avg = matrix[y][x] / ((x + 1) * sampling - y * granularity)                    for j in range(y * granularity, (x + 1) * sampling):                        for i in range(y * granularity, j + 1):                            daily[i][j] = avg            elif (y + 1) * granularity >= x * sampling:                # y*sampling <= (x+1)*granularity < (y+1)sampling                # y*sampling..(x+1)*granularity                # (x+1)*granularity..(y+1)sampling                #        x+1                #         /\                #        /  \                #       /    \                #      /    y+1                #     /                #    y                v1 = matrix[y][x - 1]                v2 = matrix[y][x]                delta = (y + 1) * granularity - x * sampling                previous = 0                if x > 0 and (x - 1) * sampling >= y * granularity:                    # x*g <= (y-1)*s <= y*s <= (x+1)*g <= (y+1)*s                    #           |________|.......^                    if x > 1:                        previous = matrix[y][x - 2]                    scale = sampling                else:                    # (y-1)*s < x*g <= y*s <= (x+1)*g <= (y+1)*s                    #            |______|.......^                    scale = sampling if x == 0 else x * sampling - y * granularity                peak = v1 + (v1 - previous) / scale * delta                if v2 > peak:                    # we need to adjust the peak, it may not be less than the decayed value                    if x < matrix.shape[1] - 1:                        # y*s <= (x+1)*g <= (y+1)*s < (y+2)*s                        #           ^.........|_________|                        k = (v2 - matrix[y][x + 1]) / sampling  # > 0                        peak = matrix[y][x] + k * ((x + 1) * sampling - (y + 1) * granularity)                        # peak > v2 > v1                    else:                        peak = v2                        # not enough data to interpolate; this is at least not restricted                grow((y + 1) * granularity, peak)                decay((y + 1) * granularity, peak)            else:                # (x+1)*granularity < y*sampling                # y*sampling..(y+1)sampling                decay(x * sampling, matrix[y][x - 1])    return dailydef import_pandas():    import pandas    try:        from pandas.plotting import register_matplotlib_converters        register_matplotlib_converters()    except ImportError:        pass    return pandasdef load_burndown(header, name, matrix, resample, report_survival=True):    pandas = import_pandas()    start, last, sampling, granularity = header    assert sampling > 0    assert granularity > 0    start = datetime.fromtimestamp(start)    last = datetime.fromtimestamp(last)    if report_survival:        kmf = fit_kaplan_meier(matrix)        if kmf is not None:            print_survival_function(kmf, sampling)    finish = start + timedelta(days=matrix.shape[1] * sampling)    if resample not in ("no", "raw"):        print("resampling to %s, please wait..." % resample)        # Interpolate the day x day matrix.        # Each day brings equal weight in the granularity.        # Sampling's interpolation is linear.        daily = interpolate_burndown_matrix(matrix, granularity, sampling)        daily[(last - start).days:] = 0        # Resample the bands        aliases = {            "year": "A",            "month": "M"        }        resample = aliases.get(resample, resample)        periods = 0        date_granularity_sampling = [start]        while date_granularity_sampling[-1] < finish:            periods += 1            date_granularity_sampling = pandas.date_range(                start, periods=periods, freq=resample)        if date_granularity_sampling[0] > finish:            if resample == "A":                print("too loose resampling - by year, trying by month")                return load_burndown(header, name, matrix, "month", report_survival=False)            else:                raise ValueError("Too loose resampling: %s. Try finer." % resample)        date_range_sampling = pandas.date_range(            date_granularity_sampling[0],            periods=(finish - date_granularity_sampling[0]).days,            freq="1D")        # Fill the new square matrix        matrix = numpy.zeros(            (len(date_granularity_sampling), len(date_range_sampling)),            dtype=numpy.float32)        for i, gdt in enumerate(date_granularity_sampling):            istart = (date_granularity_sampling[i - 1] - start).days \                if i > 0 else 0            ifinish = (gdt - start).days            for j, sdt in enumerate(date_range_sampling):                if (sdt - start).days >= istart:                    break            matrix[i, j:] = \                daily[istart:ifinish, (sdt - start).days:].sum(axis=0)        # Hardcode some cases to improve labels' readability        if resample in ("year", "A"):            labels = [dt.year for dt in date_granularity_sampling]        elif resample in ("month", "M"):            labels = [dt.strftime("%Y %B") for dt in date_granularity_sampling]        else:            labels = [dt.date() for dt in date_granularity_sampling]    else:        labels = [            "%s - %s" % ((start + timedelta(days=i * granularity)).date(),                         (                         start + timedelta(days=(i + 1) * granularity)).date())            for i in range(matrix.shape[0])]        if len(labels) > 18:            warnings.warn("Too many labels - consider resampling.")        resample = "M"  # fake resampling type is checked while plotting        date_range_sampling = pandas.date_range(            start + timedelta(days=sampling), periods=matrix.shape[1],            freq="%dD" % sampling)    return name, matrix, date_range_sampling, labels, granularity, sampling, resampledef load_ownership(header, sequence, contents, max_people):    pandas = import_pandas()    start, last, sampling, _ = header    start = datetime.fromtimestamp(start)    last = datetime.fromtimestamp(last)    people = []    for name in sequence:        people.append(contents[name].sum(axis=1))    people = numpy.array(people)    date_range_sampling = pandas.date_range(        start + timedelta(days=sampling), periods=people[0].shape[0],        freq="%dD" % sampling)    if people.shape[0] > max_people:        order = numpy.argsort(-people.sum(axis=1))        chosen_people = people[order[:max_people + 1]]        chosen_people[max_people] = people[order[max_people:]].sum(axis=0)        people = chosen_people        sequence = [sequence[i] for i in order[:max_people]] + ["others"]        print("Warning: truncated people to the most owning %d" % max_people)    for i, name in enumerate(sequence):        if len(name) > 40:            sequence[i] = name[:37] + "..."    return sequence, people, date_range_sampling, lastdef load_churn_matrix(people, matrix, max_people):    matrix = matrix.astype(float)    if matrix.shape[0] > max_people:        order = numpy.argsort(-matrix[:, 0])        matrix = matrix[order[:max_people]][:, [0, 1] + list(2 + order[:max_people])]        people = [people[i] for i in order[:max_people]]        print("Warning: truncated people to most productive %d" % max_people)    zeros = matrix[:, 0] == 0    matrix[zeros, :] = 1    matrix /= matrix[:, 0][:, None]    matrix = -matrix[:, 1:]    matrix[zeros, :] = 0    for i, name in enumerate(people):        if len(name) > 40:            people[i] = name[:37] + "..."    return people, matrixdef import_pyplot(backend, style):    import matplotlib    if backend:        matplotlib.use(backend)    from matplotlib import pyplot    pyplot.style.use(style)    print("matplotlib: backend is", matplotlib.get_backend())    return matplotlib, pyplotdef apply_plot_style(figure, axes, legend, background, font_size, axes_size):    foreground = "black" if background == "white" else "white"    if axes_size is None:        axes_size = (16, 12)    else:        axes_size = tuple(float(p) for p in axes_size.split(","))    figure.set_size_inches(*axes_size)    for side in ("bottom", "top", "left", "right"):        axes.spines[side].set_color(foreground)    for axis in (axes.xaxis, axes.yaxis):        axis.label.update(dict(fontsize=font_size, color=foreground))    for axis in ("x", "y"):        getattr(axes, axis + "axis").get_offset_text().set_size(font_size)        axes.tick_params(axis=axis, colors=foreground, labelsize=font_size)    try:        axes.ticklabel_format(axis="y", style="sci", scilimits=(0, 3))    except AttributeError:        pass    figure.patch.set_facecolor(background)    axes.set_facecolor(background)    if legend is not None:        frame = legend.get_frame()        for setter in (frame.set_facecolor, frame.set_edgecolor):            setter(background)        for text in legend.get_texts():            text.set_color(foreground)def get_plot_path(base, name):    root, ext = os.path.splitext(base)    if not ext:        ext = ".png"    output = os.path.join(root, name + ext)    os.makedirs(os.path.dirname(output), exist_ok=True)    return outputdef deploy_plot(title, output, background, tight=True):    import matplotlib.pyplot as pyplot    if not output:        pyplot.gcf().canvas.set_window_title(title)        pyplot.show()    else:        if title:            pyplot.title(title, color="black" if background == "white" else "white")        if tight:            try:                pyplot.tight_layout()            except:  # noqa: E722                print("Warning: failed to set the tight layout")        pyplot.savefig(output, transparent=True)    pyplot.clf()def default_json(x):    if hasattr(x, "tolist"):        return x.tolist()    if hasattr(x, "isoformat"):        return x.isoformat()    return xdef parse_date(text, default):    if not text:        return default    from dateutil.parser import parse    return parse(text)def plot_burndown(args, target, name, matrix, date_range_sampling, labels, granularity,                  sampling, resample):    if args.output and args.output.endswith(".json"):        data = locals().copy()        del data["args"]        data["type"] = "burndown"        if args.mode == "project" and target == "project":            output = args.output        else:            if target == "project":                name = "project"            output = get_plot_path(args.output, name)        with open(output, "w") as fout:            json.dump(data, fout, sort_keys=True, default=default_json)        return    matplotlib, pyplot = import_pyplot(args.backend, args.style)    pyplot.stackplot(date_range_sampling, matrix, labels=labels)    if args.relative:        for i in range(matrix.shape[1]):            matrix[:, i] /= matrix[:, i].sum()        pyplot.ylim(0, 1)        legend_loc = 3    else:        legend_loc = 2    legend = pyplot.legend(loc=legend_loc, fontsize=args.font_size)    pyplot.ylabel("Lines of code")    pyplot.xlabel("Time")    apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.background,                     args.font_size, args.size)    pyplot.xlim(parse_date(args.start_date, date_range_sampling[0]),                parse_date(args.end_date, date_range_sampling[-1]))    locator = pyplot.gca().xaxis.get_major_locator()    # set the optimal xticks locator    if "M" not in resample:        pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())    locs = pyplot.gca().get_xticks().tolist()    if len(locs) >= 16:        pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())        locs = pyplot.gca().get_xticks().tolist()        if len(locs) >= 16:            pyplot.gca().xaxis.set_major_locator(locator)    if locs[0] < pyplot.xlim()[0]:        del locs[0]    endindex = -1    if len(locs) >= 2 and pyplot.xlim()[1] - locs[-1] > (locs[-1] - locs[-2]) / 2:        locs.append(pyplot.xlim()[1])        endindex = len(locs) - 1    startindex = -1    if len(locs) >= 2 and locs[0] - pyplot.xlim()[0] > (locs[1] - locs[0]) / 2:        locs.append(pyplot.xlim()[0])        startindex = len(locs) - 1    pyplot.gca().set_xticks(locs)    # hacking time!    labels = pyplot.gca().get_xticklabels()    if startindex >= 0:        labels[startindex].set_text(date_range_sampling[0].date())        labels[startindex].set_text = lambda _: None        labels[startindex].set_rotation(30)        labels[startindex].set_ha("right")    if endindex >= 0:        labels[endindex].set_text(date_range_sampling[-1].date())        labels[endindex].set_text = lambda _: None        labels[endindex].set_rotation(30)        labels[endindex].set_ha("right")    title = "%s %d x %d (granularity %d, sampling %d)" % \        ((name,) + matrix.shape + (granularity, sampling))    output = args.output    if output:        if args.mode == "project" and target == "project":            output = args.output        else:            if target == "project":                name = "project"            output = get_plot_path(args.output, name)    deploy_plot(title, output, args.background)def plot_many_burndown(args, target, header, parts):    if not args.output:        print("Warning: output not set, showing %d plots." % len(parts))    itercnt = progress.bar(parts, expected_size=len(parts)) \        if progress is not None else parts    stdout = io.StringIO()    for name, matrix in itercnt:        backup = sys.stdout        sys.stdout = stdout        plot_burndown(args, target, *load_burndown(header, name, matrix, args.resample))        sys.stdout = backup    sys.stdout.write(stdout.getvalue())def plot_churn_matrix(args, repo, people, matrix):    if args.output and args.output.endswith(".json"):        data = locals().copy()        del data["args"]        data["type"] = "churn_matrix"        if args.mode == "all":            output = get_plot_path(args.output, "matrix")        else:            output = args.output        with open(output, "w") as fout:            json.dump(data, fout, sort_keys=True, default=default_json)        return    matplotlib, pyplot = import_pyplot(args.backend, args.style)    s = 4 + matrix.shape[1] * 0.3    fig = pyplot.figure(figsize=(s, s))    ax = fig.add_subplot(111)    ax.xaxis.set_label_position("top")    ax.matshow(matrix, cmap=pyplot.cm.OrRd)    ax.set_xticks(numpy.arange(0, matrix.shape[1]))    ax.set_yticks(numpy.arange(0, matrix.shape[0]))    ax.set_yticklabels(people, va="center")    ax.set_xticks(numpy.arange(0.5, matrix.shape[1] + 0.5), minor=True)    ax.set_xticklabels(["Unidentified"] + people, rotation=45, ha="left",                       va="bottom", rotation_mode="anchor")    ax.set_yticks(numpy.arange(0.5, matrix.shape[0] + 0.5), minor=True)    ax.grid(False)    ax.grid(which="minor")    apply_plot_style(fig, ax, None, args.background, args.font_size, args.size)    if not args.output:        pos1 = ax.get_position()        pos2 = (pos1.x0 + 0.15, pos1.y0 - 0.1, pos1.width * 0.9, pos1.height * 0.9)        ax.set_position(pos2)    if args.mode == "all" and args.output:        output = get_plot_path(args.output, "matrix")    else:        output = args.output    title = "%s %d developers overwrite" % (repo, matrix.shape[0])    if args.output:        # FIXME(vmarkovtsev): otherwise the title is screwed in savefig()        title = ""    deploy_plot(title, output, args.background)def plot_ownership(args, repo, names, people, date_range, last):    if args.output and args.output.endswith(".json"):        data = locals().copy()        del data["args"]        data["type"] = "ownership"        if args.mode == "all" and args.output:            output = get_plot_path(args.output, "people")        else:            output = args.output        with open(output, "w") as fout:            json.dump(data, fout, sort_keys=True, default=default_json)        return    matplotlib, pyplot = import_pyplot(args.backend, args.style)    polys = pyplot.stackplot(date_range, people, labels=names)    if names[-1] == "others":        polys[-1].set_hatch("/")    pyplot.xlim(parse_date(args.start_date, date_range[0]), parse_date(args.end_date, last))    if args.relative:        for i in range(people.shape[1]):            people[:, i] /= people[:, i].sum()        pyplot.ylim(0, 1)        legend_loc = 3    else:        legend_loc = 2    ncol = 1 if len(names) < 15 else 2    legend = pyplot.legend(loc=legend_loc, fontsize=args.font_size, ncol=ncol)    apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.background,                     args.font_size, args.size)    if args.mode == "all" and args.output:        output = get_plot_path(args.output, "people")    else:        output = args.output    deploy_plot("%s code ownership through time" % repo, output, args.background)IDEAL_SHARD_SIZE = 4096def train_embeddings(index, matrix, tmpdir, shard_size=IDEAL_SHARD_SIZE):    try:        from . import swivel    except (SystemError, ImportError):        import swivel    import tensorflow as tf    assert matrix.shape[0] == matrix.shape[1]    assert len(index) <= matrix.shape[0]    outlier_threshold = numpy.percentile(matrix.data, 99)    matrix.data[matrix.data > outlier_threshold] = outlier_threshold    nshards = len(index) // shard_size    if nshards * shard_size < len(index):        nshards += 1        shard_size = len(index) // nshards        nshards = len(index) // shard_size    remainder = len(index) - nshards * shard_size    if remainder > 0:        lengths = matrix.indptr[1:] - matrix.indptr[:-1]        filtered = sorted(numpy.argsort(lengths)[remainder:])    else:        filtered = list(range(len(index)))    if len(filtered) < matrix.shape[0]:        print("Truncating the sparse matrix...")        matrix = matrix[filtered, :][:, filtered]    meta_index = []    for i, j in enumerate(filtered):        meta_index.append((index[j], matrix[i, i]))    index = [mi[0] for mi in meta_index]    with tempfile.TemporaryDirectory(prefix="hercules_labours_", dir=tmpdir or None) as tmproot:        print("Writing Swivel metadata...")        vocabulary = "\n".join(index)        with open(os.path.join(tmproot, "row_vocab.txt"), "w") as out:            out.write(vocabulary)        with open(os.path.join(tmproot, "col_vocab.txt"), "w") as out:            out.write(vocabulary)        del vocabulary        bool_sums = matrix.indptr[1:] - matrix.indptr[:-1]        bool_sums_str = "\n".join(map(str, bool_sums.tolist()))        with open(os.path.join(tmproot, "row_sums.txt"), "w") as out:            out.write(bool_sums_str)        with open(os.path.join(tmproot, "col_sums.txt"), "w") as out:            out.write(bool_sums_str)        del bool_sums_str        reorder = numpy.argsort(-bool_sums)        print("Writing Swivel shards...")        for row in range(nshards):            for col in range(nshards):                def _int64s(xs):                    return tf.train.Feature(                        int64_list=tf.train.Int64List(value=list(xs)))                def _floats(xs):                    return tf.train.Feature(                        float_list=tf.train.FloatList(value=list(xs)))                indices_row = reorder[row::nshards]                indices_col = reorder[col::nshards]                shard = matrix[indices_row][:, indices_col].tocoo()                example = tf.train.Example(features=tf.train.Features(feature={                    "global_row": _int64s(indices_row),                    "global_col": _int64s(indices_col),                    "sparse_local_row": _int64s(shard.row),                    "sparse_local_col": _int64s(shard.col),                    "sparse_value": _floats(shard.data)}))                with open(os.path.join(tmproot, "shard-%03d-%03d.pb" % (row, col)), "wb") as out:                    out.write(example.SerializeToString())        print("Training Swivel model...")        swivel.FLAGS.submatrix_rows = shard_size        swivel.FLAGS.submatrix_cols = shard_size        if len(meta_index) <= IDEAL_SHARD_SIZE / 16:            embedding_size = 50            num_epochs = 100000        elif len(meta_index) <= IDEAL_SHARD_SIZE:            embedding_size = 50            num_epochs = 50000        elif len(meta_index) <= IDEAL_SHARD_SIZE * 2:            embedding_size = 60            num_epochs = 10000        elif len(meta_index) <= IDEAL_SHARD_SIZE * 4:            embedding_size = 70            num_epochs = 8000        elif len(meta_index) <= IDEAL_SHARD_SIZE * 10:            embedding_size = 80            num_epochs = 5000        elif len(meta_index) <= IDEAL_SHARD_SIZE * 25:            embedding_size = 100            num_epochs = 1000        elif len(meta_index) <= IDEAL_SHARD_SIZE * 100:            embedding_size = 200            num_epochs = 600        else:            embedding_size = 300            num_epochs = 300        if os.getenv("CI"):            # Travis, AppVeyor etc. during the integration tests            num_epochs /= 10        swivel.FLAGS.embedding_size = embedding_size        swivel.FLAGS.input_base_path = tmproot        swivel.FLAGS.output_base_path = tmproot        swivel.FLAGS.loss_multiplier = 1.0 / shard_size        swivel.FLAGS.num_epochs = num_epochs        # Tensorflow 1.5 parses sys.argv unconditionally *applause*        argv_backup = sys.argv[1:]        del sys.argv[1:]        swivel.main(None)        sys.argv.extend(argv_backup)        print("Reading Swivel embeddings...")        embeddings = []        with open(os.path.join(tmproot, "row_embedding.tsv")) as frow:            with open(os.path.join(tmproot, "col_embedding.tsv")) as fcol:                for i, (lrow, lcol) in enumerate(zip(frow, fcol)):                    prow, pcol = (l.split("\t", 1) for l in (lrow, lcol))                    assert prow[0] == pcol[0]                    erow, ecol = \                        (numpy.fromstring(p[1], dtype=numpy.float32, sep="\t")                         for p in (prow, pcol))                    embeddings.append((erow + ecol) / 2)    return meta_index, embeddingsclass CORSWebServer(object):    def __init__(self):        self.thread = threading.Thread(target=self.serve)        self.server = None    def serve(self):        outer = self        try:            from http.server import HTTPServer, SimpleHTTPRequestHandler, test        except ImportError:  # Python 2            from BaseHTTPServer import HTTPServer, test            from SimpleHTTPServer import SimpleHTTPRequestHandler        class ClojureServer(HTTPServer):            def __init__(self, *args, **kwargs):                HTTPServer.__init__(self, *args, **kwargs)                outer.server = self        class CORSRequestHandler(SimpleHTTPRequestHandler):            def end_headers(self):                self.send_header("Access-Control-Allow-Origin", "*")                SimpleHTTPRequestHandler.end_headers(self)        test(CORSRequestHandler, ClojureServer)    def start(self):        self.thread.start()    def stop(self):        if self.running:            self.server.shutdown()            self.thread.join()    @property    def running(self):        return self.server is not Noneweb_server = CORSWebServer()def write_embeddings(name, output, run_server, index, embeddings):    print("Writing Tensorflow Projector files...")    if not output:        output = "couples"    if output.endswith(".json"):        output = os.path.join(output[:-5], "couples")        run_server = False    metaf = "%s_%s_meta.tsv" % (output, name)    with open(metaf, "w") as fout:        fout.write("name\tcommits\n")        for pair in index:            fout.write("%s\t%s\n" % pair)    print("Wrote", metaf)    dataf = "%s_%s_data.tsv" % (output, name)    with open(dataf, "w") as fout:        for vec in embeddings:            fout.write("\t".join(str(v) for v in vec))            fout.write("\n")    print("Wrote", dataf)    jsonf = "%s_%s.json" % (output, name)    with open(jsonf, "w") as fout:        fout.write("""{  "embeddings": [    {      "tensorName": "%s %s coupling",      "tensorShape": [%s, %s],      "tensorPath": "http://0.0.0.0:8000/%s",      "metadataPath": "http://0.0.0.0:8000/%s"    }  ]}""" % (output, name, len(embeddings), len(embeddings[0]), dataf, metaf))    print("Wrote %s" % jsonf)    if run_server and not web_server.running:        web_server.start()    url = "http://projector.tensorflow.org/?config=http://0.0.0.0:8000/" + jsonf    print(url)    if run_server:        if shutil.which("xdg-open") is not None:            os.system("xdg-open " + url)        else:            browser = os.getenv("BROWSER", "")            if browser:                os.system(browser + " " + url)            else:                print("\t" + url)def show_shotness_stats(data):    top = sorted(((r.counters[i], i) for i, r in enumerate(data)), reverse=True)    for count, i in top:        r = data[i]        print("%8d  %s:%s [%s]" % (count, r.file, r.name, r.internal_role))def show_sentiment_stats(args, name, resample, start_date, data):    from scipy.signal import convolve, slepian    matplotlib, pyplot = import_pyplot(args.backend, args.style)    start_date = datetime.fromtimestamp(start_date)    data = sorted(data.items())    mood = numpy.zeros(data[-1][0] + 1, dtype=numpy.float32)    timeline = numpy.array([start_date + timedelta(days=i) for i in range(mood.shape[0])])    for d, val in data:        mood[d] = (0.5 - val.Value) * 2    resolution = 32    window = slepian(len(timeline) // resolution, 0.5)    window /= window.sum()    mood_smooth = convolve(mood, window, "same")    pos = mood_smooth.copy()    pos[pos < 0] = 0    neg = mood_smooth.copy()    neg[neg >= 0] = 0    resolution = 4    window = numpy.ones(len(timeline) // resolution)    window /= window.sum()    avg = convolve(mood, window, "same")    pyplot.fill_between(timeline, pos, color="#8DB843", label="Positive")    pyplot.fill_between(timeline, neg, color="#E14C35", label="Negative")    pyplot.plot(timeline, avg, color="grey", label="Average", linewidth=5)    legend = pyplot.legend(loc=1, fontsize=args.font_size)    pyplot.ylabel("Comment sentiment")    pyplot.xlabel("Time")    apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.background,                     args.font_size, args.size)    pyplot.xlim(parse_date(args.start_date, timeline[0]), parse_date(args.end_date, timeline[-1]))    locator = pyplot.gca().xaxis.get_major_locator()    # set the optimal xticks locator    if "M" not in resample:        pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())    locs = pyplot.gca().get_xticks().tolist()    if len(locs) >= 16:        pyplot.gca().xaxis.set_major_locator(matplotlib.dates.YearLocator())        locs = pyplot.gca().get_xticks().tolist()        if len(locs) >= 16:            pyplot.gca().xaxis.set_major_locator(locator)    if locs[0] < pyplot.xlim()[0]:        del locs[0]    endindex = -1    if len(locs) >= 2 and pyplot.xlim()[1] - locs[-1] > (locs[-1] - locs[-2]) / 2:        locs.append(pyplot.xlim()[1])        endindex = len(locs) - 1    startindex = -1    if len(locs) >= 2 and locs[0] - pyplot.xlim()[0] > (locs[1] - locs[0]) / 2:        locs.append(pyplot.xlim()[0])        startindex = len(locs) - 1    pyplot.gca().set_xticks(locs)    # hacking time!    labels = pyplot.gca().get_xticklabels()    if startindex >= 0:        labels[startindex].set_text(timeline[0].date())        labels[startindex].set_text = lambda _: None        labels[startindex].set_rotation(30)        labels[startindex].set_ha("right")    if endindex >= 0:        labels[endindex].set_text(timeline[-1].date())        labels[endindex].set_text = lambda _: None        labels[endindex].set_rotation(30)        labels[endindex].set_ha("right")    overall_pos = sum(2 * (0.5 - d[1].Value) for d in data if d[1].Value < 0.5)    overall_neg = sum(2 * (d[1].Value - 0.5) for d in data if d[1].Value > 0.5)    title = "%s sentiment +%.1f -%.1f δ=%.1f" % (        name, overall_pos, overall_neg, overall_pos - overall_neg)    deploy_plot(title, args.output, args.background)def show_devs(args, name, start_date, end_date, people, days):    from scipy.signal import convolve, slepian    max_people = 50    if len(people) > max_people:        print("Picking top 100 developers by commit count")        # pick top N developers by commit count        commits = defaultdict(int)        for devs in days.values():            for dev, stats in devs.items():                commits[dev] += stats.Commits        commits = sorted(((v, k) for k, v in commits.items()), reverse=True)        chosen_people = {people[k] for _, k in commits[:max_people]}    else:        chosen_people = set(people)    dists, devseries, devstats, route = order_commits(chosen_people, days, people)    route_map = {v: i for i, v in enumerate(route)}    # determine clusters    clusters = hdbscan_cluster_routed_series(dists, route)    keys = list(devseries.keys())    route = [keys[node] for node in route]    print("Plotting")    # smooth time series    start_date = datetime.fromtimestamp(start_date)    start_date = datetime(start_date.year, start_date.month, start_date.day)    end_date = datetime.fromtimestamp(end_date)    end_date = datetime(end_date.year, end_date.month, end_date.day)    size = (end_date - start_date).days + 1    plot_x = [start_date + timedelta(days=i) for i in range(size)]    resolution = 64    window = slepian(size // resolution, 0.5)    final = numpy.zeros((len(devseries), size), dtype=numpy.float32)    for i, s in enumerate(devseries.values()):        arr = numpy.array(s).transpose()        full_history = numpy.zeros(size, dtype=numpy.float32)        mask = arr[0] < size        full_history[arr[0][mask]] = arr[1][mask]        final[route_map[i]] = convolve(full_history, window, "same")    matplotlib, pyplot = import_pyplot(args.backend, args.style)    pyplot.rcParams["figure.figsize"] = (32, 16)    prop_cycle = pyplot.rcParams["axes.prop_cycle"]    colors = prop_cycle.by_key()["color"]    fig, axes = pyplot.subplots(final.shape[0], 1)    backgrounds = ("#C4FFDB", "#FFD0CD") if args.background == "white" else ("#05401C", "#40110E")    max_cluster = numpy.max(clusters)    for ax, series, cluster, dev_i in zip(axes, final, clusters, route):        if cluster >= 0:            color = colors[cluster % len(colors)]            i = 1            while color == "#777777":                color = colors[(max_cluster + i) % len(colors)]                i += 1        else:            # outlier            color = "#777777"        ax.fill_between(plot_x, series, color=color)        ax.set_axis_off()        author = people[dev_i]        ax.text(0.03, 0.5, author[:36] + (author[36:] and "..."),                horizontalalignment="right", verticalalignment="center",                transform=ax.transAxes, fontsize=14,                color="black" if args.background == "white" else "white")        ds = devstats[dev_i]        stats = "%5d %8s %8s" % (ds[0], _format_number(ds[1] - ds[2]), _format_number(ds[3]))        ax.text(0.97, 0.5, stats,                horizontalalignment="left", verticalalignment="center",                transform=ax.transAxes, fontsize=14, family="monospace",                backgroundcolor=backgrounds[ds[1] <= ds[2]],                color="black" if args.background == "white" else "white")    axes[0].text(0.97, 1.75, " cmts    delta  changed",                 horizontalalignment="left", verticalalignment="center",                 transform=axes[0].transAxes, fontsize=14, family="monospace",                 color="black" if args.background == "white" else "white")    axes[-1].set_axis_on()    target_num_labels = 12    num_months = (end_date.year - start_date.year) * 12 + end_date.month - start_date.month    interval = int(numpy.ceil(num_months / target_num_labels))    if interval >= 8:        interval = int(numpy.ceil(num_months / (12 * target_num_labels)))        axes[-1].xaxis.set_major_locator(matplotlib.dates.YearLocator(base=max(1, interval // 12)))        axes[-1].xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y"))    else:        axes[-1].xaxis.set_major_locator(matplotlib.dates.MonthLocator(interval=interval))        axes[-1].xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%m"))    for tick in axes[-1].xaxis.get_major_ticks():        tick.label.set_fontsize(args.font_size)    axes[-1].spines["left"].set_visible(False)    axes[-1].spines["right"].set_visible(False)    axes[-1].spines["top"].set_visible(False)    axes[-1].get_yaxis().set_visible(False)    axes[-1].set_facecolor((1.0,) * 3 + (0.0,))    title = ("%s commits" % name) if not args.output else ""    deploy_plot(title, args.output, args.background)def order_commits(chosen_people, days, people):    from seriate import seriate    try:        from fastdtw import fastdtw    except ImportError as e:        print("Cannot import fastdtw: %s\nInstall it from https://github.com/slaypni/fastdtw" % e)        sys.exit(1)    # FIXME(vmarkovtsev): remove once https://github.com/slaypni/fastdtw/pull/28 is merged&released    try:        sys.modules["fastdtw.fastdtw"].__norm = lambda p: lambda a, b: numpy.linalg.norm(            numpy.atleast_1d(a) - numpy.atleast_1d(b), p)    except KeyError:        # the native extension does not have this bug        pass    devseries = defaultdict(list)    devstats = defaultdict(lambda: DevDay(0, 0, 0, 0, {}))    for day, devs in sorted(days.items()):        for dev, stats in devs.items():            if people[dev] in chosen_people:                devseries[dev].append((day, stats.Commits))                devstats[dev] = devstats[dev].add(stats)    print("Calculating the distance matrix")    # max-normalize the time series using a sliding window    series = list(devseries.values())    for i, s in enumerate(series):        arr = numpy.array(s).transpose().astype(numpy.float32)        commits = arr[1]        if len(commits) < 7:            commits /= commits.max()        else:            # 4 is sizeof(float32)            windows = numpy.lib.stride_tricks.as_strided(commits, [len(commits) - 6, 7], [4, 4])            commits = numpy.concatenate((                [windows[0, 0] / windows[0].max(),                 windows[0, 1] / windows[0].max(),                 windows[0, 2] / windows[0].max()],                windows[:, 3] / windows.max(axis=1),                [windows[-1, 4] / windows[-1].max(),                 windows[-1, 5] / windows[-1].max(),                 windows[-1, 6] / windows[-1].max()]            ))        arr[1] = commits * 7  # 7 is a pure heuristic here and is not related to the window size        series[i] = arr.transpose()    # calculate the distance matrix using dynamic time warping metric    dists = numpy.full((len(series),) * 2, -100500, dtype=numpy.float32)    for x, serx in enumerate(series):        dists[x, x] = 0        for y, sery in enumerate(series[x + 1:], start=x + 1):            min_day = int(min(serx[0][0], sery[0][0]))            max_day = int(max(serx[-1][0], sery[-1][0]))            arrx = numpy.zeros(max_day - min_day + 1, dtype=numpy.float32)            arry = numpy.zeros_like(arrx)            arrx[serx[:, 0].astype(int) - min_day] = serx[:, 1]            arry[sery[:, 0].astype(int) - min_day] = sery[:, 1]            # L1 norm            dist, _ = fastdtw(arrx, arry, radius=5, dist=1)            dists[x, y] = dists[y, x] = dist    print("Ordering the series")    route = seriate(dists)    return dists, devseries, devstats, routedef hdbscan_cluster_routed_series(dists, route):    try:        from hdbscan import HDBSCAN    except ImportError as e:        print("Cannot import ortools: %s\nInstall it from "              "https://developers.google.com/optimization/install/python/" % e)        sys.exit(1)    opt_dist_chain = numpy.cumsum(numpy.array(        [0] + [dists[route[i], route[i + 1]] for i in range(len(route) - 1)]))    clusters = HDBSCAN(min_cluster_size=2).fit_predict(opt_dist_chain[:, numpy.newaxis])    return clustersdef show_devs_efforts(args, name, start_date, end_date, people, days, max_people):    from scipy.signal import convolve, slepian    start_date = datetime.fromtimestamp(start_date)    start_date = datetime(start_date.year, start_date.month, start_date.day)    end_date = datetime.fromtimestamp(end_date)    end_date = datetime(end_date.year, end_date.month, end_date.day)    efforts_by_dev = defaultdict(int)    for day, devs in days.items():        for dev, stats in devs.items():            efforts_by_dev[dev] += stats.Added + stats.Removed + stats.Changed    if len(efforts_by_dev) > max_people:        chosen = {v for k, v in sorted(            ((v, k) for k, v in efforts_by_dev.items()), reverse=True)[:max_people]}        print("Warning: truncated people to the most active %d" % max_people)    else:        chosen = set(efforts_by_dev)    chosen_efforts = sorted(((efforts_by_dev[k], k) for k in chosen), reverse=True)    chosen_order = {k: i for i, (_, k) in enumerate(chosen_efforts)}    efforts = numpy.zeros((len(chosen) + 1, (end_date - start_date).days + 1), dtype=numpy.float32)    for day, devs in days.items():        if day < efforts.shape[1]:            for dev, stats in devs.items():                dev = chosen_order.get(dev, len(chosen_order))                efforts[dev][day] += stats.Added + stats.Removed + stats.Changed    efforts_cum = numpy.cumsum(efforts, axis=1)    window = slepian(10, 0.5)    window /= window.sum()    for e in (efforts, efforts_cum):        for i in range(e.shape[0]):            ending = e[i][-len(window) * 2:].copy()            e[i] = convolve(e[i], window, "same")            e[i][-len(ending):] = ending    matplotlib, pyplot = import_pyplot(args.backend, args.style)    plot_x = [start_date + timedelta(days=i) for i in range(efforts.shape[1])]    people = [people[k] for _, k in chosen_efforts] + ["others"]    for i, name in enumerate(people):        if len(name) > 40:            people[i] = name[:37] + "..."    polys = pyplot.stackplot(plot_x, efforts_cum, labels=people)    if len(polys) == max_people + 1:        polys[-1].set_hatch("/")    polys = pyplot.stackplot(plot_x, -efforts * efforts_cum.max() / efforts.max())    if len(polys) == max_people + 1:        polys[-1].set_hatch("/")    yticks = []    for tick in pyplot.gca().yaxis.iter_ticks():        if tick[1] >= 0:            yticks.append(tick[1])    pyplot.gca().yaxis.set_ticks(yticks)    legend = pyplot.legend(loc=2, ncol=2, fontsize=args.font_size)    apply_plot_style(pyplot.gcf(), pyplot.gca(), legend, args.background,                     args.font_size, args.size or "16,10")    deploy_plot("Efforts through time (changed lines of code)", args.output, args.background)def show_old_vs_new(args, name, start_date, end_date, people, days):    from scipy.signal import convolve, slepian    start_date = datetime.fromtimestamp(start_date)    start_date = datetime(start_date.year, start_date.month, start_date.day)    end_date = datetime.fromtimestamp(end_date)    end_date = datetime(end_date.year, end_date.month, end_date.day)    new_lines = numpy.zeros((end_date - start_date).days + 1)    old_lines = numpy.zeros_like(new_lines)    for day, devs in days.items():        for stats in devs.values():            new_lines[day] += stats.Added            old_lines[day] += stats.Removed + stats.Changed    resolution = 32    window = slepian(len(new_lines) // resolution, 0.5)    new_lines = convolve(new_lines, window, "same")    old_lines = convolve(old_lines, window, "same")    matplotlib, pyplot = import_pyplot(args.backend, args.style)    plot_x = [start_date + timedelta(days=i) for i in range(len(new_lines))]    pyplot.fill_between(plot_x, new_lines, color="#8DB843", label="Changed new lines")    pyplot.fill_between(plot_x, old_lines, color="#E14C35", label="Changed existing lines")    pyplot.legend(loc=2, fontsize=args.font_size)    for tick in chain(pyplot.gca().xaxis.get_major_ticks(), pyplot.gca().yaxis.get_major_ticks()):        tick.label.set_fontsize(args.font_size)    deploy_plot("Additions vs changes", args.output, args.background)def show_languages(args, name, start_date, end_date, people, days):    devlangs = defaultdict(lambda: defaultdict(lambda: numpy.zeros(3, dtype=int)))    for day, devs in days.items():        for dev, stats in devs.items():            for lang, vals in stats.Languages.items():                devlangs[dev][lang] += vals    devlangs = sorted(devlangs.items(), key=lambda p: -sum(x.sum() for x in p[1].values()))    for dev, ls in devlangs:        print()        print("#", people[dev])        ls = sorted(((vals.sum(), lang) for lang, vals in ls.items()), reverse=True)        for vals, lang in ls:            if lang:                print("%s: %d" % (lang, vals))class ParallelDevData:    def __init__(self):        self.commits_rank = -1        self.commits = -1        self.lines_rank = -1        self.lines = -1        self.ownership_rank = -1        self.ownership = -1        self.couples_index = -1        self.couples_cluster = -1        self.commit_coocc_index = -1        self.commit_coocc_cluster = -1    def __str__(self):        return str(self.__dict__)    def __repr__(self):        return str(self)def load_devs_parallel(ownership, couples, devs, max_people):    from seriate import seriate    try:        from hdbscan import HDBSCAN    except ImportError as e:        print("Cannot import ortools: %s\nInstall it from "              "https://developers.google.com/optimization/install/python/" % e)        sys.exit(1)    people, owned = ownership    _, cmatrix = couples    _, days = devs    print("calculating - commits")    commits = defaultdict(int)    for day, devs in days.items():        for dev, stats in devs.items():            commits[people[dev]] += stats.Commits    chosen = [k for v, k in sorted(((v, k) for k, v in commits.items()),                                   reverse=True)[:max_people]]    result = {k: ParallelDevData() for k in chosen}    for k, v in result.items():        v.commits_rank = chosen.index(k)        v.commits = commits[k]    print("calculating - lines")    lines = defaultdict(int)    for day, devs in days.items():        for dev, stats in devs.items():            lines[people[dev]] += stats.Added + stats.Removed + stats.Changed    lines_index = {k: i for i, (_, k) in enumerate(sorted(        ((v, k) for k, v in lines.items() if k in chosen), reverse=True))}    for k, v in result.items():        v.lines_rank = lines_index[k]        v.lines = lines[k]    print("calculating - ownership")    owned_index = {k: i for i, (_, k) in enumerate(sorted(        ((owned[k][-1].sum(), k) for k in chosen), reverse=True))}    for k, v in result.items():        v.ownership_rank = owned_index[k]        v.ownership = owned[k][-1].sum()    print("calculating - couples")    embeddings = numpy.genfromtxt(fname="couples_people_data.tsv", delimiter="\t")[        [people.index(k) for k in chosen]]    embeddings /= numpy.linalg.norm(embeddings, axis=1)[:, None]    cos = embeddings.dot(embeddings.T)    cos[cos > 1] = 1  # tiny precision faults    dists = numpy.arccos(cos)    clusters = HDBSCAN(min_cluster_size=2, metric="precomputed").fit_predict(dists)    for k, v in result.items():        v.couples_cluster = clusters[chosen.index(k)]    couples_order = seriate(dists)    roll_options = []    for i in range(len(couples_order)):        loss = 0        for k, v in result.items():            loss += abs(                v.ownership_rank - (couples_order.index(chosen.index(k)) + i) % len(chosen))        roll_options.append(loss)    best_roll = numpy.argmin(roll_options)    couples_order = list(numpy.roll(couples_order, best_roll))    for k, v in result.items():        v.couples_index = couples_order.index(chosen.index(k))    print("calculating - commit series")    dists, devseries, _, orig_route = order_commits(chosen, days, people)    keys = list(devseries.keys())    route = [keys[node] for node in orig_route]    for roll in range(len(route)):        loss = 0        for k, v in result.items():            i = route.index(people.index(k))            loss += abs(v.couples_index - ((i + roll) % len(route)))        roll_options[roll] = loss    best_roll = numpy.argmin(roll_options)    route = list(numpy.roll(route, best_roll))    orig_route = list(numpy.roll(orig_route, best_roll))    clusters = hdbscan_cluster_routed_series(dists, orig_route)    for k, v in result.items():        v.commit_coocc_index = route.index(people.index(k))        v.commit_coocc_cluster = clusters[v.commit_coocc_index]    return resultdef show_devs_parallel(args, name, start_date, end_date, devs):    matplotlib, pyplot = import_pyplot(args.backend, args.style)    from matplotlib.collections import LineCollection    def solve_equations(x1, y1, x2, y2):        xcube = (x1 - x2) ** 3        a = 2 * (y2 - y1) / xcube        b = 3 * (y1 - y2) * (x1 + x2) / xcube        c = 6 * (y2 - y1) * x1 * x2 / xcube        d = y1 - a * x1 ** 3 - b * x1 ** 2 - c * x1        return a, b, c, d    # biggest = {k: max(getattr(d, k) for d in devs.values())    #            for k in ("commits", "lines", "ownership")}    for k, dev in devs.items():        points = numpy.array([            (1, dev.commits_rank),            (2, dev.lines_rank),            (3, dev.ownership_rank),            (4, dev.couples_index),            (5, dev.commit_coocc_index)],            dtype=float)        points[:, 1] = points[:, 1] / len(devs)        splines = []        for i in range(len(points) - 1):            a, b, c, d = solve_equations(*points[i], *points[i + 1])            x = numpy.linspace(i + 1, i + 2, 100)            smooth_points = numpy.array(                [x, a * x ** 3 + b * x ** 2 + c * x + d]).T.reshape(-1, 1, 2)            splines.append(smooth_points)        points = numpy.concatenate(splines)        segments = numpy.concatenate([points[:-1], points[1:]], axis=1)        lc = LineCollection(segments)        lc.set_array(numpy.linspace(0, 0.1, segments.shape[0]))        pyplot.gca().add_collection(lc)    pyplot.xlim(0, 6)    pyplot.ylim(-0.1, 1.1)    deploy_plot("Developers", args.output, args.background)def _format_number(n):    if n == 0:        return "0"    power = int(numpy.log10(abs(n)))    if power >= 6:        n = n / 1000000        if n >= 10:            n = str(int(n))        else:            n = "%.1f" % n            if n.endswith("0"):                n = n[:-2]        suffix = "M"    elif power >= 3:        n = n / 1000        if n >= 10:            n = str(int(n))        else:            n = "%.1f" % n            if n.endswith("0"):                n = n[:-2]        suffix = "K"    else:        n = str(n)        suffix = ""    return n + suffixdef main():    args = parse_args()    reader = read_input(args)    header = reader.get_header()    name = reader.get_name()    burndown_warning = "Burndown stats were not collected. Re-run hercules with --burndown."    burndown_files_warning = \        "Burndown stats for files were not collected. Re-run hercules with " \        "--burndown --burndown-files."    burndown_people_warning = \        "Burndown stats for people were not collected. Re-run hercules with " \        "--burndown --burndown-people."    couples_warning = "Coupling stats were not collected. Re-run hercules with --couples."    shotness_warning = "Structural hotness stats were not collected. Re-run hercules with " \                       "--shotness. Also check --languages - the output may be empty."    sentiment_warning = "Sentiment stats were not collected. Re-run hercules with --sentiment."    devs_warning = "Devs stats were not collected. Re-run hercules with --devs."    def run_times():        rt = reader.get_run_times()        pandas = import_pandas()        series = pandas.to_timedelta(pandas.Series(rt).sort_values(ascending=False), unit="s")        df = pandas.concat([series, series / series.sum()], axis=1)        df.columns = ["time", "ratio"]        print(df)    def project_burndown():        try:            full_header = header + reader.get_burndown_parameters()        except KeyError:            print("project: " + burndown_warning)            return        plot_burndown(args, "project",                      *load_burndown(full_header, *reader.get_project_burndown(),                                     resample=args.resample))    def files_burndown():        try:            full_header = header + reader.get_burndown_parameters()        except KeyError:            print(burndown_warning)            return        try:            plot_many_burndown(args, "file", full_header, reader.get_files_burndown())        except KeyError:            print("files: " + burndown_files_warning)    def people_burndown():        try:            full_header = header + reader.get_burndown_parameters()        except KeyError:            print(burndown_warning)            return        try:            plot_many_burndown(args, "person", full_header, reader.get_people_burndown())        except KeyError:            print("people: " + burndown_people_warning)    def churn_matrix():        try:            plot_churn_matrix(args, name, *load_churn_matrix(                *reader.get_people_interaction(), max_people=args.max_people))        except KeyError:            print("churn_matrix: " + burndown_people_warning)    def ownership_burndown():        try:            full_header = header + reader.get_burndown_parameters()        except KeyError:            print(burndown_warning)            return        try:            plot_ownership(args, name, *load_ownership(                full_header, *reader.get_ownership_burndown(), max_people=args.max_people))        except KeyError:            print("ownership: " + burndown_people_warning)    def couples_files():        try:            write_embeddings("files", args.output, not args.disable_projector,                             *train_embeddings(*reader.get_files_coocc(),                                               tmpdir=args.couples_tmp_dir))        except KeyError:            print(couples_warning)    def couples_people():        try:            write_embeddings("people", args.output, not args.disable_projector,                             *train_embeddings(*reader.get_people_coocc(),                                               tmpdir=args.couples_tmp_dir))        except KeyError:            print(couples_warning)    def couples_shotness():        try:            write_embeddings("shotness", args.output, not args.disable_projector,                             *train_embeddings(*reader.get_shotness_coocc(),                                               tmpdir=args.couples_tmp_dir))        except KeyError:            print(shotness_warning)    def shotness():        try:            data = reader.get_shotness()        except KeyError:            print(shotness_warning)            return        show_shotness_stats(data)    def sentiment():        try:            data = reader.get_sentiment()        except KeyError:            print(sentiment_warning)            return        show_sentiment_stats(args, reader.get_name(), args.resample, reader.get_header()[0], data)    def devs():        try:            data = reader.get_devs()        except KeyError:            print(devs_warning)            return        show_devs(args, reader.get_name(), *reader.get_header(), *data)    def devs_efforts():        try:            data = reader.get_devs()        except KeyError:            print(devs_warning)            return        show_devs_efforts(args, reader.get_name(), *reader.get_header(), *data,                          max_people=args.max_people)    def old_vs_new():        try:            data = reader.get_devs()        except KeyError:            print(devs_warning)            return        show_old_vs_new(args, reader.get_name(), *reader.get_header(), *data)    def languages():        try:            data = reader.get_devs()        except KeyError:            print(devs_warning)            return        show_languages(args, reader.get_name(), *reader.get_header(), *data)    def devs_parallel():        try:            ownership = reader.get_ownership_burndown()        except KeyError:            print(burndown_people_warning)            return        try:            couples = reader.get_people_coocc()        except KeyError:            print(couples_warning)            return        try:            devs = reader.get_devs()        except KeyError:            print(devs_warning)            return        show_devs_parallel(args, reader.get_name(), *reader.get_header(),                           load_devs_parallel(ownership, couples, devs, args.max_people))    modes = {        "run-times": run_times,        "burndown-project": project_burndown,        "burndown-file": files_burndown,        "burndown-person": people_burndown,        "churn-matrix": churn_matrix,        "ownership": ownership_burndown,        "couples-files": couples_files,        "couples-people": couples_people,        "couples-shotness": couples_shotness,        "shotness": shotness,        "sentiment": sentiment,        "devs": devs,        "devs-efforts": devs_efforts,        "old-vs-new": old_vs_new,        "languages": languages,        "devs-parallel": devs_parallel,    }    try:        modes[args.mode]()    except KeyError:        assert args.mode == "all"        project_burndown()        files_burndown()        people_burndown()        churn_matrix()        ownership_burndown()        couples_files()        couples_people()        couples_shotness()        shotness()        sentiment()        devs()        devs_efforts()        # devs_parallel()    if web_server.running:        secs = int(os.getenv("COUPLES_SERVER_TIME", "60"))        print("Sleeping for %d seconds, safe to Ctrl-C" % secs)        sys.stdout.flush()        try:            time.sleep(secs)        except KeyboardInterrupt:            pass        web_server.stop()if __name__ == "__main__":    sys.exit(main())
 |