123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167 |
- from collections import defaultdict
- import sys
- from typing import Any, Dict, List, Tuple
- import numpy
- from scipy.sparse.csr import csr_matrix
- from labours.modes.devs import hdbscan_cluster_routed_series, order_commits
- from labours.objects import DevDay, ParallelDevData
- from labours.plotting import deploy_plot, import_pyplot
- def load_devs_parallel(
- ownership: Tuple[List[Any], Dict[Any, Any]],
- couples: Tuple[List[str], csr_matrix],
- devs: Tuple[List[str], Dict[int, Dict[int, DevDay]]],
- max_people: int,
- ):
- from seriate import seriate
- try:
- from hdbscan import HDBSCAN
- except ImportError as e:
- print(
- "Cannot import ortools: %s\nInstall it from "
- "https://developers.google.com/optimization/install/python/" % e
- )
- sys.exit(1)
- people, owned = ownership
- _, cmatrix = couples
- _, days = devs
- print("calculating - commits")
- commits = defaultdict(int)
- for day, devs in days.items():
- for dev, stats in devs.items():
- commits[people[dev]] += stats.Commits
- chosen = [
- k
- for v, k in sorted(((v, k) for k, v in commits.items()), reverse=True)[
- :max_people
- ]
- ]
- result = {k: ParallelDevData() for k in chosen}
- for k, v in result.items():
- v.commits_rank = chosen.index(k)
- v.commits = commits[k]
- print("calculating - lines")
- lines = defaultdict(int)
- for day, devs in days.items():
- for dev, stats in devs.items():
- lines[people[dev]] += stats.Added + stats.Removed + stats.Changed
- lines_index = {
- k: i
- for i, (_, k) in enumerate(
- sorted(((v, k) for k, v in lines.items() if k in chosen), reverse=True)
- )
- }
- for k, v in result.items():
- v.lines_rank = lines_index[k]
- v.lines = lines[k]
- print("calculating - ownership")
- owned_index = {
- k: i
- for i, (_, k) in enumerate(
- sorted(((owned[k][-1].sum(), k) for k in chosen), reverse=True)
- )
- }
- for k, v in result.items():
- v.ownership_rank = owned_index[k]
- v.ownership = owned[k][-1].sum()
- print("calculating - couples")
- embeddings = numpy.genfromtxt(fname="couples_people_data.tsv", delimiter="\t")[
- [people.index(k) for k in chosen]
- ]
- embeddings /= numpy.linalg.norm(embeddings, axis=1)[:, None]
- cos = embeddings.dot(embeddings.T)
- cos[cos > 1] = 1 # tiny precision faults
- dists = numpy.arccos(cos)
- clusters = HDBSCAN(min_cluster_size=2, metric="precomputed").fit_predict(dists)
- for k, v in result.items():
- v.couples_cluster = clusters[chosen.index(k)]
- couples_order = seriate(dists)
- roll_options = []
- for i in range(len(couples_order)):
- loss = 0
- for k, v in result.items():
- loss += abs(
- v.ownership_rank
- - (couples_order.index(chosen.index(k)) + i) % len(chosen)
- )
- roll_options.append(loss)
- best_roll = numpy.argmin(roll_options)
- couples_order = list(numpy.roll(couples_order, best_roll))
- for k, v in result.items():
- v.couples_index = couples_order.index(chosen.index(k))
- print("calculating - commit series")
- dists, devseries, _, orig_route = order_commits(chosen, days, people)
- keys = list(devseries.keys())
- route = [keys[node] for node in orig_route]
- for roll in range(len(route)):
- loss = 0
- for k, v in result.items():
- i = route.index(people.index(k))
- loss += abs(v.couples_index - ((i + roll) % len(route)))
- roll_options[roll] = loss
- best_roll = numpy.argmin(roll_options)
- route = list(numpy.roll(route, best_roll))
- orig_route = list(numpy.roll(orig_route, best_roll))
- clusters = hdbscan_cluster_routed_series(dists, orig_route)
- for k, v in result.items():
- v.commit_coocc_index = route.index(people.index(k))
- v.commit_coocc_cluster = clusters[v.commit_coocc_index]
- return result
- def show_devs_parallel(args, name, start_date, end_date, devs):
- matplotlib, pyplot = import_pyplot(args.backend, args.style)
- from matplotlib.collections import LineCollection
- def solve_equations(x1, y1, x2, y2):
- xcube = (x1 - x2) ** 3
- a = 2 * (y2 - y1) / xcube
- b = 3 * (y1 - y2) * (x1 + x2) / xcube
- c = 6 * (y2 - y1) * x1 * x2 / xcube
- d = y1 - a * x1 ** 3 - b * x1 ** 2 - c * x1
- return a, b, c, d
- # biggest = {k: max(getattr(d, k) for d in devs.values())
- # for k in ("commits", "lines", "ownership")}
- for k, dev in devs.items():
- points = numpy.array(
- [
- (1, dev.commits_rank),
- (2, dev.lines_rank),
- (3, dev.ownership_rank),
- (4, dev.couples_index),
- (5, dev.commit_coocc_index),
- ],
- dtype=float,
- )
- points[:, 1] = points[:, 1] / len(devs)
- splines = []
- for i in range(len(points) - 1):
- a, b, c, d = solve_equations(*points[i], *points[i + 1])
- x = numpy.linspace(i + 1, i + 2, 100)
- smooth_points = numpy.array(
- [x, a * x ** 3 + b * x ** 2 + c * x + d]
- ).T.reshape(-1, 1, 2)
- splines.append(smooth_points)
- points = numpy.concatenate(splines)
- segments = numpy.concatenate([points[:-1], points[1:]], axis=1)
- lc = LineCollection(segments)
- lc.set_array(numpy.linspace(0, 0.1, segments.shape[0]))
- pyplot.gca().add_collection(lc)
- pyplot.xlim(0, 6)
- pyplot.ylim(-0.1, 1.1)
- deploy_plot("Developers", args.output, args.background)
|