readers.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393
  1. from argparse import Namespace
  2. from importlib import import_module
  3. import re
  4. import sys
  5. from typing import Any, Dict, List, Tuple, TYPE_CHECKING
  6. import numpy
  7. import yaml
  8. from labours.objects import DevDay
  9. if TYPE_CHECKING:
  10. from scipy.sparse.csr import csr_matrix
  11. class Reader(object):
  12. def read(self, file):
  13. raise NotImplementedError
  14. def get_name(self):
  15. raise NotImplementedError
  16. def get_header(self):
  17. raise NotImplementedError
  18. def get_burndown_parameters(self):
  19. raise NotImplementedError
  20. def get_project_burndown(self):
  21. raise NotImplementedError
  22. def get_files_burndown(self):
  23. raise NotImplementedError
  24. def get_people_burndown(self):
  25. raise NotImplementedError
  26. def get_ownership_burndown(self):
  27. raise NotImplementedError
  28. def get_people_interaction(self):
  29. raise NotImplementedError
  30. def get_files_coocc(self):
  31. raise NotImplementedError
  32. def get_people_coocc(self):
  33. raise NotImplementedError
  34. def get_shotness_coocc(self):
  35. raise NotImplementedError
  36. def get_shotness(self):
  37. raise NotImplementedError
  38. def get_sentiment(self):
  39. raise NotImplementedError
  40. def get_devs(self):
  41. raise NotImplementedError
  42. class YamlReader(Reader):
  43. def read(self, file: str):
  44. yaml.reader.Reader.NON_PRINTABLE = re.compile(r"(?!x)x")
  45. try:
  46. loader = yaml.CLoader
  47. except AttributeError:
  48. print(
  49. "Warning: failed to import yaml.CLoader, falling back to slow yaml.Loader"
  50. )
  51. loader = yaml.Loader
  52. try:
  53. if file != "-":
  54. with open(file) as fin:
  55. data = yaml.load(fin, Loader=loader)
  56. else:
  57. data = yaml.load(sys.stdin, Loader=loader)
  58. except (UnicodeEncodeError, yaml.reader.ReaderError) as e:
  59. print(
  60. "\nInvalid unicode in the input: %s\nPlease filter it through "
  61. "fix_yaml_unicode.py" % e
  62. )
  63. sys.exit(1)
  64. if data is None:
  65. print("\nNo data has been read - has Hercules crashed?")
  66. sys.exit(1)
  67. self.data = data
  68. def get_run_times(self):
  69. return {}
  70. def get_name(self):
  71. return self.data["hercules"]["repository"]
  72. def get_header(self):
  73. header = self.data["hercules"]
  74. return header["begin_unix_time"], header["end_unix_time"]
  75. def get_burndown_parameters(self):
  76. header = self.data["Burndown"]
  77. return header["sampling"], header["granularity"], header["tick_size"]
  78. def get_project_burndown(self):
  79. return (
  80. self.data["hercules"]["repository"],
  81. self._parse_burndown_matrix(self.data["Burndown"]["project"]).T,
  82. )
  83. def get_files_burndown(self):
  84. return [
  85. (p[0], self._parse_burndown_matrix(p[1]).T)
  86. for p in self.data["Burndown"]["files"].items()
  87. ]
  88. def get_people_burndown(self):
  89. return [
  90. (p[0], self._parse_burndown_matrix(p[1]).T)
  91. for p in self.data["Burndown"]["people"].items()
  92. ]
  93. def get_ownership_burndown(self):
  94. return (
  95. self.data["Burndown"]["people_sequence"].copy(),
  96. {
  97. p[0]: self._parse_burndown_matrix(p[1])
  98. for p in self.data["Burndown"]["people"].items()
  99. },
  100. )
  101. def get_people_interaction(self):
  102. return (
  103. self.data["Burndown"]["people_sequence"].copy(),
  104. self._parse_burndown_matrix(self.data["Burndown"]["people_interaction"]),
  105. )
  106. def get_files_coocc(self):
  107. coocc = self.data["Couples"]["files_coocc"]
  108. return coocc["index"], self._parse_coocc_matrix(coocc["matrix"])
  109. def get_people_coocc(self):
  110. coocc = self.data["Couples"]["people_coocc"]
  111. return coocc["index"], self._parse_coocc_matrix(coocc["matrix"])
  112. def get_shotness_coocc(self):
  113. shotness = self.data["Shotness"]
  114. index = ["%s:%s" % (i["file"], i["name"]) for i in shotness]
  115. indptr = numpy.zeros(len(shotness) + 1, dtype=numpy.int64)
  116. indices = []
  117. data = []
  118. for i, record in enumerate(shotness):
  119. pairs = [(int(k), v) for k, v in record["counters"].items()]
  120. pairs.sort()
  121. indptr[i + 1] = indptr[i] + len(pairs)
  122. for k, v in pairs:
  123. indices.append(k)
  124. data.append(v)
  125. indices = numpy.array(indices, dtype=numpy.int32)
  126. data = numpy.array(data, dtype=numpy.int32)
  127. from scipy.sparse import csr_matrix
  128. return index, csr_matrix((data, indices, indptr), shape=(len(shotness),) * 2)
  129. def get_shotness(self):
  130. from munch import munchify
  131. obj = munchify(self.data["Shotness"])
  132. # turn strings into ints
  133. for item in obj:
  134. item.counters = {int(k): v for k, v in item.counters.items()}
  135. if len(obj) == 0:
  136. raise KeyError
  137. return obj
  138. def get_sentiment(self):
  139. from munch import munchify
  140. return munchify(
  141. {
  142. int(key): {
  143. "Comments": vals[2].split("|"),
  144. "Commits": vals[1],
  145. "Value": float(vals[0]),
  146. }
  147. for key, vals in self.data["Sentiment"].items()
  148. }
  149. )
  150. def get_devs(self):
  151. people = self.data["Devs"]["people"]
  152. days = {
  153. int(d): {
  154. int(dev): DevDay(*(int(x) for x in day[:-1]), day[-1])
  155. for dev, day in devs.items()
  156. }
  157. for d, devs in self.data["Devs"]["ticks"].items()
  158. }
  159. return people, days
  160. def _parse_burndown_matrix(self, matrix):
  161. return numpy.array(
  162. [numpy.fromstring(line, dtype=int, sep=" ") for line in matrix.split("\n")]
  163. )
  164. def _parse_coocc_matrix(self, matrix):
  165. from scipy.sparse import csr_matrix
  166. data = []
  167. indices = []
  168. indptr = [0]
  169. for row in matrix:
  170. for k, v in sorted(row.items()):
  171. data.append(v)
  172. indices.append(k)
  173. indptr.append(indptr[-1] + len(row))
  174. return csr_matrix((data, indices, indptr), shape=(len(matrix),) * 2)
  175. class ProtobufReader(Reader):
  176. def read(self, file: str) -> None:
  177. try:
  178. from labours.pb_pb2 import AnalysisResults
  179. except ImportError as e:
  180. print(
  181. "\n\n>>> You need to generate python/hercules/pb/pb_pb2.py - run \"make\"\n",
  182. file=sys.stderr,
  183. )
  184. raise e from None
  185. self.data = AnalysisResults()
  186. if file != "-":
  187. with open(file, "rb") as fin:
  188. bytes = fin.read()
  189. else:
  190. bytes = sys.stdin.buffer.read()
  191. if not bytes:
  192. raise ValueError("empty input")
  193. self.data.ParseFromString(bytes)
  194. self.contents = {}
  195. for key, val in self.data.contents.items():
  196. try:
  197. mod, name = PB_MESSAGES[key].rsplit(".", 1)
  198. except KeyError:
  199. sys.stderr.write(
  200. "Warning: there is no registered PB decoder for %s\n" % key
  201. )
  202. continue
  203. cls = getattr(import_module(mod), name)
  204. self.contents[key] = msg = cls()
  205. msg.ParseFromString(val)
  206. def get_run_times(self):
  207. return {key: val for key, val in self.data.header.run_time_per_item.items()}
  208. def get_name(self) -> str:
  209. return self.data.header.repository
  210. def get_header(self) -> Tuple[int, int]:
  211. header = self.data.header
  212. return header.begin_unix_time, header.end_unix_time
  213. def get_burndown_parameters(self) -> Tuple[int, int, float]:
  214. burndown = self.contents["Burndown"]
  215. return burndown.sampling, burndown.granularity, burndown.tick_size / 1000000000
  216. def get_project_burndown(self) -> Tuple[str, numpy.ndarray]:
  217. return self._parse_burndown_matrix(self.contents["Burndown"].project)
  218. def get_files_burndown(self):
  219. return [self._parse_burndown_matrix(i) for i in self.contents["Burndown"].files]
  220. def get_people_burndown(self) -> List[Any]:
  221. return [
  222. self._parse_burndown_matrix(i) for i in self.contents["Burndown"].people
  223. ]
  224. def get_ownership_burndown(self) -> Tuple[List[Any], Dict[Any, Any]]:
  225. people = self.get_people_burndown()
  226. return [p[0] for p in people], {p[0]: p[1].T for p in people}
  227. def get_people_interaction(self):
  228. burndown = self.contents["Burndown"]
  229. return (
  230. [i.name for i in burndown.people],
  231. self._parse_sparse_matrix(burndown.people_interaction).toarray(),
  232. )
  233. def get_files_coocc(self) -> Tuple[List[str], 'csr_matrix']:
  234. node = self.contents["Couples"].file_couples
  235. return list(node.index), self._parse_sparse_matrix(node.matrix)
  236. def get_people_coocc(self) -> Tuple[List[str], 'csr_matrix']:
  237. node = self.contents["Couples"].people_couples
  238. return list(node.index), self._parse_sparse_matrix(node.matrix)
  239. def get_shotness_coocc(self):
  240. shotness = self.get_shotness()
  241. index = ["%s:%s" % (i.file, i.name) for i in shotness]
  242. indptr = numpy.zeros(len(shotness) + 1, dtype=numpy.int32)
  243. indices = []
  244. data = []
  245. for i, record in enumerate(shotness):
  246. pairs = list(record.counters.items())
  247. pairs.sort()
  248. indptr[i + 1] = indptr[i] + len(pairs)
  249. for k, v in pairs:
  250. indices.append(k)
  251. data.append(v)
  252. indices = numpy.array(indices, dtype=numpy.int32)
  253. data = numpy.array(data, dtype=numpy.int32)
  254. from scipy.sparse import csr_matrix
  255. return index, csr_matrix((data, indices, indptr), shape=(len(shotness),) * 2)
  256. def get_shotness(self):
  257. records = self.contents["Shotness"].records
  258. if len(records) == 0:
  259. raise KeyError
  260. return records
  261. def get_sentiment(self):
  262. byday = self.contents["Sentiment"].SentimentByDay
  263. if len(byday) == 0:
  264. raise KeyError
  265. return byday
  266. def get_devs(self) -> Tuple[List[str], Dict[int, Dict[int, DevDay]]]:
  267. people = list(self.contents["Devs"].dev_index)
  268. days = {
  269. d: {
  270. dev: DevDay(
  271. stats.commits,
  272. stats.stats.added,
  273. stats.stats.removed,
  274. stats.stats.changed,
  275. {
  276. k: [v.added, v.removed, v.changed]
  277. for k, v in stats.languages.items()
  278. },
  279. )
  280. for dev, stats in day.devs.items()
  281. }
  282. for d, day in self.contents["Devs"].ticks.items()
  283. }
  284. return people, days
  285. def _parse_burndown_matrix(self, matrix):
  286. dense = numpy.zeros(
  287. (matrix.number_of_rows, matrix.number_of_columns), dtype=int
  288. )
  289. for y, row in enumerate(matrix.rows):
  290. for x, col in enumerate(row.columns):
  291. dense[y, x] = col
  292. return matrix.name, dense.T
  293. def _parse_sparse_matrix(self, matrix):
  294. from scipy.sparse import csr_matrix
  295. return csr_matrix(
  296. (list(matrix.data), list(matrix.indices), list(matrix.indptr)),
  297. shape=(matrix.number_of_rows, matrix.number_of_columns),
  298. )
  299. READERS = {"yaml": YamlReader, "yml": YamlReader, "pb": ProtobufReader}
  300. PB_MESSAGES = {
  301. "Burndown": "labours.pb_pb2.BurndownAnalysisResults",
  302. "Couples": "labours.pb_pb2.CouplesAnalysisResults",
  303. "Shotness": "labours.pb_pb2.ShotnessAnalysisResults",
  304. "Devs": "labours.pb_pb2.DevsAnalysisResults",
  305. }
  306. def read_input(args: Namespace) -> ProtobufReader:
  307. sys.stdout.write("Reading the input... ")
  308. sys.stdout.flush()
  309. if args.input != "-":
  310. if args.input_format == "auto":
  311. try:
  312. args.input_format = args.input.rsplit(".", 1)[1]
  313. except IndexError:
  314. try:
  315. with open(args.input) as f:
  316. f.read(1 << 16)
  317. args.input_format = "yaml"
  318. except UnicodeDecodeError:
  319. args.input_format = "pb"
  320. elif args.input_format == "auto":
  321. args.input_format = "yaml"
  322. reader = READERS[args.input_format]()
  323. reader.read(args.input)
  324. print("done")
  325. return reader