readers.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445
  1. from argparse import Namespace
  2. from importlib import import_module
  3. import io
  4. import re
  5. import sys
  6. from typing import Any, BinaryIO, Dict, List, Tuple, TYPE_CHECKING
  7. import numpy
  8. import yaml
  9. from labours.objects import DevDay
  10. if TYPE_CHECKING:
  11. from scipy.sparse.csr import csr_matrix
  12. class Reader(object):
  13. def read(self, fileobj: BinaryIO):
  14. raise NotImplementedError
  15. def get_name(self):
  16. raise NotImplementedError
  17. def get_header(self):
  18. raise NotImplementedError
  19. def get_burndown_parameters(self):
  20. raise NotImplementedError
  21. def get_project_burndown(self):
  22. raise NotImplementedError
  23. def get_files_burndown(self):
  24. raise NotImplementedError
  25. def get_people_burndown(self):
  26. raise NotImplementedError
  27. def get_ownership_burndown(self):
  28. raise NotImplementedError
  29. def get_people_interaction(self):
  30. raise NotImplementedError
  31. def get_files_coocc(self):
  32. raise NotImplementedError
  33. def get_people_coocc(self):
  34. raise NotImplementedError
  35. def get_shotness_coocc(self):
  36. raise NotImplementedError
  37. def get_shotness(self):
  38. raise NotImplementedError
  39. def get_sentiment(self):
  40. raise NotImplementedError
  41. def get_devs(self):
  42. raise NotImplementedError
  43. class YamlReader(Reader):
  44. def read(self, fileobj: BinaryIO):
  45. yaml.reader.Reader.NON_PRINTABLE = re.compile(r"(?!x)x")
  46. try:
  47. loader = yaml.CLoader
  48. except AttributeError:
  49. print(
  50. "Warning: failed to import yaml.CLoader, falling back to slow yaml.Loader"
  51. )
  52. loader = yaml.Loader
  53. try:
  54. wrapper = io.TextIOWrapper(fileobj, encoding="utf-8")
  55. data = yaml.load(wrapper, Loader=loader)
  56. except (UnicodeEncodeError, UnicodeDecodeError, yaml.reader.ReaderError) as e:
  57. print(
  58. "\nInvalid unicode in the input: %s\nPlease filter it through "
  59. "fix_yaml_unicode.py" % e
  60. )
  61. sys.exit(1)
  62. if data is None:
  63. print("\nNo data has been read - has Hercules crashed?")
  64. sys.exit(1)
  65. self.data = data
  66. def get_run_times(self):
  67. return {}
  68. def get_name(self):
  69. return self.data["hercules"]["repository"]
  70. def get_header(self):
  71. header = self.data["hercules"]
  72. return header["begin_unix_time"], header["end_unix_time"]
  73. def get_burndown_parameters(self):
  74. header = self.data["Burndown"]
  75. return header["sampling"], header["granularity"], header["tick_size"]
  76. def get_project_burndown(self):
  77. return (
  78. self.data["hercules"]["repository"],
  79. self._parse_burndown_matrix(self.data["Burndown"]["project"]).T,
  80. )
  81. def get_files_burndown(self):
  82. return [
  83. (p[0], self._parse_burndown_matrix(p[1]).T)
  84. for p in self.data["Burndown"]["files"].items()
  85. ]
  86. def get_people_burndown(self):
  87. return [
  88. (p[0], self._parse_burndown_matrix(p[1]).T)
  89. for p in self.data["Burndown"]["people"].items()
  90. ]
  91. def get_ownership_burndown(self):
  92. return (
  93. self.data["Burndown"]["people_sequence"].copy(),
  94. {
  95. p[0]: self._parse_burndown_matrix(p[1])
  96. for p in self.data["Burndown"]["people"].items()
  97. },
  98. )
  99. def get_people_interaction(self):
  100. return (
  101. self.data["Burndown"]["people_sequence"].copy(),
  102. self._parse_burndown_matrix(self.data["Burndown"]["people_interaction"]),
  103. )
  104. def get_files_coocc(self):
  105. coocc = self.data["Couples"]["files_coocc"]
  106. return coocc["index"], self._parse_coocc_matrix(coocc["matrix"])
  107. def get_people_coocc(self):
  108. coocc = self.data["Couples"]["people_coocc"]
  109. return coocc["index"], self._parse_coocc_matrix(coocc["matrix"])
  110. def get_shotness_coocc(self):
  111. shotness = self.data["Shotness"]
  112. index = ["%s:%s" % (i["file"], i["name"]) for i in shotness]
  113. indptr = numpy.zeros(len(shotness) + 1, dtype=numpy.int64)
  114. indices = []
  115. data = []
  116. for i, record in enumerate(shotness):
  117. pairs = [(int(k), v) for k, v in record["counters"].items()]
  118. pairs.sort()
  119. indptr[i + 1] = indptr[i] + len(pairs)
  120. for k, v in pairs:
  121. indices.append(k)
  122. data.append(v)
  123. indices = numpy.array(indices, dtype=numpy.int32)
  124. data = numpy.array(data, dtype=numpy.int32)
  125. from scipy.sparse import csr_matrix
  126. return index, csr_matrix((data, indices, indptr), shape=(len(shotness),) * 2)
  127. def get_shotness(self):
  128. from munch import munchify
  129. obj = munchify(self.data["Shotness"])
  130. # turn strings into ints
  131. for item in obj:
  132. item.counters = {int(k): v for k, v in item.counters.items()}
  133. if len(obj) == 0:
  134. raise KeyError
  135. return obj
  136. def get_sentiment(self):
  137. from munch import munchify
  138. return munchify(
  139. {
  140. int(key): {
  141. "Comments": vals[2].split("|"),
  142. "Commits": vals[1],
  143. "Value": float(vals[0]),
  144. }
  145. for key, vals in self.data["Sentiment"].items()
  146. }
  147. )
  148. def get_devs(self):
  149. people = self.data["Devs"]["people"]
  150. days = {
  151. int(d): {
  152. int(dev): DevDay(*(int(x) for x in day[:-1]), day[-1])
  153. for dev, day in devs.items()
  154. }
  155. for d, devs in self.data["Devs"]["ticks"].items()
  156. }
  157. return people, days
  158. def _parse_burndown_matrix(self, matrix):
  159. return numpy.array(
  160. [numpy.fromstring(line, dtype=int, sep=" ") for line in matrix.split("\n")]
  161. )
  162. def _parse_coocc_matrix(self, matrix):
  163. from scipy.sparse import csr_matrix
  164. data = []
  165. indices = []
  166. indptr = [0]
  167. for row in matrix:
  168. for k, v in sorted(row.items()):
  169. data.append(v)
  170. indices.append(k)
  171. indptr.append(indptr[-1] + len(row))
  172. return csr_matrix((data, indices, indptr), shape=(len(matrix),) * 2)
  173. class ProtobufReader(Reader):
  174. def read(self, fileobj: BinaryIO) -> None:
  175. try:
  176. from labours.pb_pb2 import AnalysisResults
  177. except ImportError as e:
  178. print(
  179. "\n\n>>> You need to generate python/hercules/pb/pb_pb2.py - run \"make\"\n",
  180. file=sys.stderr,
  181. )
  182. raise e from None
  183. self.data = AnalysisResults()
  184. all_bytes = fileobj.read()
  185. if not all_bytes:
  186. raise ValueError("empty input")
  187. self.data.ParseFromString(all_bytes)
  188. self.contents = {}
  189. for key, val in self.data.contents.items():
  190. try:
  191. mod, name = PB_MESSAGES[key].rsplit(".", 1)
  192. except KeyError:
  193. sys.stderr.write(
  194. "Warning: there is no registered PB decoder for %s\n" % key
  195. )
  196. continue
  197. cls = getattr(import_module(mod), name)
  198. self.contents[key] = msg = cls()
  199. msg.ParseFromString(val)
  200. def get_run_times(self):
  201. return {key: val for key, val in self.data.header.run_time_per_item.items()}
  202. def get_name(self) -> str:
  203. return self.data.header.repository
  204. def get_header(self) -> Tuple[int, int]:
  205. header = self.data.header
  206. return header.begin_unix_time, header.end_unix_time
  207. def get_burndown_parameters(self) -> Tuple[int, int, float]:
  208. burndown = self.contents["Burndown"]
  209. return burndown.sampling, burndown.granularity, burndown.tick_size / 1000000000
  210. def get_project_burndown(self) -> Tuple[str, numpy.ndarray]:
  211. return self._parse_burndown_matrix(self.contents["Burndown"].project)
  212. def get_files_burndown(self):
  213. return [self._parse_burndown_matrix(i) for i in self.contents["Burndown"].files]
  214. def get_people_burndown(self) -> List[Any]:
  215. return [
  216. self._parse_burndown_matrix(i) for i in self.contents["Burndown"].people
  217. ]
  218. def get_ownership_burndown(self) -> Tuple[List[Any], Dict[Any, Any]]:
  219. people = self.get_people_burndown()
  220. return [p[0] for p in people], {p[0]: p[1].T for p in people}
  221. def get_people_interaction(self):
  222. burndown = self.contents["Burndown"]
  223. return (
  224. [i.name for i in burndown.people],
  225. self._parse_sparse_matrix(burndown.people_interaction).toarray(),
  226. )
  227. def get_files_coocc(self) -> Tuple[List[str], 'csr_matrix']:
  228. node = self.contents["Couples"].file_couples
  229. return list(node.index), self._parse_sparse_matrix(node.matrix)
  230. def get_people_coocc(self) -> Tuple[List[str], 'csr_matrix']:
  231. node = self.contents["Couples"].people_couples
  232. return list(node.index), self._parse_sparse_matrix(node.matrix)
  233. def get_shotness_coocc(self):
  234. shotness = self.get_shotness()
  235. index = ["%s:%s" % (i.file, i.name) for i in shotness]
  236. indptr = numpy.zeros(len(shotness) + 1, dtype=numpy.int32)
  237. indices = []
  238. data = []
  239. for i, record in enumerate(shotness):
  240. pairs = list(record.counters.items())
  241. pairs.sort()
  242. indptr[i + 1] = indptr[i] + len(pairs)
  243. for k, v in pairs:
  244. indices.append(k)
  245. data.append(v)
  246. indices = numpy.array(indices, dtype=numpy.int32)
  247. data = numpy.array(data, dtype=numpy.int32)
  248. from scipy.sparse import csr_matrix
  249. return index, csr_matrix((data, indices, indptr), shape=(len(shotness),) * 2)
  250. def get_shotness(self):
  251. records = self.contents["Shotness"].records
  252. if len(records) == 0:
  253. raise KeyError
  254. return records
  255. def get_sentiment(self):
  256. byday = self.contents["Sentiment"].SentimentByDay
  257. if len(byday) == 0:
  258. raise KeyError
  259. return byday
  260. def get_devs(self) -> Tuple[List[str], Dict[int, Dict[int, DevDay]]]:
  261. people = list(self.contents["Devs"].dev_index)
  262. days = {
  263. d: {
  264. dev: DevDay(
  265. stats.commits,
  266. stats.stats.added,
  267. stats.stats.removed,
  268. stats.stats.changed,
  269. {
  270. k: [v.added, v.removed, v.changed]
  271. for k, v in stats.languages.items()
  272. },
  273. )
  274. for dev, stats in day.devs.items()
  275. }
  276. for d, day in self.contents["Devs"].ticks.items()
  277. }
  278. return people, days
  279. def _parse_burndown_matrix(self, matrix):
  280. dense = numpy.zeros(
  281. (matrix.number_of_rows, matrix.number_of_columns), dtype=int
  282. )
  283. for y, row in enumerate(matrix.rows):
  284. for x, col in enumerate(row.columns):
  285. dense[y, x] = col
  286. return matrix.name, dense.T
  287. def _parse_sparse_matrix(self, matrix):
  288. from scipy.sparse import csr_matrix
  289. return csr_matrix(
  290. (list(matrix.data), list(matrix.indices), list(matrix.indptr)),
  291. shape=(matrix.number_of_rows, matrix.number_of_columns),
  292. )
  293. READERS = {"yaml": YamlReader, "yml": YamlReader, "pb": ProtobufReader}
  294. PB_MESSAGES = {
  295. "Burndown": "labours.pb_pb2.BurndownAnalysisResults",
  296. "Couples": "labours.pb_pb2.CouplesAnalysisResults",
  297. "Shotness": "labours.pb_pb2.ShotnessAnalysisResults",
  298. "Devs": "labours.pb_pb2.DevsAnalysisResults",
  299. }
  300. def chain_streams(streams, buffer_size=io.DEFAULT_BUFFER_SIZE):
  301. """
  302. Chain an iterable of streams together into a single buffered stream.
  303. Source: https://stackoverflow.com/a/50770511
  304. Usage:
  305. f = chain_streams(open(f, "rb") for f in filenames)
  306. f.read()
  307. """
  308. class ChainStream(io.RawIOBase):
  309. def __init__(self):
  310. self.leftover = b""
  311. self.stream_iter = iter(streams)
  312. try:
  313. self.stream = next(self.stream_iter)
  314. except StopIteration:
  315. self.stream = None
  316. def readable(self):
  317. return True
  318. def _read_next_chunk(self, max_length):
  319. # Return 0 or more bytes from the current stream, first returning all
  320. # leftover bytes. If the stream is closed returns b''
  321. if self.leftover:
  322. return self.leftover
  323. elif self.stream is not None:
  324. return self.stream.read(max_length)
  325. else:
  326. return b""
  327. def readinto(self, b):
  328. buffer_length = len(b)
  329. chunk = self._read_next_chunk(buffer_length)
  330. while len(chunk) == 0:
  331. # move to next stream
  332. if self.stream is not None:
  333. self.stream.close()
  334. try:
  335. self.stream = next(self.stream_iter)
  336. chunk = self._read_next_chunk(buffer_length)
  337. except StopIteration:
  338. # No more streams to chain together
  339. self.stream = None
  340. return 0 # indicate EOF
  341. output, self.leftover = chunk[:buffer_length], chunk[buffer_length:]
  342. b[:len(output)] = output
  343. return len(output)
  344. return io.BufferedReader(ChainStream(), buffer_size=buffer_size)
  345. def read_input(args: Namespace) -> ProtobufReader:
  346. sys.stdout.write("Reading the input... ")
  347. sys.stdout.flush()
  348. if args.input != "-":
  349. stream = open(args.input, "rb")
  350. else:
  351. stream = sys.stdin.buffer
  352. try:
  353. if args.input_format == "auto":
  354. buffer = stream.read(1 << 16)
  355. try:
  356. buffer.decode("utf-8")
  357. args.input_format = "yaml"
  358. except UnicodeDecodeError:
  359. args.input_format = "pb"
  360. ins = chain_streams((io.BytesIO(buffer), stream), len(buffer))
  361. else:
  362. ins = stream
  363. reader = READERS[args.input_format]()
  364. reader.read(ins)
  365. finally:
  366. if args.input != "-":
  367. stream.close()
  368. print("done")
  369. return reader