readers.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. from argparse import Namespace
  2. from importlib import import_module
  3. import re
  4. import sys
  5. from typing import TYPE_CHECKING, Any, Dict, List, Tuple
  6. import numpy
  7. import yaml
  8. from labours.objects import DevDay
  9. if TYPE_CHECKING:
  10. from scipy.sparse.csr import csr_matrix
  11. class Reader(object):
  12. def read(self, file):
  13. raise NotImplementedError
  14. def get_name(self):
  15. raise NotImplementedError
  16. def get_header(self):
  17. raise NotImplementedError
  18. def get_burndown_parameters(self):
  19. raise NotImplementedError
  20. def get_project_burndown(self):
  21. raise NotImplementedError
  22. def get_files_burndown(self):
  23. raise NotImplementedError
  24. def get_people_burndown(self):
  25. raise NotImplementedError
  26. def get_ownership_burndown(self):
  27. raise NotImplementedError
  28. def get_people_interaction(self):
  29. raise NotImplementedError
  30. def get_files_coocc(self):
  31. raise NotImplementedError
  32. def get_people_coocc(self):
  33. raise NotImplementedError
  34. def get_shotness_coocc(self):
  35. raise NotImplementedError
  36. def get_shotness(self):
  37. raise NotImplementedError
  38. def get_sentiment(self):
  39. raise NotImplementedError
  40. def get_devs(self):
  41. raise NotImplementedError
  42. class YamlReader(Reader):
  43. def read(self, file: str):
  44. yaml.reader.Reader.NON_PRINTABLE = re.compile(r"(?!x)x")
  45. try:
  46. loader = yaml.CLoader
  47. except AttributeError:
  48. print("Warning: failed to import yaml.CLoader, falling back to slow yaml.Loader")
  49. loader = yaml.Loader
  50. try:
  51. if file != "-":
  52. with open(file) as fin:
  53. data = yaml.load(fin, Loader=loader)
  54. else:
  55. data = yaml.load(sys.stdin, Loader=loader)
  56. except (UnicodeEncodeError, yaml.reader.ReaderError) as e:
  57. print("\nInvalid unicode in the input: %s\nPlease filter it through "
  58. "fix_yaml_unicode.py" % e)
  59. sys.exit(1)
  60. if data is None:
  61. print("\nNo data has been read - has Hercules crashed?")
  62. sys.exit(1)
  63. self.data = data
  64. def get_run_times(self):
  65. return {}
  66. def get_name(self):
  67. return self.data["hercules"]["repository"]
  68. def get_header(self):
  69. header = self.data["hercules"]
  70. return header["begin_unix_time"], header["end_unix_time"]
  71. def get_burndown_parameters(self):
  72. header = self.data["Burndown"]
  73. return header["sampling"], header["granularity"], header["tick_size"]
  74. def get_project_burndown(self):
  75. return self.data["hercules"]["repository"], \
  76. self._parse_burndown_matrix(self.data["Burndown"]["project"]).T
  77. def get_files_burndown(self):
  78. return [(p[0], self._parse_burndown_matrix(p[1]).T)
  79. for p in self.data["Burndown"]["files"].items()]
  80. def get_people_burndown(self):
  81. return [(p[0], self._parse_burndown_matrix(p[1]).T)
  82. for p in self.data["Burndown"]["people"].items()]
  83. def get_ownership_burndown(self):
  84. return self.data["Burndown"]["people_sequence"].copy(), \
  85. {p[0]: self._parse_burndown_matrix(p[1])
  86. for p in self.data["Burndown"]["people"].items()}
  87. def get_people_interaction(self):
  88. return self.data["Burndown"]["people_sequence"].copy(), \
  89. self._parse_burndown_matrix(self.data["Burndown"]["people_interaction"])
  90. def get_files_coocc(self):
  91. coocc = self.data["Couples"]["files_coocc"]
  92. return coocc["index"], self._parse_coocc_matrix(coocc["matrix"])
  93. def get_people_coocc(self):
  94. coocc = self.data["Couples"]["people_coocc"]
  95. return coocc["index"], self._parse_coocc_matrix(coocc["matrix"])
  96. def get_shotness_coocc(self):
  97. shotness = self.data["Shotness"]
  98. index = ["%s:%s" % (i["file"], i["name"]) for i in shotness]
  99. indptr = numpy.zeros(len(shotness) + 1, dtype=numpy.int64)
  100. indices = []
  101. data = []
  102. for i, record in enumerate(shotness):
  103. pairs = [(int(k), v) for k, v in record["counters"].items()]
  104. pairs.sort()
  105. indptr[i + 1] = indptr[i] + len(pairs)
  106. for k, v in pairs:
  107. indices.append(k)
  108. data.append(v)
  109. indices = numpy.array(indices, dtype=numpy.int32)
  110. data = numpy.array(data, dtype=numpy.int32)
  111. from scipy.sparse import csr_matrix
  112. return index, csr_matrix((data, indices, indptr), shape=(len(shotness),) * 2)
  113. def get_shotness(self):
  114. from munch import munchify
  115. obj = munchify(self.data["Shotness"])
  116. # turn strings into ints
  117. for item in obj:
  118. item.counters = {int(k): v for k, v in item.counters.items()}
  119. if len(obj) == 0:
  120. raise KeyError
  121. return obj
  122. def get_sentiment(self):
  123. from munch import munchify
  124. return munchify({int(key): {
  125. "Comments": vals[2].split("|"),
  126. "Commits": vals[1],
  127. "Value": float(vals[0])
  128. } for key, vals in self.data["Sentiment"].items()})
  129. def get_devs(self):
  130. people = self.data["Devs"]["people"]
  131. days = {int(d): {int(dev): DevDay(*(int(x) for x in day[:-1]), day[-1])
  132. for dev, day in devs.items()}
  133. for d, devs in self.data["Devs"]["ticks"].items()}
  134. return people, days
  135. def _parse_burndown_matrix(self, matrix):
  136. return numpy.array([numpy.fromstring(line, dtype=int, sep=" ")
  137. for line in matrix.split("\n")])
  138. def _parse_coocc_matrix(self, matrix):
  139. from scipy.sparse import csr_matrix
  140. data = []
  141. indices = []
  142. indptr = [0]
  143. for row in matrix:
  144. for k, v in sorted(row.items()):
  145. data.append(v)
  146. indices.append(k)
  147. indptr.append(indptr[-1] + len(row))
  148. return csr_matrix((data, indices, indptr), shape=(len(matrix),) * 2)
  149. class ProtobufReader(Reader):
  150. def read(self, file: str) -> None:
  151. try:
  152. from labours.pb_pb2 import AnalysisResults
  153. except ImportError as e:
  154. print("\n\n>>> You need to generate python/hercules/pb/pb_pb2.py - run \"make\"\n",
  155. file=sys.stderr)
  156. raise e from None
  157. self.data = AnalysisResults()
  158. if file != "-":
  159. with open(file, "rb") as fin:
  160. bytes = fin.read()
  161. else:
  162. bytes = sys.stdin.buffer.read()
  163. if not bytes:
  164. raise ValueError("empty input")
  165. self.data.ParseFromString(bytes)
  166. self.contents = {}
  167. for key, val in self.data.contents.items():
  168. try:
  169. mod, name = PB_MESSAGES[key].rsplit(".", 1)
  170. except KeyError:
  171. sys.stderr.write("Warning: there is no registered PB decoder for %s\n" % key)
  172. continue
  173. cls = getattr(import_module(mod), name)
  174. self.contents[key] = msg = cls()
  175. msg.ParseFromString(val)
  176. def get_run_times(self):
  177. return {key: val for key, val in self.data.header.run_time_per_item.items()}
  178. def get_name(self) -> str:
  179. return self.data.header.repository
  180. def get_header(self) -> Tuple[int, int]:
  181. header = self.data.header
  182. return header.begin_unix_time, header.end_unix_time
  183. def get_burndown_parameters(self) -> Tuple[int, int, float]:
  184. burndown = self.contents["Burndown"]
  185. return burndown.sampling, burndown.granularity, burndown.tick_size / 1000000000
  186. def get_project_burndown(self) -> Tuple[str, numpy.ndarray]:
  187. return self._parse_burndown_matrix(self.contents["Burndown"].project)
  188. def get_files_burndown(self):
  189. return [self._parse_burndown_matrix(i) for i in self.contents["Burndown"].files]
  190. def get_people_burndown(self) -> List[Any]:
  191. return [self._parse_burndown_matrix(i) for i in self.contents["Burndown"].people]
  192. def get_ownership_burndown(self) -> Tuple[List[Any], Dict[Any, Any]]:
  193. people = self.get_people_burndown()
  194. return [p[0] for p in people], {p[0]: p[1].T for p in people}
  195. def get_people_interaction(self):
  196. burndown = self.contents["Burndown"]
  197. return [i.name for i in burndown.people], \
  198. self._parse_sparse_matrix(burndown.people_interaction).toarray()
  199. def get_files_coocc(self) -> Tuple[List[str], 'csr_matrix']:
  200. node = self.contents["Couples"].file_couples
  201. return list(node.index), self._parse_sparse_matrix(node.matrix)
  202. def get_people_coocc(self) -> Tuple[List[str], 'csr_matrix']:
  203. node = self.contents["Couples"].people_couples
  204. return list(node.index), self._parse_sparse_matrix(node.matrix)
  205. def get_shotness_coocc(self):
  206. shotness = self.get_shotness()
  207. index = ["%s:%s" % (i.file, i.name) for i in shotness]
  208. indptr = numpy.zeros(len(shotness) + 1, dtype=numpy.int32)
  209. indices = []
  210. data = []
  211. for i, record in enumerate(shotness):
  212. pairs = list(record.counters.items())
  213. pairs.sort()
  214. indptr[i + 1] = indptr[i] + len(pairs)
  215. for k, v in pairs:
  216. indices.append(k)
  217. data.append(v)
  218. indices = numpy.array(indices, dtype=numpy.int32)
  219. data = numpy.array(data, dtype=numpy.int32)
  220. from scipy.sparse import csr_matrix
  221. return index, csr_matrix((data, indices, indptr), shape=(len(shotness),) * 2)
  222. def get_shotness(self):
  223. records = self.contents["Shotness"].records
  224. if len(records) == 0:
  225. raise KeyError
  226. return records
  227. def get_sentiment(self):
  228. byday = self.contents["Sentiment"].SentimentByDay
  229. if len(byday) == 0:
  230. raise KeyError
  231. return byday
  232. def get_devs(self) -> Tuple[List[str], Dict[int, Dict[int, DevDay]]]:
  233. people = list(self.contents["Devs"].dev_index)
  234. days = {d: {dev: DevDay(stats.commits, stats.stats.added, stats.stats.removed,
  235. stats.stats.changed, {k: [v.added, v.removed, v.changed]
  236. for k, v in stats.languages.items()})
  237. for dev, stats in day.devs.items()}
  238. for d, day in self.contents["Devs"].ticks.items()}
  239. return people, days
  240. def _parse_burndown_matrix(self, matrix):
  241. dense = numpy.zeros((matrix.number_of_rows, matrix.number_of_columns), dtype=int)
  242. for y, row in enumerate(matrix.rows):
  243. for x, col in enumerate(row.columns):
  244. dense[y, x] = col
  245. return matrix.name, dense.T
  246. def _parse_sparse_matrix(self, matrix):
  247. from scipy.sparse import csr_matrix
  248. return csr_matrix((list(matrix.data), list(matrix.indices), list(matrix.indptr)),
  249. shape=(matrix.number_of_rows, matrix.number_of_columns))
  250. READERS = {"yaml": YamlReader, "yml": YamlReader, "pb": ProtobufReader}
  251. PB_MESSAGES = {
  252. "Burndown": "labours.pb_pb2.BurndownAnalysisResults",
  253. "Couples": "labours.pb_pb2.CouplesAnalysisResults",
  254. "Shotness": "labours.pb_pb2.ShotnessAnalysisResults",
  255. "Devs": "labours.pb_pb2.DevsAnalysisResults",
  256. }
  257. def read_input(args: Namespace) -> ProtobufReader:
  258. sys.stdout.write("Reading the input... ")
  259. sys.stdout.flush()
  260. if args.input != "-":
  261. if args.input_format == "auto":
  262. try:
  263. args.input_format = args.input.rsplit(".", 1)[1]
  264. except IndexError:
  265. try:
  266. with open(args.input) as f:
  267. f.read(1 << 16)
  268. args.input_format = "yaml"
  269. except UnicodeDecodeError:
  270. args.input_format = "pb"
  271. elif args.input_format == "auto":
  272. args.input_format = "yaml"
  273. reader = READERS[args.input_format]()
  274. reader.read(args.input)
  275. print("done")
  276. return reader