plot_churn.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. import argparse
  2. from datetime import datetime, timedelta
  3. import os
  4. import re
  5. import sys
  6. from matplotlib import pyplot
  7. import matplotlib.dates as mdates
  8. import numpy
  9. import pandas
  10. import yaml
  11. def parse_args():
  12. parser = argparse.ArgumentParser()
  13. parser.add_argument("input", help="Path to the input YAML file. \"-\" means stdin.")
  14. parser.add_argument("-o", "--output", help="Output directory. If empty, display the plots.")
  15. parser.add_argument("-f", "--format", choices=("png", "svg"), default="png",
  16. help="Output format")
  17. parser.add_argument("--tick-days", type=int, default=7, help="Ticks interval in days.")
  18. args = parser.parse_args()
  19. return args
  20. def parse_input(file):
  21. yaml.reader.Reader.NON_PRINTABLE = re.compile(r"(?!x)x")
  22. try:
  23. loader = yaml.CLoader
  24. except AttributeError:
  25. print("Warning: failed to import yaml.CLoader, falling back to slow yaml.Loader")
  26. loader = yaml.Loader
  27. try:
  28. if file != "-":
  29. with open(file) as fin:
  30. return yaml.load(fin, Loader=loader)
  31. else:
  32. return yaml.load(sys.stdin, Loader=loader)
  33. except (UnicodeEncodeError, yaml.reader.ReaderError) as e:
  34. print("\nInvalid unicode in the input: %s\nPlease filter it through "
  35. "fix_yaml_unicode.py" % e)
  36. sys.exit(1)
  37. def plot_churn(name, data, url, beginTime, endTime, output, fmt, tick_interval):
  38. days, adds, dels = data["days"], data["additions"], data["removals"]
  39. dates = [beginTime + timedelta(days=d) for d in days]
  40. df = pandas.DataFrame(data=list(zip(adds, dels)),
  41. index=dates,
  42. columns=("additions", "removals"))
  43. df["removals"] = -df["removals"]
  44. df = df.reindex(pandas.date_range(beginTime, endTime, freq="D"))
  45. effective = df["additions"] + df["removals"]
  46. effective = effective.cumsum()
  47. effective.fillna(method="ffill", inplace=True)
  48. scale = numpy.maximum(df.max(), -df.min()).max()
  49. effective = effective / effective.max() * scale
  50. pyplot.figure(figsize=(16, 9))
  51. for spine in pyplot.gca().spines.values():
  52. spine.set_visible(False)
  53. pyplot.gca().xaxis.set_major_locator(mdates.DayLocator(interval=tick_interval))
  54. pyplot.gca().xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m-%d"))
  55. pyplot.tick_params(top="off", bottom="off", left="off", right="off",
  56. labelleft="off", labelbottom="on")
  57. pyplot.bar(df.index, df["additions"], label="additions")
  58. pyplot.bar(df.index, df["removals"], label="removals")
  59. pyplot.plot(df.index, effective, "black", label="effective")
  60. pyplot.xticks(rotation="vertical")
  61. pyplot.legend(loc=2, fontsize=18)
  62. pyplot.title("%s churn plot, %s" % (name, url), fontsize=24)
  63. if not output:
  64. pyplot.show()
  65. else:
  66. os.makedirs(output, exist_ok=True)
  67. pyplot.savefig(os.path.join(output, name.replace("/", "_") + "." + fmt),
  68. bbox_inches="tight", transparent=True)
  69. def main():
  70. args = parse_args()
  71. data = parse_input(args.input)
  72. beginTime, endTime = (datetime.fromtimestamp(data["hercules"][t])
  73. for t in ("begin_unix_time", "end_unix_time"))
  74. for key, val in data["ChurnAnalysis"].items():
  75. plot_churn(key, val, data["hercules"]["repository"], beginTime, endTime,
  76. args.output, args.format, args.tick_days)
  77. if __name__ == "__main__":
  78. sys.exit(main())