visuals.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487
  1. import pandas as pd
  2. import numpy as np
  3. import statsmodels.api as sm
  4. from sklearn.linear_model import LinearRegression
  5. from sklearn.metrics import mean_squared_error
  6. from scipy import stats
  7. import plotly.graph_objs as go
  8. import cufflinks
  9. cufflinks.go_offline()
  10. def make_hist(df, x, category=None):
  11. """
  12. Make an interactive histogram, optionally segmented by `category`
  13. :param df: dataframe of data
  14. :param x: string of column to use for plotting
  15. :param category: string representing column to segment by
  16. :return figure: a plotly histogram to show with iplot or plot
  17. """
  18. if category is not None:
  19. data = []
  20. for name, group in df.groupby(category):
  21. data.append(go.Histogram(dict(x=group[x], name=name)))
  22. else:
  23. data = [go.Histogram(dict(x=df[x]))]
  24. layout = go.Layout(
  25. yaxis=dict(title="Count"),
  26. xaxis=dict(title=x.replace("_", " ").title()),
  27. title=f"{x.replace('_', ' ').title()} Distribution by {category.replace('_', ' ').title()}"
  28. if category
  29. else f"{x.replace('_', ' ').title()} Distribution",
  30. )
  31. figure = go.Figure(data=data, layout=layout)
  32. return figure
  33. def make_cum_plot(df, y, category=None, ranges=False):
  34. """
  35. Make an interactive cumulative plot, optionally segmented by `category`
  36. :param df: dataframe of data, must have a `published_date` column
  37. :param y: string of column to use for plotting or list of two strings for double y axis
  38. :param category: string representing column to segment by
  39. :param ranges: boolean for whether to add range slider and range selector
  40. :return figure: a plotly plot to show with iplot or plot
  41. """
  42. if category is not None:
  43. data = []
  44. for i, (name, group) in enumerate(df.groupby(category)):
  45. group.sort_values("published_date", inplace=True)
  46. data.append(
  47. go.Scatter(
  48. x=group["published_date"],
  49. y=group[y].cumsum(),
  50. mode="lines+markers",
  51. text=group["title"],
  52. name=name,
  53. marker=dict(size=10, opacity=0.8, symbol=i + 2),
  54. )
  55. )
  56. else:
  57. df.sort_values("published_date", inplace=True)
  58. if len(y) == 2:
  59. data = [
  60. go.Scatter(
  61. x=df["published_date"],
  62. y=df[y[0]].cumsum(),
  63. name=y[0].title(),
  64. mode="lines+markers",
  65. text=df["title"],
  66. marker=dict(
  67. size=10,
  68. color="blue",
  69. opacity=0.6,
  70. line=dict(color="black"),
  71. ),
  72. ),
  73. go.Scatter(
  74. x=df["published_date"],
  75. y=df[y[1]].cumsum(),
  76. yaxis="y2",
  77. name=y[1].title(),
  78. mode="lines+markers",
  79. text=df["title"],
  80. marker=dict(
  81. size=10,
  82. color="red",
  83. opacity=0.6,
  84. line=dict(color="black"),
  85. ),
  86. ),
  87. ]
  88. else:
  89. data = [
  90. go.Scatter(
  91. x=df["published_date"],
  92. y=df[y].cumsum(),
  93. mode="lines+markers",
  94. text=df["title"],
  95. marker=dict(
  96. size=12,
  97. color="blue",
  98. opacity=0.6,
  99. line=dict(color="black"),
  100. ),
  101. )
  102. ]
  103. if len(y) == 2:
  104. layout = go.Layout(
  105. xaxis=dict(title="Published Date", type="date"),
  106. yaxis=dict(title=y[0].replace("_", " ").title(), color="blue"),
  107. yaxis2=dict(
  108. title=y[1].replace("_", " ").title(),
  109. color="red",
  110. overlaying="y",
  111. side="right",
  112. ),
  113. font=dict(size=14),
  114. title=f"Cumulative {y[0].title()} and {y[1].title()}",
  115. )
  116. else:
  117. layout = go.Layout(
  118. xaxis=dict(title="Published Date", type="date"),
  119. yaxis=dict(title=y.replace("_", " ").title()),
  120. font=dict(size=14),
  121. title=f"Cumulative {y.replace('_', ' ').title()} by {category.replace('_', ' ').title()}"
  122. if category is not None
  123. else f"Cumulative {y.replace('_', ' ').title()}",
  124. )
  125. # Add a rangeselector and rangeslider for a data xaxis
  126. if ranges:
  127. rangeselector = dict(
  128. buttons=list(
  129. [
  130. dict(count=1, label="1m", step="month", stepmode="backward"),
  131. dict(count=6, label="6m", step="month", stepmode="backward"),
  132. dict(count=1, label="1y", step="year", stepmode="backward"),
  133. dict(step="all"),
  134. ]
  135. )
  136. )
  137. rangeslider = dict(visible=True)
  138. layout["xaxis"]["rangeselector"] = rangeselector
  139. layout["xaxis"]["rangeslider"] = rangeslider
  140. layout["width"] = 1000
  141. layout["height"] = 600
  142. figure = go.Figure(data=data, layout=layout)
  143. return figure
  144. def make_scatter_plot(
  145. df,
  146. x,
  147. y,
  148. fits=None,
  149. xlog=False,
  150. ylog=False,
  151. category=None,
  152. scale=None,
  153. sizeref=2,
  154. annotations=None,
  155. ranges=False,
  156. title_override=None,
  157. ):
  158. """
  159. Make an interactive scatterplot, optionally segmented by `category`
  160. :param df: dataframe of data
  161. :param x: string of column to use for xaxis
  162. :param y: string of column to use for yaxis
  163. :param fits: list of strings of fits
  164. :param xlog: boolean for making a log xaxis
  165. :param ylog boolean for making a log yaxis
  166. :param category: string representing categorical column to segment by, this must be a categorical
  167. :param scale: string representing numerical column to size and color markers by, this must be numerical data
  168. :param sizeref: float or integer for setting the size of markers according to the scale, only used if scale is set
  169. :param annotations: text to display on the plot (dictionary)
  170. :param ranges: boolean for whether to add a range slider and selector
  171. :param title_override: String to override the title
  172. :return figure: a plotly plot to show with iplot or plot
  173. """
  174. if category is not None:
  175. title = f"{y.replace('_', ' ').title()} vs {x.replace('_', ' ').title()} by {category.replace('_', ' ').title()}"
  176. data = []
  177. for i, (name, group) in enumerate(df.groupby(category)):
  178. data.append(
  179. go.Scatter(
  180. x=group[x],
  181. y=group[y],
  182. mode="markers",
  183. text=group["title"],
  184. name=name,
  185. marker=dict(size=8, symbol=i + 2),
  186. )
  187. )
  188. else:
  189. if scale is not None:
  190. title = f"{y.replace('_', ' ').title()} vs {x.replace('_', ' ').title()} Scaled by {scale.title()}"
  191. data = [
  192. go.Scatter(
  193. x=df[x],
  194. y=df[y],
  195. mode="markers",
  196. text=df["title"],
  197. marker=dict(
  198. size=df[scale],
  199. line=dict(color="black", width=0.5),
  200. sizemode="area",
  201. sizeref=sizeref,
  202. opacity=0.8,
  203. colorscale="Viridis",
  204. color=df[scale],
  205. showscale=True,
  206. sizemin=2,
  207. ),
  208. )
  209. ]
  210. else:
  211. df.sort_values(x, inplace=True)
  212. title = f"{y.replace('_', ' ').title()} vs {x.replace('_', ' ').title()}"
  213. data = [
  214. go.Scatter(
  215. x=df[x],
  216. y=df[y],
  217. mode="markers",
  218. text=df["title"],
  219. marker=dict(
  220. size=12, color="blue", opacity=0.8, line=dict(color="black")
  221. ),
  222. name="observations",
  223. )
  224. ]
  225. if fits is not None:
  226. for fit in fits:
  227. data.append(
  228. go.Scatter(
  229. x=df[x],
  230. y=df[fit],
  231. text=df["title"],
  232. mode="lines+markers",
  233. marker=dict(size=8, opacity=0.6),
  234. line=dict(dash="dash"),
  235. name=fit,
  236. )
  237. )
  238. title += " with Fit"
  239. layout = go.Layout(
  240. annotations=annotations,
  241. xaxis=dict(
  242. title=x.replace("_", " ").title() + (" (log scale)" if xlog else ""),
  243. type="log" if xlog else None,
  244. ),
  245. yaxis=dict(
  246. title=y.replace("_", " ").title() + (" (log scale)" if ylog else ""),
  247. type="log" if ylog else None,
  248. ),
  249. font=dict(size=14),
  250. title=title if title_override is None else title_override,
  251. )
  252. # Add a rangeselector and rangeslider for a data xaxis
  253. if ranges:
  254. rangeselector = dict(
  255. buttons=list(
  256. [
  257. dict(count=1, label="1m", step="month", stepmode="backward"),
  258. dict(count=6, label="6m", step="month", stepmode="backward"),
  259. dict(count=1, label="1y", step="year", stepmode="backward"),
  260. dict(step="all"),
  261. ]
  262. )
  263. )
  264. rangeslider = dict(visible=True)
  265. layout["xaxis"]["rangeselector"] = rangeselector
  266. layout["xaxis"]["rangeslider"] = rangeslider
  267. layout["width"] = 1000
  268. layout["height"] = 600
  269. figure = go.Figure(data=data, layout=layout)
  270. return figure
  271. def make_linear_regression(df, x, y, intercept_0):
  272. """
  273. Create a linear regression, either with the intercept set to 0 or
  274. the intercept allowed to be fitted
  275. :param df: dataframe with data
  276. :param x: string or list of stringsfor the name of the column with x data
  277. :param y: string for the name of the column with y data
  278. :param intercept_0: boolean indicating whether to set the intercept to 0
  279. """
  280. if isinstance(x, list):
  281. lin_model = LinearRegression()
  282. lin_model.fit(df[x], df[y])
  283. slopes, intercept, = (
  284. lin_model.coef_,
  285. lin_model.intercept_,
  286. )
  287. df["predicted"] = lin_model.predict(df[x])
  288. r2 = lin_model.score(df[x], df[y])
  289. rmse = np.sqrt(mean_squared_error(y_true=df[y], y_pred=df["predicted"]))
  290. equation = f'{y.replace("_", " ")} ='
  291. names = ["r2", "rmse", "intercept"]
  292. values = [r2, rmse, intercept]
  293. for i, (p, s) in enumerate(zip(x, slopes)):
  294. if (i + 1) % 3 == 0:
  295. equation += f'<br>{s:.2f} * {p.replace("_", " ")} +'
  296. else:
  297. equation += f' {s:.2f} * {p.replace("_", " ")} +'
  298. names.append(p)
  299. values.append(s)
  300. equation += f" {intercept:.2f}"
  301. annotations = [
  302. dict(
  303. x=0.4 * df.index.max(),
  304. y=0.9 * df[y].max(),
  305. showarrow=False,
  306. text=equation,
  307. font=dict(size=10),
  308. )
  309. ]
  310. df["index"] = list(df.index)
  311. figure = make_scatter_plot(
  312. df, x="index", y=y, fits=["predicted"], annotations=annotations
  313. )
  314. summary = pd.DataFrame({"name": names, "value": values})
  315. else:
  316. if intercept_0:
  317. lin_reg = sm.OLS(df[y], df[x]).fit()
  318. df["fit_values"] = lin_reg.fittedvalues
  319. summary = lin_reg.summary()
  320. slope = float(lin_reg.params)
  321. equation = f"${y.replace('_', ' ')} = {slope:.2f} * {x.replace('_', ' ')}$"
  322. else:
  323. lin_reg = stats.linregress(df[x], df[y])
  324. intercept, slope = lin_reg.intercept, lin_reg.slope
  325. params = ["pvalue", "rvalue", "slope", "intercept"]
  326. values = []
  327. for p in params:
  328. values.append(getattr(lin_reg, p))
  329. summary = pd.DataFrame({"param": params, "value": values})
  330. df["fit_values"] = df[x] * slope + intercept
  331. equation = f"${y.replace('_', ' ')} = {slope:.2f} * {x.replace('_', ' ')} + {intercept:.2f}$"
  332. annotations = [
  333. dict(
  334. x=0.75 * df[x].max(),
  335. y=0.9 * df[y].max(),
  336. showarrow=False,
  337. text=equation,
  338. font=dict(size=32),
  339. )
  340. ]
  341. figure = make_scatter_plot(
  342. df, x=x, y=y, fits=["fit_values"], annotations=annotations
  343. )
  344. return figure, summary
  345. def make_poly_fits(df, x, y, degree=6):
  346. """
  347. Generate fits and make interactive plot with fits
  348. :param df: dataframe with data
  349. :param x: string representing x data column
  350. :param y: string representing y data column
  351. :param degree: integer degree of fits to go up to
  352. :return fit_stats: dataframe with information about fits
  353. :return figure: interactive plotly figure that can be shown with iplot or plot
  354. """
  355. # Don't want to alter original data frame
  356. df = df.copy()
  357. fit_list = []
  358. rmse = []
  359. fit_params = []
  360. # Make each fit
  361. for i in range(1, degree + 1):
  362. fit_name = f"fit degree = {i}"
  363. fit_list.append(fit_name)
  364. z, res, *rest = np.polyfit(df[x], df[y], i, full=True)
  365. fit_params.append(z)
  366. df.loc[:, fit_name] = np.poly1d(z)(df[x])
  367. rmse.append(np.sqrt(res[0]))
  368. fit_stats = pd.DataFrame({"fit": fit_list, "rmse": rmse, "params": fit_params})
  369. figure = make_scatter_plot(df, x=x, y=y, fits=fit_list)
  370. return figure, fit_stats
  371. def make_extrapolation(df, y, years, degree=4):
  372. """
  373. Extrapolate `y` into the future `years` with `degree` polynomial fit
  374. :param df: dataframe of data
  375. :param y: string of column to extrapolate
  376. :param years: number of years to extrapolate into the future
  377. :param degree: integer degree of polynomial fit
  378. :return figure: plotly figure for display using iplot or plot
  379. :return future_df: extrapolated numbers into the future
  380. """
  381. df = df.copy()
  382. x = "days_since_start"
  383. df["days_since_start"] = (
  384. (df["published_date"] - df["published_date"].min()).dt.total_seconds()
  385. / (3600 * 24)
  386. ).astype(int)
  387. cumy = f"cum_{y}"
  388. df[cumy] = df.sort_values(x)[y].cumsum()
  389. figure, summary = make_poly_fits(df, x, cumy, degree=degree)
  390. min_date = df["published_date"].min()
  391. max_date = df["published_date"].max()
  392. date_range = pd.date_range(
  393. start=min_date, end=max_date + pd.Timedelta(days=int(years * 365))
  394. )
  395. future_df = pd.DataFrame({"date": date_range})
  396. future_df[x] = (
  397. (future_df["date"] - future_df["date"].min()).dt.total_seconds() / (3600 * 24)
  398. ).astype(int)
  399. newcumy = f"cumulative_{y}"
  400. future_df = future_df.merge(df[[x, cumy]], on=x, how="left").rename(
  401. columns={cumy: newcumy}
  402. )
  403. z = np.poly1d(summary.iloc[-1]["params"])
  404. pred_name = f"predicted_{y}"
  405. future_df[pred_name] = z(future_df[x])
  406. future_df["title"] = ""
  407. last_date = future_df.loc[future_df["date"].idxmax()]
  408. prediction_text = f"On {last_date['date'].date()} the {y} will be {float(last_date[pred_name]):,.0f}."
  409. annotations = [
  410. dict(
  411. x=future_df["date"].quantile(0.4),
  412. y=0.8 * future_df[pred_name].max(),
  413. text=prediction_text,
  414. showarrow=False,
  415. font=dict(size=16),
  416. )
  417. ]
  418. title_override = f'{y.replace("_", " ").title()} with Extrapolation {years} Years into the Future'
  419. figure = make_scatter_plot(
  420. future_df,
  421. "date",
  422. newcumy,
  423. fits=[pred_name],
  424. annotations=annotations,
  425. ranges=True,
  426. title_override=title_override,
  427. )
  428. return figure, future_df