visuals.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
  1. # Data science imports
  2. import pandas as pd
  3. import numpy as np
  4. import statsmodels.api as sm
  5. from sklearn.linear_model import LinearRegression
  6. from sklearn.metrics import mean_squared_error
  7. from scipy import stats
  8. # Interactive plotting
  9. import plotly.graph_objs as go
  10. import cufflinks
  11. cufflinks.go_offline()
  12. def make_update_menu(base_title, article_annotations=None, response_annotations=None):
  13. """
  14. Make an updatemenu for interative plot
  15. :param base_title: string for title of plot
  16. :return updatemenus: a updatemenus object for adding to a layout
  17. """
  18. updatemenus = list(
  19. [
  20. dict(
  21. buttons=list(
  22. [
  23. dict(
  24. label="both",
  25. method="update",
  26. args=[
  27. dict(visible=[True, True]),
  28. dict(
  29. title=base_title,
  30. annotations=[
  31. article_annotations,
  32. response_annotations,
  33. ],
  34. ),
  35. ],
  36. ),
  37. dict(
  38. label="articles",
  39. method="update",
  40. args=[
  41. dict(visible=[True, False]),
  42. dict(
  43. title="Article " + base_title,
  44. annotations=[article_annotations],
  45. ),
  46. ],
  47. ),
  48. dict(
  49. label="responses",
  50. method="update",
  51. args=[
  52. dict(visible=[False, True]),
  53. dict(
  54. title="Response " + base_title,
  55. annotations=[response_annotations],
  56. ),
  57. ],
  58. ),
  59. ]
  60. )
  61. )
  62. ]
  63. )
  64. return updatemenus
  65. def make_hist(df, x, category=None):
  66. """
  67. Make an interactive histogram, optionally segmented by `category`
  68. :param df: dataframe of data
  69. :param x: string of column to use for plotting
  70. :param category: string representing column to segment by
  71. :return figure: a plotly histogram to show with iplot or plot
  72. """
  73. if category is not None:
  74. data = []
  75. for name, group in df.groupby(category):
  76. data.append(go.Histogram(dict(x=group[x], name=name)))
  77. else:
  78. data = [go.Histogram(dict(x=df[x]))]
  79. layout = go.Layout(
  80. yaxis=dict(title="Count"),
  81. xaxis=dict(title=x.replace('_', ' ').title()),
  82. title=f"{x.replace('_', ' ').title()} Distribution by {category.replace('_', ' ').title()}"
  83. if category
  84. else f"{x.replace('_', ' ').title()} Distribution",
  85. )
  86. figure = go.Figure(data=data, layout=layout)
  87. return figure
  88. def make_cum_plot(df, y, category=None, ranges=False):
  89. """
  90. Make an interactive cumulative plot, optionally segmented by `category`
  91. :param df: dataframe of data, must have a `published_date` column
  92. :param y: string of column to use for plotting or list of two strings for double y axis
  93. :param category: string representing column to segment by
  94. :param ranges: boolean for whether to add range slider and range selector
  95. :return figure: a plotly plot to show with iplot or plot
  96. """
  97. if category is not None:
  98. data = []
  99. for i, (name, group) in enumerate(df.groupby(category)):
  100. group.sort_values("published_date", inplace=True)
  101. data.append(
  102. go.Scatter(
  103. x=group["published_date"],
  104. y=group[y].cumsum(),
  105. mode="lines+markers",
  106. text=group["title"],
  107. name=name,
  108. marker=dict(size=10, opacity=0.8,
  109. symbol=i + 2),
  110. )
  111. )
  112. else:
  113. df.sort_values("published_date", inplace=True)
  114. if len(y) == 2:
  115. data = [
  116. go.Scatter(
  117. x=df["published_date"],
  118. y=df[y[0]].cumsum(),
  119. name=y[0].title(),
  120. mode="lines+markers",
  121. text=df["title"],
  122. marker=dict(size=10, color='blue', opacity=0.6, line=dict(color='black'),
  123. )),
  124. go.Scatter(
  125. x=df["published_date"],
  126. y=df[y[1]].cumsum(),
  127. yaxis='y2',
  128. name=y[1].title(),
  129. mode="lines+markers",
  130. text=df["title"],
  131. marker=dict(size=10, color='red', opacity=0.6, line=dict(color='black'),
  132. )),
  133. ]
  134. else:
  135. data = [
  136. go.Scatter(
  137. x=df["published_date"],
  138. y=df[y].cumsum(),
  139. mode="lines+markers",
  140. text=df["title"],
  141. marker=dict(size=12, color='blue', opacity=0.6, line=dict(color='black'),
  142. ),
  143. )
  144. ]
  145. if len(y) == 2:
  146. layout = go.Layout(
  147. xaxis=dict(title="Published Date", type="date"),
  148. yaxis=dict(title=y[0].title(), color='blue'),
  149. yaxis2=dict(title=y[1].title(), color='red',
  150. overlaying='y', side='right'),
  151. font=dict(size=14),
  152. title=f"Cumulative {y[0].title()} and {y[1].title()}",
  153. )
  154. else:
  155. layout = go.Layout(
  156. xaxis=dict(title="Published Date", type="date"),
  157. yaxis=dict(title=y.replace('_', ' ').title()),
  158. font=dict(size=14),
  159. title=f"Cumulative {y.replace('_', ' ').title()} by {category.replace('_', ' ').title()}"
  160. if category is not None
  161. else f"Cumulative {y.replace('_', ' ').title()}",
  162. )
  163. # Add a rangeselector and rangeslider for a data xaxis
  164. if ranges:
  165. rangeselector = dict(
  166. buttons=list(
  167. [
  168. dict(count=1, label="1m", step="month", stepmode="backward"),
  169. dict(count=6, label="6m", step="month", stepmode="backward"),
  170. dict(count=1, label="1y", step="year", stepmode="backward"),
  171. dict(step="all"),
  172. ]
  173. )
  174. )
  175. rangeslider = dict(visible=True)
  176. layout["xaxis"]["rangeselector"] = rangeselector
  177. layout["xaxis"]["rangeslider"] = rangeslider
  178. layout['width'] = 1000
  179. layout['height'] = 600
  180. figure = go.Figure(data=data, layout=layout)
  181. return figure
  182. def make_scatter_plot(df, x, y, fits=None, xlog=False, ylog=False, category=None, scale=None, sizeref=2, annotations=None, ranges=False, title_override=None):
  183. """
  184. Make an interactive scatterplot, optionally segmented by `category`
  185. :param df: dataframe of data
  186. :param x: string of column to use for xaxis
  187. :param y: string of column to use for yaxis
  188. :param fits: list of strings of fits
  189. :param xlog: boolean for making a log xaxis
  190. :param ylog boolean for making a log yaxis
  191. :param category: string representing categorical column to segment by, this must be a categorical
  192. :param scale: string representing numerical column to size and color markers by, this must be numerical data
  193. :param sizeref: float or integer for setting the size of markers according to the scale, only used if scale is set
  194. :param annotations: text to display on the plot (dictionary)
  195. :param ranges: boolean for whether to add a range slider and selector
  196. :param title_override: String to override the title
  197. :return figure: a plotly plot to show with iplot or plot
  198. """
  199. if category is not None:
  200. title = f"{y.replace('_', ' ').title()} vs {x.replace('_', ' ').title()} by {category.replace('_', ' ').title()}"
  201. data = []
  202. for i, (name, group) in enumerate(df.groupby(category)):
  203. data.append(go.Scatter(x=group[x],
  204. y=group[y],
  205. mode='markers',
  206. text=group['title'],
  207. name=name,
  208. marker=dict(size=8, symbol=i + 2)))
  209. else:
  210. if scale is not None:
  211. title = f"{y.replace('_', ' ').title()} vs {x.replace('_', ' ').title()} Scaled by {scale.title()}"
  212. data = [go.Scatter(x=df[x],
  213. y=df[y],
  214. mode='markers',
  215. text=df['title'], marker=dict(size=df[scale],
  216. line=dict(color='black', width=0.5), sizemode='area', sizeref=sizeref, opacity=0.8,
  217. colorscale='Viridis', color=df[scale], showscale=True, sizemin=2))]
  218. else:
  219. df.sort_values(x, inplace=True)
  220. title = f"{y.replace('_', ' ').title()} vs {x.replace('_', ' ').title()}"
  221. data = [go.Scatter(x=df[x],
  222. y=df[y],
  223. mode='markers',
  224. text=df['title'], marker=dict(
  225. size=12, color='blue', opacity=0.8, line=dict(color='black')),
  226. name='observations')]
  227. if fits is not None:
  228. for fit in fits:
  229. data.append(go.Scatter(x=df[x], y=df[fit], text=df['title'],
  230. mode='lines+markers', marker=dict
  231. (size=8, opacity=0.6),
  232. line=dict(dash='dash'), name=fit))
  233. title += ' with Fit'
  234. layout = go.Layout(annotations=annotations,
  235. xaxis=dict(title=x.replace('_', ' ').title() + (' (log scale)' if xlog else ''),
  236. type='log' if xlog else None),
  237. yaxis=dict(title=y.replace('_', ' ').title() + (' (log scale)' if ylog else ''),
  238. type='log' if ylog else None),
  239. font=dict(size=14),
  240. title=title if title_override is None else title_override,
  241. )
  242. # Add a rangeselector and rangeslider for a data xaxis
  243. if ranges:
  244. rangeselector = dict(
  245. buttons=list(
  246. [
  247. dict(count=1, label="1m", step="month", stepmode="backward"),
  248. dict(count=6, label="6m", step="month", stepmode="backward"),
  249. dict(count=1, label="1y", step="year", stepmode="backward"),
  250. dict(step="all"),
  251. ]
  252. )
  253. )
  254. rangeslider = dict(visible=True)
  255. layout["xaxis"]["rangeselector"] = rangeselector
  256. layout["xaxis"]["rangeslider"] = rangeslider
  257. layout['width'] = 1000
  258. layout['height'] = 600
  259. figure = go.Figure(data=data, layout=layout)
  260. return figure
  261. def make_poly_fits(df, x, y, degree=6):
  262. """
  263. Generate fits and make interactive plot with fits
  264. :param df: dataframe with data
  265. :param x: string representing x data column
  266. :param y: string representing y data column
  267. :param degree: integer degree of fits to go up to
  268. :return fit_stats: dataframe with information about fits
  269. :return figure: interactive plotly figure that can be shown with iplot or plot
  270. """
  271. # Don't want to alter original data frame
  272. df = df.copy()
  273. fit_list = []
  274. rmse = []
  275. fit_params = []
  276. # Make each fit
  277. for i in range(1, degree + 1):
  278. fit_name = f'fit degree = {i}'
  279. fit_list.append(fit_name)
  280. z, res, *rest = np.polyfit(df[x], df[y], i, full=True)
  281. fit_params.append(z)
  282. df.loc[:, fit_name] = np.poly1d(z)(df[x])
  283. rmse.append(np.sqrt(res[0]))
  284. fit_stats = pd.DataFrame(
  285. {'fit': fit_list, 'rmse': rmse, 'params': fit_params})
  286. figure = make_scatter_plot(df, x=x, y=y, fits=fit_list)
  287. return figure, fit_stats
  288. def make_linear_regression(df, x, y, intercept_0):
  289. """
  290. Create a linear regression, either with the intercept set to 0 or
  291. the intercept allowed to be fitted
  292. :param df: dataframe with data
  293. :param x: string or list of stringsfor the name of the column with x data
  294. :param y: string for the name of the column with y data
  295. :param intercept_0: boolean indicating whether to set the intercept to 0
  296. """
  297. if isinstance(x, list):
  298. lin_model = LinearRegression()
  299. lin_model.fit(df[x], df[y])
  300. slopes, intercept, = lin_model.coef_, lin_model.intercept_
  301. df['predicted'] = lin_model.predict(df[x])
  302. r2 = lin_model.score(df[x], df[y])
  303. rmse = np.sqrt(mean_squared_error(
  304. y_true=df[y], y_pred=df['predicted']))
  305. equation = f'{y.replace("_", " ")} ='
  306. names = ['r2', 'rmse', 'intercept']
  307. values = [r2, rmse, intercept]
  308. for i, (p, s) in enumerate(zip(x, slopes)):
  309. if (i + 1) % 3 == 0:
  310. equation += f'<br>{s:.2f} * {p.replace("_", " ")} +'
  311. else:
  312. equation += f' {s:.2f} * {p.replace("_", " ")} +'
  313. names.append(p)
  314. values.append(s)
  315. equation += f' {intercept:.2f}'
  316. annotations = [dict(x=0.4 * df.index.max(), y=0.9 * df[y].max(), showarrow=False,
  317. text=equation,
  318. font=dict(size=10))]
  319. df['index'] = list(df.index)
  320. figure = make_scatter_plot(df, x='index', y=y, fits=[
  321. 'predicted'], annotations=annotations)
  322. summary = pd.DataFrame({'name': names, 'value': values})
  323. else:
  324. if intercept_0:
  325. lin_reg = sm.OLS(df[y], df[x]).fit()
  326. df['fit_values'] = lin_reg.fittedvalues
  327. summary = lin_reg.summary()
  328. slope = float(lin_reg.params)
  329. equation = f"${y} = {slope:.2f} * {x.replace('_', ' ')}$"
  330. else:
  331. lin_reg = stats.linregress(df[x], df[y])
  332. intercept, slope = lin_reg.intercept, lin_reg.slope
  333. params = ['pvalue', 'rvalue', 'slope', 'intercept']
  334. values = []
  335. for p in params:
  336. values.append(getattr(lin_reg, p))
  337. summary = pd.DataFrame({'param': params, 'value': values})
  338. df['fit_values'] = df[x] * slope + intercept
  339. equation = f"${y} = {slope:.2f} * {x.replace('_', ' ')} + {intercept:.2f}$"
  340. annotations = [dict(x=0.75 * df[x].max(), y=0.9 * df[y].max(), showarrow=False,
  341. text=equation,
  342. font=dict(size=32))]
  343. figure = make_scatter_plot(
  344. df, x=x, y=y, fits=['fit_values'], annotations=annotations)
  345. return figure, summary
  346. def make_extrapolation(df, y, years, degree=4):
  347. """
  348. Extrapolate `y` into the future `years` with `degree` polynomial fit
  349. :param df: dataframe of data
  350. :param y: string of column to extrapolate
  351. :param years: number of years to extrapolate into the future
  352. :param degree: integer degree of polynomial fit
  353. :return figure: plotly figure for display using iplot or plot
  354. :return future_df: extrapolated numbers into the future
  355. """
  356. df = df.copy()
  357. x = 'days_since_start'
  358. df['days_since_start'] = (
  359. (df['published_date'] - df['published_date'].min()).
  360. dt.total_seconds() / (3600 * 24)).astype(int)
  361. cumy = f'cum_{y}'
  362. df[cumy] = df.sort_values(x)[y].cumsum()
  363. figure, summary = make_poly_fits(df, x, cumy, degree=degree)
  364. min_date = df['published_date'].min()
  365. max_date = df['published_date'].max()
  366. date_range = pd.date_range(start=min_date,
  367. end=max_date + pd.Timedelta(days=int(years * 365)))
  368. future_df = pd.DataFrame({'date': date_range})
  369. future_df[x] = (
  370. (future_df['date'] - future_df['date'].min()).
  371. dt.total_seconds() / (3600 * 24)).astype(int)
  372. newcumy = f'cumulative_{y}'
  373. future_df = future_df.merge(df[[x, cumy]], on=x, how='left').\
  374. rename(columns={cumy: newcumy})
  375. z = np.poly1d(summary.iloc[-1]['params'])
  376. pred_name = f'predicted_{y}'
  377. future_df[pred_name] = z(future_df[x])
  378. future_df['title'] = ''
  379. last_date = future_df.loc[future_df['date'].idxmax()]
  380. prediction_text = (
  381. f"On {last_date['date'].date()} the {y} will be {float(last_date[pred_name]):,.0f}.")
  382. annotations = [dict(x=future_df['date'].quantile(0.4),
  383. y=0.8 * future_df[pred_name].max(), text=prediction_text, showarrow=False,
  384. font=dict(size=16))]
  385. title_override = f'{y.replace("_", " ").title()} with Extrapolation {years} Years into the Future'
  386. figure = make_scatter_plot(future_df, 'date', newcumy, fits=[
  387. pred_name], annotations=annotations, ranges=True, title_override=title_override)
  388. return figure, future_df