visuals.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500
  1. # Data science imports
  2. from multiprocessing import Pool
  3. import requests
  4. import re
  5. from bs4 import BeautifulSoup
  6. from itertools import chain
  7. from collections import Counter, defaultdict
  8. from timeit import default_timer as timer
  9. import pandas as pd
  10. import numpy as np
  11. import statsmodels.api as sm
  12. from scipy import stats
  13. # Interactive plotting
  14. import plotly.plotly as py
  15. import plotly.graph_objs as go
  16. from plotly.offline import iplot
  17. import cufflinks
  18. cufflinks.go_offline()
  19. def make_update_menu(base_title, article_annotations=None, response_annotations=None):
  20. """
  21. Make an updatemenu for interative plot
  22. :param base_title: string for title of plot
  23. :return updatemenus: a updatemenus object for adding to a layout
  24. """
  25. updatemenus = list(
  26. [
  27. dict(
  28. buttons=list(
  29. [
  30. dict(
  31. label="both",
  32. method="update",
  33. args=[
  34. dict(visible=[True, True]),
  35. dict(
  36. title=base_title,
  37. annotations=[
  38. article_annotations,
  39. response_annotations,
  40. ],
  41. ),
  42. ],
  43. ),
  44. dict(
  45. label="articles",
  46. method="update",
  47. args=[
  48. dict(visible=[True, False]),
  49. dict(
  50. title="Article " + base_title,
  51. annotations=[article_annotations],
  52. ),
  53. ],
  54. ),
  55. dict(
  56. label="responses",
  57. method="update",
  58. args=[
  59. dict(visible=[False, True]),
  60. dict(
  61. title="Response " + base_title,
  62. annotations=[response_annotations],
  63. ),
  64. ],
  65. ),
  66. ]
  67. )
  68. )
  69. ]
  70. )
  71. return updatemenus
  72. def make_hist(df, x, category=None):
  73. """
  74. Make an interactive histogram, optionally segmented by `category`
  75. :param df: dataframe of data
  76. :param x: string of column to use for plotting
  77. :param category: string representing column to segment by
  78. :return figure: a plotly histogram to show with iplot or plot
  79. """
  80. if category is not None:
  81. data = []
  82. for name, group in df.groupby(category):
  83. data.append(go.Histogram(dict(x=group[x], name=name)))
  84. else:
  85. data = [go.Histogram(dict(x=df[x]))]
  86. layout = go.Layout(
  87. yaxis=dict(title="Count"),
  88. xaxis=dict(title=x.replace('_', ' ').title()),
  89. title=f"{x.replace('_', ' ').title()} Distribution by {category.replace('_', ' ').title()}"
  90. if category
  91. else f"{x.replace('_', ' ').title()} Distribution",
  92. )
  93. figure = go.Figure(data=data, layout=layout)
  94. return figure
  95. def make_cum_plot(df, y, category=None):
  96. """
  97. Make an interactive cumulative plot, optionally segmented by `category`
  98. :param df: dataframe of data, must have a `published_date` column
  99. :param y: string of column to use for plotting or list of two strings for double y axis
  100. :param category: string representing column to segment by
  101. :return figure: a plotly plot to show with iplot or plot
  102. """
  103. if category is not None:
  104. data = []
  105. for i, (name, group) in enumerate(df.groupby(category)):
  106. group.sort_values("published_date", inplace=True)
  107. data.append(
  108. go.Scatter(
  109. x=group["published_date"],
  110. y=group[y].cumsum(),
  111. mode="lines+markers",
  112. text=group["title"],
  113. name=name,
  114. marker=dict(size=10, opacity=0.8,
  115. symbol=i + 2),
  116. )
  117. )
  118. else:
  119. df.sort_values("published_date", inplace=True)
  120. if len(y) == 2:
  121. data = [
  122. go.Scatter(
  123. x=df["published_date"],
  124. y=df[y[0]].cumsum(),
  125. name=y[0].title(),
  126. mode="lines+markers",
  127. text=df["title"],
  128. marker=dict(size=10, color='blue', opacity=0.6, line=dict(color='black'),
  129. )),
  130. go.Scatter(
  131. x=df["published_date"],
  132. y=df[y[1]].cumsum(),
  133. yaxis='y2',
  134. name=y[1].title(),
  135. mode="lines+markers",
  136. text=df["title"],
  137. marker=dict(size=10, color='red', opacity=0.6, line=dict(color='black'),
  138. )),
  139. ]
  140. else:
  141. data = [
  142. go.Scatter(
  143. x=df["published_date"],
  144. y=df[y].cumsum(),
  145. mode="lines+markers",
  146. text=df["title"],
  147. marker=dict(size=12, color='blue', opacity=0.6, line=dict(color='black'),
  148. ),
  149. )
  150. ]
  151. if len(y) == 2:
  152. layout = go.Layout(
  153. xaxis=dict(title="Published Date", type="date"),
  154. yaxis=dict(title=y[0].title(), color='blue'),
  155. yaxis2=dict(title=y[1].title(), color='red',
  156. overlaying='y', side='right'),
  157. font=dict(size=14),
  158. title=f"Cumulative {y[0].title()} and {y[1].title()}",
  159. )
  160. else:
  161. layout = go.Layout(
  162. xaxis=dict(title="Published Date", type="date"),
  163. yaxis=dict(title=y.replace('_', ' ').title()),
  164. font=dict(size=14),
  165. title=f"Cumulative {y.replace('_', ' ').title()} by {category.replace('_', ' ').title()}"
  166. if category is not None
  167. else f"Cumulative {y.replace('_', ' ').title()}",
  168. )
  169. figure = go.Figure(data=data, layout=layout)
  170. return figure
  171. def make_scatter_plot(df, x, y, fits=None, xlog=False, ylog=False, category=None, scale=None, sizeref=2, annotations=None):
  172. """
  173. Make an interactive scatterplot, optionally segmented by `category`
  174. :param df: dataframe of data
  175. :param x: string of column to use for xaxis
  176. :param y: string of column to use for yaxis
  177. :param fits: list of strings of fits
  178. :param xlog: boolean for making a log xaxis
  179. :param ylog boolean for making a log yaxis
  180. :param category: string representing categorical column to segment by, this must be a categorical
  181. :param scale: string representing numerical column to size and color markers by, this must be numerical data
  182. :param sizeref: float or integer for setting the size of markers according to the scale, only used if scale is set
  183. :param annotations: text to display on the plot (dictionary)
  184. :return figure: a plotly plot to show with iplot or plot
  185. """
  186. if category is not None:
  187. title = f"{y.replace('_', ' ').title()} vs {x.replace('_', ' ').title()} by {category.replace('_', ' ').title()}"
  188. data = []
  189. for i, (name, group) in enumerate(df.groupby(category)):
  190. data.append(go.Scatter(x=group[x],
  191. y=group[y],
  192. mode='markers',
  193. text=group['title'],
  194. name=name,
  195. marker=dict(size=8, symbol=i + 2)))
  196. else:
  197. if scale is not None:
  198. title = f"{y.replace('_', ' ').title()} vs {x.replace('_', ' ').title()} Scaled by {scale.title()}"
  199. data = [go.Scatter(x=df[x],
  200. y=df[y],
  201. mode='markers',
  202. text=df['title'], marker=dict(size=df[scale],
  203. line=dict(color='black', width=0.5), sizemode='area', sizeref=sizeref, opacity=0.8,
  204. colorscale='Viridis', color=df[scale], showscale=True, sizemin=2))]
  205. else:
  206. df.sort_values(x, inplace=True)
  207. title = f"{y.replace('_', ' ').title()} vs {x.replace('_', ' ').title()}"
  208. data = [go.Scatter(x=df[x],
  209. y=df[y],
  210. mode='markers',
  211. text=df['title'], marker=dict(
  212. size=12, color='blue', opacity=0.8, line=dict(color='black')),
  213. name='observations')]
  214. if fits is not None:
  215. for fit in fits:
  216. data.append(go.Scatter(x=df[x], y=df[fit],
  217. mode='lines+markers', marker=dict(size=8, opacity=0.6),
  218. line=dict(dash='dash'), name=fit))
  219. title += ' with Fit'
  220. layout = go.Layout(annotations=annotations,
  221. xaxis=dict(title=x.replace('_', ' ').title() + (' (log scale)' if xlog else ''),
  222. type='log' if xlog else None),
  223. yaxis=dict(title=y.replace('_', ' ').title() + (' (log scale)' if ylog else ''),
  224. type='log' if ylog else None),
  225. font=dict(size=14),
  226. title=title,
  227. )
  228. figure = go.Figure(data=data, layout=layout)
  229. return figure
  230. def make_poly_fits(df, x, y, degree=6):
  231. """
  232. Generate fits and make interactive plot with fits
  233. :param df: dataframe with data
  234. :param x: string representing x data column
  235. :param y: string representing y data column
  236. :param degree: integer degree of fits to go up to
  237. :return fit_stats: dataframe with information about fits
  238. :return figure: interactive plotly figure that can be shown with iplot or plot
  239. """
  240. # Don't want to alter original data frame
  241. df = df.copy()
  242. fit_list = []
  243. rmse = []
  244. fit_params = []
  245. # Make each fit
  246. for i in range(1, degree + 1):
  247. fit_name = f'fit degree = {i}'
  248. fit_list.append(fit_name)
  249. z, res, *rest = np.polyfit(df[x], df[y], i, full=True)
  250. fit_params.append(z)
  251. df.loc[:, fit_name] = np.poly1d(z)(df[x])
  252. rmse.append(np.sqrt(res[0]))
  253. fit_stats = pd.DataFrame(
  254. {'fit': fit_list, 'rmse': rmse, 'params': fit_params})
  255. figure = make_scatter_plot(df, x=x, y=y, fits=fit_list)
  256. return fit_stats, figure
  257. def make_linear_regression(df, x, y, intercept_0):
  258. """
  259. Create a linear regression, either with the intercept set to 0 or
  260. the intercept allowed to be fitted
  261. :param df: dataframe with data
  262. :param x: string for the name of the column with x data
  263. :param y: string for the name of the column with y data
  264. :param intercept_0: boolean indicating whether to set the intercept to 0
  265. """
  266. if intercept_0:
  267. lin_reg = sm.OLS(df[y], df[x]).fit()
  268. df['fit_values'] = lin_reg.fittedvalues
  269. summary = lin_reg.summary()
  270. slope = float(lin_reg.params)
  271. equation = f"${y} = {slope:.2f} * {x.replace('_', ' ')}$"
  272. else:
  273. lin_reg = stats.linregress(df[x], df[y])
  274. intercept, slope = lin_reg.intercept, lin_reg.slope
  275. params = ['pvalue', 'rvalue', 'slope', 'intercept']
  276. values = []
  277. for p in params:
  278. values.append(getattr(lin_reg, p))
  279. summary = pd.DataFrame({'param': params, 'value': values})
  280. df['fit_values'] = df[x] * slope + intercept
  281. equation = f"${y} = {slope:.2f} * {x.replace('_', ' ')} + {intercept:.2f}$"
  282. annotations = [dict(x=0.75 * df[x].max(), y=0.9 * df[y].max(), showarrow=False,
  283. text=equation,
  284. font=dict(size=32))]
  285. figure = make_scatter_plot(
  286. df, x=x, y=y, fits=['fit_values'], annotations=annotations)
  287. return figure, summary
  288. def make_iplot(data, x, y, base_title, time=False, eq_pos=(0.75, 0.25)):
  289. """
  290. Make an interactive plot. Adds a dropdown to separate articles from responses
  291. if there are responses in the data. If there is only articles (or only responses)
  292. adds a linear regression line.
  293. :param data: dataframe of entry data
  294. :param x: string for xaxis of plot
  295. :param y: sring for yaxis of plot
  296. :param base_title: string for title of plot
  297. :param time: boolean for whether the xaxis is a plot
  298. :param eq_pos: position of equation for linear regression
  299. :return figure: an interactive plotly object for display
  300. """
  301. # Extract the relevant data
  302. responses = data[data["response"] == "response"].copy()
  303. articles = data[data["response"] == "article"].copy()
  304. if not responses.empty:
  305. # Create scatterplot data, articles must be first for menu selection
  306. plot_data = [
  307. go.Scatter(
  308. x=articles[x],
  309. y=articles[y],
  310. mode="markers",
  311. name="articles",
  312. text=articles["title"],
  313. marker=dict(color="blue", size=12),
  314. ),
  315. go.Scatter(
  316. x=responses[x],
  317. y=responses[y],
  318. mode="markers",
  319. name="responses",
  320. marker=dict(color="green", size=12),
  321. ),
  322. ]
  323. if not time:
  324. annotations = {}
  325. for df, name in zip([articles, responses], ["articles", "responses"]):
  326. regression = stats.linregress(x=df[x], y=df[y])
  327. slope = regression.slope
  328. intercept = regression.intercept
  329. rvalue = regression.rvalue
  330. xi = np.array(range(int(df[x].min()), int(df[x].max())))
  331. line = xi * slope + intercept
  332. trace = go.Scatter(
  333. x=xi,
  334. y=line,
  335. mode="lines",
  336. marker=dict(color="blue" if name ==
  337. "articles" else "green"),
  338. line=dict(width=4, dash="longdash"),
  339. name=f"{name} linear fit",
  340. )
  341. annotations[name] = dict(
  342. x=max(xi) * eq_pos[0],
  343. y=df[y].max() * eq_pos[1],
  344. showarrow=False,
  345. text=f"$R^2 = {rvalue:.2f}; Y = {slope:.2f}X + {intercept:.2f}$",
  346. font=dict(size=16, color="blue" if name ==
  347. "articles" else "green"),
  348. )
  349. plot_data.append(trace)
  350. # Make a layout with update menus
  351. layout = go.Layout(
  352. annotations=list(annotations.values()),
  353. height=600,
  354. width=900,
  355. title=base_title,
  356. xaxis=dict(
  357. title=x.replace('_', ' ').title(), tickfont=dict(size=14), titlefont=dict(size=16)
  358. ),
  359. yaxis=dict(
  360. title=y.replace('_', ' ').title(), tickfont=dict(size=14), titlefont=dict(size=16)
  361. ),
  362. updatemenus=make_update_menu(
  363. base_title, annotations["articles"], annotations["responses"]
  364. ),
  365. )
  366. # If there are only articles
  367. else:
  368. plot_data = [
  369. go.Scatter(
  370. x=data[x],
  371. y=data[y],
  372. mode="markers",
  373. name="observations",
  374. text=data["title"],
  375. marker=dict(color="blue", size=12),
  376. )
  377. ]
  378. regression = stats.linregress(x=data[x], y=data[y])
  379. slope = regression.slope
  380. intercept = regression.intercept
  381. rvalue = regression.rvalue
  382. xi = np.array(range(int(data[x].min()), int(data[x].max())))
  383. line = xi * slope + intercept
  384. trace = go.Scatter(
  385. x=xi,
  386. y=line,
  387. mode="lines",
  388. marker=dict(color="red"),
  389. line=dict(width=4, dash="longdash"),
  390. name="linear fit",
  391. )
  392. annotations = [
  393. dict(
  394. x=max(xi) * eq_pos[0],
  395. y=data[y].max() * eq_pos[1],
  396. showarrow=False,
  397. text=f"$R^2 = {rvalue:.2f}; Y = {slope:.2f}X + {intercept:.2f}$",
  398. font=dict(size=16),
  399. )
  400. ]
  401. plot_data.append(trace)
  402. layout = go.Layout(
  403. annotations=annotations,
  404. height=600,
  405. width=900,
  406. title=base_title,
  407. xaxis=dict(
  408. title=x.replace('_', ' ').title(), tickfont=dict(size=14), titlefont=dict(size=16)
  409. ),
  410. yaxis=dict(
  411. title=y.replace('_', ' ').title(), tickfont=dict(size=14), titlefont=dict(size=16)
  412. ),
  413. )
  414. # Add a rangeselector and rangeslider for a data xaxis
  415. if time:
  416. rangeselector = dict(
  417. buttons=list(
  418. [
  419. dict(count=1, label="1m", step="month", stepmode="backward"),
  420. dict(count=6, label="6m", step="month", stepmode="backward"),
  421. dict(count=1, label="YTD", step="year", stepmode="todate"),
  422. dict(count=1, label="1y", step="year", stepmode="backward"),
  423. dict(step="all"),
  424. ]
  425. )
  426. )
  427. rangeslider = dict(visible=True)
  428. layout["xaxis"]["rangeselector"] = rangeselector
  429. layout["xaxis"]["rangeslider"] = rangeslider
  430. figure = go.Figure(data=plot_data, layout=layout)
  431. return figure
  432. # Return the figure
  433. figure = go.Figure(data=plot_data, layout=layout)
  434. return figure