visuals.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458
  1. # Data science imports
  2. from multiprocessing import Pool
  3. import requests
  4. import re
  5. from bs4 import BeautifulSoup
  6. from itertools import chain
  7. from collections import Counter, defaultdict
  8. from timeit import default_timer as timer
  9. import pandas as pd
  10. import numpy as np
  11. from scipy import stats
  12. # Interactive plotting
  13. import plotly.plotly as py
  14. import plotly.graph_objs as go
  15. from plotly.offline import iplot
  16. import cufflinks
  17. cufflinks.go_offline()
  18. def make_update_menu(base_title, article_annotations=None, response_annotations=None):
  19. """
  20. Make an updatemenu for interative plot
  21. :param base_title: string for title of plot
  22. :return updatemenus: a updatemenus object for adding to a layout
  23. """
  24. updatemenus = list(
  25. [
  26. dict(
  27. buttons=list(
  28. [
  29. dict(
  30. label="both",
  31. method="update",
  32. args=[
  33. dict(visible=[True, True]),
  34. dict(
  35. title=base_title,
  36. annotations=[
  37. article_annotations,
  38. response_annotations,
  39. ],
  40. ),
  41. ],
  42. ),
  43. dict(
  44. label="articles",
  45. method="update",
  46. args=[
  47. dict(visible=[True, False]),
  48. dict(
  49. title="Article " + base_title,
  50. annotations=[article_annotations],
  51. ),
  52. ],
  53. ),
  54. dict(
  55. label="responses",
  56. method="update",
  57. args=[
  58. dict(visible=[False, True]),
  59. dict(
  60. title="Response " + base_title,
  61. annotations=[response_annotations],
  62. ),
  63. ],
  64. ),
  65. ]
  66. )
  67. )
  68. ]
  69. )
  70. return updatemenus
  71. def make_hist(df, x, category=None):
  72. """
  73. Make an interactive histogram, optionally segmented by `category`
  74. :param df: dataframe of data
  75. :param x: string of column to use for plotting
  76. :param category: string representing column to segment by
  77. :return figure: a plotly histogram to show with iplot or plot
  78. """
  79. if category is not None:
  80. data = []
  81. for name, group in df.groupby(category):
  82. data.append(go.Histogram(dict(x=group[x], name=name)))
  83. else:
  84. data = [go.Histogram(dict(x=df[x]))]
  85. layout = go.Layout(
  86. yaxis=dict(title="Count"),
  87. xaxis=dict(title=x.title()),
  88. title=f"{x.title()} Distribution by {category.title()}"
  89. if category
  90. else f"{x.title()} Distribution",
  91. )
  92. figure = go.Figure(data=data, layout=layout)
  93. return figure
  94. def make_cum_plot(df, y, category=None):
  95. """
  96. Make an interactive cumulative plot, optionally segmented by `category`
  97. :param df: dataframe of data, must have a `published_date` column
  98. :param y: string of column to use for plotting or list of two strings for double y axis
  99. :param category: string representing column to segment by
  100. :return figure: a plotly plot to show with iplot or plot
  101. """
  102. if category is not None:
  103. data = []
  104. for i, (name, group) in enumerate(df.groupby(category)):
  105. group.sort_values("published_date", inplace=True)
  106. data.append(
  107. go.Scatter(
  108. x=group["published_date"],
  109. y=group[y].cumsum(),
  110. mode="lines+markers",
  111. text=group["title"],
  112. name=name,
  113. marker=dict(size=8, symbol=i + 302),
  114. )
  115. )
  116. else:
  117. df.sort_values("published_date", inplace=True)
  118. if len(y) == 2:
  119. data = [
  120. go.Scatter(
  121. x=df["published_date"],
  122. y=df[y[0]].cumsum(),
  123. name=y[0].title(),
  124. mode="lines+markers",
  125. text=df["title"],
  126. marker=dict(size=8, color='blue'),
  127. ),
  128. go.Scatter(
  129. x=df["published_date"],
  130. y=df[y[1]].cumsum(),
  131. yaxis='y2',
  132. name=y[1].title(),
  133. mode="lines+markers",
  134. text=df["title"],
  135. marker=dict(size=8, color='red'),
  136. ),
  137. ]
  138. else:
  139. data = [
  140. go.Scatter(
  141. x=df["published_date"],
  142. y=df[y].cumsum(),
  143. mode="lines+markers",
  144. text=df["title"],
  145. marker=dict(size=10),
  146. )
  147. ]
  148. if len(y) == 2:
  149. layout = go.Layout(
  150. xaxis=dict(title="Published Date", type="date"),
  151. yaxis=dict(title=y[0].title(), color='blue'),
  152. yaxis2=dict(title=y[1].title(), color='red',
  153. overlaying='y', side='right'),
  154. font=dict(size=14),
  155. title=f"Cumulative {y[0].title()} and {y[1].title()}",
  156. )
  157. else:
  158. layout = go.Layout(
  159. xaxis=dict(title="Published Date", type="date"),
  160. yaxis=dict(title=y.title()),
  161. font=dict(size=14),
  162. title=f"Cumulative {y.title()} by {category.title()}"
  163. if category is not None
  164. else f"Cumulative {y.title()}",
  165. )
  166. figure = go.Figure(data=data, layout=layout)
  167. return figure
  168. def make_scatter_plot(df, x, y, fits=None, xlog=False, ylog=False, category=None, scale=None, sizeref=2):
  169. """
  170. Make an interactive scatterplot, optionally segmented by `category`
  171. :param df: dataframe of data
  172. :param x: string of column to use for xaxis
  173. :param y: string of column to use for yaxis
  174. :param fits: list of strings of fits
  175. :param xlog: boolean for making a log xaxis
  176. :param ylog boolean for making a log yaxis
  177. :param category: string representing categorical column to segment by, this must be a categorical
  178. :param scale: string representing numerical column to size and color markers by, this must be numerical data
  179. :param sizeref: float or integer for setting the size of markers according to the scale, only used if scale is set
  180. :return figure: a plotly plot to show with iplot or plot
  181. """
  182. if category is not None:
  183. title = f"{y.title()} vs {x.title()} by {category.title()}"
  184. data = []
  185. for i, (name, group) in enumerate(df.groupby(category)):
  186. data.append(go.Scatter(x=group[x],
  187. y=group[y],
  188. mode='markers',
  189. text=group['title'],
  190. name=name,
  191. marker=dict(size=8, symbol=i + 2)))
  192. else:
  193. if scale is not None:
  194. title = f"{y.title()} vs {x.title()} by {scale.title()}"
  195. data = [go.Scatter(x=df[x],
  196. y=df[y],
  197. mode='markers',
  198. text=df['title'], marker=dict(size=df[scale], sizemode='area', sizeref=sizeref,
  199. colorscale='Viridis', color=df[scale], showscale=True, sizemin=2))]
  200. else:
  201. df.sort_values(x, inplace=True)
  202. title = f"{y.title()} vs {x.title()}"
  203. data = [go.Scatter(x=df[x],
  204. y=df[y],
  205. mode='markers',
  206. text=df['title'], marker=dict(
  207. size=10, color='blue'),
  208. name='observations')]
  209. if fits is not None:
  210. for fit in fits:
  211. data.append(go.Scatter(x=df[x], y=df[fit],
  212. mode='lines+markers', marker=dict(size=8),
  213. line=dict(dash='dash'), name=fit))
  214. title += ' with Fit'
  215. layout = go.Layout(
  216. xaxis=dict(title=x.title() + (' (log scale)' if xlog else ''),
  217. type='log' if xlog else None),
  218. yaxis=dict(title=y.title() + (' (log scale)' if ylog else ''),
  219. type='log' if ylog else None),
  220. font=dict(size=14),
  221. title=title,
  222. )
  223. figure = go.Figure(data=data, layout=layout)
  224. return figure
  225. def make_fits(df, x, y, degree=6):
  226. """
  227. Generate fits and make interactive plot with fits
  228. :param df: dataframe with data
  229. :param x: string representing x data column
  230. :param y: string representing y data column
  231. :param degree: integer degree of fits to go up to
  232. :return fit_stats: dataframe with information about fits
  233. :return figure: interactive plotly figure that can be shown with iplot or plot
  234. """
  235. # Don't want to alter original data frame
  236. df = df.copy()
  237. fit_list = []
  238. rmse = []
  239. fit_params = []
  240. # Make each fit
  241. for i in range(1, degree + 1):
  242. fit_name = f'fit degree = {i}'
  243. fit_list.append(fit_name)
  244. z = np.polyfit(df[x], df[y], i)
  245. fit_params.append(z)
  246. df.loc[:, fit_name] = np.poly1d(z)(df[x])
  247. rmse.append(np.sqrt(np.mean(np.square(df[fit_name] - df[x]))))
  248. fit_stats = pd.DataFrame(
  249. {'fit': fit_list, 'rmse': rmse, 'params': fit_params})
  250. figure = make_scatter_plot(df, x=x, y=y, fits=fit_list)
  251. return fit_stats, figure
  252. def make_iplot(data, x, y, base_title, time=False, eq_pos=(0.75, 0.25)):
  253. """
  254. Make an interactive plot. Adds a dropdown to separate articles from responses
  255. if there are responses in the data. If there is only articles (or only responses)
  256. adds a linear regression line.
  257. :param data: dataframe of entry data
  258. :param x: string for xaxis of plot
  259. :param y: sring for yaxis of plot
  260. :param base_title: string for title of plot
  261. :param time: boolean for whether the xaxis is a plot
  262. :param eq_pos: position of equation for linear regression
  263. :return figure: an interactive plotly object for display
  264. """
  265. # Extract the relevant data
  266. responses = data[data["response"] == "response"].copy()
  267. articles = data[data["response"] == "article"].copy()
  268. if not responses.empty:
  269. # Create scatterplot data, articles must be first for menu selection
  270. plot_data = [
  271. go.Scatter(
  272. x=articles[x],
  273. y=articles[y],
  274. mode="markers",
  275. name="articles",
  276. text=articles["title"],
  277. marker=dict(color="blue", size=12),
  278. ),
  279. go.Scatter(
  280. x=responses[x],
  281. y=responses[y],
  282. mode="markers",
  283. name="responses",
  284. marker=dict(color="green", size=12),
  285. ),
  286. ]
  287. if not time:
  288. annotations = {}
  289. for df, name in zip([articles, responses], ["articles", "responses"]):
  290. regression = stats.linregress(x=df[x], y=df[y])
  291. slope = regression.slope
  292. intercept = regression.intercept
  293. rvalue = regression.rvalue
  294. xi = np.array(range(int(df[x].min()), int(df[x].max())))
  295. line = xi * slope + intercept
  296. trace = go.Scatter(
  297. x=xi,
  298. y=line,
  299. mode="lines",
  300. marker=dict(color="blue" if name ==
  301. "articles" else "green"),
  302. line=dict(width=4, dash="longdash"),
  303. name=f"{name} linear fit",
  304. )
  305. annotations[name] = dict(
  306. x=max(xi) * eq_pos[0],
  307. y=df[y].max() * eq_pos[1],
  308. showarrow=False,
  309. text=f"$R^2 = {rvalue:.2f}; Y = {slope:.2f}X + {intercept:.2f}$",
  310. font=dict(size=16, color="blue" if name ==
  311. "articles" else "green"),
  312. )
  313. plot_data.append(trace)
  314. # Make a layout with update menus
  315. layout = go.Layout(
  316. annotations=list(annotations.values()),
  317. height=600,
  318. width=900,
  319. title=base_title,
  320. xaxis=dict(
  321. title=x.title(), tickfont=dict(size=14), titlefont=dict(size=16)
  322. ),
  323. yaxis=dict(
  324. title=y.title(), tickfont=dict(size=14), titlefont=dict(size=16)
  325. ),
  326. updatemenus=make_update_menu(
  327. base_title, annotations["articles"], annotations["responses"]
  328. ),
  329. )
  330. # If there are only articles
  331. else:
  332. plot_data = [
  333. go.Scatter(
  334. x=data[x],
  335. y=data[y],
  336. mode="markers",
  337. name="observations",
  338. text=data["title"],
  339. marker=dict(color="blue", size=12),
  340. )
  341. ]
  342. regression = stats.linregress(x=data[x], y=data[y])
  343. slope = regression.slope
  344. intercept = regression.intercept
  345. rvalue = regression.rvalue
  346. xi = np.array(range(int(data[x].min()), int(data[x].max())))
  347. line = xi * slope + intercept
  348. trace = go.Scatter(
  349. x=xi,
  350. y=line,
  351. mode="lines",
  352. marker=dict(color="red"),
  353. line=dict(width=4, dash="longdash"),
  354. name="linear fit",
  355. )
  356. annotations = [
  357. dict(
  358. x=max(xi) * eq_pos[0],
  359. y=data[y].max() * eq_pos[1],
  360. showarrow=False,
  361. text=f"$R^2 = {rvalue:.2f}; Y = {slope:.2f}X + {intercept:.2f}$",
  362. font=dict(size=16),
  363. )
  364. ]
  365. plot_data.append(trace)
  366. layout = go.Layout(
  367. annotations=annotations,
  368. height=600,
  369. width=900,
  370. title=base_title,
  371. xaxis=dict(
  372. title=x.title(), tickfont=dict(size=14), titlefont=dict(size=16)
  373. ),
  374. yaxis=dict(
  375. title=y.title(), tickfont=dict(size=14), titlefont=dict(size=16)
  376. ),
  377. )
  378. # Add a rangeselector and rangeslider for a data xaxis
  379. if time:
  380. rangeselector = dict(
  381. buttons=list(
  382. [
  383. dict(count=1, label="1m", step="month", stepmode="backward"),
  384. dict(count=6, label="6m", step="month", stepmode="backward"),
  385. dict(count=1, label="YTD", step="year", stepmode="todate"),
  386. dict(count=1, label="1y", step="year", stepmode="backward"),
  387. dict(step="all"),
  388. ]
  389. )
  390. )
  391. rangeslider = dict(visible=True)
  392. layout["xaxis"]["rangeselector"] = rangeselector
  393. layout["xaxis"]["rangeslider"] = rangeslider
  394. figure = go.Figure(data=plot_data, layout=layout)
  395. return figure
  396. # Return the figure
  397. figure = go.Figure(data=plot_data, layout=layout)
  398. return figure