stocker.py 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981
  1. # Quandl for financial analysis, pandas and numpy for data manipulation
  2. # fbprophet for additive models, #pytrends for Google trend data
  3. import quandl
  4. import pandas as pd
  5. import numpy as np
  6. import fbprophet
  7. import pytrends
  8. from pytrends.request import TrendReq
  9. # matplotlib pyplot for plotting
  10. import matplotlib.pyplot as plt
  11. import matplotlib
  12. # Class for analyzing and (attempting) to predict future prices
  13. # Contains a number of visualizations and analysis methods
  14. class Stocker():
  15. # Initialization requires a ticker symbol
  16. def __init__(self, ticker, exchange='WIKI'):
  17. # Enforce capitalization
  18. ticker = ticker.upper()
  19. # Symbol is used for labeling plots
  20. self.symbol = ticker
  21. # Use Personal Api Key
  22. # quandl.ApiConfig.api_key = 'YourKeyHere'
  23. # Retrieval the financial data
  24. try:
  25. stock = quandl.get('%s/%s' % (exchange, ticker))
  26. except Exception as e:
  27. print('Error Retrieving Data.')
  28. print(e)
  29. return
  30. # Set the index to a column called Date
  31. stock = stock.reset_index(level=0)
  32. # Columns required for prophet
  33. stock['ds'] = stock['Date']
  34. if ('Adj. Close' not in stock.columns):
  35. stock['Adj. Close'] = stock['Close']
  36. stock['Adj. Open'] = stock['Open']
  37. stock['y'] = stock['Adj. Close']
  38. stock['Daily Change'] = stock['Adj. Close'] - stock['Adj. Open']
  39. # Data assigned as class attribute
  40. self.stock = stock.copy()
  41. # Minimum and maximum date in range
  42. self.min_date = min(stock['Date'])
  43. self.max_date = max(stock['Date'])
  44. # Find max and min prices and dates on which they occurred
  45. self.max_price = np.max(self.stock['y'])
  46. self.min_price = np.min(self.stock['y'])
  47. self.min_price_date = self.stock[self.stock['y'] == self.min_price]['Date']
  48. self.min_price_date = self.min_price_date[self.min_price_date.index[0]]
  49. self.max_price_date = self.stock[self.stock['y'] == self.max_price]['Date']
  50. self.max_price_date = self.max_price_date[self.max_price_date.index[0]]
  51. # The starting price (starting with the opening price)
  52. self.starting_price = float(self.stock.ix[0, 'Adj. Open'])
  53. # The most recent price
  54. self.most_recent_price = float(self.stock.ix[len(self.stock) - 1, 'y'])
  55. # Whether or not to round dates
  56. self.round_dates = True
  57. # Number of years of data to train on
  58. self.training_years = 3
  59. # Prophet parameters
  60. # Default prior from library
  61. self.changepoint_prior_scale = 0.05
  62. self.weekly_seasonality = False
  63. self.daily_seasonality = False
  64. self.monthly_seasonality = True
  65. self.yearly_seasonality = True
  66. self.changepoints = None
  67. print('{} Stocker Initialized. Data covers {} to {}.'.format(self.symbol,
  68. self.min_date.date(),
  69. self.max_date.date()))
  70. """
  71. Make sure start and end dates are in the range and can be
  72. converted to pandas datetimes. Returns dates in the correct format
  73. """
  74. def handle_dates(self, start_date, end_date):
  75. # Default start and end date are the beginning and end of data
  76. if start_date is None:
  77. start_date = self.min_date
  78. if end_date is None:
  79. end_date = self.max_date
  80. try:
  81. # Convert to pandas datetime for indexing dataframe
  82. start_date = pd.to_datetime(start_date)
  83. end_date = pd.to_datetime(end_date)
  84. except Exception as e:
  85. print('Enter valid pandas date format.')
  86. print(e)
  87. return
  88. valid_start = False
  89. valid_end = False
  90. # User will continue to enter dates until valid dates are met
  91. while (not valid_start) & (not valid_end):
  92. valid_end = True
  93. valid_start = True
  94. if end_date.date() < start_date.date():
  95. print('End Date must be later than start date.')
  96. start_date = pd.to_datetime(input('Enter a new start date: '))
  97. end_date= pd.to_datetime(input('Enter a new end date: '))
  98. valid_end = False
  99. valid_start = False
  100. else:
  101. if end_date.date() > self.max_date.date():
  102. print('End Date exceeds data range')
  103. end_date= pd.to_datetime(input('Enter a new end date: '))
  104. valid_end = False
  105. if start_date.date() < self.min_date.date():
  106. print('Start Date is before date range')
  107. start_date = pd.to_datetime(input('Enter a new start date: '))
  108. valid_start = False
  109. return start_date, end_date
  110. """
  111. Return the dataframe trimmed to the specified range.
  112. """
  113. def make_df(self, start_date, end_date, df=None):
  114. # Default is to use the object stock data
  115. if not df:
  116. df = self.stock.copy()
  117. start_date, end_date = self.handle_dates(start_date, end_date)
  118. # keep track of whether the start and end dates are in the data
  119. start_in = True
  120. end_in = True
  121. # If user wants to round dates (default behavior)
  122. if self.round_dates:
  123. # Record if start and end date are in df
  124. if (start_date not in list(df['Date'])):
  125. start_in = False
  126. if (end_date not in list(df['Date'])):
  127. end_in = False
  128. # If both are not in dataframe, round both
  129. if (not end_in) & (not start_in):
  130. trim_df = df[(df['Date'] >= start_date.date()) &
  131. (df['Date'] <= end_date.date())]
  132. else:
  133. # If both are in dataframe, round neither
  134. if (end_in) & (start_in):
  135. trim_df = df[(df['Date'] >= start_date.date()) &
  136. (df['Date'] <= end_date.date())]
  137. else:
  138. # If only start is missing, round start
  139. if (not start_in):
  140. trim_df = df[(df['Date'] > start_date.date()) &
  141. (df['Date'] <= end_date.date())]
  142. # If only end is imssing round end
  143. elif (not end_in):
  144. trim_df = df[(df['Date'] >= start_date.date()) &
  145. (df['Date'] < end_date.date())]
  146. else:
  147. valid_start = False
  148. valid_end = False
  149. while (not valid_start) & (not valid_end):
  150. start_date, end_date = self.handle_dates(start_date, end_date)
  151. # No round dates, if either data not in, print message and return
  152. if (start_date in list(df['Date'])):
  153. valid_start = True
  154. if (end_date in list(df['Date'])):
  155. valid_end = True
  156. # Check to make sure dates are in the data
  157. if (start_date not in list(df['Date'])):
  158. print('Start Date not in data (either out of range or not a trading day.)')
  159. start_date = pd.to_datetime(input(prompt='Enter a new start date: '))
  160. elif (end_date not in list(df['Date'])):
  161. print('End Date not in data (either out of range or not a trading day.)')
  162. end_date = pd.to_datetime(input(prompt='Enter a new end date: ') )
  163. # Dates are not rounded
  164. trim_df = df[(df['Date'] >= start_date.date()) &
  165. (df['Date'] <= end_date.date())]
  166. return trim_df
  167. # Basic Historical Plots and Basic Statistics
  168. def plot_stock(self, start_date=None, end_date=None, stats=['Adj. Close'], plot_type='basic'):
  169. self.reset_plot()
  170. if start_date is None:
  171. start_date = self.min_date
  172. if end_date is None:
  173. end_date = self.max_date
  174. stock_plot = self.make_df(start_date, end_date)
  175. colors = ['r', 'b', 'g', 'y', 'c', 'm']
  176. for i, stat in enumerate(stats):
  177. stat_min = min(stock_plot[stat])
  178. stat_max = max(stock_plot[stat])
  179. stat_avg = np.mean(stock_plot[stat])
  180. date_stat_min = stock_plot[stock_plot[stat] == stat_min]['Date']
  181. date_stat_min = date_stat_min[date_stat_min.index[0]].date()
  182. date_stat_max = stock_plot[stock_plot[stat] == stat_max]['Date']
  183. date_stat_max = date_stat_max[date_stat_max.index[0]].date()
  184. print('Maximum {} = {:.2f} on {}.'.format(stat, stat_max, date_stat_max))
  185. print('Minimum {} = {:.2f} on {}.'.format(stat, stat_min, date_stat_min))
  186. print('Current {} = {:.2f} on {}.\n'.format(stat, self.stock.ix[len(self.stock) - 1, stat], self.max_date.date()))
  187. # Percentage y-axis
  188. if plot_type == 'pct':
  189. # Simple Plot
  190. plt.style.use('fivethirtyeight');
  191. if stat == 'Daily Change':
  192. plt.plot(stock_plot['Date'], 100 * stock_plot[stat],
  193. color = colors[i], linewidth = 2.4, alpha = 0.9,
  194. label = stat)
  195. else:
  196. plt.plot(stock_plot['Date'], 100 * (stock_plot[stat] - stat_avg) / stat_avg,
  197. color = colors[i], linewidth = 2.4, alpha = 0.9,
  198. label = stat)
  199. plt.xlabel('Date'); plt.ylabel('Change Relative to Average (%)'); plt.title('%s Stock History' % self.symbol);
  200. plt.legend(prop={'size':10})
  201. plt.grid(color = 'k', alpha = 0.4);
  202. # Stat y-axis
  203. elif plot_type == 'basic':
  204. plt.style.use('fivethirtyeight');
  205. plt.plot(stock_plot['Date'], stock_plot[stat], color = colors[i], linewidth = 3, label = stat, alpha = 0.8)
  206. plt.xlabel('Date'); plt.ylabel('US $'); plt.title('%s Stock History' % self.symbol);
  207. plt.legend(prop={'size':10})
  208. plt.grid(color = 'k', alpha = 0.4);
  209. plt.show();
  210. # Reset the plotting parameters to clear style formatting
  211. # Not sure if this should be a static method
  212. @staticmethod
  213. def reset_plot():
  214. # Restore default parameters
  215. matplotlib.rcParams.update(matplotlib.rcParamsDefault)
  216. # Adjust a few parameters to liking
  217. matplotlib.rcParams['figure.figsize'] = (8, 5)
  218. matplotlib.rcParams['axes.labelsize'] = 10
  219. matplotlib.rcParams['xtick.labelsize'] = 8
  220. matplotlib.rcParams['ytick.labelsize'] = 8
  221. matplotlib.rcParams['axes.titlesize'] = 14
  222. matplotlib.rcParams['text.color'] = 'k'
  223. # Method to linearly interpolate prices on the weekends
  224. def resample(self, dataframe):
  225. # Change the index and resample at daily level
  226. dataframe = dataframe.set_index('ds')
  227. dataframe = dataframe.resample('D')
  228. # Reset the index and interpolate nan values
  229. dataframe = dataframe.reset_index(level=0)
  230. dataframe = dataframe.interpolate()
  231. return dataframe
  232. # Remove weekends from a dataframe
  233. def remove_weekends(self, dataframe):
  234. # Reset index to use ix
  235. dataframe = dataframe.reset_index(drop=True)
  236. weekends = []
  237. # Find all of the weekends
  238. for i, date in enumerate(dataframe['ds']):
  239. if (date.weekday()) == 5 | (date.weekday() == 6):
  240. weekends.append(i)
  241. # Drop the weekends
  242. dataframe = dataframe.drop(weekends, axis=0)
  243. return dataframe
  244. # Calculate and plot profit from buying and holding shares for specified date range
  245. def buy_and_hold(self, start_date=None, end_date=None, nshares=1):
  246. self.reset_plot()
  247. start_date, end_date = self.handle_dates(start_date, end_date)
  248. # Find starting and ending price of stock
  249. start_price = float(self.stock[self.stock['Date'] == start_date]['Adj. Open'])
  250. end_price = float(self.stock[self.stock['Date'] == end_date]['Adj. Close'])
  251. # Make a profit dataframe and calculate profit column
  252. profits = self.make_df(start_date, end_date)
  253. profits['hold_profit'] = nshares * (profits['Adj. Close'] - start_price)
  254. # Total profit
  255. total_hold_profit = nshares * (end_price - start_price)
  256. print('{} Total buy and hold profit from {} to {} for {} shares = ${:.2f}'.format
  257. (self.symbol, start_date.date(), end_date.date(), nshares, total_hold_profit))
  258. # Plot the total profits
  259. plt.style.use('dark_background')
  260. # Location for number of profit
  261. text_location = (end_date - pd.DateOffset(months = 1)).date()
  262. # Plot the profits over time
  263. plt.plot(profits['Date'], profits['hold_profit'], 'b', linewidth = 3)
  264. plt.ylabel('Profit ($)'); plt.xlabel('Date'); plt.title('Buy and Hold Profits for {} {} to {}'.format(
  265. self.symbol, start_date.date(), end_date.date()))
  266. # Display final value on graph
  267. plt.text(x = text_location,
  268. y = total_hold_profit + (total_hold_profit / 40),
  269. s = '$%d' % total_hold_profit,
  270. color = 'g' if total_hold_profit > 0 else 'r',
  271. size = 14)
  272. plt.grid(alpha=0.2)
  273. plt.show();
  274. # Create a prophet model without training
  275. def create_model(self):
  276. # Make the model
  277. model = fbprophet.Prophet(daily_seasonality=self.daily_seasonality,
  278. weekly_seasonality=self.weekly_seasonality,
  279. yearly_seasonality=self.yearly_seasonality,
  280. changepoint_prior_scale=self.changepoint_prior_scale,
  281. changepoints=self.changepoints)
  282. if self.monthly_seasonality:
  283. # Add monthly seasonality
  284. model.add_seasonality(name = 'monthly', period = 30.5, fourier_order = 5)
  285. return model
  286. # Graph the effects of altering the changepoint prior scale (cps)
  287. def changepoint_prior_analysis(self, changepoint_priors=[0.001, 0.05, 0.1, 0.2], colors=['b', 'r', 'grey', 'gold']):
  288. # Training and plotting with specified years of data
  289. train = self.stock[(self.stock['Date'] > (max(self.stock['Date']) - pd.DateOffset(years=self.training_years)).date())]
  290. # Iterate through all the changepoints and make models
  291. for i, prior in enumerate(changepoint_priors):
  292. # Select the changepoint
  293. self.changepoint_prior_scale = prior
  294. # Create and train a model with the specified cps
  295. model = self.create_model()
  296. model.fit(train)
  297. future = model.make_future_dataframe(periods=180, freq='D')
  298. # Make a dataframe to hold predictions
  299. if i == 0:
  300. predictions = future.copy()
  301. future = model.predict(future)
  302. # Fill in prediction dataframe
  303. predictions['%.3f_yhat_upper' % prior] = future['yhat_upper']
  304. predictions['%.3f_yhat_lower' % prior] = future['yhat_lower']
  305. predictions['%.3f_yhat' % prior] = future['yhat']
  306. # Remove the weekends
  307. predictions = self.remove_weekends(predictions)
  308. # Plot set-up
  309. self.reset_plot()
  310. plt.style.use('fivethirtyeight')
  311. fig, ax = plt.subplots(1, 1)
  312. # Actual observations
  313. ax.plot(train['ds'], train['y'], 'ko', ms = 4, label = 'Observations')
  314. color_dict = {prior: color for prior, color in zip(changepoint_priors, colors)}
  315. # Plot each of the changepoint predictions
  316. for prior in changepoint_priors:
  317. # Plot the predictions themselves
  318. ax.plot(predictions['ds'], predictions['%.3f_yhat' % prior], linewidth = 1.2,
  319. color = color_dict[prior], label = '%.3f prior scale' % prior)
  320. # Plot the uncertainty interval
  321. ax.fill_between(predictions['ds'].dt.to_pydatetime(), predictions['%.3f_yhat_upper' % prior],
  322. predictions['%.3f_yhat_lower' % prior], facecolor = color_dict[prior],
  323. alpha = 0.3, edgecolor = 'k', linewidth = 0.6)
  324. # Plot labels
  325. plt.legend(loc = 2, prop={'size': 10})
  326. plt.xlabel('Date'); plt.ylabel('Stock Price ($)'); plt.title('Effect of Changepoint Prior Scale');
  327. plt.show()
  328. # Basic prophet model for specified number of days
  329. def create_prophet_model(self, days=0, resample=False):
  330. self.reset_plot()
  331. model = self.create_model()
  332. # Fit on the stock history for self.training_years number of years
  333. stock_history = self.stock[self.stock['Date'] > (self.max_date - pd.DateOffset(years = self.training_years)).date()]
  334. if resample:
  335. stock_history = self.resample(stock_history)
  336. model.fit(stock_history)
  337. # Make and predict for next year with future dataframe
  338. future = model.make_future_dataframe(periods = days, freq='D')
  339. future = model.predict(future)
  340. if days > 0:
  341. # Print the predicted price
  342. print('Predicted Price on {} = ${:.2f}'.format(
  343. future.ix[len(future) - 1, 'ds'].date(), future.ix[len(future) - 1, 'yhat']))
  344. title = '%s Historical and Predicted Stock Price' % self.symbol
  345. else:
  346. title = '%s Historical and Modeled Stock Price' % self.symbol
  347. # Set up the plot
  348. fig, ax = plt.subplots(1, 1)
  349. # Plot the actual values
  350. ax.plot(stock_history['ds'], stock_history['y'], 'ko-', linewidth = 1.4, alpha = 0.8, ms = 1.8, label = 'Observations')
  351. # Plot the predicted values
  352. ax.plot(future['ds'], future['yhat'], 'forestgreen',linewidth = 2.4, label = 'Modeled');
  353. # Plot the uncertainty interval as ribbon
  354. ax.fill_between(future['ds'].dt.to_pydatetime(), future['yhat_upper'], future['yhat_lower'], alpha = 0.3,
  355. facecolor = 'g', edgecolor = 'k', linewidth = 1.4, label = 'Confidence Interval')
  356. # Plot formatting
  357. plt.legend(loc = 2, prop={'size': 10}); plt.xlabel('Date'); plt.ylabel('Price $');
  358. plt.grid(linewidth=0.6, alpha = 0.6)
  359. plt.title(title);
  360. plt.show()
  361. return model, future
  362. # Evaluate prediction model for one year
  363. def evaluate_prediction(self, start_date=None, end_date=None, nshares = None):
  364. # Default start date is one year before end of data
  365. # Default end date is end date of data
  366. if start_date is None:
  367. start_date = self.max_date - pd.DateOffset(years=1)
  368. if end_date is None:
  369. end_date = self.max_date
  370. start_date, end_date = self.handle_dates(start_date, end_date)
  371. # Training data starts self.training_years years before start date and goes up to start date
  372. train = self.stock[(self.stock['Date'] < start_date.date()) &
  373. (self.stock['Date'] > (start_date - pd.DateOffset(years=self.training_years)).date())]
  374. # Testing data is specified in the range
  375. test = self.stock[(self.stock['Date'] >= start_date.date()) & (self.stock['Date'] <= end_date.date())]
  376. # Create and train the model
  377. model = self.create_model()
  378. model.fit(train)
  379. # Make a future dataframe and predictions
  380. future = model.make_future_dataframe(periods = 365, freq='D')
  381. future = model.predict(future)
  382. # Merge predictions with the known values
  383. test = pd.merge(test, future, on = 'ds', how = 'inner')
  384. train = pd.merge(train, future, on = 'ds', how = 'inner')
  385. # Calculate the differences between consecutive measurements
  386. test['pred_diff'] = test['yhat'].diff()
  387. test['real_diff'] = test['y'].diff()
  388. # Correct is when we predicted the correct direction
  389. test['correct'] = (np.sign(test['pred_diff']) == np.sign(test['real_diff'])) * 1
  390. # Accuracy when we predict increase and decrease
  391. increase_accuracy = 100 * np.mean(test[test['pred_diff'] > 0]['correct'])
  392. decrease_accuracy = 100 * np.mean(test[test['pred_diff'] < 0]['correct'])
  393. # Calculate mean absolute error
  394. test_errors = abs(test['y'] - test['yhat'])
  395. test_mean_error = np.mean(test_errors)
  396. train_errors = abs(train['y'] - train['yhat'])
  397. train_mean_error = np.mean(train_errors)
  398. # Calculate percentage of time actual value within prediction range
  399. test['in_range'] = False
  400. for i in test.index:
  401. if (test.ix[i, 'y'] < test.ix[i, 'yhat_upper']) & (test.ix[i, 'y'] > test.ix[i, 'yhat_lower']):
  402. test.ix[i, 'in_range'] = True
  403. in_range_accuracy = 100 * np.mean(test['in_range'])
  404. if not nshares:
  405. # Date range of predictions
  406. print('\nPrediction Range: {} to {}.'.format(start_date.date(),
  407. end_date.date()))
  408. # Final prediction vs actual value
  409. print('\nPredicted price on {} = ${:.2f}.'.format(max(future['ds']).date(), future.ix[len(future) - 1, 'yhat']))
  410. print('Actual price on {} = ${:.2f}.\n'.format(max(test['ds']).date(), test.ix[len(test) - 1, 'y']))
  411. print('Average Absolute Error on Training Data = ${:.2f}.'.format(train_mean_error))
  412. print('Average Absolute Error on Testing Data = ${:.2f}.\n'.format(test_mean_error))
  413. # Direction accuracy
  414. print('When the model predicted an increase, the price increased {:.2f}% of the time.'.format(increase_accuracy))
  415. print('When the model predicted a decrease, the price decreased {:.2f}% of the time.\n'.format(decrease_accuracy))
  416. print('The actual value was within the {:d}% confidence interval {:.2f}% of the time.'.format(int(100 * model.interval_width), in_range_accuracy))
  417. # Reset the plot
  418. self.reset_plot()
  419. # Set up the plot
  420. fig, ax = plt.subplots(1, 1)
  421. # Plot the actual values
  422. ax.plot(train['ds'], train['y'], 'ko-', linewidth = 1.4, alpha = 0.8, ms = 1.8, label = 'Observations')
  423. ax.plot(test['ds'], test['y'], 'ko-', linewidth = 1.4, alpha = 0.8, ms = 1.8, label = 'Observations')
  424. # Plot the predicted values
  425. ax.plot(future['ds'], future['yhat'], 'navy', linewidth = 2.4, label = 'Predicted');
  426. # Plot the uncertainty interval as ribbon
  427. ax.fill_between(future['ds'].dt.to_pydatetime(), future['yhat_upper'], future['yhat_lower'], alpha = 0.6,
  428. facecolor = 'gold', edgecolor = 'k', linewidth = 1.4, label = 'Confidence Interval')
  429. # Put a vertical line at the start of predictions
  430. plt.vlines(x=min(test['ds']).date(), ymin=min(future['yhat_lower']), ymax=max(future['yhat_upper']), colors = 'r',
  431. linestyles='dashed', label = 'Prediction Start')
  432. # Plot formatting
  433. plt.legend(loc = 2, prop={'size': 8}); plt.xlabel('Date'); plt.ylabel('Price $');
  434. plt.grid(linewidth=0.6, alpha = 0.6)
  435. plt.title('{} Model Evaluation from {} to {}.'.format(self.symbol,
  436. start_date.date(), end_date.date()));
  437. plt.show();
  438. # If a number of shares is specified, play the game
  439. elif nshares:
  440. # Only playing the stocks when we predict the stock will increase
  441. test_pred_increase = test[test['pred_diff'] > 0]
  442. test_pred_increase.reset_index(inplace=True)
  443. prediction_profit = []
  444. # Iterate through all the predictions and calculate profit from playing
  445. for i, correct in enumerate(test_pred_increase['correct']):
  446. # If we predicted up and the price goes up, we gain the difference
  447. if correct == 1:
  448. prediction_profit.append(nshares * test_pred_increase.ix[i, 'real_diff'])
  449. # If we predicted up and the price goes down, we lose the difference
  450. else:
  451. prediction_profit.append(nshares * test_pred_increase.ix[i, 'real_diff'])
  452. test_pred_increase['pred_profit'] = prediction_profit
  453. # Put the profit into the test dataframe
  454. test = pd.merge(test, test_pred_increase[['ds', 'pred_profit']], on = 'ds', how = 'left')
  455. test.ix[0, 'pred_profit'] = 0
  456. # Profit for either method at all dates
  457. test['pred_profit'] = test['pred_profit'].cumsum().ffill()
  458. test['hold_profit'] = nshares * (test['y'] - float(test.ix[0, 'y']))
  459. # Display information
  460. print('You played the stock market in {} from {} to {} with {} shares.\n'.format(
  461. self.symbol, start_date.date(), end_date.date(), nshares))
  462. print('When the model predicted an increase, the price increased {:.2f}% of the time.'.format(increase_accuracy))
  463. print('When the model predicted a decrease, the price decreased {:.2f}% of the time.\n'.format(decrease_accuracy))
  464. # Display some friendly information about the perils of playing the stock market
  465. print('The total profit using the Prophet model = ${:.2f}.'.format(np.sum(prediction_profit)))
  466. print('The Buy and Hold strategy profit = ${:.2f}.'.format(float(test.ix[len(test) - 1, 'hold_profit'])))
  467. print('\nThanks for playing the stock market!\n')
  468. # Plot the predicted and actual profits over time
  469. self.reset_plot()
  470. # Final profit and final smart used for locating text
  471. final_profit = test.ix[len(test) - 1, 'pred_profit']
  472. final_smart = test.ix[len(test) - 1, 'hold_profit']
  473. # text location
  474. last_date = test.ix[len(test) - 1, 'ds']
  475. text_location = (last_date - pd.DateOffset(months = 1)).date()
  476. plt.style.use('dark_background')
  477. # Plot smart profits
  478. plt.plot(test['ds'], test['hold_profit'], 'b',
  479. linewidth = 1.8, label = 'Buy and Hold Strategy')
  480. # Plot prediction profits
  481. plt.plot(test['ds'], test['pred_profit'],
  482. color = 'g' if final_profit > 0 else 'r',
  483. linewidth = 1.8, label = 'Prediction Strategy')
  484. # Display final values on graph
  485. plt.text(x = text_location,
  486. y = final_profit + (final_profit / 40),
  487. s = '$%d' % final_profit,
  488. color = 'g' if final_profit > 0 else 'r',
  489. size = 18)
  490. plt.text(x = text_location,
  491. y = final_smart + (final_smart / 40),
  492. s = '$%d' % final_smart,
  493. color = 'g' if final_smart > 0 else 'r',
  494. size = 18);
  495. # Plot formatting
  496. plt.ylabel('Profit (US $)'); plt.xlabel('Date');
  497. plt.title('Predicted versus Buy and Hold Profits');
  498. plt.legend(loc = 2, prop={'size': 10});
  499. plt.grid(alpha=0.2);
  500. plt.show()
  501. def retrieve_google_trends(self, search, date_range):
  502. # Set up the trend fetching object
  503. pytrends = TrendReq(hl='en-US', tz=360)
  504. kw_list = [search]
  505. try:
  506. # Create the search object
  507. pytrends.build_payload(kw_list, cat=0, timeframe=date_range[0], geo='', gprop='news')
  508. # Retrieve the interest over time
  509. trends = pytrends.interest_over_time()
  510. related_queries = pytrends.related_queries()
  511. except Exception as e:
  512. print('\nGoogle Search Trend retrieval failed.')
  513. print(e)
  514. return
  515. return trends, related_queries
  516. def changepoint_date_analysis(self, search=None):
  517. self.reset_plot()
  518. model = self.create_model()
  519. # Use past self.training_years years of data
  520. train = self.stock[self.stock['Date'] > (self.max_date - pd.DateOffset(years = self.training_years)).date()]
  521. model.fit(train)
  522. # Predictions of the training data (no future periods)
  523. future = model.make_future_dataframe(periods=0, freq='D')
  524. future = model.predict(future)
  525. train = pd.merge(train, future[['ds', 'yhat']], on = 'ds', how = 'inner')
  526. changepoints = model.changepoints
  527. train = train.reset_index(drop=True)
  528. # Create dataframe of only changepoints
  529. change_indices = []
  530. for changepoint in (changepoints):
  531. change_indices.append(train[train['ds'] == changepoint.date()].index[0])
  532. c_data = train.ix[change_indices, :]
  533. deltas = model.params['delta'][0]
  534. c_data['delta'] = deltas
  535. c_data['abs_delta'] = abs(c_data['delta'])
  536. # Sort the values by maximum change
  537. c_data = c_data.sort_values(by='abs_delta', ascending=False)
  538. # Limit to 10 largest changepoints
  539. c_data = c_data[:10]
  540. # Separate into negative and positive changepoints
  541. cpos_data = c_data[c_data['delta'] > 0]
  542. cneg_data = c_data[c_data['delta'] < 0]
  543. # Changepoints and data
  544. if not search:
  545. print('\nChangepoints sorted by slope rate of change (2nd derivative):\n')
  546. print(c_data.ix[:, ['Date', 'Adj. Close', 'delta']][:5])
  547. # Line plot showing actual values, estimated values, and changepoints
  548. self.reset_plot()
  549. # Set up line plot
  550. plt.plot(train['ds'], train['y'], 'ko', ms = 4, label = 'Stock Price')
  551. plt.plot(future['ds'], future['yhat'], color = 'navy', linewidth = 2.0, label = 'Modeled')
  552. # Changepoints as vertical lines
  553. plt.vlines(cpos_data['ds'].dt.to_pydatetime(), ymin = min(train['y']), ymax = max(train['y']),
  554. linestyles='dashed', color = 'r',
  555. linewidth= 1.2, label='Negative Changepoints')
  556. plt.vlines(cneg_data['ds'].dt.to_pydatetime(), ymin = min(train['y']), ymax = max(train['y']),
  557. linestyles='dashed', color = 'darkgreen',
  558. linewidth= 1.2, label='Positive Changepoints')
  559. plt.legend(prop={'size':10});
  560. plt.xlabel('Date'); plt.ylabel('Price ($)'); plt.title('Stock Price with Changepoints')
  561. plt.show()
  562. # Search for search term in google news
  563. # Show related queries, rising related queries
  564. # Graph changepoints, search frequency, stock price
  565. if search:
  566. date_range = ['%s %s' % (str(min(train['Date']).date()), str(max(train['Date']).date()))]
  567. # Get the Google Trends for specified terms and join to training dataframe
  568. trends, related_queries = self.retrieve_google_trends(search, date_range)
  569. if (trends is None) or (related_queries is None):
  570. print('No search trends found for %s' % search)
  571. return
  572. print('\n Top Related Queries: \n')
  573. print(related_queries[search]['top'].head())
  574. print('\n Rising Related Queries: \n')
  575. print(related_queries[search]['rising'].head())
  576. # Upsample the data for joining with training data
  577. trends = trends.resample('D')
  578. trends = trends.reset_index(level=0)
  579. trends = trends.rename(columns={'date': 'ds', search: 'freq'})
  580. # Interpolate the frequency
  581. trends['freq'] = trends['freq'].interpolate()
  582. # Merge with the training data
  583. train = pd.merge(train, trends, on = 'ds', how = 'inner')
  584. # Normalize values
  585. train['y_norm'] = train['y'] / max(train['y'])
  586. train['freq_norm'] = train['freq'] / max(train['freq'])
  587. self.reset_plot()
  588. # Plot the normalized stock price and normalize search frequency
  589. plt.plot(train['ds'], train['y_norm'], 'k-', label = 'Stock Price')
  590. plt.plot(train['ds'], train['freq_norm'], color='goldenrod', label = 'Search Frequency')
  591. # Changepoints as vertical lines
  592. plt.vlines(cpos_data['ds'].dt.to_pydatetime(), ymin = 0, ymax = 1,
  593. linestyles='dashed', color = 'r',
  594. linewidth= 1.2, label='Negative Changepoints')
  595. plt.vlines(cneg_data['ds'].dt.to_pydatetime(), ymin = 0, ymax = 1,
  596. linestyles='dashed', color = 'darkgreen',
  597. linewidth= 1.2, label='Positive Changepoints')
  598. # Plot formatting
  599. plt.legend(prop={'size': 10})
  600. plt.xlabel('Date'); plt.ylabel('Normalized Values'); plt.title('%s Stock Price and Search Frequency for %s' % (self.symbol, search))
  601. plt.show()
  602. # Predict the future price for a given range of days
  603. def predict_future(self, days=30):
  604. # Use past self.training_years years for training
  605. train = self.stock[self.stock['Date'] > (max(self.stock['Date']) - pd.DateOffset(years=self.training_years)).date()]
  606. model = self.create_model()
  607. model.fit(train)
  608. # Future dataframe with specified number of days to predict
  609. future = model.make_future_dataframe(periods=days, freq='D')
  610. future = model.predict(future)
  611. # Only concerned with future dates
  612. future = future[future['ds'] >= max(self.stock['Date']).date()]
  613. # Remove the weekends
  614. future = self.remove_weekends(future)
  615. # Calculate whether increase or not
  616. future['diff'] = future['yhat'].diff()
  617. future = future.dropna()
  618. # Find the prediction direction and create separate dataframes
  619. future['direction'] = (future['diff'] > 0) * 1
  620. # Rename the columns for presentation
  621. future = future.rename(columns={'ds': 'Date', 'yhat': 'estimate', 'diff': 'change',
  622. 'yhat_upper': 'upper', 'yhat_lower': 'lower'})
  623. future_increase = future[future['direction'] == 1]
  624. future_decrease = future[future['direction'] == 0]
  625. # Print out the dates
  626. print('\nPredicted Increase: \n')
  627. print(future_increase[['Date', 'estimate', 'change', 'upper', 'lower']])
  628. print('\nPredicted Decrease: \n')
  629. print(future_decrease[['Date', 'estimate', 'change', 'upper', 'lower']])
  630. self.reset_plot()
  631. # Set up plot
  632. plt.style.use('fivethirtyeight')
  633. matplotlib.rcParams['axes.labelsize'] = 10
  634. matplotlib.rcParams['xtick.labelsize'] = 8
  635. matplotlib.rcParams['ytick.labelsize'] = 8
  636. matplotlib.rcParams['axes.titlesize'] = 12
  637. # Plot the predictions and indicate if increase or decrease
  638. fig, ax = plt.subplots(1, 1, figsize=(8, 6))
  639. # Plot the estimates
  640. ax.plot(future_increase['Date'], future_increase['estimate'], 'g^', ms = 12, label = 'Pred. Increase')
  641. ax.plot(future_decrease['Date'], future_decrease['estimate'], 'rv', ms = 12, label = 'Pred. Decrease')
  642. # Plot errorbars
  643. ax.errorbar(future['Date'].dt.to_pydatetime(), future['estimate'],
  644. yerr = future['upper'] - future['lower'],
  645. capthick=1.4, color = 'k',linewidth = 2,
  646. ecolor='darkblue', capsize = 4, elinewidth = 1, label = 'Pred with Range')
  647. # Plot formatting
  648. plt.legend(loc = 2, prop={'size': 10});
  649. plt.xticks(rotation = '45')
  650. plt.ylabel('Predicted Stock Price (US $)');
  651. plt.xlabel('Date'); plt.title('Predictions for %s' % self.symbol);
  652. plt.show()
  653. def changepoint_prior_validation(self, start_date=None, end_date=None,changepoint_priors = [0.001, 0.05, 0.1, 0.2]):
  654. # Default start date is two years before end of data
  655. # Default end date is one year before end of data
  656. if start_date is None:
  657. start_date = self.max_date - pd.DateOffset(years=2)
  658. if end_date is None:
  659. end_date = self.max_date - pd.DateOffset(years=1)
  660. # Convert to pandas datetime for indexing dataframe
  661. start_date = pd.to_datetime(start_date)
  662. end_date = pd.to_datetime(end_date)
  663. start_date, end_date = self.handle_dates(start_date, end_date)
  664. # Select self.training_years number of years
  665. train = self.stock[(self.stock['Date'] > (start_date - pd.DateOffset(years=self.training_years)).date()) &
  666. (self.stock['Date'] < start_date.date())]
  667. # Testing data is specified by range
  668. test = self.stock[(self.stock['Date'] >= start_date.date()) & (self.stock['Date'] <= end_date.date())]
  669. eval_days = (max(test['Date']).date() - min(test['Date']).date()).days
  670. results = pd.DataFrame(0, index = list(range(len(changepoint_priors))),
  671. columns = ['cps', 'train_err', 'train_range', 'test_err', 'test_range'])
  672. print('\nValidation Range {} to {}.\n'.format(min(test['Date']).date(),
  673. max(test['Date']).date()))
  674. # Iterate through all the changepoints and make models
  675. for i, prior in enumerate(changepoint_priors):
  676. results.ix[i, 'cps'] = prior
  677. # Select the changepoint
  678. self.changepoint_prior_scale = prior
  679. # Create and train a model with the specified cps
  680. model = self.create_model()
  681. model.fit(train)
  682. future = model.make_future_dataframe(periods=eval_days, freq='D')
  683. future = model.predict(future)
  684. # Training results and metrics
  685. train_results = pd.merge(train, future[['ds', 'yhat', 'yhat_upper', 'yhat_lower']], on = 'ds', how = 'inner')
  686. avg_train_error = np.mean(abs(train_results['y'] - train_results['yhat']))
  687. avg_train_uncertainty = np.mean(abs(train_results['yhat_upper'] - train_results['yhat_lower']))
  688. results.ix[i, 'train_err'] = avg_train_error
  689. results.ix[i, 'train_range'] = avg_train_uncertainty
  690. # Testing results and metrics
  691. test_results = pd.merge(test, future[['ds', 'yhat', 'yhat_upper', 'yhat_lower']], on = 'ds', how = 'inner')
  692. avg_test_error = np.mean(abs(test_results['y'] - test_results['yhat']))
  693. avg_test_uncertainty = np.mean(abs(test_results['yhat_upper'] - test_results['yhat_lower']))
  694. results.ix[i, 'test_err'] = avg_test_error
  695. results.ix[i, 'test_range'] = avg_test_uncertainty
  696. print(results)
  697. # Plot of training and testing average errors
  698. self.reset_plot()
  699. plt.plot(results['cps'], results['train_err'], 'bo-', ms = 8, label = 'Train Error')
  700. plt.plot(results['cps'], results['test_err'], 'r*-', ms = 8, label = 'Test Error')
  701. plt.xlabel('Changepoint Prior Scale'); plt.ylabel('Avg. Absolute Error ($)');
  702. plt.title('Training and Testing Curves as Function of CPS')
  703. plt.grid(color='k', alpha=0.3)
  704. plt.xticks(results['cps'], results['cps'])
  705. plt.legend(prop={'size':10})
  706. plt.show();
  707. # Plot of training and testing average uncertainty
  708. self.reset_plot()
  709. plt.plot(results['cps'], results['train_range'], 'bo-', ms = 8, label = 'Train Range')
  710. plt.plot(results['cps'], results['test_range'], 'r*-', ms = 8, label = 'Test Range')
  711. plt.xlabel('Changepoint Prior Scale'); plt.ylabel('Avg. Uncertainty ($)');
  712. plt.title('Uncertainty in Estimate as Function of CPS')
  713. plt.grid(color='k', alpha=0.3)
  714. plt.xticks(results['cps'], results['cps'])
  715. plt.legend(prop={'size':10})
  716. plt.show();