pandas_scikitLearn.py 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. import pickle
  2. import pandas as pd
  3. import quandl
  4. import matplotlib.pyplot as plt
  5. from matplotlib import style
  6. import numpy as np
  7. from statistics import mean
  8. from sklearn import svm
  9. from sklearn.preprocessing import scale, MinMaxScaler, MaxAbsScaler
  10. from sklearn.linear_model import LogisticRegression
  11. from sklearn.model_selection import train_test_split
  12. style.use('seaborn-dark-palette')
  13. ax1 = plt.subplot(2,1,1)
  14. ax2 = plt.subplot(2,1,2, sharex=ax1)
  15. def create_labels(cur_hpi, fut_hpi):
  16. if fut_hpi > cur_hpi:
  17. return 1
  18. else:
  19. return 0
  20. def moving_average(values):
  21. return mean(values)
  22. benchmark = pd.read_pickle('us_pct.pickle') # us overall housing price index percentage change
  23. HPI = pd.read_pickle('HPI_complete.pickle') # all of the state data, thirty year mortgage, unemployment rate, GDP, SP500
  24. HPI = HPI.join(benchmark['United States'])
  25. # all in percentage change since the start of the data (1975-01-01)
  26. HPI.dropna(inplace=True)
  27. housing_pct = HPI.pct_change()
  28. housing_pct.replace([np.inf, -np.inf], np.nan, inplace=True)
  29. housing_pct['US_HPI_future'] = housing_pct['United States'].shift(-1)
  30. housing_pct.dropna(inplace=True)
  31. housing_pct['label'] = list(map(create_labels, housing_pct['United States'], housing_pct['US_HPI_future']))
  32. # housing_pct['ma_apply_example'] = housing_pct['M30'].rolling(window=10).apply(moving_average)
  33. # print(housing_pct.tail())
  34. X = np.array(housing_pct.drop(['label', 'US_HPI_future'], 1))
  35. y = np.array(housing_pct['label'])
  36. X = scale(X)
  37. X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
  38. # clf = svm.SVC(kernel='linear')
  39. # clflog = LogisticRegression(C=50.0, dual=False, penalty="l1")
  40. clflog_accuracy = []
  41. clfsvm_accuracy = []
  42. for i in range(10):
  43. clflog = LogisticRegression(C=49.0, dual=False, penalty="l1")
  44. clflog.fit(X_train, y_train)
  45. clflog_accuracy.append(clflog.score(x_test,y_test))
  46. clfsvm = svm.SVC(kernel='linear')
  47. clfsvm.fit(X_train, y_train)
  48. clfsvm_accuracy.append(clfsvm.score(x_test,y_test))
  49. print('Accuracy of logistic regression = %0.4f' % (mean(clflog_accuracy) * 100))
  50. print('Accuracy of support vector machine = %0.4f' % (mean(clfsvm_accuracy) * 100))