time_features_utils.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. import pandas as pd
  2. import numpy as np
  3. from tqdm import tqdm_notebook
  4. def cyclical_encoding(series, period):
  5. features = pd.concat(
  6. [np.sin((2 * np.pi * series / period)), np.cos((2 * np.pi * series / period))],
  7. axis=1,
  8. )
  9. features.columns = [f"sin_{series.name}", f"cos_{series.name}"]
  10. return features
  11. def create_time_features(
  12. fld,
  13. keep_frac_only=False,
  14. include_additional=False,
  15. cyc_encode=False,
  16. timezone=None,
  17. ):
  18. """
  19. Create features out of a series of datetimes.
  20. Time zones are converted to local time if specified.
  21. :param fld: series of datetimes
  22. :param keep_frac_only: maintain only fractional times
  23. :param include_additional: whether to include additional attributes
  24. :param cyc_encode: whether to cyclically encode time and date features
  25. :param timezone: string for the time zone. if passed, times are converted to local
  26. :return df: dataframe with added date and time columns
  27. """
  28. # Convert to a series (in case of index)
  29. fld = pd.to_datetime(pd.Series(fld))
  30. # Create a dataframe with index as original times
  31. df = fld.to_frame().drop(columns=[fld.name])
  32. # Used for naming the columns
  33. prefix = fld.name
  34. prefix += "_"
  35. # Convert to local time and then remove time zone information
  36. if timezone:
  37. fld = fld.dt.tz_convert(timezone).dt.tz_localize(None)
  38. df["local"] = fld
  39. # Basic attributes
  40. attr = [
  41. "second",
  42. "minute",
  43. "hour",
  44. "year",
  45. "month",
  46. "week",
  47. "day",
  48. "dayofweek",
  49. "dayofyear",
  50. ]
  51. if include_additional:
  52. # Additional attributes to extract
  53. attr = attr + [
  54. "is_month_end",
  55. "is_month_start",
  56. "is_quarter_end",
  57. "is_quarter_start",
  58. "is_year_end",
  59. "is_year_start",
  60. "days_in_month",
  61. ]
  62. # iterate through each attribute and add it to the dataframe
  63. for n in attr:
  64. df[prefix + n] = getattr(fld.dt, n)
  65. # Add fractional time of day converting to hours
  66. df[prefix + "fracday"] = (
  67. df[prefix + "hour"]
  68. + df[prefix + "minute"] / 60
  69. + df[prefix + "second"] / 60 / 60
  70. ) / 24
  71. # Add fractional time of week
  72. df[prefix + "fracweek"] = (df[prefix + "dayofweek"] + df[prefix + "fracday"]) / 7
  73. # Add fractional time of month
  74. df[prefix + "fracmonth"] = ((df[prefix + "day"] - 1) + df[prefix + "fracday"]) / (
  75. fld.dt.days_in_month
  76. ) # Use fld days_in_month in case this is not
  77. # one of the attributes specified
  78. # Calculate days in year (accounting for leap year rules)
  79. days_in_year = np.where(
  80. (df[prefix + "year"] % 4 == 0)
  81. & ((df[prefix + "year"] % 100 != 0) | (df[prefix + "year"] % 400 == 0)),
  82. 366,
  83. 365,
  84. )
  85. # Add fractional time of year
  86. df[prefix + "fracyear"] = (
  87. (df[prefix + "dayofyear"] - 1) + df[prefix + "fracday"]
  88. ) / days_in_year
  89. if cyc_encode:
  90. df = pd.concat([df, cyclical_encoding(df[prefix + "hour"], 24)], axis=1)
  91. df = pd.concat([df, cyclical_encoding(df[prefix + "dayofweek"], 6)], axis=1)
  92. df = pd.concat([df, cyclical_encoding(df[prefix + "day"], 31)], axis=1)
  93. df = pd.concat([df, cyclical_encoding(df[prefix + "month"], 12)], axis=1)
  94. df = pd.concat(
  95. [df] + [cyclical_encoding(df[c], 1) for c in df if "frac" in c], axis=1
  96. )
  97. if keep_frac_only:
  98. df = df.drop(
  99. [
  100. prefix + c
  101. for c in [
  102. "second",
  103. "minute",
  104. "hour",
  105. "year",
  106. "month",
  107. "week",
  108. "day",
  109. "dayofweek",
  110. "dayofyear",
  111. ]
  112. ],
  113. axis=1,
  114. )
  115. df = df.set_index(fld).sort_index()
  116. return df
  117. def monthly_validation(data, model, track=False):
  118. train_stops = np.unique(data.index[data.index.is_month_end].date)
  119. X = data.copy()
  120. y = X.pop("energy")
  121. weighted_score = 0
  122. total_possible = 0
  123. train_points = []
  124. test_points = []
  125. scores = []
  126. for date in train_stops:
  127. y_train, y_test = y[:date], y[date:]
  128. X_train, X_test = X[:date], X[date:]
  129. model.fit(X_train, y_train)
  130. y_hat = model.predict(X_test)
  131. test_start, test_end = X_test.index.min().date(), X_test.index.max().date()
  132. n_days = (test_end - test_start).days
  133. score = 100 - mape(y_test, y_hat)
  134. if track:
  135. print(
  136. f"Accuracy: {score:.2f}% testing from {test_start} to {test_end} ({n_days} days)."
  137. )
  138. weighted_score += score * len(X_test)
  139. total_possible += 100 * len(X_test)
  140. train_points.append(len(X_train))
  141. test_points.append(len(X_test))
  142. scores.append(score)
  143. model.fit(X, y)
  144. importance_df = None
  145. if hasattr(model, "feature_importances_"):
  146. importance_df = pd.DataFrame(
  147. dict(features=X.columns, importance=model.feature_importances_)
  148. )
  149. final_score = weighted_score / total_possible
  150. results_df = pd.DataFrame(
  151. dict(train_points=train_points, test_points=test_points, score=scores)
  152. )
  153. return dict(results=results_df, importances=importance_df, score=final_score)
  154. def mape(y_true, y_pred):
  155. return 100 * np.mean(np.abs((y_pred - y_true) / y_true))
  156. def data_reading(filename):
  157. data = pd.read_csv(filename, parse_dates=["timestamp"])
  158. data = data.dropna(subset=["energy"])
  159. freq_counts = data["timestamp"].diff(1).value_counts()
  160. freq = round(freq_counts.idxmax().total_seconds() / 60)
  161. data = data.set_index("timestamp").sort_index()
  162. return data, freq, len(data)
  163. def data_testing(filename, model):
  164. building_id = filename.split("_")[-1].split(".csv")[0]
  165. data, freq, dpoints = data_reading(filename)
  166. results = test_time_features(data, model)
  167. results["freq"] = freq
  168. results["dpoints"] = dpoints
  169. results["building_id"] = building_id
  170. return results
  171. def test_time_features(data, model):
  172. data = pd.concat([data, create_time_features(data.index, cyc_encode=True)], axis=1)
  173. scores = []
  174. methods = []
  175. y = data.pop("energy")
  176. normal_features = [
  177. "timestamp_" + t for t in ["hour", "dayofweek", "month", "dayofyear", "year"]
  178. ]
  179. normal_cyc_features = [
  180. "sin_" + t
  181. for t in normal_features
  182. if t not in ["timestamp_dayofyear", "timestamp_year"]
  183. ] + [
  184. "cos_" + t
  185. for t in normal_features
  186. if t not in ["timestamp_dayofyear", "timestamp_year"]
  187. ]
  188. frac_features = [
  189. "timestamp_" + t for t in ["fracday", "fracweek", "fracmonth", "fracyear"]
  190. ]
  191. frac_cyc_features = ["sin_" + t for t in frac_features] + [
  192. "cos_" + t for t in frac_features
  193. ]
  194. data_normal = data[normal_features].copy()
  195. data_normal_cyc = data[normal_cyc_features].copy()
  196. data_frac = data[frac_features].copy()
  197. data_frac_cyc = data[frac_cyc_features].copy()
  198. results = {}
  199. dataset_names = ["normal", "normal_cyc", "frac", "frac_cyc"]
  200. for dataset, name in zip(
  201. [data_normal, data_normal_cyc, data_frac, data_frac_cyc], dataset_names
  202. ):
  203. to_drop = dataset.columns[
  204. (dataset.nunique() == 1) | (dataset.nunique() == len(dataset))
  205. ]
  206. dataset = dataset.drop(columns=to_drop)
  207. dataset["energy"] = y.copy()
  208. try:
  209. data_results = monthly_validation(dataset, model)
  210. scores.append(data_results["score"])
  211. methods.append(name)
  212. except Exception as e:
  213. print(e, name)
  214. results = pd.DataFrame(dict(score=scores, method=methods))
  215. return results