utils.py 3.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. import cudf
  2. import os
  3. import pandas as pd
  4. import urllib
  5. import tqdm
  6. from zipfile import ZipFile
  7. pbar = None
  8. def show_progress(block_num, block_size, total_size):
  9. global pbar
  10. if pbar is None:
  11. pbar = tqdm.tqdm(total=total_size / 1024, unit='kB')
  12. downloaded = block_num * block_size
  13. if downloaded < total_size:
  14. pbar.update(block_size / 1024)
  15. else:
  16. pbar.close()
  17. pbar = None
  18. def fetch_bike_dataset(years, data_dir="data"):
  19. """ Dowload bike dataset for a given year and return the list of files.
  20. """
  21. base_url = "https://s3.amazonaws.com/capitalbikeshare-data/"
  22. files = []
  23. for year in years:
  24. filename = str(year) + "-capitalbikeshare-tripdata.zip"
  25. filepath = os.path.join(data_dir, filename)
  26. if not os.path.isfile(filepath):
  27. urllib.request.urlretrieve(base_url+filename, filepath, reporthook=show_progress)
  28. with ZipFile(filepath) as myzip:
  29. files += [os.path.join(data_dir, name) for name in myzip.namelist()]
  30. myzip.extractall(data_dir)
  31. print("Files extracted: "+ str(files))
  32. return files
  33. def fetch_weather_dataset(data_dir='data'):
  34. base_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00275/'
  35. fn = 'Bike-Sharing-Dataset.zip'
  36. if not os.path.isdir(data_dir):
  37. os.makedirs(data_dir)
  38. filepath = os.path.join(data_dir, fn)
  39. if not os.path.isfile(filepath):
  40. print(f'Downloading {base_url+fn} to {filepath}')
  41. urllib.request.urlretrieve(base_url+fn, filepath)
  42. files = []
  43. with ZipFile(filepath) as myzip:
  44. files = [os.path.join(data_dir, name) for name in myzip.namelist()]
  45. myzip.extractall(data_dir)
  46. # Extract weather features from the dataset
  47. # Note this weather dataset is already preprocessed.
  48. # We reverse the steps to provide a more interesting exercise.
  49. weather = cudf.read_csv(files[2], parse_dates=[1])
  50. out = cudf.DataFrame();
  51. out['Hour'] = weather['dteday'] + cudf.Series(pd.to_timedelta(weather['hr'].to_pandas(), unit='h'))
  52. out['Temperature'] = weather['temp'] * 47.0 -8
  53. out['Relative Temperature'] = weather['atemp'] * 66.0 - 16
  54. out['Rel. humidity'] = (weather['hum'] * 100).astype('int')
  55. out['Wind'] = weather['windspeed'] * 67
  56. # Spell out weather categories
  57. # - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
  58. #- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
  59. #- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
  60. # - 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
  61. out['Weather'] = 'Clear or Partly cloudy'
  62. out['Weather'][weather['weathersit']==2] = 'Mist or Cloudy'
  63. out['Weather'][weather['weathersit']==3] = 'Light Rain or Snow, Thunderstorm'
  64. out['Weather'][weather['weathersit']==4] = 'Heavy Rain, Snow + Fog, Ice'
  65. filepath = os.path.join(data_dir, 'weather2011-2012.csv')
  66. out.to_csv(filepath, index=False)
  67. print("Weather file saved at ", filepath)
  68. return filepath
  69. def read_bike_data_pandas(files):
  70. # Reads a list of files and concatenates them
  71. tables = []
  72. for filename in files:
  73. tmp_df = pd.read_csv(filename, usecols=[1], parse_dates=['Start date'])
  74. tables.append(tmp_df)
  75. merged_df = pd.concat(tables, ignore_index=True)