Browse Source

Formatted with black

WillKoehrsen 4 years ago
parent
commit
b72edeebe2
33 changed files with 2371 additions and 1661 deletions
  1. 23 23
      datashader-work/formatting_data.py
  2. 35 28
      medium/bargraphs.py
  3. 12 5
      medium/images/data_for_fitting.py
  4. 100 76
      medium/retrieval.py
  5. 19 14
      medium/view_extraction.py
  6. 198 102
      medium/visuals.py
  7. 15 7
      sentdex_data_analysis/HPI_tpot_pipeline.py
  8. 18 12
      sentdex_data_analysis/pandas_IO.py
  9. 19 11
      sentdex_data_analysis/pandas_TPOT.py
  10. 42 29
      sentdex_data_analysis/pandas_additionalEconomic.py
  11. 13 10
      sentdex_data_analysis/pandas_basics.py
  12. 4 6
      sentdex_data_analysis/pandas_building_dataset.py
  13. 27 16
      sentdex_data_analysis/pandas_cocantenating_appending.py
  14. 22 22
      sentdex_data_analysis/pandas_comparisonOperators.py
  15. 24 23
      sentdex_data_analysis/pandas_handlingNan.py
  16. 42 33
      sentdex_data_analysis/pandas_indexing.py
  17. 6 5
      sentdex_data_analysis/pandas_intro.py
  18. 17 16
      sentdex_data_analysis/pandas_joiningData.py
  19. 21 14
      sentdex_data_analysis/pandas_joining_merging.py
  20. 26 15
      sentdex_data_analysis/pandas_mappingFunctions.py
  21. 35 33
      sentdex_data_analysis/pandas_percentChange_correlation.py
  22. 24 21
      sentdex_data_analysis/pandas_pickling.py
  23. 4 4
      sentdex_data_analysis/pandas_pickling_sentdex.py
  24. 43 39
      sentdex_data_analysis/pandas_resampling.py
  25. 29 27
      sentdex_data_analysis/pandas_rollingStatistics.py
  26. 32 23
      sentdex_data_analysis/pandas_scikitLearn.py
  27. 5 4
      sentdex_data_analysis/tpot_basic.py
  28. 47 43
      slack_interaction/utils.py
  29. 831 526
      stocker/stocker.py
  30. 77 56
      time_features/time_features_utils.py
  31. 99 96
      web_automation/canvas_upload.py
  32. 38 41
      weighter/run_weighter.py
  33. 424 281
      weighter/weighter.py

+ 23 - 23
datashader-work/formatting_data.py

@@ -1,6 +1,3 @@
-
-
-
 import geoviews as gv
 import geoviews as gv
 import geoviews.feature as gf
 import geoviews.feature as gf
 import xarray as xr
 import xarray as xr
@@ -9,31 +6,34 @@ from cartopy import crs
 import pandas as pd
 import pandas as pd
 import numpy as np
 import numpy as np
 
 
-gv.extension('bokeh', 'matplotlib')
+gv.extension("bokeh", "matplotlib")
 
 
-xr_ensemble = xr.open_dataset('Data-Analysis/datashader-work/geoviews-examples/data/ensemble.nc').load()
+xr_ensemble = xr.open_dataset(
+    "Data-Analysis/datashader-work/geoviews-examples/data/ensemble.nc"
+).load()
 
 
 from sqlalchemy import create_engine
 from sqlalchemy import create_engine
-engine = create_engine('postgres://localhost:5432/global_fishing_watch')
-engine.table_names()
-df = pd.read_sql("""SELECT * FROM fishing_effort LIMIT 10000""",
-                engine, parse_dates=['date'])
 
 
-df['flag'] = df['flag'].astype('category')
-df['geartype'] = df['geartype'].astype('category')
-df['lat'] = df['lat_bin'] / 100
-df['lon'] = df['lon_bin'] / 100
+engine = create_engine("postgres://localhost:5432/global_fishing_watch")
+engine.table_names()
+df = pd.read_sql(
+    """SELECT * FROM fishing_effort LIMIT 10000""", engine, parse_dates=["date"]
+)
+
+df["flag"] = df["flag"].astype("category")
+df["geartype"] = df["geartype"].astype("category")
+df["lat"] = df["lat_bin"] / 100
+df["lon"] = df["lon_bin"] / 100
 df.info()
 df.info()
 
 
 
 
-
 def format_df(df, n=10_000):
 def format_df(df, n=10_000):
-    ...:     df = df.iloc[:n]
-    ...:     df = df.drop_duplicates(subset=['lat', 'lon', 'date'])
-    ...:     df = df.sort_values(['lat', 'lon', 'date'])
-    ...:     index = pd.MultiIndex.from_arrays([df['lat'], df['lon'], df['date']])
-    ...:     df.index = index
-    ...:     latitudes = df.index.levels[0]
-    ...:     longitudes = df.index.levels[1]
-    ...:     times = df.index.levels[2]
-    ...:     return latitudes, longitudes, times, df
+    df = df.iloc[:n]
+    df = df.drop_duplicates(subset=["lat", "lon", "date"])
+    df = df.sort_values(["lat", "lon", "date"])
+    index = pd.MultiIndex.from_arrays([df["lat"], df["lon"], df["date"]])
+    df.index = index
+    latitudes = df.index.levels[0]
+    longitudes = df.index.levels[1]
+    times = df.index.levels[2]
+    return latitudes, longitudes, times, df

+ 35 - 28
medium/bargraphs.py

@@ -14,7 +14,7 @@ def get_all_pages(driver, xpath, months, suffix):
     # Initially starting at today
     # Initially starting at today
     latest_date_in_graph = datetime.now().date()
     latest_date_in_graph = datetime.now().date()
 
 
-    print('Starting on ', latest_date_in_graph)
+    print("Starting on ", latest_date_in_graph)
 
 
     views = []
     views = []
     dates = []
     dates = []
@@ -25,15 +25,15 @@ def get_all_pages(driver, xpath, months, suffix):
         graph_dates = []
         graph_dates = []
         # Extract the bar graph
         # Extract the bar graph
         bargraph = BeautifulSoup(driver.page_source).find_all(
         bargraph = BeautifulSoup(driver.page_source).find_all(
-            attrs={'class': 'bargraph'})[0]
+            attrs={"class": "bargraph"}
+        )[0]
 
 
         # Get all the bars in the bargraph
         # Get all the bars in the bargraph
-        bardata = bargraph.find_all(attrs={'class': 'bargraph-bar'})
+        bardata = bargraph.find_all(attrs={"class": "bargraph-bar"})
         # Sort the bar data by x position (which will be date order) with most recent first
         # Sort the bar data by x position (which will be date order) with most recent first
-        bardata = sorted(bardata, key=lambda x: float(
-            x.get('x')), reverse=True)
-        bardata = [bar.get('data-tooltip') for bar in bardata]
-        latest_day = int(bardata[0].split('\xa0')[-1])
+        bardata = sorted(bardata, key=lambda x: float(x.get("x")), reverse=True)
+        bardata = [bar.get("data-tooltip") for bar in bardata]
+        latest_day = int(bardata[0].split("\xa0")[-1])
 
 
         # Some months are not overlapping
         # Some months are not overlapping
         if latest_day != latest_date_in_graph.day:
         if latest_day != latest_date_in_graph.day:
@@ -41,7 +41,7 @@ def get_all_pages(driver, xpath, months, suffix):
 
 
         # Iterate through the bars which now are sorted in reverse date order (newest to oldest)
         # Iterate through the bars which now are sorted in reverse date order (newest to oldest)
         for i, data in enumerate(bardata):
         for i, data in enumerate(bardata):
-            graph_views.append(float(data.split(' ')[0].replace(',', '')))
+            graph_views.append(float(data.split(" ")[0].replace(",", "")))
             graph_dates.append(latest_date_in_graph - timedelta(days=i))
             graph_dates.append(latest_date_in_graph - timedelta(days=i))
 
 
         views.extend(graph_views)
         views.extend(graph_views)
@@ -55,17 +55,22 @@ def get_all_pages(driver, xpath, months, suffix):
         # Go to the previous graph
         # Go to the previous graph
         driver.find_element_by_xpath(xpath).click()
         driver.find_element_by_xpath(xpath).click()
         time.sleep(2)
         time.sleep(2)
-        print(f'{100 * m /(months):.0f}% complete.', end='\r')
-
-    results = pd.DataFrame({'date': pd.to_datetime(
-        dates), suffix: views}).groupby('date').sum()
-    results = results.loc[results[results['views'] != 0.0].index.min():, ]
-    print('First views on ', str(results.index.min().date()))
+        print(f"{100 * m /(months):.0f}% complete.", end="\r")
+
+    results = (
+        pd.DataFrame({"date": pd.to_datetime(dates), suffix: views})
+        .groupby("date")
+        .sum()
+    )
+    results = results.loc[
+        results[results["views"] != 0.0].index.min() :,
+    ]
+    print("First views on ", str(results.index.min().date()))
 
 
     # Save using the date as the file name
     # Save using the date as the file name
-    fname = f'data/{str(datetime.now().date())}_{suffix}'
+    fname = f"data/{str(datetime.now().date())}_{suffix}"
     results.to_parquet(fname)
     results.to_parquet(fname)
-    print('Stats saved to ', fname)
+    print("Stats saved to ", fname)
 
 
     return results
     return results
 
 
@@ -75,29 +80,31 @@ if __name__ == "__main__":
     driver = webdriver.Chrome(ChromeDriverManager().install())
     driver = webdriver.Chrome(ChromeDriverManager().install())
     driver.get("https://medium.com/me/stats")
     driver.get("https://medium.com/me/stats")
     # Wait for user to log in
     # Wait for user to log in
-    input('Waiting for you to log in. Press enter when ready: ')
+    input("Waiting for you to log in. Press enter when ready: ")
 
 
     # Find earliest date
     # Find earliest date
     earliest_article_date = parser.parse(
     earliest_article_date = parser.parse(
-        input('Enter earliest article date as string: ')).date()
-    days = (datetime.now().date()
-            - earliest_article_date).total_seconds() / (60 * 60 * 24)
+        input("Enter earliest article date as string: ")
+    ).date()
+    days = (datetime.now().date() - earliest_article_date).total_seconds() / (
+        60 * 60 * 24
+    )
     months = math.ceil(days / 30)
     months = math.ceil(days / 30)
 
 
     # Get the xpath from user
     # Get the xpath from user
-    xpath = input('Paste xpath with no quotation marks: ')
+    xpath = input("Paste xpath with no quotation marks: ")
     # Gather the results
     # Gather the results
-    results = get_all_pages(driver, xpath, months, suffix='views')
-    print('Refresh page and click on reads')
+    results = get_all_pages(driver, xpath, months, suffix="views")
+    print("Refresh page and click on reads")
 
 
     # Get the xpath from user
     # Get the xpath from user
-    xpath = input('Paste xpath with no quotation marks: ')
+    xpath = input("Paste xpath with no quotation marks: ")
     # Gather the results
     # Gather the results
-    results = get_all_pages(driver, xpath, months, suffix='reads')
+    results = get_all_pages(driver, xpath, months, suffix="reads")
 
 
-    print('Refresh page and click on fans')
+    print("Refresh page and click on fans")
     # Get the xpath from user
     # Get the xpath from user
-    xpath = input('Paste xpath with no quotation marks: ')
+    xpath = input("Paste xpath with no quotation marks: ")
     # Gather the results
     # Gather the results
-    results = get_all_pages(driver, xpath, months, suffix='fans')
+    results = get_all_pages(driver, xpath, months, suffix="fans")
     print("Complete. All results saved in data directory.")
     print("Complete. All results saved in data directory.")

+ 12 - 5
medium/images/data_for_fitting.py

@@ -10,10 +10,16 @@ def data_for_fitting(*, building_id, date):
         previous_bday = pd.to_datetime(date) - BDay(1)
         previous_bday = pd.to_datetime(date) - BDay(1)
 
 
         # If a holiday, this will return None
         # If a holiday, this will return None
-        lease_start = (db().execute(building_daily_stats.select().where(
-            building_daily_stats.c.building_id == building_id).where(
-                building_daily_stats.c.date == previous_bday)).fetchone().
-                       lease_obligations_start_at)
+        lease_start = (
+            db()
+            .execute(
+                building_daily_stats.select()
+                .where(building_daily_stats.c.building_id == building_id)
+                .where(building_daily_stats.c.date == previous_bday)
+            )
+            .fetchone()
+            .lease_obligations_start_at
+        )
 
 
         date = previous_bday
         date = previous_bday
 
 
@@ -21,4 +27,5 @@ def data_for_fitting(*, building_id, date):
     return load_sensor_values(
     return load_sensor_values(
         building_id=building_id,
         building_id=building_id,
         start_time=lease_start,
         start_time=lease_start,
-        end_time=lease_start + timedelta(hours=8))
+        end_time=lease_start + timedelta(hours=8),
+    )

+ 100 - 76
medium/retrieval.py

@@ -9,7 +9,7 @@ import pandas as pd
 from datetime import datetime
 from datetime import datetime
 
 
 
 
-def get_table_rows(fname='stats.html'):
+def get_table_rows(fname="stats.html"):
     """
     """
     Extract the table rows from the statistics
     Extract the table rows from the statistics
 
 
@@ -18,20 +18,23 @@ def get_table_rows(fname='stats.html'):
     :return table_rows: list of BeautifulSoup objects to be passed to `process_in_parallel`
     :return table_rows: list of BeautifulSoup objects to be passed to `process_in_parallel`
     """
     """
 
 
-    soup = BeautifulSoup(
-        open(f'data/{fname}', 'r', encoding='utf8'), features='lxml')
-    table_rows = soup.find_all(
-        attrs={'class': "sortableTable-row js-statsTableRow"})
-    print(f'Found {len(table_rows)} entries in table.')
+    soup = BeautifulSoup(open(f"data/{fname}", "r", encoding="utf8"), features="lxml")
+    table_rows = soup.find_all(attrs={"class": "sortableTable-row js-statsTableRow"})
+    print(f"Found {len(table_rows)} entries in table.")
     return table_rows
     return table_rows
 
 
 
 
 def convert_timestamp(ts: int, tz: str):
 def convert_timestamp(ts: int, tz: str):
     """Convert a unix timestamp to a date timestamp"""
     """Convert a unix timestamp to a date timestamp"""
-    return pd.to_datetime(ts, origin='unix', unit='ms').tz_localize('UTC').tz_convert(tz).tz_localize(None)
+    return (
+        pd.to_datetime(ts, origin="unix", unit="ms")
+        .tz_localize("UTC")
+        .tz_convert(tz)
+        .tz_localize(None)
+    )
 
 
 
 
-def process_entry(entry, parallel=True, tz='America/Chicago'):
+def process_entry(entry, parallel=True, tz="America/Chicago"):
     """
     """
     Extract data from one entry in table
     Extract data from one entry in table
 
 
@@ -44,106 +47,127 @@ def process_entry(entry, parallel=True, tz='America/Chicago'):
     """
     """
     # Convert to soup when running in parallel
     # Convert to soup when running in parallel
     if parallel:
     if parallel:
-        entry = BeautifulSoup(entry, features='lxml').body.tr
+        entry = BeautifulSoup(entry, features="lxml").body.tr
 
 
     entry_dict = {}
     entry_dict = {}
     # Extract information
     # Extract information
-    for value, key in zip(entry.find_all(attrs={'class': 'sortableTable-value'}),
-                          ['published_date', 'views', 'reads', 'ratio', 'fans']):
-        entry_dict[key] = float(
-            value.text) if key == 'ratio' else int(value.text)
+    for value, key in zip(
+        entry.find_all(attrs={"class": "sortableTable-value"}),
+        ["published_date", "views", "reads", "ratio", "fans"],
+    ):
+        entry_dict[key] = float(value.text) if key == "ratio" else int(value.text)
 
 
-    entry_dict['read_time'] = int(entry.find_all(attrs={'class': 'readingTime'})[
-                                  0].get('title').split(' ')[0])
+    entry_dict["read_time"] = int(
+        entry.find_all(attrs={"class": "readingTime"})[0].get("title").split(" ")[0]
+    )
 
 
     # Unlisted vs published
     # Unlisted vs published
-    entry_dict['type'] = 'unlisted' if len(
-        entry.find_all(text=' Unlisted')) > 0 else 'published'
+    entry_dict["type"] = (
+        "unlisted" if len(entry.find_all(text=" Unlisted")) > 0 else "published"
+    )
 
 
     # Publication
     # Publication
-    publication = entry.find_all(attrs={'class': 'sortableTable-text'})
-    if 'In' in publication[0].text:
-        entry_dict['publication'] = publication[0].text.split('In ')[
-            1].split('View')[0]
+    publication = entry.find_all(attrs={"class": "sortableTable-text"})
+    if "In" in publication[0].text:
+        entry_dict["publication"] = publication[0].text.split("In ")[1].split("View")[0]
     else:
     else:
-        entry_dict['publication'] = 'None'
+        entry_dict["publication"] = "None"
 
 
     # Convert datetimes
     # Convert datetimes
-    entry_dict['published_date'] = convert_timestamp(
-        entry_dict['published_date'], tz=tz)
-    entry_dict['started_date'] = convert_timestamp(
-        entry.get('data-timestamp'), tz=tz)
+    entry_dict["published_date"] = convert_timestamp(
+        entry_dict["published_date"], tz=tz
+    )
+    entry_dict["started_date"] = convert_timestamp(entry.get("data-timestamp"), tz=tz)
 
 
     # Get the link
     # Get the link
-    link = entry.find_all(text='View story',
-                               attrs={'class': 'sortableTable-link'})[0].get('href')
-    entry_dict['link'] = link
+    link = entry.find_all(text="View story", attrs={"class": "sortableTable-link"})[
+        0
+    ].get("href")
+    entry_dict["link"] = link
     # Retrieve the article and create a soup
     # Retrieve the article and create a soup
     entry = requests.get(link).content
     entry = requests.get(link).content
-    entry_soup = BeautifulSoup(entry, features='lxml')
+    entry_soup = BeautifulSoup(entry, features="lxml")
 
 
     # Get the title
     # Get the title
     try:
     try:
         title = entry_soup.h1.text
         title = entry_soup.h1.text
     except:
     except:
-        title = 'response'
+        title = "response"
 
 
     title_word_count = len(re.findall(r"[\w']+|[.,!?;]", title))
     title_word_count = len(re.findall(r"[\w']+|[.,!?;]", title))
 
 
     # Main text entries
     # Main text entries
-    entry_text = [p.text for p in entry_soup.find_all(
-        ['h1', 'h2', 'h3', 'p', 'blockquote'])]
+    entry_text = [
+        p.text for p in entry_soup.find_all(["h1", "h2", "h3", "p", "blockquote"])
+    ]
 
 
     # Make sure to catch everything
     # Make sure to catch everything
-    entry_text.extend(s.text for s in entry_soup.find_all(
-        attrs={'class': 'graf graf--li graf-after--li'}))
-    entry_text.extend(s.text for s in entry_soup.find_all(
-        attrs={'class': 'graf graf--li graf-after--p'}))
-    entry_text.extend(s.text for s in entry_soup.find_all(
-        attrs={'class': 'graf graf--li graf-after--blockquote'}))
-    entry_text.extend(s.text for s in entry_soup.find_all(
-        attrs={'class': 'graf graf--li graf-after--pullquote'}))
-
-    entry_text = ' '.join(entry_text)
+    entry_text.extend(
+        s.text
+        for s in entry_soup.find_all(attrs={"class": "graf graf--li graf-after--li"})
+    )
+    entry_text.extend(
+        s.text
+        for s in entry_soup.find_all(attrs={"class": "graf graf--li graf-after--p"})
+    )
+    entry_text.extend(
+        s.text
+        for s in entry_soup.find_all(
+            attrs={"class": "graf graf--li graf-after--blockquote"}
+        )
+    )
+    entry_text.extend(
+        s.text
+        for s in entry_soup.find_all(
+            attrs={"class": "graf graf--li graf-after--pullquote"}
+        )
+    )
+
+    entry_text = " ".join(entry_text)
 
 
     # Word count
     # Word count
     word_count = len(re.findall(r"[\w']+|[.,!?;]", entry_text))
     word_count = len(re.findall(r"[\w']+|[.,!?;]", entry_text))
 
 
     # Number of claps
     # Number of claps
     clap_pattern = re.compile(
     clap_pattern = re.compile(
-        '^[0-9]{1,} claps|^[0-9]{1,}.[0-9]{1,}K claps|^[0-9]{1,}K claps')
+        "^[0-9]{1,} claps|^[0-9]{1,}.[0-9]{1,}K claps|^[0-9]{1,}K claps"
+    )
     claps = entry_soup.find_all(text=clap_pattern)
     claps = entry_soup.find_all(text=clap_pattern)
 
 
     if len(claps) > 0:
     if len(claps) > 0:
-        if 'K' in claps[0]:
-            clap_number = int(1e3 * float(claps[0].split('K')[0]))
+        if "K" in claps[0]:
+            clap_number = int(1e3 * float(claps[0].split("K")[0]))
         else:
         else:
-            clap_number = int(claps[0].split(' ')[0])
+            clap_number = int(claps[0].split(" ")[0])
     else:
     else:
         clap_number = 0
         clap_number = 0
 
 
     # Post tags
     # Post tags
-    tags = entry_soup.find_all(
-        attrs={'class': 'tags tags--postTags tags--borderless'})
-    tags = [li.text for li in tags[0].find_all('li')]
+    tags = entry_soup.find_all(attrs={"class": "tags tags--postTags tags--borderless"})
+    tags = [li.text for li in tags[0].find_all("li")]
 
 
     # Responses to entry
     # Responses to entry
-    responses = entry_soup.find_all(attrs={'class': 'button button--chromeless u-baseColor--buttonNormal u-marginRight12',
-                                           'data-action': 'scroll-to-responses'})
+    responses = entry_soup.find_all(
+        attrs={
+            "class": "button button--chromeless u-baseColor--buttonNormal u-marginRight12",
+            "data-action": "scroll-to-responses",
+        }
+    )
     num_responses = int(responses[0].text) if len(responses) > 0 else 0
     num_responses = int(responses[0].text) if len(responses) > 0 else 0
 
 
     # Store in dictionary
     # Store in dictionary
-    entry_dict['title'] = title
-    entry_dict['title_word_count'] = title_word_count
-    entry_dict['text'] = entry_text
-    entry_dict['word_count'] = word_count
-    entry_dict['claps'] = clap_number
-    entry_dict['tags'] = tags
-    entry_dict['num_responses'] = num_responses
+    entry_dict["title"] = title
+    entry_dict["title_word_count"] = title_word_count
+    entry_dict["text"] = entry_text
+    entry_dict["word_count"] = word_count
+    entry_dict["claps"] = clap_number
+    entry_dict["tags"] = tags
+    entry_dict["num_responses"] = num_responses
 
 
     # Time since publication
     # Time since publication
-    entry_dict['days_since_publication'] = (
-        datetime.now() - entry_dict['published_date']).total_seconds() / (3600 * 24)
+    entry_dict["days_since_publication"] = (
+        datetime.now() - entry_dict["published_date"]
+    ).total_seconds() / (3600 * 24)
 
 
     return entry_dict
     return entry_dict
 
 
@@ -173,44 +197,44 @@ def process_in_parallel(table_rows, processes=20):
     start = timer()
     start = timer()
     for i, r in enumerate(pool.imap_unordered(process_entry, table_rows_str)):
     for i, r in enumerate(pool.imap_unordered(process_entry, table_rows_str)):
         # Report progress
         # Report progress
-        print(f'{100 * i / len(table_rows_str):.2f}% complete.', end='\r')
+        print(f"{100 * i / len(table_rows_str):.2f}% complete.", end="\r")
         results.append(r)
         results.append(r)
     pool.close()
     pool.close()
     pool.join()
     pool.join()
     end = timer()
     end = timer()
-    print(
-        f'Processed {len(table_rows_str)} articles in {end-start:.2f} seconds.')
+    print(f"Processed {len(table_rows_str)} articles in {end-start:.2f} seconds.")
 
 
     # Convert to dataframe
     # Convert to dataframe
     df = pd.DataFrame(results)
     df = pd.DataFrame(results)
     # Rename ratio
     # Rename ratio
-    df.rename(columns={'ratio': 'read_ratio'}, inplace=True)
+    df.rename(columns={"ratio": "read_ratio"}, inplace=True)
     # Add extra columns with more data
     # Add extra columns with more data
-    df['claps_per_word'] = df['claps'] / df['word_count']
-    df['editing_days'] = ((df['published_date'] - df['started_date']
-                           ).dt.total_seconds() / (60 * 60 * 24)).astype(int)
+    df["claps_per_word"] = df["claps"] / df["word_count"]
+    df["editing_days"] = (
+        (df["published_date"] - df["started_date"]).dt.total_seconds() / (60 * 60 * 24)
+    ).astype(int)
 
 
     # Rounding
     # Rounding
-    df['published_date'] = df['published_date'].dt.round('min')
-    df['started_date'] = df['started_date'].dt.round('min')
-    df['read_ratio'] = df['read_ratio'].round(2)
+    df["published_date"] = df["published_date"].dt.round("min")
+    df["started_date"] = df["started_date"].dt.round("min")
+    df["read_ratio"] = df["read_ratio"].round(2)
 
 
     # 5 most common tags (might want to include more tags)
     # 5 most common tags (might want to include more tags)
     n = 5
     n = 5
-    all_tags = list(chain(*df['tags'].tolist()))
+    all_tags = list(chain(*df["tags"].tolist()))
     tag_counts = Counter(all_tags)
     tag_counts = Counter(all_tags)
     tags = tag_counts.most_common(n)
     tags = tag_counts.most_common(n)
 
 
     # Adding columns with indication of tag
     # Adding columns with indication of tag
     for tag, count in tags:
     for tag, count in tags:
-        flag = [1 if tag in tags else 0 for tags in df['tags']]
-        df.loc[:, f'<tag>{tag}'] = flag
+        flag = [1 if tag in tags else 0 for tags in df["tags"]]
+        df.loc[:, f"<tag>{tag}"] = flag
 
 
-    df.sort_values('published_date', inplace=True)
+    df.sort_values("published_date", inplace=True)
     return df
     return df
 
 
 
 
-def get_data(fname='stats.html', processes=20):
+def get_data(fname="stats.html", processes=20):
     """
     """
     Retrieve medium article statistics
     Retrieve medium article statistics
 
 

+ 19 - 14
medium/view_extraction.py

@@ -6,33 +6,38 @@ import pandas as pd
 
 
 
 
 def process_bargraph(bargraph):
 def process_bargraph(bargraph):
-    bardata = [bar.get('data-tooltip')
-               for bar in bargraph.find_all(attrs={'class': 'bargraph-bar'})]
+    bardata = [
+        bar.get("data-tooltip")
+        for bar in bargraph.find_all(attrs={"class": "bargraph-bar"})
+    ]
     print(len(bardata))
     print(len(bardata))
     return
     return
     # Sort by xposition
     # Sort by xposition
-    bardata = sorted(bardata, key=lambda x: float(x.get('x')))
-    views = [float(s.split(' ')[0].replace(',', '')) for s in bardata]
-    dates = [s.split(' ')[-1].split('\xa0')[0] + ' '
-             + s.split(' ')[-1].split('\xa0')[1] for s in bardata]
+    bardata = sorted(bardata, key=lambda x: float(x.get("x")))
+    views = [float(s.split(" ")[0].replace(",", "")) for s in bardata]
+    dates = [
+        s.split(" ")[-1].split("\xa0")[0] + " " + s.split(" ")[-1].split("\xa0")[1]
+        for s in bardata
+    ]
     year = str((datetime.now() - pd.Timedelta(days=i * 30)).year)
     year = str((datetime.now() - pd.Timedelta(days=i * 30)).year)
-    dates = [parser.parse(d + ' ' + year) for d in dates]
+    dates = [parser.parse(d + " " + year) for d in dates]
     return views, dates
     return views, dates
 
 
 
 
-files = os.listdir('html_pages')
+files = os.listdir("html_pages")
 
 
 v = []
 v = []
 d = []
 d = []
 
 
 for fid in files:
 for fid in files:
-    i = int(fid.split('.')[0].split('p')[1])
-    graph = BeautifulSoup(
-        open(f'html_pages/{fid}', 'r')).find_all(attrs={'class': 'bargraph'})[0]
+    i = int(fid.split(".")[0].split("p")[1])
+    graph = BeautifulSoup(open(f"html_pages/{fid}", "r")).find_all(
+        attrs={"class": "bargraph"}
+    )[0]
     r = process_bargraph(graph, i)
     r = process_bargraph(graph, i)
     v.extend(r[0])
     v.extend(r[0])
     d.extend(r[1])
     d.extend(r[1])
-    results = pd.DataFrame({'date': d, 'views': v})
+    results = pd.DataFrame({"date": d, "views": v})
 
 
-results['date'] = pd.to_datetime(results['date'])
-results.to_parquet('medium_views_time')
+results["date"] = pd.to_datetime(results["date"])
+results.to_parquet("medium_views_time")

+ 198 - 102
medium/visuals.py

@@ -9,6 +9,7 @@ from scipy import stats
 
 
 import plotly.graph_objs as go
 import plotly.graph_objs as go
 import cufflinks
 import cufflinks
+
 cufflinks.go_offline()
 cufflinks.go_offline()
 
 
 
 
@@ -31,7 +32,7 @@ def make_hist(df, x, category=None):
 
 
     layout = go.Layout(
     layout = go.Layout(
         yaxis=dict(title="Count"),
         yaxis=dict(title="Count"),
-        xaxis=dict(title=x.replace('_', ' ').title()),
+        xaxis=dict(title=x.replace("_", " ").title()),
         title=f"{x.replace('_', ' ').title()} Distribution by {category.replace('_', ' ').title()}"
         title=f"{x.replace('_', ' ').title()} Distribution by {category.replace('_', ' ').title()}"
         if category
         if category
         else f"{x.replace('_', ' ').title()} Distribution",
         else f"{x.replace('_', ' ').title()} Distribution",
@@ -63,8 +64,7 @@ def make_cum_plot(df, y, category=None, ranges=False):
                     mode="lines+markers",
                     mode="lines+markers",
                     text=group["title"],
                     text=group["title"],
                     name=name,
                     name=name,
-                    marker=dict(size=10, opacity=0.8,
-                                symbol=i + 2),
+                    marker=dict(size=10, opacity=0.8, symbol=i + 2),
                 )
                 )
             )
             )
     else:
     else:
@@ -77,17 +77,27 @@ def make_cum_plot(df, y, category=None, ranges=False):
                     name=y[0].title(),
                     name=y[0].title(),
                     mode="lines+markers",
                     mode="lines+markers",
                     text=df["title"],
                     text=df["title"],
-                    marker=dict(size=10, color='blue', opacity=0.6, line=dict(color='black'),
-                                )),
+                    marker=dict(
+                        size=10,
+                        color="blue",
+                        opacity=0.6,
+                        line=dict(color="black"),
+                    ),
+                ),
                 go.Scatter(
                 go.Scatter(
                     x=df["published_date"],
                     x=df["published_date"],
                     y=df[y[1]].cumsum(),
                     y=df[y[1]].cumsum(),
-                    yaxis='y2',
+                    yaxis="y2",
                     name=y[1].title(),
                     name=y[1].title(),
                     mode="lines+markers",
                     mode="lines+markers",
                     text=df["title"],
                     text=df["title"],
-                    marker=dict(size=10, color='red', opacity=0.6, line=dict(color='black'),
-                                )),
+                    marker=dict(
+                        size=10,
+                        color="red",
+                        opacity=0.6,
+                        line=dict(color="black"),
+                    ),
+                ),
             ]
             ]
         else:
         else:
             data = [
             data = [
@@ -96,23 +106,31 @@ def make_cum_plot(df, y, category=None, ranges=False):
                     y=df[y].cumsum(),
                     y=df[y].cumsum(),
                     mode="lines+markers",
                     mode="lines+markers",
                     text=df["title"],
                     text=df["title"],
-                    marker=dict(size=12, color='blue', opacity=0.6, line=dict(color='black'),
-                                ),
+                    marker=dict(
+                        size=12,
+                        color="blue",
+                        opacity=0.6,
+                        line=dict(color="black"),
+                    ),
                 )
                 )
             ]
             ]
     if len(y) == 2:
     if len(y) == 2:
         layout = go.Layout(
         layout = go.Layout(
             xaxis=dict(title="Published Date", type="date"),
             xaxis=dict(title="Published Date", type="date"),
-            yaxis=dict(title=y[0].replace('_', ' ').title(), color='blue'),
-            yaxis2=dict(title=y[1].replace('_', ' ').title(), color='red',
-                        overlaying='y', side='right'),
+            yaxis=dict(title=y[0].replace("_", " ").title(), color="blue"),
+            yaxis2=dict(
+                title=y[1].replace("_", " ").title(),
+                color="red",
+                overlaying="y",
+                side="right",
+            ),
             font=dict(size=14),
             font=dict(size=14),
             title=f"Cumulative {y[0].title()} and {y[1].title()}",
             title=f"Cumulative {y[0].title()} and {y[1].title()}",
         )
         )
     else:
     else:
         layout = go.Layout(
         layout = go.Layout(
             xaxis=dict(title="Published Date", type="date"),
             xaxis=dict(title="Published Date", type="date"),
-            yaxis=dict(title=y.replace('_', ' ').title()),
+            yaxis=dict(title=y.replace("_", " ").title()),
             font=dict(size=14),
             font=dict(size=14),
             title=f"Cumulative {y.replace('_', ' ').title()} by {category.replace('_', ' ').title()}"
             title=f"Cumulative {y.replace('_', ' ').title()} by {category.replace('_', ' ').title()}"
             if category is not None
             if category is not None
@@ -134,14 +152,27 @@ def make_cum_plot(df, y, category=None, ranges=False):
         rangeslider = dict(visible=True)
         rangeslider = dict(visible=True)
         layout["xaxis"]["rangeselector"] = rangeselector
         layout["xaxis"]["rangeselector"] = rangeselector
         layout["xaxis"]["rangeslider"] = rangeslider
         layout["xaxis"]["rangeslider"] = rangeslider
-        layout['width'] = 1000
-        layout['height'] = 600
+        layout["width"] = 1000
+        layout["height"] = 600
 
 
     figure = go.Figure(data=data, layout=layout)
     figure = go.Figure(data=data, layout=layout)
     return figure
     return figure
 
 
 
 
-def make_scatter_plot(df, x, y, fits=None, xlog=False, ylog=False, category=None, scale=None, sizeref=2, annotations=None, ranges=False, title_override=None):
+def make_scatter_plot(
+    df,
+    x,
+    y,
+    fits=None,
+    xlog=False,
+    ylog=False,
+    category=None,
+    scale=None,
+    sizeref=2,
+    annotations=None,
+    ranges=False,
+    title_override=None,
+):
     """
     """
     Make an interactive scatterplot, optionally segmented by `category`
     Make an interactive scatterplot, optionally segmented by `category`
 
 
@@ -164,48 +195,83 @@ def make_scatter_plot(df, x, y, fits=None, xlog=False, ylog=False, category=None
         title = f"{y.replace('_', ' ').title()} vs {x.replace('_', ' ').title()} by {category.replace('_', ' ').title()}"
         title = f"{y.replace('_', ' ').title()} vs {x.replace('_', ' ').title()} by {category.replace('_', ' ').title()}"
         data = []
         data = []
         for i, (name, group) in enumerate(df.groupby(category)):
         for i, (name, group) in enumerate(df.groupby(category)):
-            data.append(go.Scatter(x=group[x],
-                                   y=group[y],
-                                   mode='markers',
-                                   text=group['title'],
-                                   name=name,
-                                   marker=dict(size=8, symbol=i + 2)))
+            data.append(
+                go.Scatter(
+                    x=group[x],
+                    y=group[y],
+                    mode="markers",
+                    text=group["title"],
+                    name=name,
+                    marker=dict(size=8, symbol=i + 2),
+                )
+            )
 
 
     else:
     else:
         if scale is not None:
         if scale is not None:
             title = f"{y.replace('_', ' ').title()} vs {x.replace('_', ' ').title()} Scaled by {scale.title()}"
             title = f"{y.replace('_', ' ').title()} vs {x.replace('_', ' ').title()} Scaled by {scale.title()}"
-            data = [go.Scatter(x=df[x],
-                               y=df[y],
-                               mode='markers',
-                               text=df['title'], marker=dict(size=df[scale],
-                                                             line=dict(color='black', width=0.5), sizemode='area', sizeref=sizeref, opacity=0.8,
-                                                             colorscale='Viridis', color=df[scale], showscale=True, sizemin=2))]
+            data = [
+                go.Scatter(
+                    x=df[x],
+                    y=df[y],
+                    mode="markers",
+                    text=df["title"],
+                    marker=dict(
+                        size=df[scale],
+                        line=dict(color="black", width=0.5),
+                        sizemode="area",
+                        sizeref=sizeref,
+                        opacity=0.8,
+                        colorscale="Viridis",
+                        color=df[scale],
+                        showscale=True,
+                        sizemin=2,
+                    ),
+                )
+            ]
         else:
         else:
 
 
             df.sort_values(x, inplace=True)
             df.sort_values(x, inplace=True)
             title = f"{y.replace('_', ' ').title()} vs {x.replace('_', ' ').title()}"
             title = f"{y.replace('_', ' ').title()} vs {x.replace('_', ' ').title()}"
-            data = [go.Scatter(x=df[x],
-                               y=df[y],
-                               mode='markers',
-                               text=df['title'], marker=dict(
-                size=12, color='blue', opacity=0.8, line=dict(color='black')),
-                name='observations')]
+            data = [
+                go.Scatter(
+                    x=df[x],
+                    y=df[y],
+                    mode="markers",
+                    text=df["title"],
+                    marker=dict(
+                        size=12, color="blue", opacity=0.8, line=dict(color="black")
+                    ),
+                    name="observations",
+                )
+            ]
             if fits is not None:
             if fits is not None:
                 for fit in fits:
                 for fit in fits:
-                    data.append(go.Scatter(x=df[x], y=df[fit], text=df['title'],
-                                           mode='lines+markers', marker=dict
-                                           (size=8, opacity=0.6),
-                                           line=dict(dash='dash'), name=fit))
-
-                title += ' with Fit'
-    layout = go.Layout(annotations=annotations,
-                       xaxis=dict(title=x.replace('_', ' ').title() + (' (log scale)' if xlog else ''),
-                                  type='log' if xlog else None),
-                       yaxis=dict(title=y.replace('_', ' ').title() + (' (log scale)' if ylog else ''),
-                                  type='log' if ylog else None),
-                       font=dict(size=14),
-                       title=title if title_override is None else title_override,
-                       )
+                    data.append(
+                        go.Scatter(
+                            x=df[x],
+                            y=df[fit],
+                            text=df["title"],
+                            mode="lines+markers",
+                            marker=dict(size=8, opacity=0.6),
+                            line=dict(dash="dash"),
+                            name=fit,
+                        )
+                    )
+
+                title += " with Fit"
+    layout = go.Layout(
+        annotations=annotations,
+        xaxis=dict(
+            title=x.replace("_", " ").title() + (" (log scale)" if xlog else ""),
+            type="log" if xlog else None,
+        ),
+        yaxis=dict(
+            title=y.replace("_", " ").title() + (" (log scale)" if ylog else ""),
+            type="log" if ylog else None,
+        ),
+        font=dict(size=14),
+        title=title if title_override is None else title_override,
+    )
 
 
     # Add a rangeselector and rangeslider for a data xaxis
     # Add a rangeselector and rangeslider for a data xaxis
     if ranges:
     if ranges:
@@ -222,8 +288,8 @@ def make_scatter_plot(df, x, y, fits=None, xlog=False, ylog=False, category=None
         rangeslider = dict(visible=True)
         rangeslider = dict(visible=True)
         layout["xaxis"]["rangeselector"] = rangeselector
         layout["xaxis"]["rangeselector"] = rangeselector
         layout["xaxis"]["rangeslider"] = rangeslider
         layout["xaxis"]["rangeslider"] = rangeslider
-        layout['width'] = 1000
-        layout['height'] = 600
+        layout["width"] = 1000
+        layout["height"] = 600
 
 
     figure = go.Figure(data=data, layout=layout)
     figure = go.Figure(data=data, layout=layout)
     return figure
     return figure
@@ -243,14 +309,16 @@ def make_linear_regression(df, x, y, intercept_0):
         lin_model = LinearRegression()
         lin_model = LinearRegression()
         lin_model.fit(df[x], df[y])
         lin_model.fit(df[x], df[y])
 
 
-        slopes, intercept, = lin_model.coef_, lin_model.intercept_
-        df['predicted'] = lin_model.predict(df[x])
+        slopes, intercept, = (
+            lin_model.coef_,
+            lin_model.intercept_,
+        )
+        df["predicted"] = lin_model.predict(df[x])
         r2 = lin_model.score(df[x], df[y])
         r2 = lin_model.score(df[x], df[y])
-        rmse = np.sqrt(mean_squared_error(
-            y_true=df[y], y_pred=df['predicted']))
+        rmse = np.sqrt(mean_squared_error(y_true=df[y], y_pred=df["predicted"]))
         equation = f'{y.replace("_", " ")} ='
         equation = f'{y.replace("_", " ")} ='
 
 
-        names = ['r2', 'rmse', 'intercept']
+        names = ["r2", "rmse", "intercept"]
         values = [r2, rmse, intercept]
         values = [r2, rmse, intercept]
         for i, (p, s) in enumerate(zip(x, slopes)):
         for i, (p, s) in enumerate(zip(x, slopes)):
             if (i + 1) % 3 == 0:
             if (i + 1) % 3 == 0:
@@ -260,19 +328,26 @@ def make_linear_regression(df, x, y, intercept_0):
             names.append(p)
             names.append(p)
             values.append(s)
             values.append(s)
 
 
-        equation += f' {intercept:.2f}'
-        annotations = [dict(x=0.4 * df.index.max(), y=0.9 * df[y].max(), showarrow=False,
-                            text=equation,
-                            font=dict(size=10))]
+        equation += f" {intercept:.2f}"
+        annotations = [
+            dict(
+                x=0.4 * df.index.max(),
+                y=0.9 * df[y].max(),
+                showarrow=False,
+                text=equation,
+                font=dict(size=10),
+            )
+        ]
 
 
-        df['index'] = list(df.index)
-        figure = make_scatter_plot(df, x='index', y=y, fits=[
-                                   'predicted'], annotations=annotations)
-        summary = pd.DataFrame({'name': names, 'value': values})
+        df["index"] = list(df.index)
+        figure = make_scatter_plot(
+            df, x="index", y=y, fits=["predicted"], annotations=annotations
+        )
+        summary = pd.DataFrame({"name": names, "value": values})
     else:
     else:
         if intercept_0:
         if intercept_0:
             lin_reg = sm.OLS(df[y], df[x]).fit()
             lin_reg = sm.OLS(df[y], df[x]).fit()
-            df['fit_values'] = lin_reg.fittedvalues
+            df["fit_values"] = lin_reg.fittedvalues
             summary = lin_reg.summary()
             summary = lin_reg.summary()
             slope = float(lin_reg.params)
             slope = float(lin_reg.params)
             equation = f"${y.replace('_', ' ')} = {slope:.2f} * {x.replace('_', ' ')}$"
             equation = f"${y.replace('_', ' ')} = {slope:.2f} * {x.replace('_', ' ')}$"
@@ -280,19 +355,26 @@ def make_linear_regression(df, x, y, intercept_0):
         else:
         else:
             lin_reg = stats.linregress(df[x], df[y])
             lin_reg = stats.linregress(df[x], df[y])
             intercept, slope = lin_reg.intercept, lin_reg.slope
             intercept, slope = lin_reg.intercept, lin_reg.slope
-            params = ['pvalue', 'rvalue', 'slope', 'intercept']
+            params = ["pvalue", "rvalue", "slope", "intercept"]
             values = []
             values = []
             for p in params:
             for p in params:
                 values.append(getattr(lin_reg, p))
                 values.append(getattr(lin_reg, p))
-            summary = pd.DataFrame({'param': params, 'value': values})
-            df['fit_values'] = df[x] * slope + intercept
+            summary = pd.DataFrame({"param": params, "value": values})
+            df["fit_values"] = df[x] * slope + intercept
             equation = f"${y.replace('_', ' ')} = {slope:.2f} * {x.replace('_', ' ')} + {intercept:.2f}$"
             equation = f"${y.replace('_', ' ')} = {slope:.2f} * {x.replace('_', ' ')} + {intercept:.2f}$"
 
 
-        annotations = [dict(x=0.75 * df[x].max(), y=0.9 * df[y].max(), showarrow=False,
-                            text=equation,
-                            font=dict(size=32))]
+        annotations = [
+            dict(
+                x=0.75 * df[x].max(),
+                y=0.9 * df[y].max(),
+                showarrow=False,
+                text=equation,
+                font=dict(size=32),
+            )
+        ]
         figure = make_scatter_plot(
         figure = make_scatter_plot(
-            df, x=x, y=y, fits=['fit_values'], annotations=annotations)
+            df, x=x, y=y, fits=["fit_values"], annotations=annotations
+        )
     return figure, summary
     return figure, summary
 
 
 
 
@@ -317,15 +399,14 @@ def make_poly_fits(df, x, y, degree=6):
 
 
     # Make each fit
     # Make each fit
     for i in range(1, degree + 1):
     for i in range(1, degree + 1):
-        fit_name = f'fit degree = {i}'
+        fit_name = f"fit degree = {i}"
         fit_list.append(fit_name)
         fit_list.append(fit_name)
         z, res, *rest = np.polyfit(df[x], df[y], i, full=True)
         z, res, *rest = np.polyfit(df[x], df[y], i, full=True)
         fit_params.append(z)
         fit_params.append(z)
         df.loc[:, fit_name] = np.poly1d(z)(df[x])
         df.loc[:, fit_name] = np.poly1d(z)(df[x])
         rmse.append(np.sqrt(res[0]))
         rmse.append(np.sqrt(res[0]))
 
 
-    fit_stats = pd.DataFrame(
-        {'fit': fit_list, 'rmse': rmse, 'params': fit_params})
+    fit_stats = pd.DataFrame({"fit": fit_list, "rmse": rmse, "params": fit_params})
     figure = make_scatter_plot(df, x=x, y=y, fits=fit_list)
     figure = make_scatter_plot(df, x=x, y=y, fits=fit_list)
     return figure, fit_stats
     return figure, fit_stats
 
 
@@ -344,47 +425,62 @@ def make_extrapolation(df, y, years, degree=4):
     """
     """
 
 
     df = df.copy()
     df = df.copy()
-    x = 'days_since_start'
-    df['days_since_start'] = (
-        (df['published_date'] - df['published_date'].min()).
-        dt.total_seconds() / (3600 * 24)).astype(int)
+    x = "days_since_start"
+    df["days_since_start"] = (
+        (df["published_date"] - df["published_date"].min()).dt.total_seconds()
+        / (3600 * 24)
+    ).astype(int)
 
 
-    cumy = f'cum_{y}'
+    cumy = f"cum_{y}"
     df[cumy] = df.sort_values(x)[y].cumsum()
     df[cumy] = df.sort_values(x)[y].cumsum()
 
 
     figure, summary = make_poly_fits(df, x, cumy, degree=degree)
     figure, summary = make_poly_fits(df, x, cumy, degree=degree)
 
 
-    min_date = df['published_date'].min()
-    max_date = df['published_date'].max()
+    min_date = df["published_date"].min()
+    max_date = df["published_date"].max()
 
 
-    date_range = pd.date_range(start=min_date,
-                               end=max_date + pd.Timedelta(days=int(years * 365)))
+    date_range = pd.date_range(
+        start=min_date, end=max_date + pd.Timedelta(days=int(years * 365))
+    )
 
 
-    future_df = pd.DataFrame({'date': date_range})
+    future_df = pd.DataFrame({"date": date_range})
 
 
     future_df[x] = (
     future_df[x] = (
-        (future_df['date'] - future_df['date'].min()).
-        dt.total_seconds() / (3600 * 24)).astype(int)
+        (future_df["date"] - future_df["date"].min()).dt.total_seconds() / (3600 * 24)
+    ).astype(int)
 
 
-    newcumy = f'cumulative_{y}'
+    newcumy = f"cumulative_{y}"
 
 
-    future_df = future_df.merge(df[[x, cumy]], on=x, how='left').\
-        rename(columns={cumy: newcumy})
+    future_df = future_df.merge(df[[x, cumy]], on=x, how="left").rename(
+        columns={cumy: newcumy}
+    )
 
 
-    z = np.poly1d(summary.iloc[-1]['params'])
-    pred_name = f'predicted_{y}'
+    z = np.poly1d(summary.iloc[-1]["params"])
+    pred_name = f"predicted_{y}"
     future_df[pred_name] = z(future_df[x])
     future_df[pred_name] = z(future_df[x])
-    future_df['title'] = ''
-
-    last_date = future_df.loc[future_df['date'].idxmax()]
-    prediction_text = (
-        f"On {last_date['date'].date()} the {y} will be {float(last_date[pred_name]):,.0f}.")
-    annotations = [dict(x=future_df['date'].quantile(0.4),
-                        y=0.8 * future_df[pred_name].max(), text=prediction_text, showarrow=False,
-                        font=dict(size=16))]
+    future_df["title"] = ""
+
+    last_date = future_df.loc[future_df["date"].idxmax()]
+    prediction_text = f"On {last_date['date'].date()} the {y} will be {float(last_date[pred_name]):,.0f}."
+    annotations = [
+        dict(
+            x=future_df["date"].quantile(0.4),
+            y=0.8 * future_df[pred_name].max(),
+            text=prediction_text,
+            showarrow=False,
+            font=dict(size=16),
+        )
+    ]
 
 
     title_override = f'{y.replace("_", " ").title()} with Extrapolation {years} Years into the Future'
     title_override = f'{y.replace("_", " ").title()} with Extrapolation {years} Years into the Future'
 
 
-    figure = make_scatter_plot(future_df, 'date', newcumy, fits=[
-                               pred_name], annotations=annotations, ranges=True, title_override=title_override)
+    figure = make_scatter_plot(
+        future_df,
+        "date",
+        newcumy,
+        fits=[pred_name],
+        annotations=annotations,
+        ranges=True,
+        title_override=title_override,
+    )
     return figure, future_df
     return figure, future_df

+ 15 - 7
sentdex_data_analysis/HPI_tpot_pipeline.py

@@ -7,15 +7,23 @@ from sklearn.pipeline import make_pipeline, make_union
 from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler, MinMaxScaler
 from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler, MinMaxScaler
 
 
 # NOTE: Make sure that the class is labeled 'class' in the data file
 # NOTE: Make sure that the class is labeled 'class' in the data file
-tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)
-features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)
-training_features, testing_features, training_classes, testing_classes = \
-    train_test_split(features, tpot_data['class'], random_state=42)
+tpot_data = np.recfromcsv(
+    "PATH/TO/DATA/FILE", delimiter="COLUMN_SEPARATOR", dtype=np.float64
+)
+features = np.delete(
+    tpot_data.view(np.float64).reshape(tpot_data.size, -1),
+    tpot_data.dtype.names.index("class"),
+    axis=1,
+)
+(
+    training_features,
+    testing_features,
+    training_classes,
+    testing_classes,
+) = train_test_split(features, tpot_data["class"], random_state=42)
 
 
 exported_pipeline = make_pipeline(
 exported_pipeline = make_pipeline(
-    MaxAbsScaler(),
-    MinMaxScaler(),
-    LogisticRegression(C=49.0, dual=True, penalty="l2")
+    MaxAbsScaler(), MinMaxScaler(), LogisticRegression(C=49.0, dual=True, penalty="l2")
 )
 )
 
 
 exported_pipeline.fit(training_features, training_classes)
 exported_pipeline.fit(training_features, training_classes)

+ 18 - 12
sentdex_data_analysis/pandas_IO.py

@@ -1,34 +1,40 @@
-import pandas as pd 
+import pandas as pd
 
 
-df = pd.read_csv('ZILL-Z77006_C.csv') # reading in file
-df.set_index('Date', inplace = True) # setting index to date column
+df = pd.read_csv("ZILL-Z77006_C.csv")  # reading in file
+df.set_index("Date", inplace=True)  # setting index to date column
 
 
 print(df.head())
 print(df.head())
 
 
 # df.to_csv('ZILLOW_44106.csv')
 # df.to_csv('ZILLOW_44106.csv')
 
 
-df = pd.read_csv('ZILLOW_44106.csv', index_col=0) # reading in file and setting index to the first column
+df = pd.read_csv(
+    "ZILLOW_44106.csv", index_col=0
+)  # reading in file and setting index to the first column
 
 
 print(df.head())
 print(df.head())
 
 
-df.columns = ['Cleveland_HPI'] # House Price Index # renaming the columns
+df.columns = ["Cleveland_HPI"]  # House Price Index # renaming the columns
 
 
 # print(df.head())
 # print(df.head())
 
 
 # df.to_csv('ZILLOW_44106_Rev3.csv', header = False)
 # df.to_csv('ZILLOW_44106_Rev3.csv', header = False)
 
 
 # reading in data, renaming columns, and setting index as first column
 # reading in data, renaming columns, and setting index as first column
-df = pd.read_csv('ZILLOW_44106_Rev3.csv', names=['Date', 'Cleveland_HPI'], index_col=0)
+df = pd.read_csv("ZILLOW_44106_Rev3.csv", names=["Date", "Cleveland_HPI"], index_col=0)
 
 
 # print(df.head())
 # print(df.head())
 
 
-df.to_html('example.html')  # to HTML (viewable in a web browser)
+df.to_html("example.html")  # to HTML (viewable in a web browser)
 
 
-df = pd.read_csv('ZILLOW_44106_Rev3.csv', names=['Date', 'Cleveland_HPI']) # reading in data and setting headers of columns
+df = pd.read_csv(
+    "ZILLOW_44106_Rev3.csv", names=["Date", "Cleveland_HPI"]
+)  # reading in data and setting headers of columns
 print(df.head())
 print(df.head())
 
 
-df.rename(columns={'Cleveland_HPI': 'Cleveland_44106_HPI'}, inplace = True) # renaming a column
-df.rename(columns={'Cleveland_44106_HPI' : 'Cleveland_HPI'}, inplace=True)
-df.set_index('Date', inplace = True)
+df.rename(
+    columns={"Cleveland_HPI": "Cleveland_44106_HPI"}, inplace=True
+)  # renaming a column
+df.rename(columns={"Cleveland_44106_HPI": "Cleveland_HPI"}, inplace=True)
+df.set_index("Date", inplace=True)
 
 
-print(df.head())
+print(df.head())

+ 19 - 11
sentdex_data_analysis/pandas_TPOT.py

@@ -1,11 +1,15 @@
-import pandas as pd 
-import numpy as np 
+import pandas as pd
+import numpy as np
 from tpot import TPOTClassifier
 from tpot import TPOTClassifier
 from sklearn.model_selection import train_test_split
 from sklearn.model_selection import train_test_split
 
 
-benchmark = pd.read_pickle('us_pct.pickle')  # us overall housing price index percentage change
-HPI = pd.read_pickle('HPI_complete.pickle') # all of the state data, thirty year mortgage, unemployment rate, GDP, SP500
-HPI = HPI.join(benchmark['United States'])
+benchmark = pd.read_pickle(
+    "us_pct.pickle"
+)  # us overall housing price index percentage change
+HPI = pd.read_pickle(
+    "HPI_complete.pickle"
+)  # all of the state data, thirty year mortgage, unemployment rate, GDP, SP500
+HPI = HPI.join(benchmark["United States"])
 # all in percentage change since the start of the data (1975-01-01)
 # all in percentage change since the start of the data (1975-01-01)
 
 
 HPI.dropna(inplace=True)
 HPI.dropna(inplace=True)
@@ -13,25 +17,29 @@ HPI.dropna(inplace=True)
 housing_pct = HPI.pct_change()
 housing_pct = HPI.pct_change()
 housing_pct.replace([np.inf, -np.inf], np.nan, inplace=True)
 housing_pct.replace([np.inf, -np.inf], np.nan, inplace=True)
 
 
-housing_pct['US_HPI_future'] = housing_pct['United States'].shift(-1)
+housing_pct["US_HPI_future"] = housing_pct["United States"].shift(-1)
 housing_pct.dropna(inplace=True)
 housing_pct.dropna(inplace=True)
 
 
+
 def create_labels(cur_hpi, fut_hpi):
 def create_labels(cur_hpi, fut_hpi):
     if fut_hpi > cur_hpi:
     if fut_hpi > cur_hpi:
         return 1
         return 1
     else:
     else:
         return 0
         return 0
 
 
-housing_pct['label'] = list(map(create_labels, housing_pct['United States'], housing_pct['US_HPI_future']))
+
+housing_pct["label"] = list(
+    map(create_labels, housing_pct["United States"], housing_pct["US_HPI_future"])
+)
 
 
 # housing_pct['ma_apply_example'] = housing_pct['M30'].rolling(window=10).apply(moving_average)
 # housing_pct['ma_apply_example'] = housing_pct['M30'].rolling(window=10).apply(moving_average)
 # print(housing_pct.tail())
 # print(housing_pct.tail())
-X = np.array(housing_pct.drop(['label', 'US_HPI_future'], 1))
-y = np.array(housing_pct['label'])
+X = np.array(housing_pct.drop(["label", "US_HPI_future"], 1))
+y = np.array(housing_pct["label"])
 
 
-X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
 
 
 tpot = TPOTClassifier(generations=10, population_size=20, verbosity=2)
 tpot = TPOTClassifier(generations=10, population_size=20, verbosity=2)
 tpot.fit(X_train, y_train)
 tpot.fit(X_train, y_train)
 print(tpot.score(X_test, y_test))
 print(tpot.score(X_test, y_test))
-tpot.export('HPI_tpot_pipeline.py')
+tpot.export("HPI_tpot_pipeline.py")

+ 42 - 29
sentdex_data_analysis/pandas_additionalEconomic.py

@@ -1,70 +1,83 @@
 import pickle
 import pickle
-import pandas as pd 
-import quandl 
-import matplotlib.pyplot as plt 
+import pandas as pd
+import quandl
+import matplotlib.pyplot as plt
 from matplotlib import style
 from matplotlib import style
 
 
-style.use('seaborn')
+style.use("seaborn")
+
+quandl.ApiConfig.api_key = "rFsSehe51RLzREtYhLfo"
 
 
-quandl.ApiConfig.api_key = 'rFsSehe51RLzREtYhLfo'
 
 
 def mortgage_30yr():
 def mortgage_30yr():
-	df = quandl.get('FMAC/MORTG', trim_start="1975-01-01")
-	df['Value'] = (df['Value'] - df['Value'][0]) / df['Value'][0] * 100
-	df = df.resample('M').mean()
-	df.rename(columns={'Value': 'M30'}, inplace=True)
-	df = df['M30']
-	return df 
+    df = quandl.get("FMAC/MORTG", trim_start="1975-01-01")
+    df["Value"] = (df["Value"] - df["Value"][0]) / df["Value"][0] * 100
+    df = df.resample("M").mean()
+    df.rename(columns={"Value": "M30"}, inplace=True)
+    df = df["M30"]
+    return df
+
 
 
 def sp500_data():
 def sp500_data():
     df = quandl.get("YAHOO/INDEX_GSPC", trim_start="1975-01-01")
     df = quandl.get("YAHOO/INDEX_GSPC", trim_start="1975-01-01")
-    df["Adjusted Close"] = (df["Adjusted Close"]-df["Adjusted Close"][0]) / df["Adjusted Close"][0] * 100.0
-    df=df.resample('M').mean()
-    df.rename(columns={'Adjusted Close':'sp500'}, inplace=True)
-    df = df['sp500']
+    df["Adjusted Close"] = (
+        (df["Adjusted Close"] - df["Adjusted Close"][0])
+        / df["Adjusted Close"][0]
+        * 100.0
+    )
+    df = df.resample("M").mean()
+    df.rename(columns={"Adjusted Close": "sp500"}, inplace=True)
+    df = df["sp500"]
     return df
     return df
 
 
+
 def gdp_data():
 def gdp_data():
     df = quandl.get("BCB/4385", trim_start="1975-01-01")
     df = quandl.get("BCB/4385", trim_start="1975-01-01")
-    df["Value"] = (df["Value"]-df["Value"][0]) / df["Value"][0] * 100.0
-    df=df.resample('M').mean()
-    df.rename(columns={'Value':'GDP'}, inplace=True)
-    df = df['GDP'] # DataFrame to Series
+    df["Value"] = (df["Value"] - df["Value"][0]) / df["Value"][0] * 100.0
+    df = df.resample("M").mean()
+    df.rename(columns={"Value": "GDP"}, inplace=True)
+    df = df["GDP"]  # DataFrame to Series
     return df
     return df
 
 
+
 def us_unemployment():
 def us_unemployment():
     df = quandl.get("ECPI/JOB_G", trim_start="1975-01-01")
     df = quandl.get("ECPI/JOB_G", trim_start="1975-01-01")
-    df["Unemployment Rate"] = (df["Unemployment Rate"]-df["Unemployment Rate"][0]) / df["Unemployment Rate"][0] * 100.0
-    df=df.resample('1D').mean()
-    df=df.resample('M').mean()
+    df["Unemployment Rate"] = (
+        (df["Unemployment Rate"] - df["Unemployment Rate"][0])
+        / df["Unemployment Rate"][0]
+        * 100.0
+    )
+    df = df.resample("1D").mean()
+    df = df.resample("M").mean()
     return df
     return df
 
 
+
 # m30 = mortgage_30yr() # Series
 # m30 = mortgage_30yr() # Series
 # sp500 = sp500_data() # Series
 # sp500 = sp500_data() # Series
 # gdp = gdp_data() # Series
 # gdp = gdp_data() # Series
 # unemployment = us_unemployment() # DataFrame
 # unemployment = us_unemployment() # DataFrame
 # HPI = HPI_data.join([m30, unemployment, gdp, sp500])
 # HPI = HPI_data.join([m30, unemployment, gdp, sp500])
 
 
-ax1 = plt.subplot(2,1,1)
-ax2 = plt.subplot(2,1,2, sharex=ax1)
+ax1 = plt.subplot(2, 1, 1)
+ax2 = plt.subplot(2, 1, 2, sharex=ax1)
 
 
 # initial_state_data()
 # initial_state_data()
 
 
-pickle_in = open('fifty_states_pct.pickle' , 'rb')
+pickle_in = open("fifty_states_pct.pickle", "rb")
 HPI_data = pickle.load(pickle_in)
 HPI_data = pickle.load(pickle_in)
 
 
 # HPI_Benchmark()
 # HPI_Benchmark()
 
 
-pickle_in = open('us_pct.pickle','rb')
+pickle_in = open("us_pct.pickle", "rb")
 benchmark = pickle.load(pickle_in)
 benchmark = pickle.load(pickle_in)
 
 
-pickle_in = open('HPI_complete.pickle', 'rb')
+pickle_in = open("HPI_complete.pickle", "rb")
 HPI = pickle.load(pickle_in)
 HPI = pickle.load(pickle_in)
 HPI.dropna(inplace=True)
 HPI.dropna(inplace=True)
 print(HPI.head())
 print(HPI.head())
 
 
 
 
-state_HPI_M30 = HPI_data.join(HPI['M30'])
+state_HPI_M30 = HPI_data.join(HPI["M30"])
 
 
 
 
-# print(state_HPI_M30.corr().describe()['M30'])
+# print(state_HPI_M30.corr().describe()['M30'])

+ 13 - 10
sentdex_data_analysis/pandas_basics.py

@@ -1,25 +1,28 @@
-import pandas as pd 
-import matplotlib.pyplot as plt 
+import pandas as pd
+import matplotlib.pyplot as plt
 from matplotlib import style
 from matplotlib import style
-style.use('ggplot')
 
 
-web_stats = {'Day': [1,2,3,4,5,6],
-			 'Visitors': [54, 65, 76, 76, 34, 34],
-			 'Bounce_Rate': [54, 23, 32, 54, 54, 32]}
+style.use("ggplot")
+
+web_stats = {
+    "Day": [1, 2, 3, 4, 5, 6],
+    "Visitors": [54, 65, 76, 76, 34, 34],
+    "Bounce_Rate": [54, 23, 32, 54, 54, 32],
+}
 
 
 df = pd.DataFrame(web_stats)
 df = pd.DataFrame(web_stats)
 
 
 # print(df.head())
 # print(df.head())
 
 
-df.set_index('Day', inplace = True)
+df.set_index("Day", inplace=True)
 
 
- # print(df.index)
+# print(df.index)
 
 
 df.Visitors.plot()
 df.Visitors.plot()
 
 
 # plt.show()
 # plt.show()
 
 
-print(df[['Visitors','Bounce_Rate']])
+print(df[["Visitors", "Bounce_Rate"]])
 
 
 ex_list = df.Visitors.tolist()
 ex_list = df.Visitors.tolist()
-print(ex_list)
+print(ex_list)

+ 4 - 6
sentdex_data_analysis/pandas_building_dataset.py

@@ -1,13 +1,11 @@
 import quandl
 import quandl
-import pandas as pd 
+import pandas as pd
 
 
-api_key = 'rFsSehe51RLzREtYhLfo'
+api_key = "rFsSehe51RLzREtYhLfo"
 
 
 # df = quandl.get('FMAC/HPI_AK', authtoken = api_key)
 # df = quandl.get('FMAC/HPI_AK', authtoken = api_key)
 
 
-fifty_states = pd.read_html('https://simple.wikipedia.org/wiki/List_of_U.S._states')
+fifty_states = pd.read_html("https://simple.wikipedia.org/wiki/List_of_U.S._states")
 
 
 for abbv in fifty_states[0][0][1:]:
 for abbv in fifty_states[0][0][1:]:
-	print('FMAC/HPI_' + str(abbv))
-
-
+    print("FMAC/HPI_" + str(abbv))

+ 27 - 16
sentdex_data_analysis/pandas_cocantenating_appending.py

@@ -1,19 +1,31 @@
 import pandas as pd
 import pandas as pd
 
 
-df1 = pd.DataFrame({'HPI':[80,85,88,85],
-                    'Int_rate':[2, 3, 2, 2],
-                    'US_GDP_Thousands':[50, 55, 65, 55]},
-                   index = [2001, 2002, 2003, 2004])
-
-df2 = pd.DataFrame({'HPI':[80,85,88,85],
-                    'Int_rate':[2, 3, 2, 2],
-                    'US_GDP_Thousands':[50, 55, 65, 55]},
-                   index = [2005, 2006, 2007, 2008])
-
-df3 = pd.DataFrame({'HPI':[80,85,88,85],
-                    'Int_rate':[2, 3, 2, 2],
-                    'Low_tier_HPI':[50, 52, 50, 53]},
-                   index = [2001, 2002, 2003, 2004])
+df1 = pd.DataFrame(
+    {
+        "HPI": [80, 85, 88, 85],
+        "Int_rate": [2, 3, 2, 2],
+        "US_GDP_Thousands": [50, 55, 65, 55],
+    },
+    index=[2001, 2002, 2003, 2004],
+)
+
+df2 = pd.DataFrame(
+    {
+        "HPI": [80, 85, 88, 85],
+        "Int_rate": [2, 3, 2, 2],
+        "US_GDP_Thousands": [50, 55, 65, 55],
+    },
+    index=[2005, 2006, 2007, 2008],
+)
+
+df3 = pd.DataFrame(
+    {
+        "HPI": [80, 85, 88, 85],
+        "Int_rate": [2, 3, 2, 2],
+        "Low_tier_HPI": [50, 52, 50, 53],
+    },
+    index=[2001, 2002, 2003, 2004],
+)
 
 
 # df1.set_index('HPI', inplace=True)
 # df1.set_index('HPI', inplace=True)
 concat = pd.concat([df1, df3])
 concat = pd.concat([df1, df3])
@@ -23,6 +35,5 @@ df4 = df1.append(df3)
 print(concat)
 print(concat)
 print(df4)
 print(df4)
 
 
-s = pd.Series([[80, 2, 50],[80, 54, 56], [56, 43, 23]])
+s = pd.Series([[80, 2, 50], [80, 54, 56], [56, 43, 23]])
 print(s)
 print(s)
-

+ 22 - 22
sentdex_data_analysis/pandas_comparisonOperators.py

@@ -1,57 +1,57 @@
 import pickle
 import pickle
-import pandas as pd 
-import quandl 
-import matplotlib.pyplot as plt 
+import pandas as pd
+import quandl
+import matplotlib.pyplot as plt
 from matplotlib import style
 from matplotlib import style
 
 
-style.use('seaborn')
+style.use("seaborn")
 
 
-bridge_height = {'meters':[10.26, 10.31, 10.27, 10.22, 10.23, 6212.42, 10.28, 10.25, 10.31]}
+bridge_height = {
+    "meters": [10.26, 10.31, 10.27, 10.22, 10.23, 6212.42, 10.28, 10.25, 10.31]
+}
 df = pd.DataFrame(bridge_height)
 df = pd.DataFrame(bridge_height)
 
 
-df['std'] = df['meters'].rolling(window=2).std()
+df["std"] = df["meters"].rolling(window=2).std()
 
 
-df_std = df.describe()['meters']['std']
-df_mean = df.describe()['meters']['mean']
+df_std = df.describe()["meters"]["std"]
+df_mean = df.describe()["meters"]["mean"]
 
 
 # df = df[df['std'] < df_std] # sentdex methods
 # df = df[df['std'] < df_std] # sentdex methods
-df = df[df['meters'] < (df_mean + df_std)] # my methods
+df = df[df["meters"] < (df_mean + df_std)]  # my methods
 print(df)
 print(df)
 
 
-df['meters'].plot()
+df["meters"].plot()
 plt.show()
 plt.show()
 
 
-ax1 = plt.subplot(2,1,1)
-ax2 = plt.subplot(2,1,2, sharex=ax1)
+ax1 = plt.subplot(2, 1, 1)
+ax2 = plt.subplot(2, 1, 2, sharex=ax1)
 
 
 # initial_state_data()
 # initial_state_data()
 
 
-pickle_in = open('fifty_states_pct.pickle' , 'rb')
+pickle_in = open("fifty_states_pct.pickle", "rb")
 HPI_data = pickle.load(pickle_in)
 HPI_data = pickle.load(pickle_in)
 
 
 # HPI_Benchmark()
 # HPI_Benchmark()
 
 
-pickle_in = open('us_pct.pickle','rb')
+pickle_in = open("us_pct.pickle", "rb")
 benchmark = pickle.load(pickle_in)
 benchmark = pickle.load(pickle_in)
 
 
 # rolling statistics
 # rolling statistics
-HPI_data['TX12MA'] = HPI_data['TX'].rolling(window=12, center=False).mean()
-HPI_data['TX12STD']= HPI_data['TX'].rolling(window=12, center=False).std() 
+HPI_data["TX12MA"] = HPI_data["TX"].rolling(window=12, center=False).mean()
+HPI_data["TX12STD"] = HPI_data["TX"].rolling(window=12, center=False).std()
 # standard deviation is a measure of the volatility of the price
 # standard deviation is a measure of the volatility of the price
 
 
 HPI_data.dropna(inplace=True)
 HPI_data.dropna(inplace=True)
 
 
-TK_AK_12corr = HPI_data['TX'].rolling(window=12).corr(HPI_data['AK'])
+TK_AK_12corr = HPI_data["TX"].rolling(window=12).corr(HPI_data["AK"])
 
 
-HPI_data['TX'].plot(ax=ax1, label = 'TX HPI')
-HPI_data['AK'].plot(ax=ax1, label = 'AK HPI')
+HPI_data["TX"].plot(ax=ax1, label="TX HPI")
+HPI_data["AK"].plot(ax=ax1, label="AK HPI")
 ax1.legend(loc=4)
 ax1.legend(loc=4)
 
 
-TK_AK_12corr.plot(ax=ax2, label= 'TK AK 12 month correlation')
+TK_AK_12corr.plot(ax=ax2, label="TK AK 12 month correlation")
 ax2.legend(loc=4)
 ax2.legend(loc=4)
 
 
 # HPI_data[['TX12MA','TX']].plot(ax=ax1)
 # HPI_data[['TX12MA','TX']].plot(ax=ax1)
 # HPI_data['TX12STD'].plot(ax=ax2)
 # HPI_data['TX12STD'].plot(ax=ax2)
 # plt.show()
 # plt.show()
-
-

+ 24 - 23
sentdex_data_analysis/pandas_handlingNan.py

@@ -1,23 +1,25 @@
 import pickle
 import pickle
-import pandas as pd 
-import quandl 
-import matplotlib.pyplot as plt 
+import pandas as pd
+import quandl
+import matplotlib.pyplot as plt
 from matplotlib import style
 from matplotlib import style
 
 
-style.use('seaborn')
+style.use("seaborn")
+
+api_key = "rFsSehe51RLzREtYhLfo"
 
 
-api_key = 'rFsSehe51RLzREtYhLfo'
 
 
 def state_list():
 def state_list():
-    fifty_states = pd.read_html('https://simple.wikipedia.org/wiki/List_of_U.S._states')
+    fifty_states = pd.read_html("https://simple.wikipedia.org/wiki/List_of_U.S._states")
     return fifty_states[0][0][1:]
     return fifty_states[0][0][1:]
 
 
+
 def initial_state_data():
 def initial_state_data():
     states = state_list()
     states = state_list()
     main_df = pd.DataFrame()
     main_df = pd.DataFrame()
 
 
     for abbv in states:
     for abbv in states:
-        query = 'FMAC/HPI_' + str(abbv)
+        query = "FMAC/HPI_" + str(abbv)
         df = quandl.get(query, authtoken=api_key)
         df = quandl.get(query, authtoken=api_key)
         df.columns = [str(abbv)]
         df.columns = [str(abbv)]
         df[abbv] = (df[abbv] - df[abbv][0]) / df[abbv][0] * 100.0
         df[abbv] = (df[abbv] - df[abbv][0]) / df[abbv][0] * 100.0
@@ -26,29 +28,31 @@ def initial_state_data():
         else:
         else:
             main_df = main_df.join(df)
             main_df = main_df.join(df)
 
 
-    pickle_out = open('fifty_states_pct.pickle', 'wb')
+    pickle_out = open("fifty_states_pct.pickle", "wb")
     pickle.dump(main_df, pickle_out)
     pickle.dump(main_df, pickle_out)
     pickle_out.close()
     pickle_out.close()
 
 
+
 def HPI_Benchmark():
 def HPI_Benchmark():
-    df = quandl.get('FMAC/HPI_USA' , authtoken=api_key)
-    df['United States'] = (df['Value'] - df['Value'][0]) / df['Value'][0] * 100.0
-    
-    pickle_out = open('us_pct.pickle', 'wb')
+    df = quandl.get("FMAC/HPI_USA", authtoken=api_key)
+    df["United States"] = (df["Value"] - df["Value"][0]) / df["Value"][0] * 100.0
+
+    pickle_out = open("us_pct.pickle", "wb")
     pickle.dump(df, pickle_out)
     pickle.dump(df, pickle_out)
     pickle_out.close()
     pickle_out.close()
 
 
+
 # fig = plt.figure()
 # fig = plt.figure()
-ax1 = plt.subplot(1,1,1)
+ax1 = plt.subplot(1, 1, 1)
 
 
 # initial_state_data()
 # initial_state_data()
 
 
-pickle_in = open('fifty_states_pct.pickle' , 'rb')
+pickle_in = open("fifty_states_pct.pickle", "rb")
 HPI_data = pickle.load(pickle_in)
 HPI_data = pickle.load(pickle_in)
 
 
 # HPI_Benchmark()
 # HPI_Benchmark()
 
 
-pickle_in = open('us_pct.pickle','rb')
+pickle_in = open("us_pct.pickle", "rb")
 benchmark = pickle.load(pickle_in)
 benchmark = pickle.load(pickle_in)
 
 
 # HPI_data = HPI_data.pct_change()
 # HPI_data = HPI_data.pct_change()
@@ -57,21 +61,18 @@ benchmark = pickle.load(pickle_in)
 # benchmark['United States'].plot(ax=ax1, color='k', linewidth=10)
 # benchmark['United States'].plot(ax=ax1, color='k', linewidth=10)
 # plt.legend().remove()
 # plt.legend().remove()
 
 
-TX1yr = HPI_data['TX'].resample('A').mean()
-HPI_data['TX1yr'] = TX1yr
+TX1yr = HPI_data["TX"].resample("A").mean()
+HPI_data["TX1yr"] = TX1yr
 # print(HPI_data[['TX1yr','TX']])
 # print(HPI_data[['TX1yr','TX']])
 print(HPI_data.isnull().values.sum())
 print(HPI_data.isnull().values.sum())
 
 
-HPI_data.fillna(method='bfill', inplace=True)
+HPI_data.fillna(method="bfill", inplace=True)
 # HPI_data.dropna(inplace=True)
 # HPI_data.dropna(inplace=True)
 
 
 # print(HPI_data[['TX1yr','TX']])
 # print(HPI_data[['TX1yr','TX']])
 
 
-HPI_data[['TX1yr', 'TX']].plot(ax=ax1)
+HPI_data[["TX1yr", "TX"]].plot(ax=ax1)
 # plt.show()
 # plt.show()
 
 
-print(HPI_data['TX'].hasnans)
+print(HPI_data["TX"].hasnans)
 print(HPI_data.isnull().values.sum())
 print(HPI_data.isnull().values.sum())
-
-
-

+ 42 - 33
sentdex_data_analysis/pandas_indexing.py

@@ -1,62 +1,71 @@
 import pickle
 import pickle
-import pandas as pd 
-import quandl 
-import matplotlib.pyplot as plt 
+import pandas as pd
+import quandl
+import matplotlib.pyplot as plt
 from matplotlib import style
 from matplotlib import style
 
 
-style.use('seaborn')
+style.use("seaborn")
+
+api_key = "rFsSehe51RLzREtYhLfo"
 
 
-api_key = 'rFsSehe51RLzREtYhLfo'
 
 
 def state_list():
 def state_list():
-	fifty_states = pd.read_html('https://simple.wikipedia.org/wiki/List_of_U.S._states')
-	return fifty_states[0][0][1:]
+    fifty_states = pd.read_html("https://simple.wikipedia.org/wiki/List_of_U.S._states")
+    return fifty_states[0][0][1:]
+
 
 
 def initial_state_data():
 def initial_state_data():
-	states = state_list()
-	main_df = pd.DataFrame()
-
-	for abbv in states:
-		query = 'FMAC/HPI_' + str(abbv)
-		df = quandl.get(query, authtoken=api_key)
-		df.columns = [str(abbv)]
-		df[abbv] = (df[abbv] - df[abbv][0]) / df[abbv][0] * 100.0
-		if main_df.empty:
-			main_df = df
-		else:
-			main_df = main_df.join(df)
-
-	pickle_out = open('fifty_states_pct.pickle', 'wb')
-	pickle.dump(main_df, pickle_out)
-	pickle_out.close()
+    states = state_list()
+    main_df = pd.DataFrame()
+
+    for abbv in states:
+        query = "FMAC/HPI_" + str(abbv)
+        df = quandl.get(query, authtoken=api_key)
+        df.columns = [str(abbv)]
+        df[abbv] = (df[abbv] - df[abbv][0]) / df[abbv][0] * 100.0
+        if main_df.empty:
+            main_df = df
+        else:
+            main_df = main_df.join(df)
+
+    pickle_out = open("fifty_states_pct.pickle", "wb")
+    pickle.dump(main_df, pickle_out)
+    pickle_out.close()
+
 
 
 # initial_state_data()
 # initial_state_data()
 
 
+
 def HPI_Benchmark():
 def HPI_Benchmark():
-	df = quandl.get('FMAC/HPI_USA' , authtoken=api_key)
-	df['United States'] = (df['Value'] - df['Value'][0]) / df['Value'][0] * 100.0
-	return df
+    df = quandl.get("FMAC/HPI_USA", authtoken=api_key)
+    df["United States"] = (df["Value"] - df["Value"][0]) / df["Value"][0] * 100.0
+    return df
+
 
 
 fig = plt.figure()
 fig = plt.figure()
-ax1 = plt.subplot2grid((1,1), (0,0))
+ax1 = plt.subplot2grid((1, 1), (0, 0))
 
 
-pickle_in = open('fifty_states_pct.pickle' , 'rb')
+pickle_in = open("fifty_states_pct.pickle", "rb")
 HPI_data = pickle.load(pickle_in)
 HPI_data = pickle.load(pickle_in)
 benchmark = HPI_Benchmark()
 benchmark = HPI_Benchmark()
 # HPI_data = HPI_data.pct_change()
 # HPI_data = HPI_data.pct_change()
 
 
 HPI_data.plot(ax=ax1)
 HPI_data.plot(ax=ax1)
-benchmark['United States'].plot(ax=ax1, color='k', linewidth=10)
+benchmark["United States"].plot(ax=ax1, color="k", linewidth=10)
 plt.legend().remove()
 plt.legend().remove()
 
 
 HPI_complete_data = HPI_data
 HPI_complete_data = HPI_data
-HPI_complete_data['United States'] = benchmark['United States']
+HPI_complete_data["United States"] = benchmark["United States"]
 # print(HPI_complete_data.head())
 # print(HPI_complete_data.head())
 HPI_State_Correlation = HPI_data.corr()
 HPI_State_Correlation = HPI_data.corr()
 
 
 HPI_complete_correlation = HPI_complete_data.corr()
 HPI_complete_correlation = HPI_complete_data.corr()
-HPI_US_correlation = HPI_complete_correlation['United States']
+HPI_US_correlation = HPI_complete_correlation["United States"]
 HPI_US_correlation_sorted = HPI_US_correlation.sort_values(ascending=True)
 HPI_US_correlation_sorted = HPI_US_correlation.sort_values(ascending=True)
-print(HPI_US_correlation_sorted[HPI_US_correlation_sorted == HPI_US_correlation_sorted[-2]].index)
+print(
+    HPI_US_correlation_sorted[
+        HPI_US_correlation_sorted == HPI_US_correlation_sorted[-2]
+    ].index
+)
 plt.show()
 plt.show()
-# print(HPI_data[['IL','WI']].corr())
+# print(HPI_data[['IL','WI']].corr())

+ 6 - 5
sentdex_data_analysis/pandas_intro.py

@@ -1,9 +1,10 @@
-import pandas as pd 
+import pandas as pd
 import datetime
 import datetime
 from pandas_datareader import data
 from pandas_datareader import data
-import matplotlib.pyplot as plt 
+import matplotlib.pyplot as plt
 from matplotlib import style
 from matplotlib import style
-style.use('seaborn-dark')
+
+style.use("seaborn-dark")
 
 
 start = datetime.datetime(2010, 1, 1)
 start = datetime.datetime(2010, 1, 1)
 end = datetime.datetime(2016, 12, 31)
 end = datetime.datetime(2016, 12, 31)
@@ -12,6 +13,6 @@ df = data.DataReader("GM", "yahoo", start, end)
 
 
 print(df.head())
 print(df.head())
 
 
-df['Adj Close'].plot()
+df["Adj Close"].plot()
 
 
-plt.show()
+plt.show()

+ 17 - 16
sentdex_data_analysis/pandas_joiningData.py

@@ -1,32 +1,33 @@
 import pickle
 import pickle
-import pandas as pd 
-import quandl 
-import matplotlib.pyplot as plt 
+import pandas as pd
+import quandl
+import matplotlib.pyplot as plt
 from matplotlib import style
 from matplotlib import style
 
 
-style.use('seaborn')
+style.use("seaborn")
+
+quandl.ApiConfig.api_key = "rFsSehe51RLzREtYhLfo"
 
 
-quandl.ApiConfig.api_key = 'rFsSehe51RLzREtYhLfo'
 
 
 def mortgage_30yr():
 def mortgage_30yr():
-	df = quandl.get('FMAC/MORTG')
-	df = df[df.index > "1974-12-01"]
-	df = (df['Value'] - df['Value'][0]) / df['Value'][0] * 100
-	df = df.resample('M').mean()
-	return df 
+    df = quandl.get("FMAC/MORTG")
+    df = df[df.index > "1974-12-01"]
+    df = (df["Value"] - df["Value"][0]) / df["Value"][0] * 100
+    df = df.resample("M").mean()
+    return df
 
 
 
 
-ax1 = plt.subplot(2,1,1)
-ax2 = plt.subplot(2,1,2, sharex=ax1)
+ax1 = plt.subplot(2, 1, 1)
+ax2 = plt.subplot(2, 1, 2, sharex=ax1)
 
 
 # initial_state_data()
 # initial_state_data()
 
 
-pickle_in = open('fifty_states_pct.pickle' , 'rb')
+pickle_in = open("fifty_states_pct.pickle", "rb")
 HPI_data = pickle.load(pickle_in)
 HPI_data = pickle.load(pickle_in)
 
 
 # HPI_Benchmark()
 # HPI_Benchmark()
 
 
-pickle_in = open('us_pct.pickle','rb')
+pickle_in = open("us_pct.pickle", "rb")
 benchmark = pickle.load(pickle_in)
 benchmark = pickle.load(pickle_in)
 
 
 
 
@@ -35,6 +36,6 @@ m30 = mortgage_30yr()
 HPI_Bench = benchmark
 HPI_Bench = benchmark
 
 
 state_HPI_M30 = HPI_data.join(m30)
 state_HPI_M30 = HPI_data.join(m30)
-state_HPI_M30.rename({'Value' : 'M30'}, inplace=True)
+state_HPI_M30.rename({"Value": "M30"}, inplace=True)
 
 
-print(state_HPI_M30.corr().describe()['Value'])
+print(state_HPI_M30.corr().describe()["Value"])

+ 21 - 14
sentdex_data_analysis/pandas_joining_merging.py

@@ -1,21 +1,29 @@
 import pandas as pd
 import pandas as pd
 
 
-df1 = pd.DataFrame({'HPI':[80,86,88,85],
-                    'Int_rate':[2, 3, 2, 2],
-                    'US_GDP_Thousands':[50, 55, 65, 55],
-                   'Year' : [2001, 2002, 2003, 2005]})
-
-'''
+df1 = pd.DataFrame(
+    {
+        "HPI": [80, 86, 88, 85],
+        "Int_rate": [2, 3, 2, 2],
+        "US_GDP_Thousands": [50, 55, 65, 55],
+        "Year": [2001, 2002, 2003, 2005],
+    }
+)
+
+"""
 df2 = pd.DataFrame({'HPI':[80,85,88,85],
 df2 = pd.DataFrame({'HPI':[80,85,88,85],
                     'Int_rate':[5, 3, 2, 2],
                     'Int_rate':[5, 3, 2, 2],
                     'US_GDP_Thousands':[50, 55, 65, 55]},
                     'US_GDP_Thousands':[50, 55, 65, 55]},
                    index = [2005, 2006, 2007, 2008])
                    index = [2005, 2006, 2007, 2008])
-'''
+"""
 
 
-df3 = pd.DataFrame({'HPI':[95, 86, 88, 90],
-                    'Unemployment':[7, 8, 9, 6],
-                    'Low_tier_HPI':[50, 52, 50, 53],
-                   'Year' : [2000, 2002, 2003, 2004]})
+df3 = pd.DataFrame(
+    {
+        "HPI": [95, 86, 88, 90],
+        "Unemployment": [7, 8, 9, 6],
+        "Low_tier_HPI": [50, 52, 50, 53],
+        "Year": [2000, 2002, 2003, 2004],
+    }
+)
 
 
 
 
 # print(pd.merge(df1, df3, on=['HPI']))
 # print(pd.merge(df1, df3, on=['HPI']))
@@ -26,6 +34,5 @@ df3 = pd.DataFrame({'HPI':[95, 86, 88, 90],
 # df3.set_index('Year', inplace=True)
 # df3.set_index('Year', inplace=True)
 
 
 
 
-
-merged = pd.merge(df1, df3, on='Year', how='outer')
-print(merged)
+merged = pd.merge(df1, df3, on="Year", how="outer")
+print(merged)

+ 26 - 15
sentdex_data_analysis/pandas_mappingFunctions.py

@@ -1,15 +1,16 @@
 import pickle
 import pickle
-import pandas as pd 
-import quandl 
-import matplotlib.pyplot as plt 
+import pandas as pd
+import quandl
+import matplotlib.pyplot as plt
 from matplotlib import style
 from matplotlib import style
-import numpy as np 
-from statistics import mean 
+import numpy as np
+from statistics import mean
 
 
-style.use('seaborn-dark-palette')
+style.use("seaborn-dark-palette")
+
+ax1 = plt.subplot(2, 1, 1)
+ax2 = plt.subplot(2, 1, 2, sharex=ax1)
 
 
-ax1 = plt.subplot(2,1,1)
-ax2 = plt.subplot(2,1,2, sharex=ax1)
 
 
 def create_labels(cur_hpi, fut_hpi):
 def create_labels(cur_hpi, fut_hpi):
     if fut_hpi > cur_hpi:
     if fut_hpi > cur_hpi:
@@ -17,12 +18,18 @@ def create_labels(cur_hpi, fut_hpi):
     else:
     else:
         return 0
         return 0
 
 
+
 def moving_average(values):
 def moving_average(values):
     return mean(values)
     return mean(values)
 
 
-benchmark = pd.read_pickle('us_pct.pickle')  # us overall housing price index percentage change
-HPI = pd.read_pickle('HPI_complete.pickle') # all of the state data, thirty year mortgage, unemployment rate, GDP, SP500
-HPI = HPI.join(benchmark['United States'])
+
+benchmark = pd.read_pickle(
+    "us_pct.pickle"
+)  # us overall housing price index percentage change
+HPI = pd.read_pickle(
+    "HPI_complete.pickle"
+)  # all of the state data, thirty year mortgage, unemployment rate, GDP, SP500
+HPI = HPI.join(benchmark["United States"])
 # all in percentage change since the start of the data (1975-01-01)
 # all in percentage change since the start of the data (1975-01-01)
 
 
 HPI.dropna(inplace=True)
 HPI.dropna(inplace=True)
@@ -30,14 +37,18 @@ HPI.dropna(inplace=True)
 housing_pct = HPI.pct_change()
 housing_pct = HPI.pct_change()
 housing_pct.replace([np.inf, -np.inf], np.nan, inplace=True)
 housing_pct.replace([np.inf, -np.inf], np.nan, inplace=True)
 
 
-housing_pct['US_HPI_future'] = housing_pct['United States'].shift(-1)
+housing_pct["US_HPI_future"] = housing_pct["United States"].shift(-1)
 housing_pct.dropna(inplace=True)
 housing_pct.dropna(inplace=True)
 
 
-housing_pct['label'] = list(map(create_labels, housing_pct['United States'], housing_pct['US_HPI_future']))
+housing_pct["label"] = list(
+    map(create_labels, housing_pct["United States"], housing_pct["US_HPI_future"])
+)
 
 
 # housing_pct['ma_apply_example'] = pd.rolling_apply(housing_pct['M30'], 10, moving_average)
 # housing_pct['ma_apply_example'] = pd.rolling_apply(housing_pct['M30'], 10, moving_average)
-housing_pct['ma_apply_example'] = housing_pct['M30'].rolling(window=10).apply(moving_average)
+housing_pct["ma_apply_example"] = (
+    housing_pct["M30"].rolling(window=10).apply(moving_average)
+)
 print(housing_pct.tail())
 print(housing_pct.tail())
 
 
 # state_HPI_M30 = HPI_data.join(HPI['M30']) # fifty states plus mortgage data
 # state_HPI_M30 = HPI_data.join(HPI['M30']) # fifty states plus mortgage data
-# print(state_HPI_M30.corr().describe().tail())
+# print(state_HPI_M30.corr().describe().tail())

+ 35 - 33
sentdex_data_analysis/pandas_percentChange_correlation.py

@@ -1,63 +1,65 @@
 import pickle
 import pickle
-import pandas as pd 
-import quandl 
-import matplotlib.pyplot as plt 
+import pandas as pd
+import quandl
+import matplotlib.pyplot as plt
 from matplotlib import style
 from matplotlib import style
 
 
-style.use('seaborn')
+style.use("seaborn")
+
+api_key = "rFsSehe51RLzREtYhLfo"
 
 
-api_key = 'rFsSehe51RLzREtYhLfo'
 
 
 def state_list():
 def state_list():
-	fifty_states = pd.read_html('https://simple.wikipedia.org/wiki/List_of_U.S._states')
-	return fifty_states[0][0][1:]
+    fifty_states = pd.read_html("https://simple.wikipedia.org/wiki/List_of_U.S._states")
+    return fifty_states[0][0][1:]
 
 
-def initial_state_data():
-	states = state_list()
-	main_df = pd.DataFrame()
 
 
-	for abbv in states:
-		query = 'FMAC/HPI_' + str(abbv)
-		df = quandl.get(query, authtoken=api_key)
-		df.columns = [str(abbv)]
-		df[abbv] = (df[abbv] - df[abbv][0]) / df[abbv][0] * 100.0
-		if main_df.empty:
-			main_df = df
-		else:
-			main_df = main_df.join(df)
+def initial_state_data():
+    states = state_list()
+    main_df = pd.DataFrame()
 
 
-	print(main_df.head())
+    for abbv in states:
+        query = "FMAC/HPI_" + str(abbv)
+        df = quandl.get(query, authtoken=api_key)
+        df.columns = [str(abbv)]
+        df[abbv] = (df[abbv] - df[abbv][0]) / df[abbv][0] * 100.0
+        if main_df.empty:
+            main_df = df
+        else:
+            main_df = main_df.join(df)
 
 
-	pickle_out = open('fifty_states_pct.pickle', 'wb')
-	pickle.dump(main_df, pickle_out)
-	pickle_out.close()
+    print(main_df.head())
 
 
+    pickle_out = open("fifty_states_pct.pickle", "wb")
+    pickle.dump(main_df, pickle_out)
+    pickle_out.close()
 
 
 
 
 def HPI_Benchmark():
 def HPI_Benchmark():
-	df = quandl.get('FMAC/HPI_USA' , authtoken=api_key)
-	df['United States'] = (df['Value'] - df['Value'][0]) / df['Value'][0] * 100.0
-	
-	pickle_out = open('us_pct.pickle', 'wb')
-	pickle.dump(df, pickle_out)
-	pickle_out.close()
+    df = quandl.get("FMAC/HPI_USA", authtoken=api_key)
+    df["United States"] = (df["Value"] - df["Value"][0]) / df["Value"][0] * 100.0
+
+    pickle_out = open("us_pct.pickle", "wb")
+    pickle.dump(df, pickle_out)
+    pickle_out.close()
+
 
 
 fig = plt.figure()
 fig = plt.figure()
-ax1 = plt.subplot2grid((1,1), (0,0))
+ax1 = plt.subplot2grid((1, 1), (0, 0))
 
 
 # initial_state_data()
 # initial_state_data()
-pickle_in = open('fifty_states_pct.pickle' , 'rb')
+pickle_in = open("fifty_states_pct.pickle", "rb")
 HPI_data = pickle.load(pickle_in)
 HPI_data = pickle.load(pickle_in)
 
 
 
 
 # HPI_Benchmark()
 # HPI_Benchmark()
-pickle_in = open('us_pct.pickle' , 'rb')
+pickle_in = open("us_pct.pickle", "rb")
 benchmark = pickle.load(pickle_in)
 benchmark = pickle.load(pickle_in)
 
 
 # HPI_data = HPI_data.pct_change()
 # HPI_data = HPI_data.pct_change()
 
 
 HPI_data.plot(ax=ax1)
 HPI_data.plot(ax=ax1)
-benchmark['United States'].plot(ax=ax1, color='k', linewidth=10)
+benchmark["United States"].plot(ax=ax1, color="k", linewidth=10)
 plt.legend().remove()
 plt.legend().remove()
 
 
 HPI_State_Correlation = HPI_data.corr()
 HPI_State_Correlation = HPI_data.corr()

+ 24 - 21
sentdex_data_analysis/pandas_pickling.py

@@ -1,35 +1,38 @@
 import pickle
 import pickle
-import pandas as pd 
-import quandl 
+import pandas as pd
+import quandl
+
+api_key = "rFsSehe51RLzREtYhLfo"
 
 
-api_key = 'rFsSehe51RLzREtYhLfo'
 
 
 def state_list():
 def state_list():
-	fifty_states = pd.read_html('https://simple.wikipedia.org/wiki/List_of_U.S._states')
-	return fifty_states[0][0][1:]
+    fifty_states = pd.read_html("https://simple.wikipedia.org/wiki/List_of_U.S._states")
+    return fifty_states[0][0][1:]
+
 
 
 def initial_state_data():
 def initial_state_data():
-	states = state_list()
-	main_df = pd.DataFrame()
+    states = state_list()
+    main_df = pd.DataFrame()
+
+    for abbv in states:
+        query = "FMAC/HPI_" + str(abbv)
+        df = quandl.get(query, authtoken=api_key)
+        df.columns = [str(abbv)]
+        if main_df.empty:
+            main_df = df
+        else:
+            main_df = main_df.join(df)
 
 
-	for abbv in states:
-		query = 'FMAC/HPI_' + str(abbv)
-		df = quandl.get(query, authtoken=api_key)
-		df.columns = [str(abbv)]
-		if main_df.empty:
-			main_df = df
-		else:
-			main_df = main_df.join(df)
+    print(main_df.head())
 
 
-	print(main_df.head())
+    pickle_out = open("fifty_states.pickle", "wb")
+    pickle.dump(main_df, pickle_out)
+    pickle_out.close()
 
 
-	pickle_out = open('fifty_states.pickle', 'wb')
-	pickle.dump(main_df, pickle_out)
-	pickle_out.close()
 
 
 # initial_state_data()
 # initial_state_data()
 
 
-pickle_in = open('fifty_states.pickle' , 'rb')
+pickle_in = open("fifty_states.pickle", "rb")
 HPI_data = pickle.load(pickle_in)
 HPI_data = pickle.load(pickle_in)
 
 
-print(HPI_data)
+print(HPI_data)

+ 4 - 4
sentdex_data_analysis/pandas_pickling_sentdex.py

@@ -2,16 +2,16 @@ import quandl
 import pandas as pd
 import pandas as pd
 
 
 # Not necessary, I just do this so I do not show my API key.
 # Not necessary, I just do this so I do not show my API key.
-api_key = 'rFsSehe51RLzREtYhLfo'
-fiddy_states = pd.read_html('https://simple.wikipedia.org/wiki/List_of_U.S._states')
+api_key = "rFsSehe51RLzREtYhLfo"
+fiddy_states = pd.read_html("https://simple.wikipedia.org/wiki/List_of_U.S._states")
 
 
 main_df = pd.DataFrame()
 main_df = pd.DataFrame()
 
 
 for abbv in fiddy_states[0][0][1:]:
 for abbv in fiddy_states[0][0][1:]:
-    query = "FMAC/HPI_"+str(abbv)
+    query = "FMAC/HPI_" + str(abbv)
     df = quandl.get(query, authtoken=api_key)
     df = quandl.get(query, authtoken=api_key)
 
 
     if main_df.empty:
     if main_df.empty:
         main_df = df
         main_df = df
     else:
     else:
-        main_df = main_df.join(df)
+        main_df = main_df.join(df)

+ 43 - 39
sentdex_data_analysis/pandas_resampling.py

@@ -1,54 +1,58 @@
 import pickle
 import pickle
-import pandas as pd 
-import quandl 
-import matplotlib.pyplot as plt 
+import pandas as pd
+import quandl
+import matplotlib.pyplot as plt
 from matplotlib import style
 from matplotlib import style
 
 
-style.use('seaborn')
+style.use("seaborn")
+
+api_key = "rFsSehe51RLzREtYhLfo"
 
 
-api_key = 'rFsSehe51RLzREtYhLfo'
 
 
 def state_list():
 def state_list():
-	fifty_states = pd.read_html('https://simple.wikipedia.org/wiki/List_of_U.S._states')
-	return fifty_states[0][0][1:]
+    fifty_states = pd.read_html("https://simple.wikipedia.org/wiki/List_of_U.S._states")
+    return fifty_states[0][0][1:]
+
 
 
 def initial_state_data():
 def initial_state_data():
-	states = state_list()
-	main_df = pd.DataFrame()
-
-	for abbv in states:
-		query = 'FMAC/HPI_' + str(abbv)
-		df = quandl.get(query, authtoken=api_key)
-		df.columns = [str(abbv)]
-		df[abbv] = (df[abbv] - df[abbv][0]) / df[abbv][0] * 100.0
-		if main_df.empty:
-			main_df = df
-		else:
-			main_df = main_df.join(df)
-
-	pickle_out = open('fifty_states_pct.pickle', 'wb')
-	pickle.dump(main_df, pickle_out)
-	pickle_out.close()
+    states = state_list()
+    main_df = pd.DataFrame()
+
+    for abbv in states:
+        query = "FMAC/HPI_" + str(abbv)
+        df = quandl.get(query, authtoken=api_key)
+        df.columns = [str(abbv)]
+        df[abbv] = (df[abbv] - df[abbv][0]) / df[abbv][0] * 100.0
+        if main_df.empty:
+            main_df = df
+        else:
+            main_df = main_df.join(df)
+
+    pickle_out = open("fifty_states_pct.pickle", "wb")
+    pickle.dump(main_df, pickle_out)
+    pickle_out.close()
+
 
 
 def HPI_Benchmark():
 def HPI_Benchmark():
-	df = quandl.get('FMAC/HPI_USA' , authtoken=api_key)
-	df['United States'] = (df['Value'] - df['Value'][0]) / df['Value'][0] * 100.0
-	
-	pickle_out = open('us_pct.pickle', 'wb')
-	pickle.dump(df, pickle_out)
-	pickle_out.close()
+    df = quandl.get("FMAC/HPI_USA", authtoken=api_key)
+    df["United States"] = (df["Value"] - df["Value"][0]) / df["Value"][0] * 100.0
+
+    pickle_out = open("us_pct.pickle", "wb")
+    pickle.dump(df, pickle_out)
+    pickle_out.close()
+
 
 
 # fig = plt.figure()
 # fig = plt.figure()
-ax1 = plt.subplot(1,1,1)
+ax1 = plt.subplot(1, 1, 1)
 
 
 # initial_state_data()
 # initial_state_data()
 
 
-pickle_in = open('fifty_states_pct.pickle' , 'rb')
+pickle_in = open("fifty_states_pct.pickle", "rb")
 HPI_data = pickle.load(pickle_in)
 HPI_data = pickle.load(pickle_in)
 
 
 # HPI_Benchmark()
 # HPI_Benchmark()
 
 
-pickle_in = open('us_pct.pickle','rb')
+pickle_in = open("us_pct.pickle", "rb")
 benchmark = pickle.load(pickle_in)
 benchmark = pickle.load(pickle_in)
 
 
 # HPI_data = HPI_data.pct_change()
 # HPI_data = HPI_data.pct_change()
@@ -58,13 +62,13 @@ benchmark = pickle.load(pickle_in)
 # plt.legend().remove()
 # plt.legend().remove()
 
 
 HPI_complete_data = HPI_data
 HPI_complete_data = HPI_data
-HPI_complete_data['United States'] = benchmark['United States']
-US1YR = benchmark['United States'].resample('A').mean() # new method of resampling
-HPI1YR = HPI_data.resample('A').mean() # can change rate of sampling and method of sampling 
+HPI_complete_data["United States"] = benchmark["United States"]
+US1YR = benchmark["United States"].resample("A").mean()  # new method of resampling
+HPI1YR = HPI_data.resample(
+    "A"
+).mean()  # can change rate of sampling and method of sampling
 
 
 US1YR.plot(ax=ax1)
 US1YR.plot(ax=ax1)
-benchmark['United States'].plot(ax=ax1)
-plt.legend(['Yearly sampled', 'Monthly sampled']) # original data is sampled monthly
+benchmark["United States"].plot(ax=ax1)
+plt.legend(["Yearly sampled", "Monthly sampled"])  # original data is sampled monthly
 plt.show()
 plt.show()
-
-

+ 29 - 27
sentdex_data_analysis/pandas_rollingStatistics.py

@@ -1,23 +1,25 @@
 import pickle
 import pickle
-import pandas as pd 
-import quandl 
-import matplotlib.pyplot as plt 
+import pandas as pd
+import quandl
+import matplotlib.pyplot as plt
 from matplotlib import style
 from matplotlib import style
 
 
-style.use('seaborn')
+style.use("seaborn")
+
+api_key = "rFsSehe51RLzREtYhLfo"
 
 
-api_key = 'rFsSehe51RLzREtYhLfo'
 
 
 def state_list():
 def state_list():
-    fifty_states = pd.read_html('https://simple.wikipedia.org/wiki/List_of_U.S._states')
+    fifty_states = pd.read_html("https://simple.wikipedia.org/wiki/List_of_U.S._states")
     return fifty_states[0][0][1:]
     return fifty_states[0][0][1:]
 
 
+
 def initial_state_data():
 def initial_state_data():
     states = state_list()
     states = state_list()
     main_df = pd.DataFrame()
     main_df = pd.DataFrame()
 
 
     for abbv in states:
     for abbv in states:
-        query = 'FMAC/HPI_' + str(abbv)
+        query = "FMAC/HPI_" + str(abbv)
         df = quandl.get(query, authtoken=api_key)
         df = quandl.get(query, authtoken=api_key)
         df.columns = [str(abbv)]
         df.columns = [str(abbv)]
         df[abbv] = (df[abbv] - df[abbv][0]) / df[abbv][0] * 100.0
         df[abbv] = (df[abbv] - df[abbv][0]) / df[abbv][0] * 100.0
@@ -26,29 +28,31 @@ def initial_state_data():
         else:
         else:
             main_df = main_df.join(df)
             main_df = main_df.join(df)
 
 
-    pickle_out = open('fifty_states_pct.pickle', 'wb')
+    pickle_out = open("fifty_states_pct.pickle", "wb")
     pickle.dump(main_df, pickle_out)
     pickle.dump(main_df, pickle_out)
     pickle_out.close()
     pickle_out.close()
 
 
+
 def HPI_Benchmark():
 def HPI_Benchmark():
-    df = quandl.get('FMAC/HPI_USA' , authtoken=api_key)
-    df['United States'] = (df['Value'] - df['Value'][0]) / df['Value'][0] * 100.0
-    
-    pickle_out = open('us_pct.pickle', 'wb')
+    df = quandl.get("FMAC/HPI_USA", authtoken=api_key)
+    df["United States"] = (df["Value"] - df["Value"][0]) / df["Value"][0] * 100.0
+
+    pickle_out = open("us_pct.pickle", "wb")
     pickle.dump(df, pickle_out)
     pickle.dump(df, pickle_out)
     pickle_out.close()
     pickle_out.close()
 
 
-ax1 = plt.subplot(2,1,1)
-ax2 = plt.subplot(2,1,2, sharex=ax1)
+
+ax1 = plt.subplot(2, 1, 1)
+ax2 = plt.subplot(2, 1, 2, sharex=ax1)
 
 
 # initial_state_data()
 # initial_state_data()
 
 
-pickle_in = open('fifty_states_pct.pickle' , 'rb')
+pickle_in = open("fifty_states_pct.pickle", "rb")
 HPI_data = pickle.load(pickle_in)
 HPI_data = pickle.load(pickle_in)
 
 
 # HPI_Benchmark()
 # HPI_Benchmark()
 
 
-pickle_in = open('us_pct.pickle','rb')
+pickle_in = open("us_pct.pickle", "rb")
 benchmark = pickle.load(pickle_in)
 benchmark = pickle.load(pickle_in)
 
 
 # HPI_data = HPI_data.pct_change()
 # HPI_data = HPI_data.pct_change()
@@ -57,12 +61,12 @@ benchmark = pickle.load(pickle_in)
 # benchmark['United States'].plot(ax=ax1, color='k', linewidth=10)
 # benchmark['United States'].plot(ax=ax1, color='k', linewidth=10)
 # plt.legend().remove()
 # plt.legend().remove()
 
 
-TX1yr = HPI_data['TX'].resample('A').mean()
-HPI_data['TX1yr'] = TX1yr
+TX1yr = HPI_data["TX"].resample("A").mean()
+HPI_data["TX1yr"] = TX1yr
 # print(HPI_data[['TX1yr','TX']])
 # print(HPI_data[['TX1yr','TX']])
 print(HPI_data.isnull().values.sum())
 print(HPI_data.isnull().values.sum())
 
 
-HPI_data.fillna(method='bfill', inplace=True)
+HPI_data.fillna(method="bfill", inplace=True)
 # HPI_data.dropna(inplace=True)
 # HPI_data.dropna(inplace=True)
 print(HPI_data.isnull().values.sum())
 print(HPI_data.isnull().values.sum())
 
 
@@ -74,23 +78,21 @@ print(HPI_data.isnull().values.sum())
 # print(HPI_data['TX'].hasnans)
 # print(HPI_data['TX'].hasnans)
 
 
 # rolling statistics
 # rolling statistics
-HPI_data['TX12MA'] = HPI_data['TX'].rolling(window=12, center=False).mean()
-HPI_data['TX12STD']= HPI_data['TX'].rolling(window=12, center=False).std() 
+HPI_data["TX12MA"] = HPI_data["TX"].rolling(window=12, center=False).mean()
+HPI_data["TX12STD"] = HPI_data["TX"].rolling(window=12, center=False).std()
 # standard deviation is a measure of the volatility of the price
 # standard deviation is a measure of the volatility of the price
 HPI_data.dropna(inplace=True)
 HPI_data.dropna(inplace=True)
 
 
-TK_AK_12corr = HPI_data['TX'].rolling(window=12).corr(HPI_data['AK'])
+TK_AK_12corr = HPI_data["TX"].rolling(window=12).corr(HPI_data["AK"])
 
 
-HPI_data['TX'].plot(ax=ax1, label = 'TX HPI')
-HPI_data['AK'].plot(ax=ax1, label = 'AK HPI')
+HPI_data["TX"].plot(ax=ax1, label="TX HPI")
+HPI_data["AK"].plot(ax=ax1, label="AK HPI")
 ax1.legend(loc=4)
 ax1.legend(loc=4)
 
 
-TK_AK_12corr.plot(ax=ax2, label= 'TK AK 12 month correlation')
+TK_AK_12corr.plot(ax=ax2, label="TK AK 12 month correlation")
 ax2.legend(loc=4)
 ax2.legend(loc=4)
 
 
 # HPI_data[['TX12MA','TX']].plot(ax=ax1)
 # HPI_data[['TX12MA','TX']].plot(ax=ax1)
 # HPI_data['TX12STD'].plot(ax=ax2)
 # HPI_data['TX12STD'].plot(ax=ax2)
 # print(HPI_data.head())
 # print(HPI_data.head())
 plt.show()
 plt.show()
-
-

+ 32 - 23
sentdex_data_analysis/pandas_scikitLearn.py

@@ -1,19 +1,20 @@
 import pickle
 import pickle
-import pandas as pd 
-import quandl 
-import matplotlib.pyplot as plt 
+import pandas as pd
+import quandl
+import matplotlib.pyplot as plt
 from matplotlib import style
 from matplotlib import style
-import numpy as np 
-from statistics import mean 
+import numpy as np
+from statistics import mean
 from sklearn import svm
 from sklearn import svm
 from sklearn.preprocessing import scale, MinMaxScaler, MaxAbsScaler
 from sklearn.preprocessing import scale, MinMaxScaler, MaxAbsScaler
 from sklearn.linear_model import LogisticRegression
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import train_test_split
 from sklearn.model_selection import train_test_split
 
 
-style.use('seaborn-dark-palette')
+style.use("seaborn-dark-palette")
+
+ax1 = plt.subplot(2, 1, 1)
+ax2 = plt.subplot(2, 1, 2, sharex=ax1)
 
 
-ax1 = plt.subplot(2,1,1)
-ax2 = plt.subplot(2,1,2, sharex=ax1)
 
 
 def create_labels(cur_hpi, fut_hpi):
 def create_labels(cur_hpi, fut_hpi):
     if fut_hpi > cur_hpi:
     if fut_hpi > cur_hpi:
@@ -21,12 +22,18 @@ def create_labels(cur_hpi, fut_hpi):
     else:
     else:
         return 0
         return 0
 
 
+
 def moving_average(values):
 def moving_average(values):
     return mean(values)
     return mean(values)
 
 
-benchmark = pd.read_pickle('us_pct.pickle')  # us overall housing price index percentage change
-HPI = pd.read_pickle('HPI_complete.pickle') # all of the state data, thirty year mortgage, unemployment rate, GDP, SP500
-HPI = HPI.join(benchmark['United States'])
+
+benchmark = pd.read_pickle(
+    "us_pct.pickle"
+)  # us overall housing price index percentage change
+HPI = pd.read_pickle(
+    "HPI_complete.pickle"
+)  # all of the state data, thirty year mortgage, unemployment rate, GDP, SP500
+HPI = HPI.join(benchmark["United States"])
 # all in percentage change since the start of the data (1975-01-01)
 # all in percentage change since the start of the data (1975-01-01)
 
 
 HPI.dropna(inplace=True)
 HPI.dropna(inplace=True)
@@ -34,15 +41,17 @@ HPI.dropna(inplace=True)
 housing_pct = HPI.pct_change()
 housing_pct = HPI.pct_change()
 housing_pct.replace([np.inf, -np.inf], np.nan, inplace=True)
 housing_pct.replace([np.inf, -np.inf], np.nan, inplace=True)
 
 
-housing_pct['US_HPI_future'] = housing_pct['United States'].shift(-1)
+housing_pct["US_HPI_future"] = housing_pct["United States"].shift(-1)
 housing_pct.dropna(inplace=True)
 housing_pct.dropna(inplace=True)
 
 
-housing_pct['label'] = list(map(create_labels, housing_pct['United States'], housing_pct['US_HPI_future']))
+housing_pct["label"] = list(
+    map(create_labels, housing_pct["United States"], housing_pct["US_HPI_future"])
+)
 
 
 # housing_pct['ma_apply_example'] = housing_pct['M30'].rolling(window=10).apply(moving_average)
 # housing_pct['ma_apply_example'] = housing_pct['M30'].rolling(window=10).apply(moving_average)
 # print(housing_pct.tail())
 # print(housing_pct.tail())
-X = np.array(housing_pct.drop(['label', 'US_HPI_future'], 1))
-y = np.array(housing_pct['label'])
+X = np.array(housing_pct.drop(["label", "US_HPI_future"], 1))
+y = np.array(housing_pct["label"])
 
 
 X = scale(X)
 X = scale(X)
 
 
@@ -54,13 +63,13 @@ clflog_accuracy = []
 clfsvm_accuracy = []
 clfsvm_accuracy = []
 
 
 for i in range(10):
 for i in range(10):
-	clflog = LogisticRegression(C=49.0, dual=False, penalty="l1")
-	clflog.fit(X_train, y_train)
-	clflog_accuracy.append(clflog.score(x_test,y_test))
+    clflog = LogisticRegression(C=49.0, dual=False, penalty="l1")
+    clflog.fit(X_train, y_train)
+    clflog_accuracy.append(clflog.score(x_test, y_test))
 
 
-	clfsvm = svm.SVC(kernel='linear')
-	clfsvm.fit(X_train, y_train)
-	clfsvm_accuracy.append(clfsvm.score(x_test,y_test))
+    clfsvm = svm.SVC(kernel="linear")
+    clfsvm.fit(X_train, y_train)
+    clfsvm_accuracy.append(clfsvm.score(x_test, y_test))
 
 
-print('Accuracy of logistic regression = %0.4f' % (mean(clflog_accuracy) * 100))
-print('Accuracy of support vector machine = %0.4f' % (mean(clfsvm_accuracy) * 100))
+print("Accuracy of logistic regression = %0.4f" % (mean(clflog_accuracy) * 100))
+print("Accuracy of support vector machine = %0.4f" % (mean(clfsvm_accuracy) * 100))

+ 5 - 4
sentdex_data_analysis/tpot_basic.py

@@ -4,11 +4,12 @@ from sklearn.model_selection import train_test_split
 
 
 digits = load_digits()
 digits = load_digits()
 
 
-X_train, X_test, y_train,  y_test = train_test_split(digits.data, digits.target,
-													train_size = 0.75, test_size = 0.25)
+X_train, X_test, y_train, y_test = train_test_split(
+    digits.data, digits.target, train_size=0.75, test_size=0.25
+)
 
 
-tpot = TPOTClassifier(generations = 5, population_size = 20, verbosity = 2)
+tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2)
 
 
 tpot.fit(X_train, y_train)
 tpot.fit(X_train, y_train)
 print(tpot.score(X_test, y_test))
 print(tpot.score(X_test, y_test))
-tpot.export('tpot_mnist_pipeline.py')
+tpot.export("tpot_mnist_pipeline.py")

+ 47 - 43
slack_interaction/utils.py

@@ -9,52 +9,54 @@ import matplotlib.pyplot as plt
 
 
 def plot_history(history):
 def plot_history(history):
     """Plot Results of Keras training"""
     """Plot Results of Keras training"""
-    plt.style.use('fivethirtyeight')
-    epochs = list(range(1, len(history['loss']) + 1))
-    plt.figure(figsize = (18, 6))
-    
+    plt.style.use("fivethirtyeight")
+    epochs = list(range(1, len(history["loss"]) + 1))
+    plt.figure(figsize=(18, 6))
+
     # Losses
     # Losses
     plt.subplot(1, 2, 1)
     plt.subplot(1, 2, 1)
-    plt.plot(epochs, history['loss'], '-o', ms = 10, label = "Training Loss")
-    plt.plot(epochs, history['val_loss'], '-*',  ms = 10, label = "Validation Loss")
-    plt.legend(); 
-    plt.xlabel('Epoch'); plt.ylabel('Loss')
-    plt.title('Losses');
-    
+    plt.plot(epochs, history["loss"], "-o", ms=10, label="Training Loss")
+    plt.plot(epochs, history["val_loss"], "-*", ms=10, label="Validation Loss")
+    plt.legend()
+    plt.xlabel("Epoch")
+    plt.ylabel("Loss")
+    plt.title("Losses")
+
     # Accuracy
     # Accuracy
     plt.subplot(1, 2, 2)
     plt.subplot(1, 2, 2)
-    plt.plot(epochs, history['acc'], '-o', ms = 10, label = 'Training Acc')
-    plt.plot(epochs, history['val_acc'], '-*',  ms = 10, label = "Validation Acc")
+    plt.plot(epochs, history["acc"], "-o", ms=10, label="Training Acc")
+    plt.plot(epochs, history["val_acc"], "-*", ms=10, label="Validation Acc")
     plt.legend()
     plt.legend()
-    plt.xlabel('Epoch'); plt.ylabel('Acc')
-    plt.title('Accuracy');
-    
-    plt.suptitle('Training Curves', y= 1.05)
+    plt.xlabel("Epoch")
+    plt.ylabel("Acc")
+    plt.title("Accuracy")
+
+    plt.suptitle("Training Curves", y=1.05)
 
 
 
 
 def get_options(slack):
 def get_options(slack):
-    command_dict = {'functions': {},
-                    'attributes': {}}
+    command_dict = {"functions": {}, "attributes": {}}
 
 
     # Modules
     # Modules
     for d in dir(slack):
     for d in dir(slack):
-        if not d.startswith('_'):
-            command_dict['functions'][d] = []
-            command_dict['attributes'][d] = []
+        if not d.startswith("_"):
+            command_dict["functions"][d] = []
+            command_dict["attributes"][d] = []
             # Iterate through methods and attributes
             # Iterate through methods and attributes
             for dd in dir(getattr(slack, d)):
             for dd in dir(getattr(slack, d)):
-                if not dd.startswith('_'):
+                if not dd.startswith("_"):
                     # List of methods and attributes
                     # List of methods and attributes
                     l = dir(getattr(getattr(slack, d), dd))
                     l = dir(getattr(getattr(slack, d), dd))
                     # Method (function)
                     # Method (function)
-                    if '__call__' in l:
-                        command_dict['functions'][d].append(dd)
+                    if "__call__" in l:
+                        command_dict["functions"][d].append(dd)
                     # Attributes
                     # Attributes
                     else:
                     else:
-                        command_dict['attributes'][d].append(dd)
-                        
+                        command_dict["attributes"][d].append(dd)
+
     return command_dict
     return command_dict
 
 
+
 def get_data_and_model():
 def get_data_and_model():
     batch_size = 128
     batch_size = 128
     num_classes = 10
     num_classes = 10
@@ -66,7 +68,7 @@ def get_data_and_model():
     # the data, split between train and test sets
     # the data, split between train and test sets
     (x_train, y_train), (x_test, y_test) = mnist.load_data()
     (x_train, y_train), (x_test, y_test) = mnist.load_data()
 
 
-    if K.image_data_format() == 'channels_first':
+    if K.image_data_format() == "channels_first":
         x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
         x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
         x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
         x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
         input_shape = (1, img_rows, img_cols)
         input_shape = (1, img_rows, img_cols)
@@ -75,32 +77,34 @@ def get_data_and_model():
         x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
         x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
         input_shape = (img_rows, img_cols, 1)
         input_shape = (img_rows, img_cols, 1)
 
 
-    x_train = x_train.astype('float32')
-    x_test = x_test.astype('float32')
+    x_train = x_train.astype("float32")
+    x_test = x_test.astype("float32")
     x_train /= 255
     x_train /= 255
     x_test /= 255
     x_test /= 255
-    print('x_train shape:', x_train.shape)
-    print(x_train.shape[0], 'train samples')
-    print(x_test.shape[0], 'test samples')
+    print("x_train shape:", x_train.shape)
+    print(x_train.shape[0], "train samples")
+    print(x_test.shape[0], "test samples")
 
 
     # convert class vectors to binary class matrices
     # convert class vectors to binary class matrices
     y_train = keras.utils.to_categorical(y_train, num_classes)
     y_train = keras.utils.to_categorical(y_train, num_classes)
     y_test = keras.utils.to_categorical(y_test, num_classes)
     y_test = keras.utils.to_categorical(y_test, num_classes)
 
 
     model = Sequential()
     model = Sequential()
-    model.add(Conv2D(32, kernel_size=(3, 3),
-                     activation='relu',
-                     input_shape=input_shape))
-    model.add(Conv2D(64, (3, 3), activation='relu'))
+    model.add(
+        Conv2D(32, kernel_size=(3, 3), activation="relu", input_shape=input_shape)
+    )
+    model.add(Conv2D(64, (3, 3), activation="relu"))
     model.add(MaxPooling2D(pool_size=(2, 2)))
     model.add(MaxPooling2D(pool_size=(2, 2)))
     model.add(Dropout(0.25))
     model.add(Dropout(0.25))
     model.add(Flatten())
     model.add(Flatten())
-    model.add(Dense(128, activation='relu'))
+    model.add(Dense(128, activation="relu"))
     model.add(Dropout(0.5))
     model.add(Dropout(0.5))
-    model.add(Dense(num_classes, activation='softmax'))
+    model.add(Dense(num_classes, activation="softmax"))
+
+    model.compile(
+        loss=keras.losses.categorical_crossentropy,
+        optimizer=keras.optimizers.Adadelta(),
+        metrics=["accuracy"],
+    )
 
 
-    model.compile(loss=keras.losses.categorical_crossentropy,
-                  optimizer=keras.optimizers.Adadelta(),
-                  metrics=['accuracy'])
-    
-    return x_train, x_test, y_train, y_test, model
+    return x_train, x_test, y_train, y_test, model

File diff suppressed because it is too large
+ 831 - 526
stocker/stocker.py


+ 77 - 56
time_features/time_features_utils.py

@@ -5,14 +5,20 @@ from tqdm import tqdm_notebook
 
 
 
 
 def cyclical_encoding(series, period):
 def cyclical_encoding(series, period):
-    features = pd.concat([np.sin((2 * np.pi * series / period)),
-                          np.cos((2 * np.pi * series / period))], axis=1)
-    features.columns = [f'sin_{series.name}', f'cos_{series.name}']
+    features = pd.concat(
+        [np.sin((2 * np.pi * series / period)), np.cos((2 * np.pi * series / period))],
+        axis=1,
+    )
+    features.columns = [f"sin_{series.name}", f"cos_{series.name}"]
     return features
     return features
 
 
 
 
 def create_time_features(
 def create_time_features(
-    fld, keep_frac_only=False, include_additional=False, cyc_encode=False, timezone=None,
+    fld,
+    keep_frac_only=False,
+    include_additional=False,
+    cyc_encode=False,
+    timezone=None,
 ):
 ):
     """
     """
     Create features out of a series of datetimes.
     Create features out of a series of datetimes.
@@ -42,8 +48,17 @@ def create_time_features(
         df["local"] = fld
         df["local"] = fld
 
 
     # Basic attributes
     # Basic attributes
-    attr = ["second", "minute", "hour", "year", "month",
-            "week", "day", "dayofweek", "dayofyear"]
+    attr = [
+        "second",
+        "minute",
+        "hour",
+        "year",
+        "month",
+        "week",
+        "day",
+        "dayofweek",
+        "dayofyear",
+    ]
 
 
     if include_additional:
     if include_additional:
         # Additional attributes to extract
         # Additional attributes to extract
@@ -69,14 +84,10 @@ def create_time_features(
     ) / 24
     ) / 24
 
 
     # Add fractional time of week
     # Add fractional time of week
-    df[prefix + "fracweek"] = (
-        df[prefix + "dayofweek"] + df[prefix + "fracday"]
-    ) / 7
+    df[prefix + "fracweek"] = (df[prefix + "dayofweek"] + df[prefix + "fracday"]) / 7
 
 
     # Add fractional time of month
     # Add fractional time of month
-    df[prefix + "fracmonth"] = (
-        (df[prefix + "day"] - 1) + df[prefix + "fracday"]
-    ) / (
+    df[prefix + "fracmonth"] = ((df[prefix + "day"] - 1) + df[prefix + "fracday"]) / (
         fld.dt.days_in_month
         fld.dt.days_in_month
     )  # Use fld days_in_month in case this is not
     )  # Use fld days_in_month in case this is not
     # one of the attributes specified
     # one of the attributes specified
@@ -84,7 +95,7 @@ def create_time_features(
     # Calculate days in year (accounting for leap year rules)
     # Calculate days in year (accounting for leap year rules)
     days_in_year = np.where(
     days_in_year = np.where(
         (df[prefix + "year"] % 4 == 0)
         (df[prefix + "year"] % 4 == 0)
-        & ( ( df[prefix + "year"] % 100 != 0) | (df[prefix + "year"] % 400 == 0)),
+        & ((df[prefix + "year"] % 100 != 0) | (df[prefix + "year"] % 400 == 0)),
         366,
         366,
         365,
         365,
     )
     )
@@ -95,15 +106,13 @@ def create_time_features(
     ) / days_in_year
     ) / days_in_year
 
 
     if cyc_encode:
     if cyc_encode:
-        df = pd.concat([df, cyclical_encoding(
-            df[prefix + 'hour'], 24)], axis=1)
-        df = pd.concat([df, cyclical_encoding(
-            df[prefix + 'dayofweek'], 6)], axis=1)
-        df = pd.concat([df, cyclical_encoding(df[prefix + 'day'], 31)], axis=1)
-        df = pd.concat([df, cyclical_encoding(
-            df[prefix + 'month'], 12)], axis=1)
-        df = pd.concat([df] + [cyclical_encoding(df[c], 1)
-                               for c in df if 'frac' in c], axis=1)
+        df = pd.concat([df, cyclical_encoding(df[prefix + "hour"], 24)], axis=1)
+        df = pd.concat([df, cyclical_encoding(df[prefix + "dayofweek"], 6)], axis=1)
+        df = pd.concat([df, cyclical_encoding(df[prefix + "day"], 31)], axis=1)
+        df = pd.concat([df, cyclical_encoding(df[prefix + "month"], 12)], axis=1)
+        df = pd.concat(
+            [df] + [cyclical_encoding(df[c], 1) for c in df if "frac" in c], axis=1
+        )
 
 
     if keep_frac_only:
     if keep_frac_only:
         df = df.drop(
         df = df.drop(
@@ -133,7 +142,7 @@ def monthly_validation(data, model, track=False):
     train_stops = np.unique(data.index[data.index.is_month_end].date)
     train_stops = np.unique(data.index[data.index.is_month_end].date)
 
 
     X = data.copy()
     X = data.copy()
-    y = X.pop('energy')
+    y = X.pop("energy")
     weighted_score = 0
     weighted_score = 0
     total_possible = 0
     total_possible = 0
     train_points = []
     train_points = []
@@ -153,7 +162,8 @@ def monthly_validation(data, model, track=False):
 
 
         if track:
         if track:
             print(
             print(
-                f'Accuracy: {score:.2f}% testing from {test_start} to {test_end} ({n_days} days).')
+                f"Accuracy: {score:.2f}% testing from {test_start} to {test_end} ({n_days} days)."
+            )
         weighted_score += score * len(X_test)
         weighted_score += score * len(X_test)
         total_possible += 100 * len(X_test)
         total_possible += 100 * len(X_test)
         train_points.append(len(X_train))
         train_points.append(len(X_train))
@@ -163,12 +173,14 @@ def monthly_validation(data, model, track=False):
     model.fit(X, y)
     model.fit(X, y)
 
 
     importance_df = None
     importance_df = None
-    if hasattr(model, 'feature_importances_'):
+    if hasattr(model, "feature_importances_"):
         importance_df = pd.DataFrame(
         importance_df = pd.DataFrame(
-            dict(features=X.columns, importance=model.feature_importances_))
+            dict(features=X.columns, importance=model.feature_importances_)
+        )
     final_score = weighted_score / total_possible
     final_score = weighted_score / total_possible
     results_df = pd.DataFrame(
     results_df = pd.DataFrame(
-        dict(train_points=train_points, test_points=test_points, score=scores))
+        dict(train_points=train_points, test_points=test_points, score=scores)
+    )
     return dict(results=results_df, importances=importance_df, score=final_score)
     return dict(results=results_df, importances=importance_df, score=final_score)
 
 
 
 
@@ -177,21 +189,21 @@ def mape(y_true, y_pred):
 
 
 
 
 def data_reading(filename):
 def data_reading(filename):
-    data = pd.read_csv(filename, parse_dates=['timestamp'])
-    data = data.dropna(subset=['energy'])
-    freq_counts = data['timestamp'].diff(1).value_counts()
+    data = pd.read_csv(filename, parse_dates=["timestamp"])
+    data = data.dropna(subset=["energy"])
+    freq_counts = data["timestamp"].diff(1).value_counts()
     freq = round(freq_counts.idxmax().total_seconds() / 60)
     freq = round(freq_counts.idxmax().total_seconds() / 60)
-    data = data.set_index('timestamp').sort_index()
+    data = data.set_index("timestamp").sort_index()
     return data, freq, len(data)
     return data, freq, len(data)
 
 
 
 
 def data_testing(filename, model):
 def data_testing(filename, model):
-    building_id = filename.split('_')[-1].split('.csv')[0]
+    building_id = filename.split("_")[-1].split(".csv")[0]
     data, freq, dpoints = data_reading(filename)
     data, freq, dpoints = data_reading(filename)
     results = test_time_features(data, model)
     results = test_time_features(data, model)
-    results['freq'] = freq
-    results['dpoints'] = dpoints
-    results['building_id'] = building_id
+    results["freq"] = freq
+    results["dpoints"] = dpoints
+    results["building_id"] = building_id
     return results
     return results
 
 
 
 
@@ -202,17 +214,27 @@ def test_time_features(data, model):
     scores = []
     scores = []
     methods = []
     methods = []
 
 
-    y = data.pop('energy')
-
-    normal_features = ['timestamp_' + t for t in ['hour',
-                                                  'dayofweek', 'month', 'dayofyear', 'year']]
-    normal_cyc_features = ['sin_' + t for t in normal_features if t not in ['timestamp_dayofyear', 'timestamp_year']
-                           ] + ['cos_' + t for t in normal_features if t not in ['timestamp_dayofyear', 'timestamp_year']]
-
-    frac_features = ['timestamp_' +
-                     t for t in ['fracday', 'fracweek', 'fracmonth', 'fracyear']]
-    frac_cyc_features = ['sin_' + t for t in frac_features] + \
-        ['cos_' + t for t in frac_features]
+    y = data.pop("energy")
+
+    normal_features = [
+        "timestamp_" + t for t in ["hour", "dayofweek", "month", "dayofyear", "year"]
+    ]
+    normal_cyc_features = [
+        "sin_" + t
+        for t in normal_features
+        if t not in ["timestamp_dayofyear", "timestamp_year"]
+    ] + [
+        "cos_" + t
+        for t in normal_features
+        if t not in ["timestamp_dayofyear", "timestamp_year"]
+    ]
+
+    frac_features = [
+        "timestamp_" + t for t in ["fracday", "fracweek", "fracmonth", "fracyear"]
+    ]
+    frac_cyc_features = ["sin_" + t for t in frac_features] + [
+        "cos_" + t for t in frac_features
+    ]
 
 
     data_normal = data[normal_features].copy()
     data_normal = data[normal_features].copy()
     data_normal_cyc = data[normal_cyc_features].copy()
     data_normal_cyc = data[normal_cyc_features].copy()
@@ -220,22 +242,21 @@ def test_time_features(data, model):
     data_frac_cyc = data[frac_cyc_features].copy()
     data_frac_cyc = data[frac_cyc_features].copy()
 
 
     results = {}
     results = {}
-    dataset_names = ['normal', 'normal_cyc', 'frac', 'frac_cyc']
+    dataset_names = ["normal", "normal_cyc", "frac", "frac_cyc"]
 
 
-    for dataset, name in zip([data_normal,
-                              data_normal_cyc,
-                              data_frac,
-                              data_frac_cyc],
-                             dataset_names):
+    for dataset, name in zip(
+        [data_normal, data_normal_cyc, data_frac, data_frac_cyc], dataset_names
+    ):
 
 
-        to_drop = dataset.columns[(dataset.nunique() == 1)
-                                  | (dataset.nunique() == len(dataset))]
+        to_drop = dataset.columns[
+            (dataset.nunique() == 1) | (dataset.nunique() == len(dataset))
+        ]
 
 
         dataset = dataset.drop(columns=to_drop)
         dataset = dataset.drop(columns=to_drop)
-        dataset['energy'] = y.copy()
+        dataset["energy"] = y.copy()
         try:
         try:
             data_results = monthly_validation(dataset, model)
             data_results = monthly_validation(dataset, model)
-            scores.append(data_results['score'])
+            scores.append(data_results["score"])
             methods.append(name)
             methods.append(name)
         except Exception as e:
         except Exception as e:
             print(e, name)
             print(e, name)

+ 99 - 96
web_automation/canvas_upload.py

@@ -1,4 +1,3 @@
-
 # selenium for web driving
 # selenium for web driving
 import selenium
 import selenium
 from selenium import webdriver
 from selenium import webdriver
@@ -14,130 +13,134 @@ import os
 
 
 
 
 def submit_assignment(file_tup):
 def submit_assignment(file_tup):
-	# Using Chrome to access web
-	driver = webdriver.Chrome()
-
-	time.sleep(5)
+    # Using Chrome to access web
+    driver = webdriver.Chrome()
 
 
-	# Open the website
-	driver.get('https://canvas.case.edu')
+    time.sleep(5)
+
+    # Open the website
+    driver.get("https://canvas.case.edu")
+
+    # Password for Canvas
+    with open("C:/Users/Will Koehrsen/Desktop/cp.txt", "r") as f:
+        cp = f.read()
 
 
-	# Password for Canvas
-	with open('C:/Users/Will Koehrsen/Desktop/cp.txt', 'r') as f:
-	    cp = f.read()
+    # Locate id and password
+    id_box = driver.find_element_by_name("username")
+    pass_box = driver.find_element_by_name("password")
 
 
+    # Send login information
+    id_box.send_keys("wjk68")
+    pass_box.send_keys(cp)
 
 
-	# Locate id and password
-	id_box = driver.find_element_by_name('username')
-	pass_box = driver.find_element_by_name('password')
+    # Click login
+    login_button = driver.find_element_by_name("submit")
+    login_button.click()
 
 
-	# Send login information
-	id_box.send_keys('wjk68')
-	pass_box.send_keys(cp)
+    # Find and click on list of courses
+    courses_button = driver.find_element_by_id("global_nav_courses_link")
+    courses_button.click()
 
 
-	# Click login
-	login_button = driver.find_element_by_name('submit')
-	login_button.click()
+    # Wait for the page to load
+    time.sleep(2)
 
 
-	# Find and click on list of courses
-	courses_button = driver.find_element_by_id('global_nav_courses_link')
-	courses_button.click()
+    # Get the name of the folder
+    folder = file_tup[0]
 
 
+    # Class to select depends on folder
+    if folder == "DSCI451":
+        class_select = driver.find_element_by_link_text(
+            "Applied Data Science Research (100/5047)"
+        )
+    elif folder == "DCSI453":
+        class_select = driver.find_element_by_link_text(
+            "Data Science: Statistical Learning, Modeling and Prediction (100/5046)"
+        )
+    elif folder == "EECS491":
+        class_select = driver.find_element_by_link_text(
+            "Artificial Intelligence: Probabilistic Graphical Models (100/10039)"
+        )
+    elif folder == "EECS531":
+        class_select = driver.find_element_by_link_text("Computer Vision (100/10040)")
 
 
-	# Wait for the page to load
-	time.sleep(2)
+    # Click on the specific class
+    class_select.click()
 
 
-	# Get the name of the folder
-	folder = file_tup[0]
-	    
-	# Class to select depends on folder
-	if folder == 'DSCI451':
-	    class_select = driver.find_element_by_link_text('Applied Data Science Research (100/5047)')
-	elif folder == 'DCSI453':
-	    class_select = driver.find_element_by_link_text('Data Science: Statistical Learning, Modeling and Prediction (100/5046)')
-	elif folder == 'EECS491':
-	    class_select = driver.find_element_by_link_text('Artificial Intelligence: Probabilistic Graphical Models (100/10039)')
-	elif folder == 'EECS531':
-	    class_select = driver.find_element_by_link_text('Computer Vision (100/10040)')
+    assignment_button = driver.find_element_by_link_text("Assignments")
+    assignment_button.click()
 
 
-	# Click on the specific class
-	class_select.click()
+    # Wait for the page to load
+    time.sleep(2)
 
 
-	assignment_button = driver.find_element_by_link_text('Assignments')
-	assignment_button.click()
+    # Locate the specific assignment
+    file_name = file_tup[1]
+    file_locator = file_name.split(".")[0]
 
 
-	# Wait for the page to load 
-	time.sleep(2)
+    specific_assigment = driver.find_element_by_link_text(file_locator)
+    specific_assigment.click()
 
 
-	# Locate the specific assignment
-	file_name = file_tup[1]
-	file_locator = file_name.split('.')[0]
-	 
-	specific_assigment = driver.find_element_by_link_text(file_locator)
-	specific_assigment.click()
+    # Click on the button to submit an assignment
+    try:
+        submit_assignment_button = driver.find_element_by_link_text("Submit Assignment")
+    # If assignment has already been submitted
+    except:
+        print("Assignment already submitted, re-submitting")
+        submit_assignment_button = driver.find_element_by_link_text(
+            "Re-submit Assignment"
+        )
 
 
-	# Click on the button to submit an assignment
-	try:
-	    submit_assignment_button = driver.find_element_by_link_text('Submit Assignment')
-	# If assignment has already been submitted
-	except:
-	    print('Assignment already submitted, re-submitting')
-	    submit_assignment_button = driver.find_element_by_link_text('Re-submit Assignment')
+    submit_assignment_button.click()
 
 
-	submit_assignment_button.click()
+    # Wait for the page to load
+    time.sleep(2)
 
 
-	# Wait for the page to load
-	time.sleep(2)
+    # Choose file button
+    choose_file = driver.find_element_by_name("attachments[0][uploaded_data]")
 
 
-	# Choose file button
-	choose_file = driver.find_element_by_name('attachments[0][uploaded_data]')
+    # Send the name of the file to the button
+    file_location = os.path.join(submission_dir, folder, file_name)
+    choose_file.send_keys(file_location)
 
 
-	# Send the name of the file to the button
-	file_location = os.path.join(submission_dir, folder, file_name)
-	choose_file.send_keys(file_location)
+    submit_assignment = driver.find_element_by_id("submit_file_button")
+    submit_assignment.click()
 
 
-	submit_assignment = driver.find_element_by_id('submit_file_button')
-	submit_assignment.click()
+    # Wait for the page
+    time.sleep(2)
 
 
-	# Wait for the page
-	time.sleep(2)
+    # Move the file to the submitted folder
+    submitted_dir = "C:/Users/Will Koehrsen/Desktop/submitted_assignments"
+    submitted_dir = os.path.join(submitted_dir, folder)
+    submitted_file_name = "Submitted " + file_name
 
 
-	# Move the file to the submitted folder
-	submitted_dir = 'C:/Users/Will Koehrsen/Desktop/submitted_assignments'
-	submitted_dir = os.path.join(submitted_dir, folder)
-	submitted_file_name = 'Submitted ' + file_name
+    submitted_file_location = os.path.join(submitted_dir, submitted_file_name)
+    # os.rename(file_location, submitted_file_location)
 
 
-	submitted_file_location = os.path.join(submitted_dir, submitted_file_name)
-	# os.rename(file_location, submitted_file_location)
+    print(
+        "{} Assignment for Class {} successfully submitted at {}.".format(
+            file_name, folder, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        )
+    )
 
 
-	print('{} Assignment for Class {} successfully submitted at {}.'.format(
-		file_name, folder, datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
+    print("Submitted assignment available at {}.".format(submitted_file_location))
 
 
-	print('Submitted assignment available at {}.'.format(submitted_file_location))
+    return
 
 
-	return
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
 
 
-	# Build tuple of (folder, file) to turn in
-	submission_dir = 'C:/Users/Will Koehrsen/Desktop/completed_assignments'
-	dir_list = list(os.listdir(submission_dir))
-
-	for directory in dir_list:
-	    file_list = list(os.listdir(os.path.join(submission_dir, directory)))
-	    if len(file_list) != 0:
-	        file_tup = (directory, file_list[0])
-
-	if len(file_tup) == 0:
-		print('No files to submit')
-
-	else:
-		print('Assignment "{}" for "{}" found.'.format(file_tup[1], file_tup[0]))
-		input('Press enter to proceed: ')
-		submit_assignment(file_tup)
-
-
-
+    # Build tuple of (folder, file) to turn in
+    submission_dir = "C:/Users/Will Koehrsen/Desktop/completed_assignments"
+    dir_list = list(os.listdir(submission_dir))
 
 
+    for directory in dir_list:
+        file_list = list(os.listdir(os.path.join(submission_dir, directory)))
+        if len(file_list) != 0:
+            file_tup = (directory, file_list[0])
 
 
+    if len(file_tup) == 0:
+        print("No files to submit")
 
 
+    else:
+        print('Assignment "{}" for "{}" found.'.format(file_tup[1], file_tup[0]))
+        input("Press enter to proceed: ")
+        submit_assignment(file_tup)

+ 38 - 41
weighter/run_weighter.py

@@ -1,4 +1,3 @@
-
 # pandas and numpy for data manipulation
 # pandas and numpy for data manipulation
 import pandas as pd
 import pandas as pd
 import numpy as np
 import numpy as np
@@ -18,7 +17,7 @@ from oauth2client.service_account import ServiceAccountCredentials
 # os for deleting images
 # os for deleting images
 import os
 import os
 
 
-# matplotlib for plotting 
+# matplotlib for plotting
 import matplotlib.pyplot as plt
 import matplotlib.pyplot as plt
 import matplotlib.patches as mpatches
 import matplotlib.patches as mpatches
 import matplotlib
 import matplotlib
@@ -28,55 +27,53 @@ from weighter import Weighter
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
 
 
-	# google sheets access
-	scope = ['https://spreadsheets.google.com/feeds']
-
-	# Use local stored credentials in json file
-	# make sure to first share the sheet with the email in the json file
-	credentials = ServiceAccountCredentials.from_json_keyfile_name('C:/Users/Will Koehrsen/Desktop/weighter-2038ffb4e5a6.json', scope)
-
-	# Authorize access
-	gc = gspread.authorize(credentials);
-
-	# Slack api key is stored as text file
-	with open('C:/Users/Will Koehrsen/Desktop/slack_api.txt', 'r') as f:
-	    slack_api_key = f.read()
+    # google sheets access
+    scope = ["https://spreadsheets.google.com/feeds"]
 
 
-	slack = Slacker(slack_api_key)
+    # Use local stored credentials in json file
+    # make sure to first share the sheet with the email in the json file
+    credentials = ServiceAccountCredentials.from_json_keyfile_name(
+        "C:/Users/Will Koehrsen/Desktop/weighter-2038ffb4e5a6.json", scope
+    )
 
 
-	# Open the sheet, need to share the sheet with email specified in json file
-	gsheet = gc.open('Auto Weight Challenge').sheet1
+    # Authorize access
+    gc = gspread.authorize(credentials)
 
 
-	# List of lists with each row in the sheet as a list
-	weight_lists = gsheet.get_all_values()
+    # Slack api key is stored as text file
+    with open("C:/Users/Will Koehrsen/Desktop/slack_api.txt", "r") as f:
+        slack_api_key = f.read()
 
 
-	# Headers are the first list
-	# Pop returns the element (list in this case) and removes it from the list
-	headers = weight_lists.pop(0)
+    slack = Slacker(slack_api_key)
 
 
-	# Convert list of lists to a dataframe with specified column header
-	weights = pd.DataFrame(weight_lists, columns=headers)
+    # Open the sheet, need to share the sheet with email specified in json file
+    gsheet = gc.open("Auto Weight Challenge").sheet1
 
 
-	# Record column should be a boolean
-	weights['Record'] = weights['Record'].astype(bool)
+    # List of lists with each row in the sheet as a list
+    weight_lists = gsheet.get_all_values()
 
 
-	# Name column is a string
-	weights['Name'] = weights['Name'].astype(str)
+    # Headers are the first list
+    # Pop returns the element (list in this case) and removes it from the list
+    headers = weight_lists.pop(0)
 
 
-	# Convert dates to datetime, then set as index, then set the time zone
-	weights['Date'] = pd.to_datetime(weights['Date'], unit='s')
-	weights  = weights.set_index('Date', drop = True).tz_localize(tz='US/Eastern')
+    # Convert list of lists to a dataframe with specified column header
+    weights = pd.DataFrame(weight_lists, columns=headers)
 
 
-	# Drop any extra entries
-	weights = weights.drop('NaT')
+    # Record column should be a boolean
+    weights["Record"] = weights["Record"].astype(bool)
 
 
-	# If there are new entries create the weighter object
-	if len(weights) > np.count_nonzero(weights['Record']):
-		# Initialize with dataframe of weights, google sheet, and slack object
-    	 weighter = Weighter(weights, gsheet, slack)
-    	 weighter.process_entries()
-    	 print('Success')
+    # Name column is a string
+    weights["Name"] = weights["Name"].astype(str)
 
 
+    # Convert dates to datetime, then set as index, then set the time zone
+    weights["Date"] = pd.to_datetime(weights["Date"], unit="s")
+    weights = weights.set_index("Date", drop=True).tz_localize(tz="US/Eastern")
 
 
-	
+    # Drop any extra entries
+    weights = weights.drop("NaT")
 
 
+    # If there are new entries create the weighter object
+    if len(weights) > np.count_nonzero(weights["Record"]):
+        # Initialize with dataframe of weights, google sheet, and slack object
+        weighter = Weighter(weights, gsheet, slack)
+        weighter.process_entries()
+        print("Success")

+ 424 - 281
weighter/weighter.py

@@ -17,169 +17,181 @@ from oauth2client.service_account import ServiceAccountCredentials
 # os for deleting images
 # os for deleting images
 import os
 import os
 
 
-# matplotlib for plotting 
+# matplotlib for plotting
 import matplotlib.pyplot as plt
 import matplotlib.pyplot as plt
 import matplotlib.patches as mpatches
 import matplotlib.patches as mpatches
 import matplotlib
 import matplotlib
 
 
 
 
-class Weighter():
-    
+class Weighter:
+
     """
     """
     When weighter is initialized, we need to convert the usernames,
     When weighter is initialized, we need to convert the usernames,
     get a dictionary of the unrecorded entries, construct a dictionary
     get a dictionary of the unrecorded entries, construct a dictionary
     of the actions to take, and make sure all data is formatted correctly
     of the actions to take, and make sure all data is formatted correctly
     """
     """
-    
+
     def __init__(self, weights, gsheet, slack):
     def __init__(self, weights, gsheet, slack):
-        
+
         # Weights is a dataframe
         # Weights is a dataframe
         self.weights = weights.copy()
         self.weights = weights.copy()
 
 
         self.gsheet = gsheet
         self.gsheet = gsheet
         self.slack = slack
         self.slack = slack
 
 
-        
         # Users is a list of the unique users in the data
         # Users is a list of the unique users in the data
-        self.users = list(set(self.weights['Name']))
-        
+        self.users = list(set(self.weights["Name"]))
+
         correct_names = []
         correct_names = []
-        
+
         # Name Changes
         # Name Changes
-        for user in self.weights['Name']:
-            
+        for user in self.weights["Name"]:
+
             # Have to hardcode in name Changes
             # Have to hardcode in name Changes
-            if user == 'koehrcl':
-                correct_names.append('Craig')
-            elif user == 'willkoehrsen':
-                correct_names.append('Will')
-            elif user == 'fletcher':
-                correct_names.append('Fletcher')
-            
+            if user == "koehrcl":
+                correct_names.append("Craig")
+            elif user == "willkoehrsen":
+                correct_names.append("Will")
+            elif user == "fletcher":
+                correct_names.append("Fletcher")
+
             # Currently do not handle new users
             # Currently do not handle new users
             else:
             else:
-                print('New User Detected')
+                print("New User Detected")
                 return
                 return
-            
-        self.weights['Name'] = correct_names
-        
+
+        self.weights["Name"] = correct_names
+
         # Users is a list of the unique users in the data
         # Users is a list of the unique users in the data
-        self.users = list(set(self.weights['Name']))
-        
+        self.users = list(set(self.weights["Name"]))
+
         # Create a dataframe of the unrecorded entries
         # Create a dataframe of the unrecorded entries
-        self.unrecorded = self.weights[self.weights['Record'] != True]
-        
+        self.unrecorded = self.weights[self.weights["Record"] != True]
+
         # Process the unrecorded entries
         # Process the unrecorded entries
         self.process_unrecorded()
         self.process_unrecorded()
-        
+
         # The remaning entries will all be weights
         # The remaning entries will all be weights
-        self.weights['Entry'] = [float(weight) for weight in self.weights['Entry']]
-        
+        self.weights["Entry"] = [float(weight) for weight in self.weights["Entry"]]
+
         # Build the user dictionary
         # Build the user dictionary
         self.build_user_dict()
         self.build_user_dict()
-        
+
         # Calculate the change and percentage change columns
         # Calculate the change and percentage change columns
         self.calculate_columns()
         self.calculate_columns()
-        
+
     """ 
     """ 
     Constructs a dictionary for each user with critical information
     Constructs a dictionary for each user with critical information
     This forms the basis for the summarize function
     This forms the basis for the summarize function
     """
     """
-    
+
     def build_user_dict(self):
     def build_user_dict(self):
-        
+
         user_dict = {}
         user_dict = {}
-        
-        user_goals = {'Craig': 215.0, 'Fletcher': 200.0, 'Will': 155.0}
-        user_colors = {'Craig': 'forestgreen', 'Fletcher': 'navy', 'Will': 'darkred'}
-        
+
+        user_goals = {"Craig": 215.0, "Fletcher": 200.0, "Will": 155.0}
+        user_colors = {"Craig": "forestgreen", "Fletcher": "navy", "Will": "darkred"}
+
         for i, user in enumerate(self.users):
         for i, user in enumerate(self.users):
-            
-            user_weights = self.weights[self.weights['Name'] == user]
+
+            user_weights = self.weights[self.weights["Name"] == user]
             goal = user_goals.get(user)
             goal = user_goals.get(user)
 
 
-            start_weight = user_weights.ix[min(user_weights.index), 'Entry']   
+            start_weight = user_weights.ix[min(user_weights.index), "Entry"]
             start_date = min(user_weights.index)
             start_date = min(user_weights.index)
-            
+
             # Find minimum weight and date on which it occurs
             # Find minimum weight and date on which it occurs
-            min_weight =  min(user_weights['Entry'])
-            min_weight_date = ((user_weights[user_weights['Entry'] == min_weight].index)[0])
-            
+            min_weight = min(user_weights["Entry"])
+            min_weight_date = (user_weights[user_weights["Entry"] == min_weight].index)[
+                0
+            ]
+
             # Find maximum weight and date on which it occurs
             # Find maximum weight and date on which it occurs
-            max_weight = max(user_weights['Entry'])
-            max_weight_date = ((user_weights[user_weights['Entry'] == max_weight].index)[0])
-            
-            most_recent_weight = user_weights.ix[max(user_weights.index), 'Entry']
-            
+            max_weight = max(user_weights["Entry"])
+            max_weight_date = (user_weights[user_weights["Entry"] == max_weight].index)[
+                0
+            ]
+
+            most_recent_weight = user_weights.ix[max(user_weights.index), "Entry"]
+
             if goal < start_weight:
             if goal < start_weight:
                 change = start_weight - most_recent_weight
                 change = start_weight - most_recent_weight
-                obj = 'lose'
+                obj = "lose"
             elif goal > start_weight:
             elif goal > start_weight:
                 change = most_recent_weight - start_weight
                 change = most_recent_weight - start_weight
-                obj = 'gain'
-                
+                obj = "gain"
+
             pct_change = 100 * change / start_weight
             pct_change = 100 * change / start_weight
-            
-            pct_to_goal = 100 * (change / abs(start_weight - goal) )
-            
+
+            pct_to_goal = 100 * (change / abs(start_weight - goal))
+
             # Color for plotting
             # Color for plotting
             user_color = user_colors[user]
             user_color = user_colors[user]
-            
-            user_dict[user] = {'min_weight': min_weight, 'max_weight': max_weight,
-                               'min_date': min_weight_date, 'max_date': max_weight_date,
-                               'recent': most_recent_weight, 'abs_change': change,
-                               'pct_change': pct_change, 'pct_towards_goal': pct_to_goal,
-                               'start_weight': start_weight, 'start_date': start_date,
-                               'goal_weight': goal, 'objective': obj, 'color': user_color}
-       
+
+            user_dict[user] = {
+                "min_weight": min_weight,
+                "max_weight": max_weight,
+                "min_date": min_weight_date,
+                "max_date": max_weight_date,
+                "recent": most_recent_weight,
+                "abs_change": change,
+                "pct_change": pct_change,
+                "pct_towards_goal": pct_to_goal,
+                "start_weight": start_weight,
+                "start_date": start_date,
+                "goal_weight": goal,
+                "objective": obj,
+                "color": user_color,
+            }
+
         self.user_dict = user_dict
         self.user_dict = user_dict
-             
+
     """
     """
     Builds a dictionary of unrecorded entries where each key is the user
     Builds a dictionary of unrecorded entries where each key is the user
     and the value is a list of weights and methods called for by the user.
     and the value is a list of weights and methods called for by the user.
     This dictionary is saved as the entries attribute of the class.
     This dictionary is saved as the entries attribute of the class.
     Removes the none weights from the data and from the google sheet.
     Removes the none weights from the data and from the google sheet.
     """
     """
-    
+
     def process_unrecorded(self):
     def process_unrecorded(self):
-        
-        entries = {name:[] for name in self.users}
+
+        entries = {name: [] for name in self.users}
         drop = []
         drop = []
-        
+
         location = {}
         location = {}
-        
+
         for index in self.unrecorded.index:
         for index in self.unrecorded.index:
 
 
-            entry = self.unrecorded.ix[index, 'Entry']
-            user = str(self.unrecorded.ix[index, 'Name'])
-            
+            entry = self.unrecorded.ix[index, "Entry"]
+            user = str(self.unrecorded.ix[index, "Name"])
+
             # Try and except does not seem like the best way to handle this
             # Try and except does not seem like the best way to handle this
             try:
             try:
                 entry = float(entry)
                 entry = float(entry)
                 entries[user].append(entry)
                 entries[user].append(entry)
                 location[index] = True
                 location[index] = True
-                
-            except:  
+
+            except:
                 entry = str(entry)
                 entry = str(entry)
                 entries[user].append(entry.strip())
                 entries[user].append(entry.strip())
-                location[index] = 'remove'
-                
+                location[index] = "remove"
+
                 drop.append(index)
                 drop.append(index)
-                
-            self.weights.ix[index, 'Record'] = True
-           
+
+            self.weights.ix[index, "Record"] = True
+
         # Indexes of new entries
         # Indexes of new entries
         self.location = location
         self.location = location
-        
+
         # Update the Google Sheet before dropping
         # Update the Google Sheet before dropping
         self.update_sheet()
         self.update_sheet()
-        
+
         # Drop the rows which do not contain a weight
         # Drop the rows which do not contain a weight
         self.weights.drop(drop, axis=0, inplace=True)
         self.weights.drop(drop, axis=0, inplace=True)
 
 
         # Entries is all of the new entries
         # Entries is all of the new entries
         self.entries = entries
         self.entries = entries
-        
+
     """ 
     """ 
     Update the Google Spreadsheet. This involves removing the rows without weight
     Update the Google Spreadsheet. This involves removing the rows without weight
     entries and putting a True in the record column for all weights. 
     entries and putting a True in the record column for all weights. 
@@ -187,161 +199,198 @@ class Weighter():
 
 
     def update_sheet(self):
     def update_sheet(self):
         delete_count = 0
         delete_count = 0
-        
+
         # Iterate through the locations and update as appropriate
         # Iterate through the locations and update as appropriate
         for index, action in self.location.items():
         for index, action in self.location.items():
             cell_row = (np.where(self.weights.index == index))[0][0] + 2 - delete_count
             cell_row = (np.where(self.weights.index == index))[0][0] + 2 - delete_count
-            if action == 'remove':
-                self.gsheet.delete_row(index = cell_row)
+            if action == "remove":
+                self.gsheet.delete_row(index=cell_row)
                 delete_count += 1
                 delete_count += 1
             elif action:
             elif action:
-                self.gsheet.update_acell(label='D%d' % cell_row, val = 'True')
-           
+                self.gsheet.update_acell(label="D%d" % cell_row, val="True")
+
     """ 
     """ 
     Iterates through the unrecorded entries and delegates 
     Iterates through the unrecorded entries and delegates 
     each one to the appropriate method.
     each one to the appropriate method.
     Updates the record cell in the google sheet 
     Updates the record cell in the google sheet 
     """
     """
+
     def process_entries(self):
     def process_entries(self):
         for user, user_entries in self.entries.items():
         for user, user_entries in self.entries.items():
             for entry in user_entries:
             for entry in user_entries:
-                
+
                 # If a weight, display the basic message
                 # If a weight, display the basic message
                 if type(entry) == float:
                 if type(entry) == float:
                     self.basic_message(user)
                     self.basic_message(user)
-                    
+
                 # If the message is a string hand off to the appropriate function
                 # If the message is a string hand off to the appropriate function
                 else:
                 else:
-                    
+
                     # Require at lesat 8 days of data
                     # Require at lesat 8 days of data
-                    if len(self.weights[self.weights['Name'] == user]) < 8:
-                        message = "\nAt least 8 days of data required for detailed analysis."
-                        self.slack.chat.post_message(channel='#weight_tracker', text = message, username = "Data Analyst", icon_emoji=":calendar:")
-                
-                    elif entry.lower() == 'summary':
+                    if len(self.weights[self.weights["Name"] == user]) < 8:
+                        message = (
+                            "\nAt least 8 days of data required for detailed analysis."
+                        )
+                        self.slack.chat.post_message(
+                            channel="#weight_tracker",
+                            text=message,
+                            username="Data Analyst",
+                            icon_emoji=":calendar:",
+                        )
+
+                    elif entry.lower() == "summary":
                         self.summary(user)
                         self.summary(user)
 
 
-                    elif entry.lower() == 'percent':
+                    elif entry.lower() == "percent":
                         self.percentage_plot()
                         self.percentage_plot()
 
 
-                    elif entry.lower() == 'history':
+                    elif entry.lower() == "history":
                         self.history_plot(user)
                         self.history_plot(user)
 
 
-                    elif entry.lower() == 'future':
+                    elif entry.lower() == "future":
                         self.future_plot(user)
                         self.future_plot(user)
 
 
-                    elif entry.lower() == 'analysis':
+                    elif entry.lower() == "analysis":
                         self.analyze(user)
                         self.analyze(user)
-    
+
                     # Display a help message if the string is not valid
                     # Display a help message if the string is not valid
                     else:
                     else:
-                        message = ("\nPlease enter a valid message:\n\n"
-                                   "Your weight\n"
-                                   "'Summary' to see a personal summary\n"
-                                   "'Percent' to see a plot of all users percentage changes\n"
-                                   "'History' to see a plot of your personal history\n"
-                                   "'Future' to see your predictions for the next thirty days\n"
-                                   "'Analysis' to view personalized advice\n"
-                                   "For more help, contact @koehrsen_will on Twitter.\n")
-
-                        self.slack.chat.post_message(channel='#weight_tracker', text = message, username = "Help", 
-                        	icon_emoji=":interrobang:")
-                    
-            
+                        message = (
+                            "\nPlease enter a valid message:\n\n"
+                            "Your weight\n"
+                            "'Summary' to see a personal summary\n"
+                            "'Percent' to see a plot of all users percentage changes\n"
+                            "'History' to see a plot of your personal history\n"
+                            "'Future' to see your predictions for the next thirty days\n"
+                            "'Analysis' to view personalized advice\n"
+                            "For more help, contact @koehrsen_will on Twitter.\n"
+                        )
+
+                        self.slack.chat.post_message(
+                            channel="#weight_tracker",
+                            text=message,
+                            username="Help",
+                            icon_emoji=":interrobang:",
+                        )
+
     """ 
     """ 
     Adds the change and percentage change columns to the self.weights df
     Adds the change and percentage change columns to the self.weights df
     """
     """
+
     def calculate_columns(self):
     def calculate_columns(self):
-        
-        self.weights = self.weights.sort_values('Name')
-        self.weights['change'] = 0
-        self.weights['pct_change'] = 0
-        self.weights.reset_index(level=0, inplace = True)
-        
+
+        self.weights = self.weights.sort_values("Name")
+        self.weights["change"] = 0
+        self.weights["pct_change"] = 0
+        self.weights.reset_index(level=0, inplace=True)
+
         for index in self.weights.index:
         for index in self.weights.index:
-            user = self.weights.ix[index, 'Name']
-            weight = self.weights.ix[index, 'Entry']
-            start_weight = self.user_dict[user]['start_weight']
-            objective = self.user_dict[user]['objective']
-            
-            if objective == 'lose':
-                
-                self.weights.ix[index, 'change'] = start_weight - weight
-                self.weights.ix[index, 'pct_change'] = 100 * (start_weight - weight) / start_weight
-                
-            elif objective == 'gain':
-                self.weights.ix[index, 'change'] = weight - start_weight
-                self.weights.ix[index, 'pct_change'] = 100 * (weight - start_weight) / start_weight
-
-        self.weights.set_index('Date', drop=True, inplace=True)
-        
-                
+            user = self.weights.ix[index, "Name"]
+            weight = self.weights.ix[index, "Entry"]
+            start_weight = self.user_dict[user]["start_weight"]
+            objective = self.user_dict[user]["objective"]
+
+            if objective == "lose":
+
+                self.weights.ix[index, "change"] = start_weight - weight
+                self.weights.ix[index, "pct_change"] = (
+                    100 * (start_weight - weight) / start_weight
+                )
+
+            elif objective == "gain":
+                self.weights.ix[index, "change"] = weight - start_weight
+                self.weights.ix[index, "pct_change"] = (
+                    100 * (weight - start_weight) / start_weight
+                )
+
+        self.weights.set_index("Date", drop=True, inplace=True)
+
     """ 
     """ 
     This method is automatically run for each new weight
     This method is automatically run for each new weight
     """
     """
+
     def basic_message(self, user):
     def basic_message(self, user):
-    
+
         # Find information for user, construct message, post message to Slack
         # Find information for user, construct message, post message to Slack
         user_info = self.user_dict.get(user)
         user_info = self.user_dict.get(user)
 
 
-        message = ("\n{}: Total Weight Change = {:.2f} lbs.\n\n"
-                    "Percentage Weight Change = {:.2f}%\n").format(user, user_info['abs_change'],
-                                                     user_info['pct_change'])
+        message = (
+            "\n{}: Total Weight Change = {:.2f} lbs.\n\n"
+            "Percentage Weight Change = {:.2f}%\n"
+        ).format(user, user_info["abs_change"], user_info["pct_change"])
+
+        self.slack.chat.post_message(
+            "#weight_tracker", text=message, username="Update", icon_emoji=":scales:"
+        )
 
 
-        self.slack.chat.post_message('#weight_tracker', text=message, username='Update', icon_emoji=':scales:')
-                        
     """ 
     """ 
     Displays comprehensive stats about the user
     Displays comprehensive stats about the user
     """
     """
-    
+
     def summary(self, user):
     def summary(self, user):
         user_info = self.user_dict.get(user)
         user_info = self.user_dict.get(user)
-        message = ("\n{}, your most recent weight was {:.2f} lbs.\n\n"
-                   "Absolute weight change = {:.2f} lbs, percentage weight change = {:.2f}%.\n\n"
-                   "Minimum weight = {:.2f} lbs on {} and maximum weight = {:.2f} lbs on {}.\n\n"
-                   "Your goal weight = {:.2f} lbs. and you are {:.2f}% of the way there.\n\n"
-                   "You started at {:.2f} lbs on {}. Congratulations on the progress!\n").format(user, 
-                     user_info['recent'], user_info['abs_change'], user_info['pct_change'], 
-                     user_info['min_weight'], str(user_info['min_date'].date()),
-                     user_info['max_weight'], str(user_info['max_date'].date()),
-                     user_info['goal_weight'], user_info['pct_towards_goal'],                                                       
-                     user_info['start_weight'], str(user_info['start_date'].date()))
-        
-        self.slack.chat.post_message('#weight_tracker', text=message, username='Summary', icon_emoji=":earth_africa:")
-   
+        message = (
+            "\n{}, your most recent weight was {:.2f} lbs.\n\n"
+            "Absolute weight change = {:.2f} lbs, percentage weight change = {:.2f}%.\n\n"
+            "Minimum weight = {:.2f} lbs on {} and maximum weight = {:.2f} lbs on {}.\n\n"
+            "Your goal weight = {:.2f} lbs. and you are {:.2f}% of the way there.\n\n"
+            "You started at {:.2f} lbs on {}. Congratulations on the progress!\n"
+        ).format(
+            user,
+            user_info["recent"],
+            user_info["abs_change"],
+            user_info["pct_change"],
+            user_info["min_weight"],
+            str(user_info["min_date"].date()),
+            user_info["max_weight"],
+            str(user_info["max_date"].date()),
+            user_info["goal_weight"],
+            user_info["pct_towards_goal"],
+            user_info["start_weight"],
+            str(user_info["start_date"].date()),
+        )
+
+        self.slack.chat.post_message(
+            "#weight_tracker",
+            text=message,
+            username="Summary",
+            icon_emoji=":earth_africa:",
+        )
+
     """
     """
     Reset the plot and institute basic parameters
     Reset the plot and institute basic parameters
     """
     """
+
     @staticmethod
     @staticmethod
     def reset_plot():
     def reset_plot():
         matplotlib.rcParams.update(matplotlib.rcParamsDefault)
         matplotlib.rcParams.update(matplotlib.rcParamsDefault)
-        matplotlib.rcParams['text.color'] = 'k'
-        
+        matplotlib.rcParams["text.color"] = "k"
+
     """
     """
     Plot of all users percentage changes.
     Plot of all users percentage changes.
     Includes polynomial fits (degree may need to be adjusted).
     Includes polynomial fits (degree may need to be adjusted).
     """
     """
-    
+
     def percentage_plot(self):
     def percentage_plot(self):
-        
+
         self.reset_plot()
         self.reset_plot()
-        
-        plt.style.use('fivethirtyeight')
-        plt.figure(figsize=(10,8))
+
+        plt.style.use("fivethirtyeight")
+        plt.figure(figsize=(10, 8))
 
 
         for i, user in enumerate(self.users):
         for i, user in enumerate(self.users):
-            
-            user_color = self.user_dict[user]['color']
+
+            user_color = self.user_dict[user]["color"]
 
 
             # Select the user and order dataframe by date
             # Select the user and order dataframe by date
-            df = self.weights[self.weights['Name'] == user]
+            df = self.weights[self.weights["Name"] == user]
             df.sort_index(inplace=True)
             df.sort_index(inplace=True)
-            
+
             # List is used for fitting polynomial
             # List is used for fitting polynomial
             xvalues = list(range(len(df)))
             xvalues = list(range(len(df)))
 
 
             # Create a polynomial fit
             # Create a polynomial fit
-            z = np.polyfit(xvalues, df['pct_change'], deg=6)
+            z = np.polyfit(xvalues, df["pct_change"], deg=6)
 
 
             # Create a function from the fit
             # Create a function from the fit
             p = np.poly1d(z)
             p = np.poly1d(z)
@@ -350,40 +399,62 @@ class Weighter():
             fit_data = p(xvalues)
             fit_data = p(xvalues)
 
 
             # Plot the actual points and the fit
             # Plot the actual points and the fit
-            plt.plot(df.index, df['pct_change'], 'o', color = user_color, label = '%s Observations' % user)
-            plt.plot(df.index, fit_data, '-', color = user_color, linewidth = 5, label = '%s Smooth Fit' % user)
-
+            plt.plot(
+                df.index,
+                df["pct_change"],
+                "o",
+                color=user_color,
+                label="%s Observations" % user,
+            )
+            plt.plot(
+                df.index,
+                fit_data,
+                "-",
+                color=user_color,
+                linewidth=5,
+                label="%s Smooth Fit" % user,
+            )
 
 
         # Plot formatting
         # Plot formatting
-        plt.xlabel('Date'); plt.ylabel('% Change from Start')
-        plt.title('Percentage Changes')
-        plt.grid(color='k', alpha=0.4)
-        plt.legend(prop={'size':14})
-        plt.savefig('C:\\Users\\Will Koehrsen\\Documents\\Data-Analysis\\weighter\\images\\percentage_plot.png')
-        
-        self.slack.files.upload('C:\\Users\\Will Koehrsen\\Documents\\Data-Analysis\\weighter\\images\\percentage_plot.png', channels='#weight_tracker', title="Percent Plot")
-        
-        os.remove('C:\\Users\\Will Koehrsen\\Documents\\Data-Analysis\\weighter\\images\\percentage_plot.png')
-        
+        plt.xlabel("Date")
+        plt.ylabel("% Change from Start")
+        plt.title("Percentage Changes")
+        plt.grid(color="k", alpha=0.4)
+        plt.legend(prop={"size": 14})
+        plt.savefig(
+            "C:\\Users\\Will Koehrsen\\Documents\\Data-Analysis\\weighter\\images\\percentage_plot.png"
+        )
+
+        self.slack.files.upload(
+            "C:\\Users\\Will Koehrsen\\Documents\\Data-Analysis\\weighter\\images\\percentage_plot.png",
+            channels="#weight_tracker",
+            title="Percent Plot",
+        )
+
+        os.remove(
+            "C:\\Users\\Will Koehrsen\\Documents\\Data-Analysis\\weighter\\images\\percentage_plot.png"
+        )
+
     """ 
     """ 
     Plot of a single user's history.
     Plot of a single user's history.
     Also plot a polynomial fit on the observations.
     Also plot a polynomial fit on the observations.
     """
     """
+
     def history_plot(self, user):
     def history_plot(self, user):
-        
+
         self.reset_plot()
         self.reset_plot()
-        plt.style.use('fivethirtyeight')
+        plt.style.use("fivethirtyeight")
         plt.figure(figsize=(10, 8))
         plt.figure(figsize=(10, 8))
-        
-        df = self.weights[self.weights['Name'] == user]
-        df.sort_index(inplace=True) 
-        user_color = self.user_dict[user]['color']
-        
+
+        df = self.weights[self.weights["Name"] == user]
+        df.sort_index(inplace=True)
+        user_color = self.user_dict[user]["color"]
+
         # List is used for fitting polynomial
         # List is used for fitting polynomial
         xvalues = list(range(len(df)))
         xvalues = list(range(len(df)))
 
 
         # Create a polynomial fit
         # Create a polynomial fit
-        z = np.polyfit(xvalues, df['Entry'], deg=6)
+        z = np.polyfit(xvalues, df["Entry"], deg=6)
 
 
         # Create a function from the fit
         # Create a function from the fit
         p = np.poly1d(z)
         p = np.poly1d(z)
@@ -392,150 +463,222 @@ class Weighter():
         fit_data = p(xvalues)
         fit_data = p(xvalues)
 
 
         # Make a simple plot and upload to slack
         # Make a simple plot and upload to slack
-        plt.plot(df.index, df['Entry'], 'ko', ms = 8, label = 'Observed')
-        plt.plot(df.index, fit_data, '-', color = user_color, linewidth = 5, label = 'Smooth Fit')
-        plt.xlabel('Date'); plt.ylabel('Weight (lbs)'); plt.title('%s Weight History' % user)
-        plt.legend(prop={'size': 14});
-        
-        plt.savefig(fname='C:\\Users\\Will Koehrsen\\Documents\\Data-Analysis\\weighter\\images\\history_plot.png')
-        self.slack.files.upload('C:\\Users\\Will Koehrsen\\Documents\\Data-Analysis\\weighter\\images\\history_plot.png', channels='#weight_tracker', title="%s History" % user)
-        
+        plt.plot(df.index, df["Entry"], "ko", ms=8, label="Observed")
+        plt.plot(
+            df.index, fit_data, "-", color=user_color, linewidth=5, label="Smooth Fit"
+        )
+        plt.xlabel("Date")
+        plt.ylabel("Weight (lbs)")
+        plt.title("%s Weight History" % user)
+        plt.legend(prop={"size": 14})
+
+        plt.savefig(
+            fname="C:\\Users\\Will Koehrsen\\Documents\\Data-Analysis\\weighter\\images\\history_plot.png"
+        )
+        self.slack.files.upload(
+            "C:\\Users\\Will Koehrsen\\Documents\\Data-Analysis\\weighter\\images\\history_plot.png",
+            channels="#weight_tracker",
+            title="%s History" % user,
+        )
+
         # Remove the plot from local storage
         # Remove the plot from local storage
-        os.remove('C:\\Users\\Will Koehrsen\\Documents\\Data-Analysis\\weighter\\images\\history_plot.png')
-   
+        os.remove(
+            "C:\\Users\\Will Koehrsen\\Documents\\Data-Analysis\\weighter\\images\\history_plot.png"
+        )
+
     """ 
     """ 
     Create a prophet model for forecasting and trend analysis.
     Create a prophet model for forecasting and trend analysis.
     Might need to adjust model hyperparameters.
     Might need to adjust model hyperparameters.
     """
     """
-    
+
     def prophet_model(self):
     def prophet_model(self):
         model = fbprophet.Prophet(daily_seasonality=False, yearly_seasonality=False)
         model = fbprophet.Prophet(daily_seasonality=False, yearly_seasonality=False)
         return model
         return model
-        
+
     """ 
     """ 
     Plot the prophet forecast for the next thirty days
     Plot the prophet forecast for the next thirty days
     Print the expected weight at the end of the forecast
     Print the expected weight at the end of the forecast
     """
     """
+
     def future_plot(self, user):
     def future_plot(self, user):
         self.reset_plot()
         self.reset_plot()
-        
-        df = self.weights[self.weights['Name'] == user]
+
+        df = self.weights[self.weights["Name"] == user]
         dates = [date.date() for date in df.index]
         dates = [date.date() for date in df.index]
-        df['ds'] = dates
-        df['y'] = df['Entry']
-        
+        df["ds"] = dates
+        df["y"] = df["Entry"]
+
         df.sort_index(inplace=True)
         df.sort_index(inplace=True)
 
 
         # Prophet model
         # Prophet model
         model = self.prophet_model()
         model = self.prophet_model()
         model.fit(df)
         model.fit(df)
-        
+
         # Future dataframe for predictions
         # Future dataframe for predictions
-        future = model.make_future_dataframe(periods=30, freq='D')
+        future = model.make_future_dataframe(periods=30, freq="D")
         future = model.predict(future)
         future = model.predict(future)
-    
-        color = self.user_dict[user]['color']
-        
+
+        color = self.user_dict[user]["color"]
+
         # Write a message and post to slack
         # Write a message and post to slack
-        message = ('{} Your predicted weight on {} = {:.2f} lbs.'.format(
-            user, max(future['ds']).date(), future.ix[len(future) - 1, 'yhat']))
-        
-        self.slack.chat.post_message(channel="#weight_tracker", text=message, username = 'The Future', icon_emoji=":city_sunrise:")
-        
+        message = "{} Your predicted weight on {} = {:.2f} lbs.".format(
+            user, max(future["ds"]).date(), future.ix[len(future) - 1, "yhat"]
+        )
+
+        self.slack.chat.post_message(
+            channel="#weight_tracker",
+            text=message,
+            username="The Future",
+            icon_emoji=":city_sunrise:",
+        )
+
         # Create the plot and upload to slack
         # Create the plot and upload to slack
         fig, ax = plt.subplots(1, 1, figsize=(10, 8))
         fig, ax = plt.subplots(1, 1, figsize=(10, 8))
-        ax.plot(df['ds'], df['y'], 'o', color = 'k', ms = 8, label = 'observations')
-        ax.plot(future['ds'], future['yhat'], '-', color = color, label = 'modeled')
-        ax.fill_between(future['ds'].dt.to_pydatetime(), future['yhat_upper'], future['yhat_lower'], facecolor = color, 
-                alpha = 0.4, edgecolor = 'k', linewidth  = 1.8, label = 'confidence interval')
-        plt.xlabel('Date'); plt.ylabel('Weight (lbs)'); plt.title('%s 30 Day Prediction' % user)
+        ax.plot(df["ds"], df["y"], "o", color="k", ms=8, label="observations")
+        ax.plot(future["ds"], future["yhat"], "-", color=color, label="modeled")
+        ax.fill_between(
+            future["ds"].dt.to_pydatetime(),
+            future["yhat_upper"],
+            future["yhat_lower"],
+            facecolor=color,
+            alpha=0.4,
+            edgecolor="k",
+            linewidth=1.8,
+            label="confidence interval",
+        )
+        plt.xlabel("Date")
+        plt.ylabel("Weight (lbs)")
+        plt.title("%s 30 Day Prediction" % user)
         plt.legend()
         plt.legend()
-        plt.savefig('C:\\Users\\Will Koehrsen\\Documents\\Data-Analysis\\weighter\\images\\future_plot.png')
-        
-        self.slack.files.upload('C:\\Users\\Will Koehrsen\\Documents\\Data-Analysis\\weighter\\images\\future_plot.png', channels="#weight_tracker", title="%s Future Predictions" % user)
-        
-        os.remove('C:\\Users\\Will Koehrsen\\Documents\\Data-Analysis\\weighter\\images\\future_plot.png')
-        
+        plt.savefig(
+            "C:\\Users\\Will Koehrsen\\Documents\\Data-Analysis\\weighter\\images\\future_plot.png"
+        )
+
+        self.slack.files.upload(
+            "C:\\Users\\Will Koehrsen\\Documents\\Data-Analysis\\weighter\\images\\future_plot.png",
+            channels="#weight_tracker",
+            title="%s Future Predictions" % user,
+        )
+
+        os.remove(
+            "C:\\Users\\Will Koehrsen\\Documents\\Data-Analysis\\weighter\\images\\future_plot.png"
+        )
+
     """ 
     """ 
     Analyze user trends and provide recommendations. 
     Analyze user trends and provide recommendations. 
     Determine if the user is on track to meet their goal.
     Determine if the user is on track to meet their goal.
     """
     """
-    
+
     def analyze(self, user):
     def analyze(self, user):
-        
+
         self.reset_plot()
         self.reset_plot()
-        
+
         # Get user info and sort dataframe by date
         # Get user info and sort dataframe by date
         info = self.user_dict.get(user)
         info = self.user_dict.get(user)
-        goal_weight = info['goal_weight']
-        df = self.weights[self.weights['Name'] == user]
+        goal_weight = info["goal_weight"]
+        df = self.weights[self.weights["Name"] == user]
         df = df.sort_index()
         df = df.sort_index()
-        df['ds'] = [date.date() for date in df.index]
-        df['y'] = df['Entry']
-        
+        df["ds"] = [date.date() for date in df.index]
+        df["y"] = df["Entry"]
+
         model = self.prophet_model()
         model = self.prophet_model()
         model.fit(df)
         model.fit(df)
-        
+
         prediction_days = 2 * len(df)
         prediction_days = 2 * len(df)
-        
-        future = model.make_future_dataframe(periods = prediction_days, freq = 'D')
+
+        future = model.make_future_dataframe(periods=prediction_days, freq="D")
         future = model.predict(future)
         future = model.predict(future)
-        
-        # lbs change per day 
-        change_per_day = info['abs_change'] / (max(df['ds']) - min(df['ds'])).days
-        
-        days_to_goal = abs(int((info['recent'] - goal_weight) / change_per_day))
-        date_for_goal = max(df['ds']) + pd.DateOffset(days=days_to_goal)
-        
+
+        # lbs change per day
+        change_per_day = info["abs_change"] / (max(df["ds"]) - min(df["ds"])).days
+
+        days_to_goal = abs(int((info["recent"] - goal_weight) / change_per_day))
+        date_for_goal = max(df["ds"]) + pd.DateOffset(days=days_to_goal)
+
         # future dataframe where the user in above goal
         # future dataframe where the user in above goal
-        goal_future = future[future['yhat'] < goal_weight]
-        
+        goal_future = future[future["yhat"] < goal_weight]
+
         # The additive model predicts the user will meet their goal
         # The additive model predicts the user will meet their goal
         if len(goal_future) > 0:
         if len(goal_future) > 0:
-            model_goal_date = min(goal_future['ds'])
-            message = ("\n{} Your average weight change per day is {:.2f} lbs\n"
-                       "Extrapolating the average loss per day, you will reach your goal of {} lbs in {} days on {}.\n\n"
-                       "The additive model predicts you will reach your goal on {}\n".format(
-                       user, change_per_day, goal_weight, days_to_goal, date_for_goal.date(), model_goal_date.date()))
-        
+            model_goal_date = min(goal_future["ds"])
+            message = (
+                "\n{} Your average weight change per day is {:.2f} lbs\n"
+                "Extrapolating the average loss per day, you will reach your goal of {} lbs in {} days on {}.\n\n"
+                "The additive model predicts you will reach your goal on {}\n".format(
+                    user,
+                    change_per_day,
+                    goal_weight,
+                    days_to_goal,
+                    date_for_goal.date(),
+                    model_goal_date.date(),
+                )
+            )
+
         # The additive model does not predict the user will meet their goal
         # The additive model does not predict the user will meet their goal
         else:
         else:
-            final_future_date = max(future['ds'])
-            message = ("\n{} Your average weight change per day is {:.2f} lbs\n\n"
-                       "Extrapolating the average loss per day, you will reach your goal of {} lbs in {} days on {}.\n\n"
-                       "The additive model does not forecast you reaching your goal by {}.\n".format(
-                           user, change_per_day, goal_weight, days_to_goal, date_for_goal.date(), final_future_date))
-        
-        
-        
-        self.slack.chat.post_message(channel="#weight_tracker", text=message, username="Analysis", icon_emoji=":bar_chart:")
+            final_future_date = max(future["ds"])
+            message = (
+                "\n{} Your average weight change per day is {:.2f} lbs\n\n"
+                "Extrapolating the average loss per day, you will reach your goal of {} lbs in {} days on {}.\n\n"
+                "The additive model does not forecast you reaching your goal by {}.\n".format(
+                    user,
+                    change_per_day,
+                    goal_weight,
+                    days_to_goal,
+                    date_for_goal.date(),
+                    final_future_date,
+                )
+            )
+
+        self.slack.chat.post_message(
+            channel="#weight_tracker",
+            text=message,
+            username="Analysis",
+            icon_emoji=":bar_chart:",
+        )
 
 
         # Identify Weekly Trends
         # Identify Weekly Trends
-        future['weekday'] = [date.weekday() for date in future['ds']]
-        future_weekly = future.groupby('weekday').mean()
-        future_weekly.index = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']
-        
+        future["weekday"] = [date.weekday() for date in future["ds"]]
+        future_weekly = future.groupby("weekday").mean()
+        future_weekly.index = ["Mon", "Tues", "Wed", "Thurs", "Fri", "Sat", "Sun"]
+
         # Color labels based on the users objective
         # Color labels based on the users objective
-        colors = ['red' if ( ((weight > 0) & (info['objective'] == 'lose')) | ((weight < 0) & (info['objective'] == 'gain'))) else 'green' for weight in future_weekly['weekly']]
+        colors = [
+            "red"
+            if (
+                ((weight > 0) & (info["objective"] == "lose"))
+                | ((weight < 0) & (info["objective"] == "gain"))
+            )
+            else "green"
+            for weight in future_weekly["weekly"]
+        ]
 
 
         self.reset_plot()
         self.reset_plot()
-        
+
         # Create a bar plot with labels for positive and negative changes
         # Create a bar plot with labels for positive and negative changes
         plt.figure(figsize=(10, 8))
         plt.figure(figsize=(10, 8))
         xvalues = list(range(len(future_weekly)))
         xvalues = list(range(len(future_weekly)))
-        plt.bar(xvalues, future_weekly['weekly'], color = colors, edgecolor = 'k', linewidth = 2)
+        plt.bar(
+            xvalues, future_weekly["weekly"], color=colors, edgecolor="k", linewidth=2
+        )
         plt.xticks(xvalues, list(future_weekly.index))
         plt.xticks(xvalues, list(future_weekly.index))
-        red_patch = mpatches.Patch(color='red',  linewidth = 2, label='Needs Work')
-        green_patch = mpatches.Patch(color='green', linewidth = 2, label='Solid')
+        red_patch = mpatches.Patch(color="red", linewidth=2, label="Needs Work")
+        green_patch = mpatches.Patch(color="green", linewidth=2, label="Solid")
         plt.legend(handles=[red_patch, green_patch])
         plt.legend(handles=[red_patch, green_patch])
-        plt.xlabel('Day of Week')
-        plt.ylabel('Trend (lbs)')
-        plt.title('%s Weekly Trends' % user)
-        plt.savefig('C:\\Users\\Will Koehrsen\\Documents\\Data-Analysis\\weighter\\images\\weekly_plot.png')
-        
-        # Upload the image to slack and delete local file
-        self.slack.files.upload('C:\\Users\\Will Koehrsen\\Documents\\Data-Analysis\\weighter\\images\\weekly_plot.png', channels = '#weight_tracker', title="%s Weekly Trends" % user)
-
-        os.remove('C:\\Users\\Will Koehrsen\\Documents\\Data-Analysis\\weighter\\images\\weekly_plot.png')
+        plt.xlabel("Day of Week")
+        plt.ylabel("Trend (lbs)")
+        plt.title("%s Weekly Trends" % user)
+        plt.savefig(
+            "C:\\Users\\Will Koehrsen\\Documents\\Data-Analysis\\weighter\\images\\weekly_plot.png"
+        )
 
 
-        
+        # Upload the image to slack and delete local file
+        self.slack.files.upload(
+            "C:\\Users\\Will Koehrsen\\Documents\\Data-Analysis\\weighter\\images\\weekly_plot.png",
+            channels="#weight_tracker",
+            title="%s Weekly Trends" % user,
+        )
+
+        os.remove(
+            "C:\\Users\\Will Koehrsen\\Documents\\Data-Analysis\\weighter\\images\\weekly_plot.png"
+        )