il y a 6 ans · 0a739c7a8a
--- a/medium/Fitting.ipynb
+++ b/medium/Fitting.ipynb
--- a/Analysis.ipynb
+++ b/Analysis.ipynb
@@ -73,7 +73,16 @@
 
				     "import cufflinks\n",
			
 
				     "cufflinks.go_offline()\n",
			
 
				     "\n",
			
 
				-    "from timeit import default_timer as timer"
			
 
				+    "from timeit import default_timer as timer\n",
			
 
				+    "\n",
			
 
				+    "from collections import Counter, defaultdict\n",
			
 
				+    "from itertools import chain\n",
			
 
				+    "\n",
			
 
				+    "from bs4 import BeautifulSoup\n",
			
 
				+    "import re\n",
			
 
				+    "\n",
			
 
				+    "import requests\n",
			
 
				+    "from multiprocessing import Pool"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -107,7 +116,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "from bs4 import BeautifulSoup\n",
			
 
				+    "\n",
			
 
				     "\n",
			
 
				     "soup = BeautifulSoup(open('data/published.html', 'r').read())\n",
			
 
				     "soup.text[:100]"
			
@@ -241,7 +250,6 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "import re\n",
			
 
				     "pattern = re.compile('[0-9]{1,} min read')\n",
			
 
				     "pattern.findall(string = '15 min read')"
			
 
				    ]
			
@@ -673,7 +681,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "import requests\n",
			
 
				+    "\n",
			
 
				     "\n",
			
 
				     "entries = []\n",
			
 
				     "for a in entry_links[:10]:\n",
			
@@ -888,7 +896,6 @@
 
				    },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "from collections import defaultdict\n",
			
 
				     "entry_dict = defaultdict(dict)\n",
			
 
				     "entry_dict[title]['text'] = entry_text\n",
			
 
				     "entry_dict[title]['word_count'] = word_count\n",
			
@@ -1417,9 +1424,6 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "from multiprocessing import Pool\n",
			
 
				-    "\n",
			
 
				-    "\n",
			
 
				     "pool = Pool(processes=20)\n",
			
 
				     "results = []\n",
			
 
				     "\n",
			
@@ -6631,9 +6635,6 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "from collections import Counter\n",
			
 
				-    "from itertools import chain\n",
			
 
				-    "\n",
			
 
				     "all_tags = list(chain(*entry_data['tags'].tolist()))\n",
			
 
				     "tag_counts = Counter(all_tags)\n",
			
 
				     "tag_counts.most_common(10)"
			
--- a/medium/__pycache__/utils.cpython-36.pyc
+++ b/medium/__pycache__/utils.cpython-36.pyc
--- a/medium/utils.py
+++ b/medium/utils.py
@@ -0,0 +1,346 @@
 
				+# Data science imports
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+
			
 
				+# Options for pandas
			
 
				+pd.options.display.max_columns = 20
			
 
				+
			
 
				+# Display all cell outputs
			
 
				+from IPython.core.interactiveshell import InteractiveShell
			
 
				+InteractiveShell.ast_node_interactivity = 'all'
			
 
				+
			
 
				+# Interactive plotting
			
 
				+import plotly.plotly as py
			
 
				+import plotly.graph_objs as go
			
 
				+from plotly.offline import iplot
			
 
				+import cufflinks
			
 
				+cufflinks.go_offline()
			
 
				+
			
 
				+from timeit import default_timer as timer
			
 
				+
			
 
				+from collections import Counter, defaultdict
			
 
				+from itertools import chain
			
 
				+
			
 
				+from bs4 import BeautifulSoup
			
 
				+import re
			
 
				+
			
 
				+import requests
			
 
				+from multiprocessing import Pool
			
 
				+
			
 
				+def get_links(soup):
			
 
				+    """
			
 
				+    Retrieve all links to entries on webpage
			
 
				+    
			
 
				+    :param soup: BeautifulSoup of HTML for page
			
 
				+    :return entry_links: list of links to entries
			
 
				+    
			
 
				+    """    
			
 
				+    titles = soup.find_all(attrs = {'class': 'bq y br af bs ag db dc dd c de df dg'})
			
 
				+    pattern = re.compile('[0-9]{1,} min read')
			
 
				+    read_times = soup.find_all(text = pattern)
			
 
				+    read_times = [int(x.split(' ')[0]) for x in read_times]
			
 
				+    total_read_time = sum(read_times)
			
 
				+    
			
 
				+    print(f'Found {len(titles)} entries.')
			
 
				+    print(f'Total Read Time of Entries: {total_read_time} minutes.')
			
 
				+    entry_links = [title.a.get_attribute_list('href')[0] for title in titles]
			
 
				+    
			
 
				+    return entry_links
			
 
				+
			
 
				+def process_entry(link):
			
 
				+    """
			
 
				+    Retrieve data of single entry.
			
 
				+    
			
 
				+    :param link: string for link to entry
			
 
				+    
			
 
				+    :return entry_dict: dictionary of data about entry
			
 
				+    """
			
 
				+    
			
 
				+    entry_dict = {}
			
 
				+     
			
 
				+    # Retrieve the article and create a soup
			
 
				+    entry = requests.get(link).content
			
 
				+    entry_soup = BeautifulSoup(entry, features="lxml")
			
 
				+    
			
 
				+    # Publication time
			
 
				+    t = entry_soup.find_all('time')[0]
			
 
				+    t = pd.to_datetime(t.get('datetime'), utc=True).tz_convert('America/New_York')
			
 
				+
			
 
				+    # Find the title header (determines if an article or a response)
			
 
				+    if entry_soup.h1 is not None:
			
 
				+        title = entry_soup.h1.text
			
 
				+    else:
			
 
				+        title = f'response-{t}'
			
 
				+
			
 
				+    # Text as single long string
			
 
				+    entry_text = [p.text for p in entry_soup.find_all('p')]
			
 
				+    entry_text = ' '.join(entry_text)
			
 
				+
			
 
				+    # Word count
			
 
				+    word_count = len(entry_text.split(' '))
			
 
				+
			
 
				+    # Reading time in minutes
			
 
				+    read_time = entry_soup.find_all(attrs={'class': 'readingTime'})
			
 
				+    read_mins = int(read_time[0].get('title').split(' ')[0])
			
 
				+
			
 
				+    # Number of claps
			
 
				+    clap_pattern = re.compile('^[0-9]{1,} claps|^[0-9]{1,}.[0-9]{1,}K claps|^[0-9]{1,}K claps')
			
 
				+    claps = entry_soup.find_all(text = clap_pattern)
			
 
				+
			
 
				+    if len(claps) > 0:
			
 
				+        if 'K' in claps[0]:
			
 
				+            clap_number = int(1e3 * float(claps[0].split('K')[0]))
			
 
				+        else:
			
 
				+            clap_number = int(claps[0].split(' ')[0])
			
 
				+    else:
			
 
				+        clap_number = 0
			
 
				+
			
 
				+    # Post tags
			
 
				+    tags = entry_soup.find_all(attrs={'class': 'tags tags--postTags tags--borderless'})
			
 
				+    tags = [li.text for li in tags[0].find_all('li')]
			
 
				+        
			
 
				+    # Store in dictionary with title as key
			
 
				+    entry_dict['title'] = title
			
 
				+    entry_dict['text'] = entry_text
			
 
				+    entry_dict['word_count'] = word_count
			
 
				+    entry_dict['read_time'] = read_mins
			
 
				+    entry_dict['claps'] = clap_number
			
 
				+    entry_dict['time_published'] = t
			
 
				+    entry_dict['tags'] = tags
			
 
				+        
			
 
				+    
			
 
				+    return entry_dict
			
 
				+    
			
 
				+def process_in_parallel(links, processes=20):
			
 
				+    """
			
 
				+    Process entries in parallel
			
 
				+    
			
 
				+    :param links: list of entry links
			
 
				+    :param processes: integer number of processes (threads) to use in parallel
			
 
				+    
			
 
				+    :return results: list of dictionaries of entry data
			
 
				+    """
			
 
				+    pool = Pool(processes=processes)
			
 
				+    results = []
			
 
				+
			
 
				+    start = timer()
			
 
				+    for i, result in enumerate(pool.imap_unordered(process_entry, links)):
			
 
				+        if (i + 1) % 5 == 0:
			
 
				+            print(f'{100 * i / len(links):.2f}% complete.', end='\r')
			
 
				+        results.append(result)
			
 
				+
			
 
				+    pool.close()
			
 
				+    pool.join()
			
 
				+    end = timer()
			
 
				+    
			
 
				+    print(f'Processed {len(results)} entries in {end-start:.0f} seconds.')
			
 
				+    
			
 
				+    # Add extra columns with more data
			
 
				+    df = pd.DataFrame.from_dict(results)
			
 
				+    df['response'] = ['response' if x == True else 'article' for x in df['title'].str.contains('response')]
			
 
				+    df['claps_per_word'] = df['claps'] / df['word_count']
			
 
				+    df['words_per_minute'] = df['word_count'] / df['read_time']
			
 
				+    
			
 
				+    # Add 10 most common tags with flag if data has it
			
 
				+    n = 10
			
 
				+    all_tags = list(chain(*df['tags'].tolist()))
			
 
				+    tag_counts = Counter(all_tags)
			
 
				+    tags = tag_counts.most_common(n)
			
 
				+
			
 
				+    for tag, count in tags:
			
 
				+        flag = [1 if tag in tags else 0 for tags in df['tags']]
			
 
				+        df.loc[:, f'<tag>{tag}'] = flag
			
 
				+        
			
 
				+    return df
			
 
				+
			
 
				+def make_update_menu(base_title, article_annotations=None, response_annotations=None):
			
 
				+    """
			
 
				+    Make an updatemenu for interative plot
			
 
				+    
			
 
				+    :param base_title: string for title of plot
			
 
				+    
			
 
				+    :return updatemenus: a updatemenus object for adding to a layout
			
 
				+    """
			
 
				+    updatemenus = list([
			
 
				+    dict(
			
 
				+        buttons=list([
			
 
				+            dict(
			
 
				+                label='both', method='update', 
			
 
				+                args=[dict(visible=[True, True]), dict(title = base_title,
			
 
				+                                                       annotations=[article_annotations,response_annotations])]),
			
 
				+            dict(
			
 
				+                label='articles',
			
 
				+                method='update',
			
 
				+                args=[dict(visible=[True, False]), dict(title = 'Article ' + base_title,
			
 
				+                                                        annotations = [article_annotations])]),
			
 
				+            dict(
			
 
				+                label='responses',
			
 
				+                method='update',
			
 
				+                args=[dict(visible=[False, True]), dict(title='Response ' + base_title,
			
 
				+                                                       annotations = [response_annotations])]),
			
 
				+        ]))
			
 
				+    ])
			
 
				+    return updatemenus
			
 
				+
			
 
				+
			
 
				+
			
 
				+def make_iplot(data, x, y, base_title, time=False, eq_pos=(0.75, 0.25)):
			
 
				+    """
			
 
				+    Make an interactive plot. Adds a dropdown to separate articles from responses
			
 
				+    if there are responses in the data. If there is only articles (or only responses)
			
 
				+    adds a linear regression line. 
			
 
				+    
			
 
				+    :param data: dataframe of entry data
			
 
				+    :param x: string for xaxis of plot
			
 
				+    :param y: sring for yaxis of plot
			
 
				+    :param base_title: string for title of plot
			
 
				+    :param time: boolean for whether the xaxis is a plot
			
 
				+    :param eq_pos: position of equation for linear regression
			
 
				+    
			
 
				+    :return figure: an interactive plotly object for display
			
 
				+    
			
 
				+    """
			
 
				+
			
 
				+    # Extract the relevant data
			
 
				+    responses = data[data['response'] == 'response'].copy()
			
 
				+    articles = data[data['response'] == 'article'].copy()
			
 
				+
			
 
				+    if not responses.empty:
			
 
				+        # Create scatterplot data, articles must be first for menu selection
			
 
				+        plot_data = [
			
 
				+            go.Scatter(
			
 
				+                x=articles[x],
			
 
				+                y=articles[y],
			
 
				+                mode='markers',
			
 
				+                name='articles',
			
 
				+                text=articles['title'],
			
 
				+                marker=dict(color='blue', size=12)),
			
 
				+            go.Scatter(
			
 
				+                x=responses[x],
			
 
				+                y=responses[y],
			
 
				+                mode='markers',
			
 
				+                name='responses',
			
 
				+                marker=dict(color='green', size=12))
			
 
				+        ]
			
 
				+        
			
 
				+        if not time:
			
 
				+            annotations = {}
			
 
				+            for df, name in zip([articles, responses], 
			
 
				+                                ['articles', 'responses']):
			
 
				+                
			
 
				+                regression = stats.linregress(x=df[x], y=df[y])
			
 
				+                slope = regression.slope
			
 
				+                intercept = regression.intercept
			
 
				+                rvalue = regression.rvalue
			
 
				+
			
 
				+                xi = np.array(range(int(df[x].min()), int(df[x].max())))
			
 
				+                
			
 
				+                line = xi*slope + intercept
			
 
				+                trace = go.Scatter(
			
 
				+                                  x=xi,
			
 
				+                                  y=line,
			
 
				+                                  mode='lines',
			
 
				+                                  marker=dict(color='blue' if name == 'articles' else 'green'), 
			
 
				+                                  line=dict(width=4, dash='longdash'),
			
 
				+                                  name=f'{name} linear fit'
			
 
				+                                  )
			
 
				+
			
 
				+                annotations[name] = dict(
			
 
				+                                  x=max(xi) * eq_pos[0],
			
 
				+                                  y=df[y].max() * eq_pos[1],
			
 
				+                                  showarrow=False,
			
 
				+                                  text=f'$R^2 = {rvalue:.2f}; Y = {slope:.2f}X + {intercept:.2f}$',
			
 
				+                          font=dict(size=16, color='blue' if name == 'articles' else 'green')
			
 
				+                          )
			
 
				+
			
 
				+                plot_data.append(trace)
			
 
				+        
			
 
				+        # Make a layout with update menus
			
 
				+        layout = go.Layout(annotations=list(annotations.values()),
			
 
				+            height=600,
			
 
				+            width=900,
			
 
				+            title=base_title,
			
 
				+            xaxis=dict(
			
 
				+                title=x.title(),
			
 
				+                tickfont=dict(size=14),
			
 
				+                titlefont=dict(size=16)),
			
 
				+            yaxis=dict(
			
 
				+                title=y.title(),
			
 
				+                tickfont=dict(size=14),
			
 
				+                titlefont=dict(size=16)),
			
 
				+            updatemenus=make_update_menu(base_title, annotations['articles'], annotations['responses']))
			
 
				+
			
 
				+    # If there are only articles
			
 
				+    else:
			
 
				+        plot_data = [
			
 
				+            go.Scatter(
			
 
				+                x=data[x],
			
 
				+                y=data[y],
			
 
				+                mode='markers',
			
 
				+                name = 'observations',
			
 
				+                text=data['title'],
			
 
				+                marker=dict(color='blue', size=12))
			
 
				+        ]
			
 
				+        
			
 
				+        regression = stats.linregress(x=data[x], y=data[y])
			
 
				+        slope = regression.slope
			
 
				+        intercept = regression.intercept
			
 
				+        rvalue = regression.rvalue
			
 
				+        
			
 
				+        xi = np.array(range(int(data[x].min()), int(data[x].max())))
			
 
				+        line = xi*slope + intercept
			
 
				+        trace = go.Scatter(
			
 
				+                          x=xi,
			
 
				+                          y=line,
			
 
				+                          mode='lines',
			
 
				+                          marker=dict(color='red'), 
			
 
				+                          line=dict(width=4, dash='longdash'),
			
 
				+                          name='linear fit'
			
 
				+                          )
			
 
				+        
			
 
				+        annotations = [dict(
			
 
				+                          x=max(xi) * eq_pos[0],
			
 
				+                          y=data[y].max() * eq_pos[1],
			
 
				+                          showarrow=False,
			
 
				+                          text=f'$R^2 = {rvalue:.2f}; Y = {slope:.2f}X + {intercept:.2f}$',
			
 
				+                  font=dict(size=16)
			
 
				+                  )]
			
 
				+        
			
 
				+        plot_data.append(trace)
			
 
				+
			
 
				+        layout = go.Layout(annotations=annotations,
			
 
				+            height=600,
			
 
				+            width=900,
			
 
				+            title=base_title,
			
 
				+            xaxis=dict(
			
 
				+                title=x.title(),
			
 
				+                tickfont=dict(size=14),
			
 
				+                titlefont=dict(size=16)),
			
 
				+            yaxis=dict(
			
 
				+                title=y.title(),
			
 
				+                tickfont=dict(size=14),
			
 
				+                titlefont=dict(size=16)))
			
 
				+
			
 
				+    # Add a rangeselector and rangeslider for a data xaxis
			
 
				+    if time:
			
 
				+        rangeselector = dict(
			
 
				+            buttons=list([
			
 
				+                dict(count=1, label='1m', step='month', stepmode='backward'),
			
 
				+                dict(count=6, label='6m', step='month', stepmode='backward'),
			
 
				+                dict(count=1, label='YTD', step='year', stepmode='todate'),
			
 
				+                dict(count=1, label='1y', step='year', stepmode='backward'),
			
 
				+                dict(step='all')
			
 
				+            ]))
			
 
				+        rangeslider = dict(visible=True)
			
 
				+        layout['xaxis']['rangeselector'] = rangeselector
			
 
				+        layout['xaxis']['rangeslider'] = rangeslider
			
 
				+        
			
 
				+        figure = go.Figure(data=plot_data, layout=layout)
			
 
				+           
			
 
				+        return figure
			
 
				+        
			
 
				+    
			
 
				+    # Return the figure
			
 
				+    figure = go.Figure(data=plot_data, layout=layout)
			
 
				+
			
 
				+    return figure