radu
/
Data-Analysis-Jupyter
mirror of https://github.com/WillKoehrsen/Data-Analysis.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
							# Data science imports
import pandas as pd
import numpy as np

from scipy import stats

# Options for pandas
pd.options.display.max_columns = 20

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Interactive plotting
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot
import cufflinks
cufflinks.go_offline()

from timeit import default_timer as timer

from collections import Counter, defaultdict
from itertools import chain

from bs4 import BeautifulSoup
import re

import requests
from multiprocessing import Pool

def get_links(soup):
    """
    Retrieve all links to entries on webpage
    
    :param soup: BeautifulSoup of HTML for page
    :return entry_links: list of links to entries
    
    """    
    titles = soup.find_all(attrs = {'class': 'bq y br af bs ag db dc dd c de df dg'})
    pattern = re.compile('[0-9]{1,} min read')
    read_times = soup.find_all(text = pattern)
    read_times = [int(x.split(' ')[0]) for x in read_times]
    total_read_time = sum(read_times)
    
    print(f'Found {len(titles)} entries.')
    print(f'Total Read Time of Entries: {total_read_time} minutes.')
    entry_links = [title.a.get_attribute_list('href')[0] for title in titles]
    
    return entry_links

def process_entry(link):
    """
    Retrieve data of single entry.
    
    :param link: string for link to entry
    
    :return entry_dict: dictionary of data about entry
    """
    
    entry_dict = {}
     
    # Retrieve the article and create a soup
    entry = requests.get(link).content
    entry_soup = BeautifulSoup(entry, features="lxml")
    
    # Publication time
    t = entry_soup.find_all('time')[0]
    t = pd.to_datetime(t.get('datetime'), utc=True).tz_convert('America/New_York')

    # Find the title header (determines if an article or a response)
    if entry_soup.h1 is not None:
        title = entry_soup.h1.text
    else:
        title = f'response-{t}'

    # Text as single long string
    entry_text = [p.text for p in entry_soup.find_all('p')]
    entry_text = ' '.join(entry_text)

    # Word count
    word_count = len(entry_text.split(' '))

    # Reading time in minutes
    read_time = entry_soup.find_all(attrs={'class': 'readingTime'})
    read_mins = int(read_time[0].get('title').split(' ')[0])

    # Number of claps
    clap_pattern = re.compile('^[0-9]{1,} claps|^[0-9]{1,}.[0-9]{1,}K claps|^[0-9]{1,}K claps')
    claps = entry_soup.find_all(text = clap_pattern)

    if len(claps) > 0:
        if 'K' in claps[0]:
            clap_number = int(1e3 * float(claps[0].split('K')[0]))
        else:
            clap_number = int(claps[0].split(' ')[0])
    else:
        clap_number = 0

    # Post tags
    tags = entry_soup.find_all(attrs={'class': 'tags tags--postTags tags--borderless'})
    tags = [li.text for li in tags[0].find_all('li')]
        
    # Store in dictionary with title as key
    entry_dict['title'] = title
    entry_dict['text'] = entry_text
    entry_dict['word_count'] = word_count
    entry_dict['read_time'] = read_mins
    entry_dict['claps'] = clap_number
    entry_dict['time_published'] = t
    entry_dict['tags'] = tags
        
    
    return entry_dict
    
def process_in_parallel(links, processes=20):
    """
    Process entries in parallel
    
    :param links: list of entry links
    :param processes: integer number of processes (threads) to use in parallel
    
    :return results: list of dictionaries of entry data
    """
    pool = Pool(processes=processes)
    results = []

    start = timer()
    for i, result in enumerate(pool.imap_unordered(process_entry, links)):
        if (i + 1) % 5 == 0:
            print(f'{100 * i / len(links):.2f}% complete.', end='\r')
        results.append(result)

    pool.close()
    pool.join()
    end = timer()
    
    print(f'Processed {len(results)} entries in {end-start:.0f} seconds.')
    
    # Add extra columns with more data
    df = pd.DataFrame.from_dict(results)
    df['response'] = ['response' if x == True else 'article' for x in df['title'].str.contains('response')]
    df['claps_per_word'] = df['claps'] / df['word_count']
    df['words_per_minute'] = df['word_count'] / df['read_time']
    
    # Add 10 most common tags with flag if data has it
    n = 10
    all_tags = list(chain(*df['tags'].tolist()))
    tag_counts = Counter(all_tags)
    tags = tag_counts.most_common(n)

    for tag, count in tags:
        flag = [1 if tag in tags else 0 for tags in df['tags']]
        df.loc[:, f'<tag>{tag}'] = flag
        
    return df

def make_update_menu(base_title, article_annotations=None, response_annotations=None):
    """
    Make an updatemenu for interative plot
    
    :param base_title: string for title of plot
    
    :return updatemenus: a updatemenus object for adding to a layout
    """
    updatemenus = list([
    dict(
        buttons=list([
            dict(
                label='both', method='update', 
                args=[dict(visible=[True, True]), dict(title = base_title,
                                                       annotations=[article_annotations,response_annotations])]),
            dict(
                label='articles',
                method='update',
                args=[dict(visible=[True, False]), dict(title = 'Article ' + base_title,
                                                        annotations = [article_annotations])]),
            dict(
                label='responses',
                method='update',
                args=[dict(visible=[False, True]), dict(title='Response ' + base_title,
                                                       annotations = [response_annotations])]),
        ]))
    ])
    return updatemenus


def make_iplot(data, x, y, base_title, time=False, eq_pos=(0.75, 0.25)):
    """
    Make an interactive plot. Adds a dropdown to separate articles from responses
    if there are responses in the data. If there is only articles (or only responses)
    adds a linear regression line. 
    
    :param data: dataframe of entry data
    :param x: string for xaxis of plot
    :param y: sring for yaxis of plot
    :param base_title: string for title of plot
    :param time: boolean for whether the xaxis is a plot
    :param eq_pos: position of equation for linear regression
    
    :return figure: an interactive plotly object for display
    
    """

    # Extract the relevant data
    responses = data[data['response'] == 'response'].copy()
    articles = data[data['response'] == 'article'].copy()

    if not responses.empty:
        # Create scatterplot data, articles must be first for menu selection
        plot_data = [
            go.Scatter(
                x=articles[x],
                y=articles[y],
                mode='markers',
                name='articles',
                text=articles['title'],
                marker=dict(color='blue', size=12)),
            go.Scatter(
                x=responses[x],
                y=responses[y],
                mode='markers',
                name='responses',
                marker=dict(color='green', size=12))
        ]
        
        if not time:
            annotations = {}
            for df, name in zip([articles, responses], 
                                ['articles', 'responses']):
                
                regression = stats.linregress(x=df[x], y=df[y])
                slope = regression.slope
                intercept = regression.intercept
                rvalue = regression.rvalue

                xi = np.array(range(int(df[x].min()), int(df[x].max())))
                
                line = xi*slope + intercept
                trace = go.Scatter(
                                  x=xi,
                                  y=line,
                                  mode='lines',
                                  marker=dict(color='blue' if name == 'articles' else 'green'), 
                                  line=dict(width=4, dash='longdash'),
                                  name=f'{name} linear fit'
                                  )

                annotations[name] = dict(
                                  x=max(xi) * eq_pos[0],
                                  y=df[y].max() * eq_pos[1],
                                  showarrow=False,
                                  text=f'$R^2 = {rvalue:.2f}; Y = {slope:.2f}X + {intercept:.2f}$',
                          font=dict(size=16, color='blue' if name == 'articles' else 'green')
                          )

                plot_data.append(trace)
        
        # Make a layout with update menus
        layout = go.Layout(annotations=list(annotations.values()),
            height=600,
            width=900,
            title=base_title,
            xaxis=dict(
                title=x.title(),
                tickfont=dict(size=14),
                titlefont=dict(size=16)),
            yaxis=dict(
                title=y.title(),
                tickfont=dict(size=14),
                titlefont=dict(size=16)),
            updatemenus=make_update_menu(base_title, annotations['articles'], annotations['responses']))

    # If there are only articles
    else:
        plot_data = [
            go.Scatter(
                x=data[x],
                y=data[y],
                mode='markers',
                name = 'observations',
                text=data['title'],
                marker=dict(color='blue', size=12))
        ]
        
        regression = stats.linregress(x=data[x], y=data[y])
        slope = regression.slope
        intercept = regression.intercept
        rvalue = regression.rvalue
        
        xi = np.array(range(int(data[x].min()), int(data[x].max())))
        line = xi*slope + intercept
        trace = go.Scatter(
                          x=xi,
                          y=line,
                          mode='lines',
                          marker=dict(color='red'), 
                          line=dict(width=4, dash='longdash'),
                          name='linear fit'
                          )
        
        annotations = [dict(
                          x=max(xi) * eq_pos[0],
                          y=data[y].max() * eq_pos[1],
                          showarrow=False,
                          text=f'$R^2 = {rvalue:.2f}; Y = {slope:.2f}X + {intercept:.2f}$',
                  font=dict(size=16)
                  )]
        
        plot_data.append(trace)

        layout = go.Layout(annotations=annotations,
            height=600,
            width=900,
            title=base_title,
            xaxis=dict(
                title=x.title(),
                tickfont=dict(size=14),
                titlefont=dict(size=16)),
            yaxis=dict(
                title=y.title(),
                tickfont=dict(size=14),
                titlefont=dict(size=16)))

    # Add a rangeselector and rangeslider for a data xaxis
    if time:
        rangeselector = dict(
            buttons=list([
                dict(count=1, label='1m', step='month', stepmode='backward'),
                dict(count=6, label='6m', step='month', stepmode='backward'),
                dict(count=1, label='YTD', step='year', stepmode='todate'),
                dict(count=1, label='1y', step='year', stepmode='backward'),
                dict(step='all')
            ]))
        rangeslider = dict(visible=True)
        layout['xaxis']['rangeselector'] = rangeselector
        layout['xaxis']['rangeslider'] = rangeslider
        
        figure = go.Figure(data=plot_data, layout=layout)
           
        return figure
        
    
    # Return the figure
    figure = go.Figure(data=plot_data, layout=layout)

    return figure