from multiprocessing import Pool import requests import re from bs4 import BeautifulSoup from itertools import chain from collections import Counter from timeit import default_timer as timer import pandas as pd from datetime import datetime def get_table_rows(fname='stats.html'): """ Extract the table rows from the statistics :param fname: string name of the file stored in `data` directory :return table_rows: list of BeautifulSoup objects to be passed to `process_in_parallel` """ soup = BeautifulSoup( open(f'data/{fname}', 'r', encoding='utf8'), features='lxml') table_rows = soup.find_all( attrs={'class': "sortableTable-row js-statsTableRow"}) print(f'Found {len(table_rows)} entries in table.') return table_rows def convert_timestamp(ts: int, tz: str): """Convert a unix timestamp to a date timestamp""" return pd.to_datetime(ts, origin='unix', unit='ms').tz_localize('UTC').tz_convert(tz).tz_localize(None) def process_entry(entry, parallel=True, tz='America/Chicago'): """ Extract data from one entry in table :param entry: BeautifulSoup tag :param parallel: Boolean for whether function is being run in parallel :param tz: string representing timezone for started and published time :return entry_dict: dictionary with data about entry """ # Convert to soup when running in parallel if parallel: entry = BeautifulSoup(entry, features='lxml').body.tr entry_dict = {} # Extract information for value, key in zip(entry.find_all(attrs={'class': 'sortableTable-value'}), ['published_date', 'views', 'reads', 'ratio', 'fans']): entry_dict[key] = float( value.text) if key == 'ratio' else int(value.text) entry_dict['read_time'] = int(entry.find_all(attrs={'class': 'readingTime'})[ 0].get('title').split(' ')[0]) # Unlisted vs published entry_dict['type'] = 'unlisted' if len( entry.find_all(text=' Unlisted')) > 0 else 'published' # Publication publication = entry.find_all(attrs={'class': 'sortableTable-text'}) if 'In' in publication[0].text: entry_dict['publication'] = publication[0].text.split('In ')[ 1].split('View')[0] else: entry_dict['publication'] = 'None' # Convert datetimes entry_dict['published_date'] = convert_timestamp( entry_dict['published_date'], tz=tz) entry_dict['started_date'] = convert_timestamp( entry.get('data-timestamp'), tz=tz) # Get the link link = entry.find_all(text='View story', attrs={'class': 'sortableTable-link'})[0].get('href') entry_dict['link'] = link # Retrieve the article and create a soup entry = requests.get(link).content entry_soup = BeautifulSoup(entry, features='lxml') # Get the title try: title = entry_soup.h1.text except: title = 'response' title_word_count = len(re.findall(r"[\w']+|[.,!?;]", title)) # Main text entries entry_text = [p.text for p in entry_soup.find_all( ['h1', 'h2', 'h3', 'p', 'blockquote'])] # Make sure to catch everything entry_text.extend(s.text for s in entry_soup.find_all( attrs={'class': 'graf graf--li graf-after--li'})) entry_text.extend(s.text for s in entry_soup.find_all( attrs={'class': 'graf graf--li graf-after--p'})) entry_text.extend(s.text for s in entry_soup.find_all( attrs={'class': 'graf graf--li graf-after--blockquote'})) entry_text.extend(s.text for s in entry_soup.find_all( attrs={'class': 'graf graf--li graf-after--pullquote'})) entry_text = ' '.join(entry_text) # Word count word_count = len(re.findall(r"[\w']+|[.,!?;]", entry_text)) # Number of claps clap_pattern = re.compile( '^[0-9]{1,} claps|^[0-9]{1,}.[0-9]{1,}K claps|^[0-9]{1,}K claps') claps = entry_soup.find_all(text=clap_pattern) if len(claps) > 0: if 'K' in claps[0]: clap_number = int(1e3 * float(claps[0].split('K')[0])) else: clap_number = int(claps[0].split(' ')[0]) else: clap_number = 0 # Post tags tags = entry_soup.find_all( attrs={'class': 'tags tags--postTags tags--borderless'}) tags = [li.text for li in tags[0].find_all('li')] # Responses to entry responses = entry_soup.find_all(attrs={'class': 'button button--chromeless u-baseColor--buttonNormal u-marginRight12', 'data-action': 'scroll-to-responses'}) num_responses = int(responses[0].text) if len(responses) > 0 else 0 # Store in dictionary entry_dict['title'] = title entry_dict['title_word_count'] = title_word_count entry_dict['text'] = entry_text entry_dict['word_count'] = word_count entry_dict['claps'] = clap_number entry_dict['tags'] = tags entry_dict['num_responses'] = num_responses # Time since publication entry_dict['days_since_publication'] = ( datetime.now() - entry_dict['published_date']).total_seconds() / (3600 * 24) return entry_dict def process_in_parallel(table_rows, processes=20): """ Process all the stats in a table in parallel :note: make sure to set the correct time zone in `process_entry` :note: running on Mac may first require setting export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES from the command line to enable parallel processing :param table_rows: BeautifulSoup table rows :param processes: integer number of processes (threads) to use in parallel :return df: dataframe of information about each post """ # Convert to strings for multiprocessing table_rows_str = [str(r) for r in table_rows] # Process each article in paralllel pool = Pool(processes=processes) results = [] start = timer() for i, r in enumerate(pool.imap_unordered(process_entry, table_rows_str)): # Report progress print(f'{100 * i / len(table_rows_str):.2f}% complete.', end='\r') results.append(r) pool.close() pool.join() end = timer() print( f'Processed {len(table_rows_str)} articles in {end-start:.2f} seconds.') # Convert to dataframe df = pd.DataFrame(results) # Rename ratio df.rename(columns={'ratio': 'read_ratio'}, inplace=True) # Add extra columns with more data df['claps_per_word'] = df['claps'] / df['word_count'] df['editing_days'] = ((df['published_date'] - df['started_date'] ).dt.total_seconds() / (60 * 60 * 24)).astype(int) # Rounding df['published_date'] = df['published_date'].dt.round('min') df['started_date'] = df['started_date'].dt.round('min') df['read_ratio'] = df['read_ratio'].round(2) # 5 most common tags (might want to include more tags) n = 5 all_tags = list(chain(*df['tags'].tolist())) tag_counts = Counter(all_tags) tags = tag_counts.most_common(n) # Adding columns with indication of tag for tag, count in tags: flag = [1 if tag in tags else 0 for tags in df['tags']] df.loc[:, f'{tag}'] = flag df.sort_values('published_date', inplace=True) return df def get_data(fname='stats.html', processes=20): """ Retrieve medium article statistics :note: running on Mac may first require setting export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES from the command line to enable parallel processing :param fname: file name (should be 'stats.html') :param processes: integer number of processes :return df: dataframe of article data """ t = get_table_rows(fname=fname) return process_in_parallel(t, processes=processes)