123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214 |
- # Data science imports
- from multiprocessing import Pool
- import requests
- import re
- from bs4 import BeautifulSoup
- from itertools import chain
- from collections import Counter
- from timeit import default_timer as timer
- import pandas as pd
- from datetime import datetime
- import pytz
- def get_table_rows(fname='stats.html'):
- """
- Extract the table rows from the statistics
- :param fname: string name of the file stored in `data` directory
- :return table_rows: list of BeautifulSoup objects to be passed to `process_in_parallel`
- """
- soup = BeautifulSoup(open(f'data/{fname}', 'r'), features='lxml')
- table_rows = soup.find_all(
- attrs={'class': "sortableTable-row js-statsTableRow"})
- print(f'Found {len(table_rows)} entries in table.')
- return table_rows
- def convert_timestamp(ts: int, tz: str):
- """Convert a unix timestamp to a date timestamp"""
- return pd.to_datetime(ts, origin='unix', unit='ms').tz_localize('UTC').tz_convert(tz).tz_localize(None)
- def process_entry(entry, parallel=True, tz='America/Chicago'):
- """
- Extract data from one entry in table
- :param entry: BeautifulSoup tag
- :param parallel: Boolean for whether function is being run in parallel
- :param tz: string representing timezone for started and published time
- :return entry_dict: dictionary with data about entry
- """
- # Convert to soup when running in parallel
- if parallel:
- entry = BeautifulSoup(entry, features='lxml').body.tr
- entry_dict = {}
- # Extract information
- for value, key in zip(entry.find_all(attrs={'class': 'sortableTable-value'}),
- ['published_date', 'views', 'reads', 'ratio', 'fans']):
- entry_dict[key] = float(
- value.text) if key == 'ratio' else int(value.text)
- entry_dict['read_time'] = int(entry.find_all(attrs={'class': 'readingTime'})[
- 0].get('title').split(' ')[0])
- # Unlisted vs published
- entry_dict['type'] = 'unlisted' if len(
- entry.find_all(text=' Unlisted')) > 0 else 'published'
- # Publication
- publication = entry.find_all(attrs={'class': 'sortableTable-text'})
- if 'In' in publication[0].text:
- entry_dict['publication'] = publication[0].text.split('In ')[
- 1].split('View')[0]
- else:
- entry_dict['publication'] = 'None'
- # Convert datetimes
- entry_dict['published_date'] = convert_timestamp(
- entry_dict['published_date'], tz=tz)
- entry_dict['started_date'] = convert_timestamp(
- entry.get('data-timestamp'), tz=tz)
- # Get the link
- link = entry.find_all(text='View story',
- attrs={'class': 'sortableTable-link'})[0].get('href')
- # Retrieve the article and create a soup
- entry = requests.get(link).content
- entry_soup = BeautifulSoup(entry, features='lxml')
- # Get the title
- try:
- title = entry_soup.h1.text
- except:
- title = 'response'
- title_word_count = len(re.findall(r"[\w']+|[.,!?;]", title))
- # Main text entries
- entry_text = [p.text for p in entry_soup.find_all(
- ['h1', 'h2', 'h3', 'p', 'blockquote'])]
- # Make sure to catch everything
- entry_text.extend(s.text for s in entry_soup.find_all(
- attrs={'class': 'graf graf--li graf-after--li'}))
- entry_text.extend(s.text for s in entry_soup.find_all(
- attrs={'class': 'graf graf--li graf-after--p'}))
- entry_text.extend(s.text for s in entry_soup.find_all(
- attrs={'class': 'graf graf--li graf-after--blockquote'}))
- entry_text.extend(s.text for s in entry_soup.find_all(
- attrs={'class': 'graf graf--li graf-after--pullquote'}))
- entry_text = ' '.join(entry_text)
- # Word count
- word_count = len(re.findall(r"[\w']+|[.,!?;]", entry_text))
- # Number of claps
- clap_pattern = re.compile(
- '^[0-9]{1,} claps|^[0-9]{1,}.[0-9]{1,}K claps|^[0-9]{1,}K claps')
- claps = entry_soup.find_all(text=clap_pattern)
- if len(claps) > 0:
- if 'K' in claps[0]:
- clap_number = int(1e3 * float(claps[0].split('K')[0]))
- else:
- clap_number = int(claps[0].split(' ')[0])
- else:
- clap_number = 0
- # Post tags
- tags = entry_soup.find_all(
- attrs={'class': 'tags tags--postTags tags--borderless'})
- tags = [li.text for li in tags[0].find_all('li')]
- # Responses to entry
- responses = entry_soup.find_all(attrs={'class': 'button button--chromeless u-baseColor--buttonNormal u-marginRight12',
- 'data-action': 'scroll-to-responses'})
- num_responses = int(responses[0].text) if len(responses) > 0 else 0
- # Store in dictionary
- entry_dict['title'] = title
- entry_dict['title_word_count'] = title_word_count
- entry_dict['text'] = entry_text
- entry_dict['word_count'] = word_count
- entry_dict['claps'] = clap_number
- entry_dict['tags'] = tags
- entry_dict['num_responses'] = num_responses
- # Time since publication
- entry_dict['days_since_publication'] = (
- datetime.now() - entry_dict['published_date']).total_seconds() / (3600 * 24)
- return entry_dict
- def process_in_parallel(table_rows, processes=20):
- """
- Process all the stats in a table in parallel
- :note: make sure to set the correct time zone in `process_entry`
- :param table_rows: BeautifulSoup table rows
- :param processes: integer number of processes (threads) to use in parallel
- :return df: dataframe of information about each post
- """
- # Convert to strings for multiprocessing
- table_rows_str = [str(r) for r in table_rows]
- # Process each article in paralllel
- pool = Pool(processes=processes)
- results = []
- start = timer()
- for i, r in enumerate(pool.imap_unordered(process_entry, table_rows_str)):
- # Report progress
- print(f'{100 * i / len(table_rows_str):.2f}% complete.', end='\r')
- results.append(r)
- pool.close()
- pool.join()
- end = timer()
- print(
- f'Processed {len(table_rows_str)} articles in {end-start:.2f} seconds.')
- # Convert to dataframe
- df = pd.DataFrame(results)
- # Rename ratio
- df.rename(columns={'ratio': 'read_ratio'}, inplace=True)
- # Add extra columns with more data
- df['claps_per_word'] = df['claps'] / df['word_count']
- df['editing_days'] = ((df['published_date'] - df['started_date']
- ).dt.total_seconds() / (60 * 60 * 24)).astype(int)
- # Rounding
- df['published_date'] = df['published_date'].dt.round('min')
- df['started_date'] = df['started_date'].dt.round('min')
- df['read_ratio'] = df['read_ratio'].round(2)
- # 5 most common tags (might want to include more tags)
- n = 5
- all_tags = list(chain(*df['tags'].tolist()))
- tag_counts = Counter(all_tags)
- tags = tag_counts.most_common(n)
- # Adding columns with indication of tag
- for tag, count in tags:
- flag = [1 if tag in tags else 0 for tags in df['tags']]
- df.loc[:, f'<tag>{tag}'] = flag
- df.sort_values('published_date', inplace=True)
- return df
- def get_data(fname='stats.html', processes=20):
- t = get_table_rows(fname=fname)
- return process_in_parallel(t, processes=processes)
|