radu
/
Django-Hackathon-Starter


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
							# pylint: disable=line-too-long

'''
scraper.py contains various methods for scraping
HTML content and packaging into dictionaries.
'''

import requests
from bs4 import BeautifulSoup

def fetchHTML(url):
    '''
    Returns HTML retrived from a url.

    Parameters:
        url: String
            - The URL to fetch HTML from

    Returns:
        html: String
            - The HTML from a given URL
    '''
    req = requests.get(url)
    html = req.text
    return html

def extractTag(content, tag, className=None):
    '''
    Returns data embed within a tag, along
    with an optional class for filtering.

    Parameters:
        content: String
            - The HTML to parse
        tag: String
            - The HTML tag to scan for
        class: String
            - Optional filter for tag

    Returns:
        filteredData: List
            - Content embed within searched tags
    '''
    soup = BeautifulSoup(content)
    data = soup.findAll(tag, {'class': className})
    filteredData = []
    for datum in data:
        filteredData.append(datum.text)
    return filteredData


def steamDiscounts():
    '''Returns discounts from steam.com'''
    req = requests.get('http://store.steampowered.com/search/?specials=1#sort_by=_ASC&sort_order=ASC&specials=1&page=1')
    content = req.text
    soup = BeautifulSoup(content)
    allData = {id: {} for id in range(0, 25)}

    # Get all divs of a specific class
    releaseDate = soup.findAll('div', {'class': 'col search_released'})

    # Get all release dates
    releaseDates = []
    id = 0
    for date in releaseDate:
        allData[id]['releaseDates'] = date.text
        releaseDates.append(date.text)
        id += 1

    #print releaseDates

    id = 0
    gameName = soup.findAll('div', {'class': 'col search_name ellipsis'})

    # Get all game names
    gameNames = []
    for name in gameName:
        span = name.findAll('span', {'class': 'title'})
        for tag in span:
            allData[id]['name'] = tag.text
            gameNames.append(tag.text)
            id += 1

    # print gameNames

    discount = soup.findAll('div', {'class': 'col search_discount'})

    id = 0
    # Get all game discounts
    gameDiscounts = []
    for discountedGame in discount:
        span = discountedGame.findAll('span')
        for tag in span:
            allData[id]['discount'] = tag.text
            gameDiscounts.append(tag.text)
            id += 1

    # print gameDiscounts

    price = soup.findAll('div', {'class': 'col search_price discounted'})

    id = 0
    prices = []
    # Get all discounted prices
    for value in price:
        br = value.findAll('br')
        for tag in br:
            allData[id]['price'] = tag.text.strip('\t')
            prices.append(tag.text.strip('\t'))
            id += 1

    # Cleanup data
    newData = []
    for datum in allData:
        newData.append(allData[datum])
    # print prices
    return newData