radu
/
Django-Hackathon-Starter


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
							'''
scraper.py contains various methods for scraping
HTML content and packaging into dictionaries.
'''

import requests
import itertools 
from bs4 import BeautifulSoup

def fetchHTML(url):
	'''
	Returns HTML retrived from a url.

	Parameters:
		url: String
			- The URL to fetch HTML from

	Returns:
		html: String
			- The HTML from a given URL
	'''
	req = requests.get(url)
	html = req.text
	return html

def extractTag(content, tag, className=None):
	'''
	Returns data embed within a tag, along
	with an optional class for filtering.

	Parameters:
		content: String
			- The HTML to parse
		tag: String
			- The HTML tag to scan for
		class: String
			- Optional filter for tag

	Returns:
		filteredData: List
			- Content embed within searched tags
	'''
	soup = BeautifulSoup(content)
	data = soup.findAll(tag, { 'class': className })
	filteredData = []
	for datum in data:
		filteredData.append(datum.text)
	return filteredData


def steamDiscounts():
	req = requests.get('http://store.steampowered.com/search/?specials=1#sort_by=_ASC&sort_order=ASC&specials=1&page=1')
	content = req.text
	soup = BeautifulSoup(content)
	allData = {id: {} for id in range(0, 25)}

	# Get all divs of a specific class
	releaseDate = soup.findAll('div', {'class': 'col search_released'})

	# Get all release dates
	releaseDates = []
	id = 0
	for date in releaseDate:
		allData[id]['releaseDates'] = date.text
		releaseDates.append(date.text)
		id += 1

	#print releaseDates

	id = 0
	gameName = soup.findAll('div', {'class': 'col search_name ellipsis'})

	# Get all game names
	gameNames = []
	for name in gameName:
		span = name.findAll('span', {'class': 'title'})
		for tag in span:
			allData[id]['name'] = tag.text
			gameNames.append(tag.text)
			id += 1

	# print gameNames

	discount = soup.findAll('div', {'class': 'col search_discount'})

	id = 0
	# Get all game discounts 
	gameDiscounts = []
	for discountedGame in discount:
		span = discountedGame.findAll('span')
		for tag in span:
			allData[id]['discount'] = tag.text
			gameDiscounts.append(tag.text)
			id += 1

	# print gameDiscounts

	price = soup.findAll('div', {'class': 'col search_price discounted'})

	id = 0
	prices = []
	# Get all discounted prices
	for value in price:
		br = value.findAll('br')
		for tag in br:
			allData[id]['price'] = tag.text.strip('\t')
			prices.append(tag.text.strip('\t'))
			id += 1

	# Cleanup data
	newData = []
	for datum in allData:
		newData.append(allData[datum])
	# print prices
	return newData