123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122 |
- # pylint: disable=line-too-long
- '''
- scraper.py contains various methods for scraping
- HTML content and packaging into dictionaries.
- '''
- import requests
- from bs4 import BeautifulSoup
- def fetchHTML(url):
- '''
- Returns HTML retrived from a url.
- Parameters:
- url: String
- - The URL to fetch HTML from
- Returns:
- html: String
- - The HTML from a given URL
- '''
- req = requests.get(url)
- html = req.text
- return html
- def extractTag(content, tag, className=None):
- '''
- Returns data embed within a tag, along
- with an optional class for filtering.
- Parameters:
- content: String
- - The HTML to parse
- tag: String
- - The HTML tag to scan for
- class: String
- - Optional filter for tag
- Returns:
- filteredData: List
- - Content embed within searched tags
- '''
- soup = BeautifulSoup(content)
- data = soup.findAll(tag, {'class': className})
- filteredData = []
- for datum in data:
- filteredData.append(datum.text)
- return filteredData
- def steamDiscounts():
- '''Returns discounts from steam.com'''
- req = requests.get('http://store.steampowered.com/search/?specials=1#sort_by=_ASC&sort_order=ASC&specials=1&page=1')
- content = req.text
- soup = BeautifulSoup(content)
- allData = {id: {} for id in range(0, 25)}
- # Get all divs of a specific class
- releaseDate = soup.findAll('div', {'class': 'col search_released'})
- # Get all release dates
- releaseDates = []
- id = 0
- for date in releaseDate:
- allData[id]['releaseDates'] = date.text
- releaseDates.append(date.text)
- id += 1
- #print releaseDates
- id = 0
- gameName = soup.findAll('div', {'class': 'col search_name ellipsis'})
- # Get all game names
- gameNames = []
- for name in gameName:
- span = name.findAll('span', {'class': 'title'})
- for tag in span:
- allData[id]['name'] = tag.text
- gameNames.append(tag.text)
- id += 1
- # print gameNames
- discount = soup.findAll('div', {'class': 'col search_discount'})
- id = 0
- # Get all game discounts
- gameDiscounts = []
- for discountedGame in discount:
- span = discountedGame.findAll('span')
- for tag in span:
- allData[id]['discount'] = tag.text
- gameDiscounts.append(tag.text)
- id += 1
- # print gameDiscounts
- price = soup.findAll('div', {'class': 'col search_price discounted'})
- id = 0
- prices = []
- # Get all discounted prices
- for value in price:
- br = value.findAll('br')
- for tag in br:
- allData[id]['price'] = tag.text.strip('\t')
- prices.append(tag.text.strip('\t'))
- id += 1
- # Cleanup data
- newData = []
- for datum in allData:
- newData.append(allData[datum])
- # print prices
- return newData
|