scraper.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. '''
  2. scraper.py contains various methods for scraping
  3. HTML content and packaging into dictionaries.
  4. '''
  5. import requests
  6. import itertools
  7. from bs4 import BeautifulSoup
  8. def fetchHTML(url):
  9. '''
  10. Returns HTML retrived from a url.
  11. Parameters:
  12. url: String
  13. - The URL to fetch HTML from
  14. Returns:
  15. html: String
  16. - The HTML from a given URL
  17. '''
  18. req = requests.get(url)
  19. html = req.text
  20. return html
  21. def steamDiscounts():
  22. req = requests.get('http://store.steampowered.com/search/?specials=1#sort_by=_ASC&sort_order=ASC&specials=1&page=1')
  23. content = req.text
  24. soup = BeautifulSoup(content)
  25. allData = {id: {} for id in range(0, 25)}
  26. # Get all divs of a specific class
  27. releaseDate = soup.findAll('div', {'class': 'col search_released'})
  28. # Get all release dates
  29. releaseDates = []
  30. id = 0
  31. for date in releaseDate:
  32. allData[id]['releaseDates'] = date.text
  33. releaseDates.append(date.text)
  34. id += 1
  35. #print releaseDates
  36. id = 0
  37. gameName = soup.findAll('div', {'class': 'col search_name ellipsis'})
  38. # Get all game names
  39. gameNames = []
  40. for name in gameName:
  41. span = name.findAll('span', {'class': 'title'})
  42. for tag in span:
  43. allData[id]['name'] = tag.text
  44. gameNames.append(tag.text)
  45. id += 1
  46. # print gameNames
  47. discount = soup.findAll('div', {'class': 'col search_discount'})
  48. id = 0
  49. # Get all game discounts
  50. gameDiscounts = []
  51. for discountedGame in discount:
  52. span = discountedGame.findAll('span')
  53. for tag in span:
  54. allData[id]['discount'] = tag.text
  55. gameDiscounts.append(tag.text)
  56. id += 1
  57. # print gameDiscounts
  58. price = soup.findAll('div', {'class': 'col search_price discounted'})
  59. id = 0
  60. prices = []
  61. # Get all discounted prices
  62. for value in price:
  63. br = value.findAll('br')
  64. for tag in br:
  65. allData[id]['price'] = tag.text.strip('\t')
  66. prices.append(tag.text.strip('\t'))
  67. id += 1
  68. # Cleanup data
  69. newData = []
  70. for datum in allData:
  71. newData.append(allData[datum])
  72. # print prices
  73. return newData