scraper.py 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. '''
  2. scraper.py contains various methods for scraping
  3. HTML content and packaging into dictionaries.
  4. '''
  5. import requests
  6. import itertools
  7. from bs4 import BeautifulSoup
  8. def fetchHTML(url):
  9. '''
  10. Returns HTML retrived from a url.
  11. Parameters:
  12. url: String
  13. - The URL to fetch HTML from
  14. Returns:
  15. html: String
  16. - The HTML from a given URL
  17. '''
  18. req = requests.get(url)
  19. html = req.text
  20. return html
  21. def extractTag(content, tag, className=None):
  22. '''
  23. Returns data embed within a tag, along
  24. with an optional class for filtering.
  25. Parameters:
  26. content: String
  27. - The HTML to parse
  28. tag: String
  29. - The HTML tag to scan for
  30. class: String
  31. - Optional filter for tag
  32. Returns:
  33. filteredData: List
  34. - Content embed within searched tags
  35. '''
  36. soup = BeautifulSoup(content)
  37. data = soup.findAll(tag, { 'class': className })
  38. filteredData = []
  39. for datum in data:
  40. filteredData.append(datum.text)
  41. return filteredData
  42. def steamDiscounts():
  43. req = requests.get('http://store.steampowered.com/search/?specials=1#sort_by=_ASC&sort_order=ASC&specials=1&page=1')
  44. content = req.text
  45. soup = BeautifulSoup(content)
  46. allData = {id: {} for id in range(0, 25)}
  47. # Get all divs of a specific class
  48. releaseDate = soup.findAll('div', {'class': 'col search_released'})
  49. # Get all release dates
  50. releaseDates = []
  51. id = 0
  52. for date in releaseDate:
  53. allData[id]['releaseDates'] = date.text
  54. releaseDates.append(date.text)
  55. id += 1
  56. #print releaseDates
  57. id = 0
  58. gameName = soup.findAll('div', {'class': 'col search_name ellipsis'})
  59. # Get all game names
  60. gameNames = []
  61. for name in gameName:
  62. span = name.findAll('span', {'class': 'title'})
  63. for tag in span:
  64. allData[id]['name'] = tag.text
  65. gameNames.append(tag.text)
  66. id += 1
  67. # print gameNames
  68. discount = soup.findAll('div', {'class': 'col search_discount'})
  69. id = 0
  70. # Get all game discounts
  71. gameDiscounts = []
  72. for discountedGame in discount:
  73. span = discountedGame.findAll('span')
  74. for tag in span:
  75. allData[id]['discount'] = tag.text
  76. gameDiscounts.append(tag.text)
  77. id += 1
  78. # print gameDiscounts
  79. price = soup.findAll('div', {'class': 'col search_price discounted'})
  80. id = 0
  81. prices = []
  82. # Get all discounted prices
  83. for value in price:
  84. br = value.findAll('br')
  85. for tag in br:
  86. allData[id]['price'] = tag.text.strip('\t')
  87. prices.append(tag.text.strip('\t'))
  88. id += 1
  89. # Cleanup data
  90. newData = []
  91. for datum in allData:
  92. newData.append(allData[datum])
  93. # print prices
  94. return newData