scraper.py 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. # pylint: disable=line-too-long
  2. '''
  3. scraper.py contains various methods for scraping
  4. HTML content and packaging into dictionaries.
  5. '''
  6. import requests
  7. from bs4 import BeautifulSoup
  8. def fetchHTML(url):
  9. '''
  10. Returns HTML retrived from a url.
  11. Parameters:
  12. url: String
  13. - The URL to fetch HTML from
  14. Returns:
  15. html: String
  16. - The HTML from a given URL
  17. '''
  18. req = requests.get(url)
  19. html = req.text
  20. return html
  21. def extractTag(content, tag, className=None):
  22. '''
  23. Returns data embed within a tag, along
  24. with an optional class for filtering.
  25. Parameters:
  26. content: String
  27. - The HTML to parse
  28. tag: String
  29. - The HTML tag to scan for
  30. class: String
  31. - Optional filter for tag
  32. Returns:
  33. filteredData: List
  34. - Content embed within searched tags
  35. '''
  36. soup = BeautifulSoup(content)
  37. data = soup.findAll(tag, {'class': className})
  38. filteredData = []
  39. for datum in data:
  40. filteredData.append(datum.text)
  41. return filteredData
  42. def steamDiscounts():
  43. '''Returns discounts from steam.com'''
  44. req = requests.get('http://store.steampowered.com/search/?specials=1#sort_by=_ASC&sort_order=ASC&specials=1&page=1')
  45. content = req.text
  46. soup = BeautifulSoup(content)
  47. allData = {id: {} for id in range(0, 25)}
  48. # Get all divs of a specific class
  49. releaseDate = soup.findAll('div', {'class': 'col search_released'})
  50. # Get all release dates
  51. releaseDates = []
  52. id = 0
  53. for date in releaseDate:
  54. allData[id]['releaseDates'] = date.text
  55. releaseDates.append(date.text)
  56. id += 1
  57. #print releaseDates
  58. id = 0
  59. gameName = soup.findAll('div', {'class': 'col search_name ellipsis'})
  60. # Get all game names
  61. gameNames = []
  62. for name in gameName:
  63. span = name.findAll('span', {'class': 'title'})
  64. for tag in span:
  65. allData[id]['name'] = tag.text
  66. gameNames.append(tag.text)
  67. id += 1
  68. # print gameNames
  69. discount = soup.findAll('div', {'class': 'col search_discount'})
  70. id = 0
  71. # Get all game discounts
  72. gameDiscounts = []
  73. for discountedGame in discount:
  74. span = discountedGame.findAll('span')
  75. for tag in span:
  76. allData[id]['discount'] = tag.text
  77. gameDiscounts.append(tag.text)
  78. id += 1
  79. # print gameDiscounts
  80. price = soup.findAll('div', {'class': 'col search_price discounted'})
  81. id = 0
  82. prices = []
  83. # Get all discounted prices
  84. for value in price:
  85. br = value.findAll('br')
  86. for tag in br:
  87. allData[id]['price'] = tag.text.strip('\t')
  88. prices.append(tag.text.strip('\t'))
  89. id += 1
  90. # Cleanup data
  91. newData = []
  92. for datum in allData:
  93. newData.append(allData[datum])
  94. # print prices
  95. return newData