blacklist_urls.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313
  1. # coding=utf-8
  2. # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import glob
  16. import re
  17. import time
  18. import tldextract
  19. import sys
  20. # List of the domains to blacklist.
  21. domain_blacklist = set([
  22. '500px',
  23. 'aapks',
  24. 'akamaihd',
  25. 'amazon',
  26. 'apple',
  27. 'artifactfire',
  28. 'artstation',
  29. 'awwni',
  30. 'bandcamp',
  31. 'battleforthenet',
  32. 'coinscalendar',
  33. 'dailymotion',
  34. 'deviantart',
  35. 'discord',
  36. 'discordapp',
  37. 'dlapkandroid',
  38. 'dropbox',
  39. 'e621',
  40. 'ebay',
  41. 'edealinfo',
  42. 'erome',
  43. 'eroshare',
  44. 'explosm',
  45. 'facebook',
  46. 'fbcdn',
  47. 'flickr',
  48. 'furaffinity',
  49. 'futhead',
  50. 'gatopardo',
  51. 'gfycat',
  52. 'gifsound',
  53. 'gifsoup',
  54. 'giphy',
  55. 'github',
  56. 'google',
  57. 'gunprime',
  58. 'gyazo',
  59. 'hotdealstar',
  60. 'imagefap',
  61. 'imageshack',
  62. 'imgflip',
  63. 'imgur',
  64. 'instagram',
  65. 'karmadecay',
  66. 'kryptocal',
  67. 'kym-cdn',
  68. 'liveleak',
  69. 'livememe',
  70. 'lmgtfy',
  71. 'magaimg',
  72. 'memegenerator',
  73. 'minorplanetcenter',
  74. 'minus',
  75. 'mobafire',
  76. 'morejpeg',
  77. 'nocookie',
  78. 'pcpartpicker',
  79. 'photobucket',
  80. 'pinimg',
  81. 'pinterest',
  82. 'pixiv',
  83. 'pornhub',
  84. 'prntscr',
  85. 'puu',
  86. 'qkme',
  87. 'quickmeme',
  88. 'radd',
  89. 'redd',
  90. 'reddit',
  91. 'reddit-stream',
  92. 'redditlog',
  93. 'redditmedia',
  94. 'reddituploads',
  95. 'redtube',
  96. 'reupp',
  97. 'reverb',
  98. 'roanoke',
  99. 'rollingstone',
  100. 'sli',
  101. 'soundcloud',
  102. 'soundgasm',
  103. 'spankbang',
  104. 'spotify',
  105. 'strawpoll',
  106. 'streamable',
  107. 'timeanddate',
  108. 'tinypic',
  109. 'touhouradio',
  110. 'tumblr',
  111. 'twimg',
  112. 'twitch',
  113. 'twitter',
  114. 'vid',
  115. 'vimeo',
  116. 'vine',
  117. 'vkaao',
  118. 'vocaroo',
  119. 'voyagefusion',
  120. 'walmart',
  121. 'wciu',
  122. 'wikimedia',
  123. 'wikipedia',
  124. 'xhamster',
  125. 'xkcd',
  126. 'xvideos',
  127. 'youtu',
  128. 'youtube',
  129. 'youtubedoubler',
  130. 'ytimg',
  131. 'zillexplorer',
  132. ])
  133. def domain_is_in_blacklist(url):
  134. domain = tldextract.extract(url).domain
  135. return domain in domain_blacklist
  136. # List of extentions to blacklist.
  137. extentions_blacklist = (
  138. '.3gp',
  139. '.7z'
  140. '.ai',
  141. '.aif',
  142. '.apk',
  143. '.app',
  144. '.avi',
  145. '.bin',
  146. '.bmp',
  147. '.bz2',
  148. '.css',
  149. '.csv',
  150. '.dat',
  151. '.deb',
  152. '.dmg',
  153. '.doc',
  154. '.docx',
  155. '.exe',
  156. '.gif',
  157. '.gifv',
  158. '.gz',
  159. '.iso',
  160. '.jar',
  161. '.jpeg',
  162. '.jpg',
  163. '.js',
  164. '.log',
  165. '.mid',
  166. '.midi',
  167. '.mkv',
  168. '.mov',
  169. '.mp3',
  170. '.mp4',
  171. '.mpeg',
  172. '.mpg',
  173. '.ogg',
  174. '.ogv',
  175. '.otf',
  176. '.pdf',
  177. '.pkg',
  178. '.png',
  179. '.pps',
  180. '.ppt',
  181. '.pptx',
  182. '.psd',
  183. '.py',
  184. '.qt',
  185. '.ram',
  186. '.rar',
  187. '.sql',
  188. '.svg',
  189. '.swf',
  190. '.tar.gz',
  191. '.tar',
  192. '.tgz',
  193. '.tiff',
  194. '.ttf',
  195. '.txt',
  196. '.wav',
  197. '.webm',
  198. '.wma',
  199. '.wmv',
  200. '.xls',
  201. '.xlsx',
  202. '.xml',
  203. '.xz',
  204. '.zip',
  205. )
  206. def extention_is_in_blacklist(url):
  207. if url.split('?')[0].lower().endswith(extentions_blacklist):
  208. return True
  209. return False
  210. # Malformed urls.
  211. # This function is adapted from:
  212. # https://stackoverflow.com/questions/7160737/python-how-to-validate-a-url-in-python-malformed-or-not
  213. url_regex = re.compile(
  214. r'^(?:http)s?://' # http:// or https://
  215. r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
  216. r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
  217. r'(?::\d+)?' # optional port
  218. r'(?:/?|[/?]\S+)$', re.IGNORECASE)
  219. def url_is_malformed(url):
  220. return re.match(url_regex, url) is None
  221. def print_progress(prefix, start_time, urls_counter,
  222. domain_blacklist_counter,
  223. extention_blacklist_counter,
  224. short_url_counter, malformed_url_counter,
  225. duplicate_url_counter):
  226. string = prefix + ' | '
  227. string += 'time elapsed (s): {:.2f} | '.format(time.time() - start_time)
  228. string += 'number of urls: {} | '.format(urls_counter)
  229. string += 'domain blacklisted: {} | '.format(domain_blacklist_counter)
  230. string += 'extention blacklisted: {} | '.format(extention_blacklist_counter)
  231. string += 'short urls (<=8): {} | '.format(short_url_counter)
  232. string += 'malformed urls: {} | '.format(malformed_url_counter)
  233. string += 'duplicate urls: {}'.format(duplicate_url_counter)
  234. print(string, flush=True)
  235. if __name__ == '__main__':
  236. print('remove blacklisted urls ..')
  237. # Path to the url files.
  238. path = sys.argv[1]
  239. # Output url file.
  240. output = sys.argv[2]
  241. # Get the list of url files.
  242. files = glob.glob(path + '/*.txt')
  243. print('> found {} files'.format(len(files)))
  244. urls = set()
  245. urls_counter = 0
  246. domain_blacklist_counter = 0
  247. extention_blacklist_counter = 0
  248. short_url_counter = 0
  249. malformed_url_counter = 0
  250. duplicate_url_counter = 0
  251. start_time = time.time()
  252. for filename in files:
  253. with open(filename, 'r') as f:
  254. for line in f:
  255. url = line.strip()
  256. urls_counter += 1
  257. if domain_is_in_blacklist(url):
  258. print('[DOMAIN BLACKLIST]: {}'.format(url), flush=True)
  259. domain_blacklist_counter += 1
  260. elif extention_is_in_blacklist(url):
  261. print('[EXTENTION BLACKLIST]: {}'.format(url), flush=True)
  262. extention_blacklist_counter += 1
  263. elif len(url) <= 8:
  264. print('[SHORT URL]: {}'.format(url), flush=True)
  265. short_url_counter += 1
  266. elif url_is_malformed(url):
  267. print('[MALFORMED URL]: {}'.format(url), flush=True)
  268. malformed_url_counter += 1
  269. elif url in urls:
  270. print('[DUPLICATE URL]: {}'.format(url), flush=True)
  271. duplicate_url_counter += 1
  272. else:
  273. urls.add(url)
  274. if urls_counter % 100000 == 0:
  275. print_progress('PROGRESS', start_time, urls_counter,
  276. domain_blacklist_counter,
  277. extention_blacklist_counter,
  278. short_url_counter, malformed_url_counter,
  279. duplicate_url_counter)
  280. print_progress('FINAL', start_time, urls_counter,
  281. domain_blacklist_counter,
  282. extention_blacklist_counter,
  283. short_url_counter, malformed_url_counter,
  284. duplicate_url_counter)
  285. # Write the final set of urls.
  286. print('> writing cleaned up url list to {}'.format(output))
  287. with open(output, 'w') as f:
  288. for url in urls:
  289. f.write(url + '\n')
  290. print('done :-)')