#!/bin/python # # Collects torrent and magnet links from thepiratebay from a given URL # rly 2014-08-13 # # What it does: # -takes a TPB URL and collects the links to the torrentfiles and magnetfiles contained. # -torrentfile links are saved to torrentlinks.txt and magnet links are saved to magnetlinks.txt # # The URL can be either a serch result or a browsing category. # The script will walk through every page until until it reaches the last page. # The script will only traverse the pages upwards, so if there are 100 pages, and you start from page 98 it will only do 98,99,100. # # It will collect unique links and add them to the existing files, thereby it can be used for different searches to accumulate the results without getting duplicate results. # # Example usage: # Collect torrent and linksmagnetlinks for the whole e-book category (limited to 100 pages by TPB): # # python tpb_linkscraper.py http://thepiratebay.se/browse/601 # # Collect all matching the search 'tolkien': # # python tpb_linkscraper.py http://thepiratebay.se/search/tolkien/0/99/0 # # Limitations: # -Wont recognize torrent file that is for the same torrent as a magnetlink, use magnetlinks to get everyting (all torrents have magnet links). # import sys import urllib3 from bs4 import BeautifulSoup from bs4 import NavigableString # Scraper class class Scraper: def __init__(self, currentURL, downloadDir): self.currentURL = currentURL self.currentHTML = '' self.currentSoup = '' self.downloadDir = downloadDir self.http = urllib3.PoolManager() self.baseURL = None self.DOM = None self.links = [] self.torrentLinks = set() self.magnetLinks = set() # Read the page for the current URL and set up internal objects def readURL(self): html = "" print('Current page: ' + self.currentURL) self.baseURL = urllib3.util.url.parse_url(self.currentURL).host try: req = self.http.request('GET',self.currentURL) except Exception: print('Error: unable to open the URL: ' + self.currentURL + ', saving current progress and bailing out.') self.writeFiles() sys.exit('') if req.status != 200: print('Error: '+req.status) else: print('Got page!') self.currentHTM = req.data self.currentSoup = BeautifulSoup(self.currentHTM) # Parse the links def findLinks(self): for link in self.currentSoup.find_all('a'): if link.get('title') == 'Download this torrent': href = 'http://' + self.baseURL + '/' + link.get('href') self.torrentLinks.add(href.strip()) elif link.get('title') == 'Download this torrent using magnet': self.magnetLinks.add(link.get('href').strip()) print(str(len(self.torrentLinks)) + ' torrents ' + str(len(self.magnetLinks)) + ' magnets') # Find the link to the next page and scrape that one too def findNextPage(self): pageTickerTag = None nextURL = None for td in self.currentSoup.find_all('td'): # Find the pages-ticker at the bottom of the page, it is contained in the