#!/bin/python # # Collects torrent and magnet links from thepiratebay from a given URL # rly 2014-08-13 # # What it does: # -takes a TPB URL and collects the links to the torrentfiles and magnetfiles contained. # -torrentfile links are saved to torrentlinks.txt and magnet links are saved to magnetlinks.txt # # The URL can be either a serch result or a browsing category. # The script will walk through every page until until it reaches the last page. # The script will only traverse the pages upwards, so if there are 100 pages, and you start from page 98 it will only do 98,99,100. # # It will collect unique links and add them to the existing files, thereby it can be used for different searches to accumulate the results without getting duplicate results. # # Example usage: # Collect torrent and linksmagnetlinks for the whole e-book category (limited to 100 pages by TPB): # # python tpb_linkscraper.py http://thepiratebay.se/browse/601 # # Collect all matching the search 'tolkien': # # python tpb_linkscraper.py http://thepiratebay.se/search/tolkien/0/99/0 # # Limitations: # -Wont recognize torrent file that is for the same torrent as a magnetlink, use magnetlinks to get everyting (all torrents have magnet links). # import sys import urllib3 from bs4 import BeautifulSoup from bs4 import NavigableString # Scraper class class Scraper: def __init__(self, currentURL, downloadDir): self.currentURL = currentURL self.currentHTML = '' self.currentSoup = '' self.downloadDir = downloadDir self.http = urllib3.PoolManager() self.baseURL = None self.DOM = None self.links = [] self.torrentLinks = set() self.magnetLinks = set() # Read the page for the current URL and set up internal objects def readURL(self): html = "" print('Current page: ' + self.currentURL) self.baseURL = urllib3.util.url.parse_url(self.currentURL).host try: req = self.http.request('GET',self.currentURL) except Exception: print('Error: unable to open the URL: ' + self.currentURL + ', saving current progress and bailing out.') self.writeFiles() sys.exit('') if req.status != 200: print('Error: '+req.status) else: print('Got page!') self.currentHTM = req.data self.currentSoup = BeautifulSoup(self.currentHTM) # Parse the links def findLinks(self): for link in self.currentSoup.find_all('a'): if link.get('title') == 'Download this torrent': href = 'http://' + self.baseURL + '/' + link.get('href') self.torrentLinks.add(href.strip()) elif link.get('title') == 'Download this torrent using magnet': self.magnetLinks.add(link.get('href').strip()) print(str(len(self.torrentLinks)) + ' torrents ' + str(len(self.magnetLinks)) + ' magnets') # Find the link to the next page and scrape that one too def findNextPage(self): pageTickerTag = None nextURL = None for td in self.currentSoup.find_all('td'): # Find the pages-ticker at the bottom of the page, it is contained in the with the following attributes: if td.get('colspan') == '9' and td.get('style') == 'text-align:center;': pageTickerTag = td # found? the searchresult page is a little different: if pageTickerTag == None: for div in self.currentSoup.find_all('div'): # Find the pages-ticker at the bottom of the page, it is contained in the
with the following attributes: if div.get('align') == 'center': pageTickerTag = div # in this tag are the links # first, find where we are # loop through until we find the the number that is not a link (signifies the current page) for child in pageTickerTag: if type(child) is NavigableString and child.strip().isdigit(): # found it, now the next thing after this number should be the link to the next page (if we are not at the end) if child.next_sibling != None: if child.next_sibling.name == 'a': nextURL = href = 'http://' + self.baseURL + '/' + child.next_sibling.get('href') if nextURL == None: print('Last page reached') else: # recursion self.currentURL = nextURL self.readURL() self.findLinks() self.findNextPage() # Read the links from textfiles def readFiles(self): ti = 0 mi = 0 try: torrentfile = open("torrentlinks.txt", "r") for line in torrentfile: self.torrentLinks.add(line.strip()) ti = ti + 1 torrentfile.close() except IOError: pass try: magnetfile = open("magnetlinks.txt", "r") for line in magnetfile: self.magnetLinks.add(line.strip()) mi = mi + 1 magnetfile.close() except IOError: pass print('Read ' + str(ti) + ' torrentlinks and ' + str(mi) + ' magnetlinks') # Write the data to textfiles, one link per line def writeFiles(self): ti = 0 mi = 0 try: torrentfile = open("torrentlinks.txt", "w+") for link in self.torrentLinks: torrentfile.write(str(link)+'\n') ti = ti + 1 torrentfile.close() except IOError: print('Error writing torrentlinks.txt') try: magnetfile = open("magnetlinks.txt", "w+") for link in self.magnetLinks: magnetfile.write(str(link)+'\n') mi = mi + 1 magnetfile.close() except IOError: print('Error writing magnetlinks.txt') print('Wrote ' + str(ti) + ' torrentlinks and ' + str(mi) + ' magnetlinks') # End class Scraper # Check for arguments if len(sys.argv) < 2: sys.exit('Usage: %s URL' % sys.argv[0]) print('TPB linkscraper 0.1') print('-----------------------') print('') url = sys.argv[1] sc = Scraper(url,'.') sc.readFiles() sc.readURL() sc.findLinks() sc.findNextPage() sc.writeFiles() print('Done!')