#!/bin/python
#
# Collects torrent and magnet links from thepiratebay from a given URL
# rly 2014-08-13
#
# What it does:
# -takes a TPB URL and collects the links to the torrentfiles and magnetfiles contained.
# -torrentfile links are saved to torrentlinks.txt and magnet links are saved to magnetlinks.txt
#
# The URL can be either a serch result or a browsing category.
# The script will walk through every page until until it reaches the last page.
# The script will only traverse the pages upwards, so if there are 100 pages, and you start from page 98 it will only do 98,99,100.
#
# It will collect unique links and add them to the existing files, thereby it can be used for different searches to accumulate the results without getting duplicate results.
#
# Example usage:
# Collect torrent and linksmagnetlinks for the whole e-book category (limited to 100 pages by TPB):
#
#    python tpb_linkscraper.py http://thepiratebay.se/browse/601
#
# Collect all matching the search 'tolkien':
#
#    python tpb_linkscraper.py http://thepiratebay.se/search/tolkien/0/99/0
#
# Limitations:
# -Wont recognize torrent file that is for the same torrent as a magnetlink, use magnetlinks to get everyting (all torrents have magnet links).
#
import sys
import urllib3
from bs4 import BeautifulSoup
from bs4 import NavigableString


# Scraper class
class Scraper:

    def __init__(self, currentURL, downloadDir):
        self.currentURL = currentURL
        self.currentHTML = ''
        self.currentSoup = ''
        self.downloadDir = downloadDir
        self.http = urllib3.PoolManager()
        self.baseURL = None
        self.DOM = None
        self.links = []
        self.torrentLinks = set()
        self.magnetLinks = set()

    # Read the page for the current URL and set up internal objects
    def readURL(self):
        html = ""
        print('Current page: ' + self.currentURL)
        self.baseURL = urllib3.util.url.parse_url(self.currentURL).host
        try:
            req = self.http.request('GET',self.currentURL)
        except Exception:
            print('Error: unable to open the URL: ' + self.currentURL + ', saving current progress and bailing out.')
            self.writeFiles()
            sys.exit('')
        if req.status != 200:
            print('Error: '+req.status)
        else:
            print('Got page!')
            self.currentHTM = req.data
            self.currentSoup = BeautifulSoup(self.currentHTM)

    # Parse the links
    def findLinks(self):
        for link in self.currentSoup.find_all('a'):
            if link.get('title') == 'Download this torrent':
                href = 'http://' + self.baseURL + '/' + link.get('href')
                self.torrentLinks.add(href.strip())
            elif link.get('title') == 'Download this torrent using magnet':
                self.magnetLinks.add(link.get('href').strip())
        print(str(len(self.torrentLinks)) + ' torrents     ' + str(len(self.magnetLinks)) + ' magnets')

    # Find the link to the next page and scrape that one too
    def findNextPage(self):
        pageTickerTag = None
        nextURL = None
        for td in self.currentSoup.find_all('td'):
            # Find the pages-ticker at the bottom of the page, it is contained in the <td> with the following attributes:
            if td.get('colspan') == '9' and td.get('style') == 'text-align:center;':
                pageTickerTag = td
        # found? the searchresult page is a little different:
        if pageTickerTag == None:
            for div in self.currentSoup.find_all('div'):
                # Find the pages-ticker at the bottom of the page, it is contained in the <div> with the following attributes:
                if div.get('align') == 'center':
                    pageTickerTag = div

        # in this tag are the links
        # first, find where we are
        # loop through until we find the the number that is not a link (signifies the current page)
        for child in pageTickerTag:
            if type(child) is NavigableString and child.strip().isdigit():
                # found it, now the next thing after this number should be the link to the next page (if we are not at the end)
                if child.next_sibling != None:
                     if child.next_sibling.name == 'a':
                         nextURL = href = 'http://' + self.baseURL + '/' + child.next_sibling.get('href')
        if nextURL == None:
            print('Last page reached')
        else:
            # recursion
            self.currentURL = nextURL
            self.readURL()
            self.findLinks()
            self.findNextPage()

    # Read the links from textfiles
    def readFiles(self):
        ti = 0
        mi = 0
        try:
            torrentfile = open("torrentlinks.txt", "r")
            for line in torrentfile:
                self.torrentLinks.add(line.strip())
                ti = ti + 1
            torrentfile.close()
        except IOError:
            pass
        try:
            magnetfile = open("magnetlinks.txt", "r")
            for line in magnetfile:
                self.magnetLinks.add(line.strip())
                mi = mi + 1
            magnetfile.close()
        except IOError:
            pass
        print('Read ' + str(ti) + ' torrentlinks and ' + str(mi) + ' magnetlinks')

    # Write the data to textfiles, one link per line
    def writeFiles(self):
        ti = 0
        mi = 0
        try:
            torrentfile = open("torrentlinks.txt", "w+")
            for link in self.torrentLinks:
                torrentfile.write(str(link)+'\n')
                ti = ti + 1
            torrentfile.close()
        except IOError:
            print('Error writing torrentlinks.txt')
        try:
            magnetfile = open("magnetlinks.txt", "w+")
            for link in self.magnetLinks:
                magnetfile.write(str(link)+'\n')
                mi = mi + 1
            magnetfile.close()
        except IOError:
            print('Error writing magnetlinks.txt')
        print('Wrote ' + str(ti) + ' torrentlinks and ' + str(mi) + ' magnetlinks')

# End class Scraper


# Check for arguments
if len(sys.argv) < 2:
    sys.exit('Usage: %s URL' % sys.argv[0])

print('TPB linkscraper 0.1')
print('-----------------------')
print('')
url = sys.argv[1]
sc = Scraper(url,'.')
sc.readFiles()
sc.readURL()
sc.findLinks()
sc.findNextPage()
sc.writeFiles()
print('Done!')