From 360bb867b22a2bbafd2260ceff1a206ffb643cfe Mon Sep 17 00:00:00 2001 From: Alex Duchesne Date: Thu, 12 Sep 2024 12:45:14 -0400 Subject: [PATCH] Fixed search failure due to unexpected parser state In many plugins the parser's state wasn't reset between pages. This meant that if a page ended in a weird state (truncated or temporary error or unexpected html), all following pages would fail to find results. torrentproject noticed the issue and overrode feed() to reset some of its state between pages. But creating a new parser for each page is simpler. I have updated all plugins with this issue. --- nova3/engines/limetorrents.py | 19 ++++++++----------- nova3/engines/solidtorrents.py | 30 +++++++----------------------- nova3/engines/torlock.py | 27 ++++++++++----------------- nova3/engines/torrentproject.py | 16 ++++------------ nova3/engines/versions.txt | 8 ++++---- 5 files changed, 33 insertions(+), 67 deletions(-) diff --git a/nova3/engines/limetorrents.py b/nova3/engines/limetorrents.py index 37d8c5a..248aeda 100644 --- a/nova3/engines/limetorrents.py +++ b/nova3/engines/limetorrents.py @@ -1,4 +1,4 @@ -#VERSION: 4.7 +#VERSION: 4.8 # AUTHORS: Lima66 # CONTRIBUTORS: Diego de las Heras (ngosang@hotmail.es) @@ -38,7 +38,7 @@ class limetorrents(object): self.url = url self.current_item = {} # dict for found item self.item_name = None # key's name in current_item dict - self.page_empty = 22000 + self.page_items = 0 self.inside_tr = False self.findTable = False self.parser_class = {"tdnormal": "size", # class @@ -113,14 +113,11 @@ class limetorrents(object): query = query.replace("%20", "-") category = self.supported_categories[cat] - parser = self.MyHtmlParser(self.url) - page = 1 - while True: - page_url = "{0}/search/{1}/{2}/seeds/{3}/".format(self.url, category, query, page) + for page in range(1, 5): + page_url = f"{self.url}/search/{category}/{query}/seeds/{page}/" html = retrieve_url(page_url) - lunghezza_html = len(html) - if page > 6 or lunghezza_html <= parser.page_empty: - return + parser = self.MyHtmlParser(self.url) parser.feed(html) - page += 1 - parser.close() + parser.close() + if parser.page_items < 20: + break diff --git a/nova3/engines/solidtorrents.py b/nova3/engines/solidtorrents.py index 5dfccd6..3a46f6a 100644 --- a/nova3/engines/solidtorrents.py +++ b/nova3/engines/solidtorrents.py @@ -1,4 +1,4 @@ -# VERSION: 2.3 +# VERSION: 2.4 # AUTHORS: nKlido # LICENSING INFORMATION @@ -24,7 +24,6 @@ from helpers import retrieve_url from novaprinter import prettyPrinter from html.parser import HTMLParser from datetime import datetime -import math class solidtorrents(object): @@ -47,8 +46,6 @@ class solidtorrents(object): self.parseDate = False self.column = 0 self.torrentReady = False - self.foundSearchStats = False - self.parseTotalResults = False self.totalResults = 0 self.torrent_info = self.empty_torrent_info() @@ -68,13 +65,6 @@ class solidtorrents(object): def handle_starttag(self, tag, attrs): params = dict(attrs) - if 'search-stats' in params.get('class', ''): - self.foundSearchStats = True - - if (self.foundSearchStats and tag == 'b'): - self.parseTotalResults = True - self.foundSearchStats = False - if 'search-result' in params.get('class', ''): self.foundResult = True return @@ -115,13 +105,10 @@ class solidtorrents(object): prettyPrinter(self.torrent_info) self.torrentReady = False self.torrent_info = self.empty_torrent_info() + self.totalResults += 1 def handle_data(self, data): - if (self.parseTotalResults): - self.totalResults = int(data.strip()) - self.parseTotalResults = False - if (self.parseTitle): if (bool(data.strip()) and data != '\n'): self.torrent_info['name'] = data @@ -161,12 +148,9 @@ class solidtorrents(object): def search(self, what, cat='all'): category = self.supported_categories[cat] - parser = self.TorrentInfoParser(self.url) - parser.feed(self.request(what, category, 1)) - - totalPages = min(math.ceil(parser.totalResults / 20), 5) - - for page in range(2, totalPages + 1): + for page in range(1, 5): + parser = self.TorrentInfoParser(self.url) parser.feed(self.request(what, category, page)) - - parser.close() + parser.close() + if parser.totalResults < 15: + break diff --git a/nova3/engines/torlock.py b/nova3/engines/torlock.py index 7b60263..6aa6a9d 100644 --- a/nova3/engines/torlock.py +++ b/nova3/engines/torlock.py @@ -1,8 +1,7 @@ -#VERSION: 2.23 +#VERSION: 2.24 # AUTHORS: Douman (custparasite@gmx.se) # CONTRIBUTORS: Diego de las Heras (ngosang@hotmail.es) -from re import compile as re_compile from html.parser import HTMLParser from datetime import datetime, timedelta @@ -35,6 +34,7 @@ class torlock(object): self.item_bad = False # set to True for malicious links self.current_item = None # dict for found item self.item_name = None # key's name in current_item dict + self.page_items = 0 self.parser_class = {"td": "pub_date", "ts": "size", "tul": "seeds", @@ -91,26 +91,19 @@ class torlock(object): except Exception: self.current_item["pub_date"] = -1 prettyPrinter(self.current_item) + self.page_items += 1 self.current_item = {} def search(self, query, cat='all'): """ Performs search """ query = query.replace("%20", "-") + category = self.supported_categories[cat] - parser = self.MyHtmlParser(self.url) - page = "".join((self.url, "/", self.supported_categories[cat], - "/torrents/", query, ".html?sort=seeds&page=1")) - html = retrieve_url(page) - parser.feed(html) - - counter = 1 - additional_pages = re_compile(r"/{0}/torrents/{1}.html\?sort=seeds&page=[0-9]+" - .format(self.supported_categories[cat], query)) - list_searches = additional_pages.findall(html)[:-1] # last link is next(i.e. second) - for page in map(lambda link: "".join((self.url, link)), list_searches): - html = retrieve_url(page) + for page in range(1, 5): + parser = self.MyHtmlParser(self.url) + page_url = f"{self.url}/{category}/torrents/{query}.html?sort=seeds&page={page}" + html = retrieve_url(page_url) parser.feed(html) - counter += 1 - if counter > 3: + parser.close() + if parser.page_items < 20: break - parser.close() diff --git a/nova3/engines/torrentproject.py b/nova3/engines/torrentproject.py index e736871..2db3b8d 100644 --- a/nova3/engines/torrentproject.py +++ b/nova3/engines/torrentproject.py @@ -1,4 +1,4 @@ -#VERSION: 1.4 +#VERSION: 1.5 #AUTHORS: mauricci from helpers import retrieve_url @@ -102,26 +102,18 @@ class torrentproject(object): elif curr_key != 'name': self.singleResData[curr_key] += data.strip() - def feed(self, html): - HTMLParser.feed(self, html) - self.pageComplete = False - self.insideResults = False - self.insideDataDiv = False - self.spanCount = -1 - def search(self, what, cat='all'): # curr_cat = self.supported_categories[cat] - parser = self.MyHTMLParser(self.url) what = what.replace('%20', '+') # analyze first 5 pages of results for currPage in range(0, 5): url = self.url + '/browse?t={0}&p={1}'.format(what, currPage) html = retrieve_url(url) + parser = self.MyHTMLParser(self.url) parser.feed(html) - if len(parser.pageRes) <= 0: + parser.close() + if len(parser.pageRes) < 20: break - del parser.pageRes[:] - parser.close() def download_torrent(self, info): """ Downloader """ diff --git a/nova3/engines/versions.txt b/nova3/engines/versions.txt index 672def0..65fc148 100644 --- a/nova3/engines/versions.txt +++ b/nova3/engines/versions.txt @@ -1,8 +1,8 @@ eztv: 1.16 jackett: 4.0 -limetorrents: 4.7 +limetorrents: 4.8 piratebay: 3.3 -solidtorrents: 2.3 -torlock: 2.23 -torrentproject: 1.4 +solidtorrents: 2.4 +torlock: 2.24 +torrentproject: 1.5 torrentscsv: 1.4