From e5677ae6b6b23c943e64d8e2abcb64c13c0e8bbf Mon Sep 17 00:00:00 2001 From: Kirill Isakov Date: Fri, 25 Mar 2016 00:24:37 +0600 Subject: [PATCH 01/14] Add Nyaa.se search engine --- searx/engines/nyaa.py | 115 ++++++++++++++++++++++++++++++++ searx/settings.yml | 4 ++ tests/unit/engines/test_nyaa.py | 66 ++++++++++++++++++ 3 files changed, 185 insertions(+) create mode 100644 searx/engines/nyaa.py create mode 100644 tests/unit/engines/test_nyaa.py diff --git a/searx/engines/nyaa.py b/searx/engines/nyaa.py new file mode 100644 index 000000000..81caed76f --- /dev/null +++ b/searx/engines/nyaa.py @@ -0,0 +1,115 @@ +""" + Nyaa.se (Anime Bittorrent tracker) + + @website http://www.nyaa.se/ + @provide-api no + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, content, seed, leech, torrentfile +""" + +from cgi import escape +from urllib import urlencode +from lxml import html +from searx.engines.xpath import extract_text + +# engine dependent config +categories = ['files', 'images', 'videos', 'music'] +paging = True + +# search-url +base_url = 'http://www.nyaa.se/' +search_url = base_url + '?page=search&{query}&offset={offset}' + +# xpath queries +xpath_results = '//table[@class="tlist"]//tr[contains(@class, "tlistrow")]' +xpath_category = './/td[@class="tlisticon"]/a' +xpath_title = './/td[@class="tlistname"]/a' +xpath_torrent_file = './/td[@class="tlistdownload"]/a' +xpath_filesize = './/td[@class="tlistsize"]/text()' +xpath_seeds = './/td[@class="tlistsn"]/text()' +xpath_leeches = './/td[@class="tlistln"]/text()' +xpath_downloads = './/td[@class="tlistdn"]/text()' + + +# convert a variable to integer or return 0 if it's not a number +def int_or_zero(num): + if isinstance(num, list): + if len(num) < 1: + return 0 + num = num[0] + if num.isdigit(): + return int(num) + return 0 + + +# do search-request +def request(query, params): + query = urlencode({'term': query}) + params['url'] = search_url.format(query=query, offset=params['pageno']) + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for result in dom.xpath(xpath_results): + # category in which our torrent belongs + category = result.xpath(xpath_category)[0].attrib.get('title') + + # torrent title + page_a = result.xpath(xpath_title)[0] + title = escape(extract_text(page_a)) + + # link to the page + href = page_a.attrib.get('href') + + # link to the torrent file + torrent_link = result.xpath(xpath_torrent_file)[0].attrib.get('href') + + # torrent size + try: + file_size, suffix = result.xpath(xpath_filesize)[0].split(' ') + + # convert torrent size to bytes. + # if there is no correct index in this dictionary, + # the try block fails as it should + multiplier = { + 'KIB': 1024, + 'MIB': 1024 ** 2, + 'GIB': 1024 ** 3, + 'TIB': 1024 ** 4 + }[suffix.upper()] + + file_size = int(float(file_size) * multiplier) + except Exception as e: + file_size = None + + # seed count + seed = int_or_zero(result.xpath(xpath_seeds)) + + # leech count + leech = int_or_zero(result.xpath(xpath_leeches)) + + # torrent downloads count + downloads = int_or_zero(result.xpath(xpath_downloads)) + + # content string contains all information not included into template + content = 'Category: "{category}". Downloaded {downloads} times.' + content = content.format(category=category, downloads=downloads) + content = escape(content) + + results.append({'url': href, + 'title': title, + 'content': content, + 'seed': seed, + 'leech': leech, + 'filesize': file_size, + 'torrentfile': torrent_link, + 'template': 'torrent.html'}) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index 5ef74d955..bd075b86c 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -175,6 +175,10 @@ engines: engine : mixcloud shortcut : mc + - name : nyaa + engine : nyaa + shortcut : nt + - name : openstreetmap engine : openstreetmap shortcut : osm diff --git a/tests/unit/engines/test_nyaa.py b/tests/unit/engines/test_nyaa.py new file mode 100644 index 000000000..db412e1cc --- /dev/null +++ b/tests/unit/engines/test_nyaa.py @@ -0,0 +1,66 @@ +from collections import defaultdict +import mock +from searx.engines import nyaa +from searx.testing import SearxTestCase + + +class TestNyaaEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dic = defaultdict(dict) + dic['pageno'] = 1 + params = nyaa.request(query, dic) + self.assertTrue('url' in params) + self.assertTrue(query in params['url']) + self.assertTrue('nyaa.se' in params['url']) + + def test_response(self): + resp = mock.Mock(text='') + self.assertEqual(nyaa.response(resp), []) + + html = """ + + + + + + + + + + + + + +
+ + English-translated Anime + + + + Sample torrent title + + + + DL + + 10 MiB136660
+ """ + + resp = mock.Mock(text=html) + results = nyaa.response(resp) + + self.assertEqual(type(results), list) + self.assertEqual(len(results), 1) + + r = results[0] + self.assertTrue(r['url'].find('www.nyaa.se/?page3') >= 0) + self.assertTrue(r['torrentfile'].find('www.nyaa.se/?page_dl') >= 0) + self.assertTrue(r['content'].find('English-translated Anime') >= 0) + self.assertTrue(r['content'].find('Downloaded 666 times.') >= 0) + + self.assertEqual(r['title'], 'Sample torrent title') + self.assertEqual(r['seed'], 1) + self.assertEqual(r['leech'], 3) + self.assertEqual(r['filesize'], 10 * 1024 * 1024) From d026a97e42dce14bb187ea79682b9a303cd91e9e Mon Sep 17 00:00:00 2001 From: Kirill Isakov Date: Fri, 25 Mar 2016 19:30:32 +0600 Subject: [PATCH 02/14] Add Reddit search engine --- searx/engines/reddit.py | 74 +++++++++++++++++++++++++++++++ searx/settings.yml | 7 +++ tests/unit/engines/test_reddit.py | 67 ++++++++++++++++++++++++++++ 3 files changed, 148 insertions(+) create mode 100644 searx/engines/reddit.py create mode 100644 tests/unit/engines/test_reddit.py diff --git a/searx/engines/reddit.py b/searx/engines/reddit.py new file mode 100644 index 000000000..d2b185b40 --- /dev/null +++ b/searx/engines/reddit.py @@ -0,0 +1,74 @@ +""" + Reddit + + @website https://www.reddit.com/ + @provide-api yes (https://www.reddit.com/dev/api) + + @using-api yes + @results JSON + @stable yes + @parse url, title, content, thumbnail, publishedDate +""" + +import json +from cgi import escape +from urllib import urlencode +from urlparse import urlparse +from datetime import datetime + +# engine dependent config +categories = ['general', 'images', 'news', 'social media'] +page_size = 25 + +# search-url +search_url = 'https://www.reddit.com/search.json?{query}' + + +# do search-request +def request(query, params): + query = urlencode({'q': query, + 'limit': page_size}) + params['url'] = search_url.format(query=query) + + return params + + +# get response from search-request +def response(resp): + img_results = [] + text_results = [] + + search_results = json.loads(resp.text) + + # return empty array if there are no results + if 'data' not in search_results: + return [] + + posts = search_results.get('data', {}).get('children', []) + + # process results + for post in posts: + data = post['data'] + + # extract post information + params = { + 'url': data['url'], + 'title': data['title'] + } + + # if thumbnail field contains a valid URL, we need to change template + thumbnail = data['thumbnail'] + url_info = urlparse(thumbnail) + # netloc & path + if url_info[1] != '' and url_info[2] != '': + params['thumbnail_src'] = thumbnail + params['template'] = 'images.html' + img_results.append(params) + else: + created = datetime.fromtimestamp(data['created_utc']) + params['content'] = escape(data['selftext']) + params['publishedDate'] = created + text_results.append(params) + + # show images first and text results second + return img_results + text_results diff --git a/searx/settings.yml b/searx/settings.yml index bd075b86c..4942ae56f 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -213,6 +213,13 @@ engines: shortcut : qws categories : social media + - name : reddit + engine : reddit + shortcut : re + page_size : 25 + timeout : 10.0 + disabled : True + - name : kickass engine : kickass shortcut : ka diff --git a/tests/unit/engines/test_reddit.py b/tests/unit/engines/test_reddit.py new file mode 100644 index 000000000..51589e300 --- /dev/null +++ b/tests/unit/engines/test_reddit.py @@ -0,0 +1,67 @@ +from collections import defaultdict +import mock +from searx.engines import reddit +from searx.testing import SearxTestCase +from datetime import datetime + + +class TestRedditEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dic = defaultdict(dict) + params = reddit.request(query, dic) + self.assertTrue('url' in params) + self.assertTrue(query in params['url']) + self.assertTrue('reddit.com' in params['url']) + + def test_response(self): + resp = mock.Mock(text='{}') + self.assertEqual(reddit.response(resp), []) + + json = """ + { + "kind": "Listing", + "data": { + "children": [{ + "data": { + "url": "http://google.com/", + "title": "Title number one", + "selftext": "Sample", + "created_utc": 1401219957.0, + "thumbnail": "http://image.com/picture.jpg" + } + }, { + "data": { + "url": "https://reddit.com/", + "title": "Title number two", + "selftext": "Dominus vobiscum", + "created_utc": 1438792533.0, + "thumbnail": "self" + } + }] + } + } + """ + + resp = mock.Mock(text=json) + results = reddit.response(resp) + + self.assertEqual(len(results), 2) + self.assertEqual(type(results), list) + + # testing first result (picture) + r = results[0] + self.assertEqual(r['url'], 'http://google.com/') + self.assertEqual(r['title'], 'Title number one') + self.assertEqual(r['template'], 'images.html') + self.assertEqual(r['thumbnail_src'], 'http://image.com/picture.jpg') + + # testing second result (self-post) + r = results[1] + self.assertEqual(r['url'], 'https://reddit.com/') + self.assertEqual(r['title'], 'Title number two') + self.assertEqual(r['content'], 'Dominus vobiscum') + created = datetime.fromtimestamp(1438792533.0) + self.assertEqual(r['publishedDate'], created) + self.assertTrue('thumbnail_src' not in r) From 7fbc12ee4e6aea8a8ad0098deb03054976056371 Mon Sep 17 00:00:00 2001 From: Kirill Isakov Date: Sat, 26 Mar 2016 05:28:58 +0600 Subject: [PATCH 03/14] Add Torrentz.eu search engine --- searx/engines/nyaa.py | 26 ++++---- searx/engines/torrentz.py | 93 +++++++++++++++++++++++++++++ searx/settings.yml | 5 ++ tests/unit/engines/test_torrentz.py | 91 ++++++++++++++++++++++++++++ 4 files changed, 203 insertions(+), 12 deletions(-) create mode 100644 searx/engines/torrentz.py create mode 100644 tests/unit/engines/test_torrentz.py diff --git a/searx/engines/nyaa.py b/searx/engines/nyaa.py index 81caed76f..b0f884b77 100644 --- a/searx/engines/nyaa.py +++ b/searx/engines/nyaa.py @@ -43,6 +43,19 @@ def int_or_zero(num): return int(num) return 0 +# get multiplier to convert torrent size to bytes +def get_filesize_mul(suffix): + return { + 'KB': 1024, + 'MB': 1024 ** 2, + 'GB': 1024 ** 3, + 'TB': 1024 ** 4, + + 'KIB': 1024, + 'MIB': 1024 ** 2, + 'GIB': 1024 ** 3, + 'TIB': 1024 ** 4 + }[str(suffix).upper()] # do search-request def request(query, params): @@ -74,18 +87,7 @@ def response(resp): # torrent size try: file_size, suffix = result.xpath(xpath_filesize)[0].split(' ') - - # convert torrent size to bytes. - # if there is no correct index in this dictionary, - # the try block fails as it should - multiplier = { - 'KIB': 1024, - 'MIB': 1024 ** 2, - 'GIB': 1024 ** 3, - 'TIB': 1024 ** 4 - }[suffix.upper()] - - file_size = int(float(file_size) * multiplier) + file_size = int(float(file_size) * get_filesize_mul(suffix)) except Exception as e: file_size = None diff --git a/searx/engines/torrentz.py b/searx/engines/torrentz.py new file mode 100644 index 000000000..92fbe7013 --- /dev/null +++ b/searx/engines/torrentz.py @@ -0,0 +1,93 @@ +""" + Torrentz.eu (BitTorrent meta-search engine) + + @website https://torrentz.eu/ + @provide-api no + + @using-api no + @results HTML + @stable no (HTML can change, although unlikely, + see https://torrentz.eu/torrentz.btsearch) + @parse url, title, publishedDate, seed, leech, filesize, magnetlink +""" + +import re +from cgi import escape +from urllib import urlencode +from lxml import html +from searx.engines.xpath import extract_text +from datetime import datetime +from searx.engines.nyaa import int_or_zero, get_filesize_mul + +# engine dependent config +categories = ['files', 'videos', 'music'] +paging = True + +# search-url +# https://torrentz.eu/search?f=EXAMPLE&p=6 +base_url = 'https://torrentz.eu/' +search_url = base_url + 'search?{query}' + + +# do search-request +def request(query, params): + page = params['pageno'] - 1 + query = urlencode({'q': query, 'p': page}) + params['url'] = search_url.format(query=query) + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for result in dom.xpath('//div[@class="results"]/dl'): + name_cell = result.xpath('./dt')[0] + title = extract_text(name_cell) + + # skip rows that do not contain a link to a torrent + links = name_cell.xpath('./a') + if len(links) != 1: + continue + + # extract url and remove a slash in the beginning + link = links[0].attrib.get('href').lstrip('/') + + seed = result.xpath('./dd/span[@class="u"]/text()')[0].replace(',', '') + leech = result.xpath('./dd/span[@class="d"]/text()')[0].replace(',', '') + + params = { + 'url': base_url + link, + 'title': title, + 'seed': int_or_zero(seed), + 'leech': int_or_zero(leech), + 'template': 'torrent.html' + } + + # let's try to calculate the torrent size + try: + size_str = result.xpath('./dd/span[@class="s"]/text()')[0] + size, suffix = size_str.split() + params['filesize'] = int(size) * get_filesize_mul(suffix) + except Exception as e: + pass + + # does our link contain a valid SHA1 sum? + if re.compile('[0-9a-fA-F]{40}').match(link): + # add a magnet link to the result + params['magnetlink'] = 'magnet:?xt=urn:btih:' + link + + # extract and convert creation date + try: + date_str = result.xpath('./dd/span[@class="a"]/span')[0].attrib.get('title') + # Fri, 25 Mar 2016 16:29:01 + date = datetime.strptime(date_str, '%a, %d %b %Y %H:%M:%S') + params['publishedDate'] = date + except Exception as e: + pass + + results.append(params) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index 4942ae56f..2dce06fd0 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -271,6 +271,11 @@ engines: shortcut : sw disabled : True + - name : torrentz + engine : torrentz + timeout : 5.0 + shortcut : to + - name : twitter engine : twitter shortcut : tw diff --git a/tests/unit/engines/test_torrentz.py b/tests/unit/engines/test_torrentz.py new file mode 100644 index 000000000..2f836f73e --- /dev/null +++ b/tests/unit/engines/test_torrentz.py @@ -0,0 +1,91 @@ +import mock +from collections import defaultdict +from searx.engines import torrentz +from searx.testing import SearxTestCase +from datetime import datetime + + +class TestTorrentzEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dic = defaultdict(dict) + dic['pageno'] = 1 + params = torrentz.request(query, dic) + self.assertTrue('url' in params) + self.assertTrue(query in params['url']) + self.assertTrue('torrentz.eu' in params['url']) + + def test_response(self): + resp = mock.Mock(text='') + self.assertEqual(torrentz.response(resp), []) + + html = """ +
+
+
+ + Completely valid info + + books ebooks +
+
+ 1 + + 4 months + + 30 MB + 14 + 1 +
+
+ +
+
+ + Invalid hash and date and filesize + + books ebooks +
+
+ 1 + + 4 months + + 30MB + 5,555 + 1,234,567 +
+
+
+ """ + + resp = mock.Mock(text=html) + results = torrentz.response(resp) + + self.assertEqual(type(results), list) + self.assertEqual(len(results), 2) + + # testing against the first result + r = results[0] + self.assertEqual(r['url'], 'https://torrentz.eu/4362e08b1d80e1820fb2550b752f9f3126fe76d6') + self.assertEqual(r['title'], 'Completely valid info books ebooks') + # 22 Nov 2015 03:01:42 + self.assertEqual(r['publishedDate'], datetime(2015, 11, 22, 3, 1, 42)) + self.assertEqual(r['seed'], 14) + self.assertEqual(r['leech'], 1) + self.assertEqual(r['filesize'], 30 * 1024 * 1024) + self.assertEqual(r['magnetlink'], 'magnet:?xt=urn:btih:4362e08b1d80e1820fb2550b752f9f3126fe76d6') + + # testing against the second result + r = results[1] + self.assertEqual(r['url'], 'https://torrentz.eu/poaskdpokaspod') + self.assertEqual(r['title'], 'Invalid hash and date and filesize books ebooks') + self.assertEqual(r['seed'], 5555) + self.assertEqual(r['leech'], 1234567) + + # in the second result we have invalid hash, creation date & torrent size, + # so these tests should fail + self.assertFalse('magnetlink' in r) + self.assertFalse('filesize' in r) + self.assertFalse('publishedDate' in r) From 547b8a87653d87b8be85710275a66be1bec1e39c Mon Sep 17 00:00:00 2001 From: Kirill Isakov Date: Sun, 27 Mar 2016 00:49:57 +0600 Subject: [PATCH 04/14] Add Tokyo Toshokan search engine --- searx/engines/tokyotoshokan.py | 102 +++++++++++++++++++++ searx/settings.yml | 6 ++ tests/unit/engines/test_tokyotoshokan.py | 110 +++++++++++++++++++++++ 3 files changed, 218 insertions(+) create mode 100644 searx/engines/tokyotoshokan.py create mode 100644 tests/unit/engines/test_tokyotoshokan.py diff --git a/searx/engines/tokyotoshokan.py b/searx/engines/tokyotoshokan.py new file mode 100644 index 000000000..17e8e2191 --- /dev/null +++ b/searx/engines/tokyotoshokan.py @@ -0,0 +1,102 @@ +""" + Tokyo Toshokan (A BitTorrent Library for Japanese Media) + + @website https://www.tokyotosho.info/ + @provide-api no + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, publishedDate, seed, leech, + filesize, magnetlink, content +""" + +import re +from cgi import escape +from urllib import urlencode +from lxml import html +from searx.engines.xpath import extract_text +from datetime import datetime +from searx.engines.nyaa import int_or_zero, get_filesize_mul + +# engine dependent config +categories = ['files', 'videos', 'music'] +paging = True + +# search-url +base_url = 'https://www.tokyotosho.info/' +search_url = base_url + 'search.php?{query}' + + +# do search-request +def request(query, params): + query = urlencode({'page': params['pageno'], + 'terms': query}) + params['url'] = search_url.format(query=query) + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + rows = dom.xpath('//table[@class="listing"]//tr[contains(@class, "category_0")]') + + # check if there are no results or page layout was changed so we cannot parse it + # currently there are two rows for each result, so total count must be even + if len(rows) == 0 or len(rows) % 2 != 0: + return [] + + # regular expression for parsing torrent size strings + size_re = re.compile('Size:\s*([\d.]+)(TB|GB|MB|B)', re.IGNORECASE) + + # processing the results, two rows at a time + for i in xrange(0, len(rows), 2): + # parse the first row + name_row = rows[i] + + links = name_row.xpath('./td[@class="desc-top"]/a') + params = { + 'template': 'torrent.html', + 'url': links[-1].attrib.get('href'), + 'title': extract_text(links[-1]) + } + # I have not yet seen any torrents without magnet links, but + # it's better to be prepared to stumble upon one some day + if len(links) == 2: + magnet = links[0].attrib.get('href') + if magnet.startswith('magnet'): + # okay, we have a valid magnet link, let's add it to the result + params['magnetlink'] = magnet + + # no more info in the first row, start parsing the second one + info_row = rows[i + 1] + desc = extract_text(info_row.xpath('./td[@class="desc-bot"]')[0]) + for item in desc.split('|'): + item = item.strip() + if item.startswith('Size:'): + try: + # ('1.228', 'GB') + groups = size_re.match(item).groups() + multiplier = get_filesize_mul(groups[1]) + params['filesize'] = int(multiplier * float(groups[0])) + except Exception as e: + pass + elif item.startswith('Date:'): + try: + # Date: 2016-02-21 21:44 UTC + date = datetime.strptime(item, 'Date: %Y-%m-%d %H:%M UTC') + params['publishedDate'] = date + except Exception as e: + pass + elif item.startswith('Comment:'): + params['content'] = item + stats = info_row.xpath('./td[@class="stats"]/span') + # has the layout not changed yet? + if len(stats) == 3: + params['seed'] = int_or_zero(extract_text(stats[0])) + params['leech'] = int_or_zero(extract_text(stats[1])) + + results.append(params) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index 2dce06fd0..7ecb477b7 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -271,6 +271,12 @@ engines: shortcut : sw disabled : True + - name : tokyotoshokan + engine : tokyotoshokan + shortcut : tt + timeout : 6.0 + disabled : True + - name : torrentz engine : torrentz timeout : 5.0 diff --git a/tests/unit/engines/test_tokyotoshokan.py b/tests/unit/engines/test_tokyotoshokan.py new file mode 100644 index 000000000..efe7dbfc2 --- /dev/null +++ b/tests/unit/engines/test_tokyotoshokan.py @@ -0,0 +1,110 @@ +import mock +from collections import defaultdict +from searx.engines import tokyotoshokan +from searx.testing import SearxTestCase +from datetime import datetime + + +class TestTokyotoshokanEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dic = defaultdict(dict) + dic['pageno'] = 1 + params = tokyotoshokan.request(query, dic) + self.assertTrue('url' in params) + self.assertTrue(query in params['url']) + self.assertTrue('tokyotosho.info' in params['url']) + + def test_response(self): + resp = mock.Mock(text='') + self.assertEqual(tokyotoshokan.response(resp), []) + + html = """ + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + Koyomimonogatari + + Details
+ Authorized: Yes + Submitter: Ohys | + Size: 10.5MB | + Date: 2016-03-26 16:41 UTC | + Comment: sample comment + + S: 53 + L: 18 + C: 0 + ID: 975700 +
+ + + + Owarimonogatari + + Details
+ Submitter: Ohys | + Size: 932.84EB | + Date: QWERTY-03-26 16:41 UTC + + S: 0 +
+ """ + + resp = mock.Mock(text=html) + results = tokyotoshokan.response(resp) + + self.assertEqual(type(results), list) + self.assertEqual(len(results), 2) + + # testing the first result, which has correct format + # and should have all information fields filled + r = results[0] + self.assertEqual(r['url'], 'http://www.nyaa.se/f') + self.assertEqual(r['title'], 'Koyomimonogatari') + self.assertEqual(r['magnetlink'], 'magnet:?xt=urn:btih:4c19eb46b5113685fbd2288ed2531b0b') + self.assertEqual(r['filesize'], int(1024 * 1024 * 10.5)) + self.assertEqual(r['publishedDate'], datetime(2016, 03, 26, 16, 41)) + self.assertEqual(r['content'], 'Comment: sample comment') + self.assertEqual(r['seed'], 53) + self.assertEqual(r['leech'], 18) + + # testing the second result, which does not include magnet link, + # seed & leech info, and has incorrect size & creation date + r = results[1] + self.assertEqual(r['url'], 'http://google.com/q') + self.assertEqual(r['title'], 'Owarimonogatari') + + self.assertFalse('magnetlink' in r) + self.assertFalse('filesize' in r) + self.assertFalse('content' in r) + self.assertFalse('publishedDate' in r) + self.assertFalse('seed' in r) + self.assertFalse('leech' in r) From c1d456b1366e339b09bd3744b45bf80da1e7d808 Mon Sep 17 00:00:00 2001 From: Kirill Isakov Date: Sun, 27 Mar 2016 03:50:44 +0600 Subject: [PATCH 05/14] Add F-Droid search engine --- searx/engines/fdroid.py | 53 +++++++++++++++++++++++++++++++ searx/settings.yml | 5 +++ tests/unit/engines/test_fdroid.py | 49 ++++++++++++++++++++++++++++ 3 files changed, 107 insertions(+) create mode 100644 searx/engines/fdroid.py create mode 100644 tests/unit/engines/test_fdroid.py diff --git a/searx/engines/fdroid.py b/searx/engines/fdroid.py new file mode 100644 index 000000000..0b16773e3 --- /dev/null +++ b/searx/engines/fdroid.py @@ -0,0 +1,53 @@ +""" + F-Droid (a repository of FOSS applications for Android) + + @website https://f-droid.org/ + @provide-api no + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, content +""" + +from cgi import escape +from urllib import urlencode +from searx.engines.xpath import extract_text +from lxml import html + +# engine dependent config +categories = ['files'] +paging = True + +# search-url +base_url = 'https://f-droid.org/' +search_url = base_url + 'repository/browse/?{query}' + + +# do search-request +def request(query, params): + query = urlencode({'fdfilter': query, + 'fdpage': params['pageno']}) + params['url'] = search_url.format(query=query) + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for app in dom.xpath('//div[@id="appheader"]'): + url = app.xpath('./ancestor::a/@href')[0] + title = app.xpath('./p/span/text()')[0] + img_src = app.xpath('.//img/@src')[0] + + content = extract_text(app.xpath('./p')[0]) + content = escape(content.replace(title, '', 1).strip()) + + results.append({'url': url, + 'title': title, + 'content': content, + 'img_src': img_src}) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index 7ecb477b7..51e313205 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -105,6 +105,11 @@ engines: shortcut : 1x disabled : True + - name : fdroid + engine : fdroid + shortcut : fd + disabled : True + - name : flickr categories : images shortcut : fl diff --git a/tests/unit/engines/test_fdroid.py b/tests/unit/engines/test_fdroid.py new file mode 100644 index 000000000..d75f4f0b4 --- /dev/null +++ b/tests/unit/engines/test_fdroid.py @@ -0,0 +1,49 @@ +import mock +from collections import defaultdict +from searx.engines import fdroid +from searx.testing import SearxTestCase + + +class TestFdroidEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dic = defaultdict(dict) + dic['pageno'] = 1 + params = fdroid.request(query, dic) + self.assertTrue('url' in params) + self.assertTrue(query in params['url']) + self.assertTrue('f-droid.org' in params['url']) + + def test_response(self): + resp = mock.Mock(text='') + self.assertEqual(fdroid.response(resp), []) + + html = """ + +
+
+ +
+
+

Details...

+
+

+ Sample title +
+ Sample content +

+
+
+ """ + + resp = mock.Mock(text=html) + results = fdroid.response(resp) + + self.assertEqual(type(results), list) + self.assertEqual(len(results), 1) + self.assertEqual(results[0]['url'], 'https://google.com/qwerty') + self.assertEqual(results[0]['title'], 'Sample title') + self.assertEqual(results[0]['content'], 'Sample content') + self.assertEqual(results[0]['img_src'], 'http://example.com/image.png') From 80813c3e056f0831c9e15cff2e94df9dede0a47a Mon Sep 17 00:00:00 2001 From: Kirill Isakov Date: Sun, 27 Mar 2016 04:29:23 +0600 Subject: [PATCH 06/14] Add Erowid search engine --- searx/settings.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/searx/settings.yml b/searx/settings.yml index 51e313205..c0324976c 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -82,6 +82,16 @@ engines: engine : digg shortcut : dg + - name : erowid + engine : xpath + search_url : https://www.erowid.org/search.php?q={query} + url_xpath : //dl[@class="results-list"]/dt[@class="result-title"]/a/@href + title_xpath : //dl[@class="results-list"]/dt[@class="result-title"]/a/text() + content_xpath : //dl[@class="results-list"]/dd[@class="result-details"] + categories : general + shortcut : ew + disabled : True + - name : wikidata engine : wikidata shortcut : wd From a8832574fa113e0673dcf9a51ece058b0a4cdcb5 Mon Sep 17 00:00:00 2001 From: Kirill Isakov Date: Sun, 27 Mar 2016 05:09:04 +0600 Subject: [PATCH 07/14] Shorten content field for very long Reddit search results --- searx/engines/reddit.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/searx/engines/reddit.py b/searx/engines/reddit.py index d2b185b40..9729898e5 100644 --- a/searx/engines/reddit.py +++ b/searx/engines/reddit.py @@ -66,7 +66,10 @@ def response(resp): img_results.append(params) else: created = datetime.fromtimestamp(data['created_utc']) - params['content'] = escape(data['selftext']) + content = escape(data['selftext']) + if len(content) > 500: + content = content[:500] + '...' + params['content'] = content params['publishedDate'] = created text_results.append(params) From e202c6fbec7c9dff67acdfe387699010d61da32b Mon Sep 17 00:00:00 2001 From: Kirill Isakov Date: Sun, 27 Mar 2016 05:23:17 +0600 Subject: [PATCH 08/14] Fix PEP8 warnings for Nyaa.py --- searx/engines/nyaa.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/searx/engines/nyaa.py b/searx/engines/nyaa.py index b0f884b77..cda8231f7 100644 --- a/searx/engines/nyaa.py +++ b/searx/engines/nyaa.py @@ -43,6 +43,7 @@ def int_or_zero(num): return int(num) return 0 + # get multiplier to convert torrent size to bytes def get_filesize_mul(suffix): return { @@ -57,6 +58,7 @@ def get_filesize_mul(suffix): 'TIB': 1024 ** 4 }[str(suffix).upper()] + # do search-request def request(query, params): query = urlencode({'term': query}) From bacc9a3df156605c49fb93ede6e6a730110d15db Mon Sep 17 00:00:00 2001 From: Kirill Isakov Date: Mon, 28 Mar 2016 19:15:03 +0600 Subject: [PATCH 09/14] Add paging support to XPath & Erowid engines --- searx/engines/xpath.py | 16 +++++++++++++++- searx/settings.yml | 5 ++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py index f51634be0..e701c02bf 100644 --- a/searx/engines/xpath.py +++ b/searx/engines/xpath.py @@ -11,6 +11,14 @@ title_xpath = None suggestion_xpath = '' results_xpath = '' +# parameters for engines with paging support +# +# number of results on each page +# (only needed if the site requires not a page number, but an offset) +page_size = 1 +# number of the first page (usually 0 or 1) +first_page_num = 1 + ''' if xpath_results is list, extract the text from each result and concat the list @@ -76,8 +84,14 @@ def normalize_url(url): def request(query, params): query = urlencode({'q': query})[2:] - params['url'] = search_url.format(query=query) + + fp = {'query': query} + if paging and search_url.find('{pageno}') >= 0: + fp['pageno'] = (params['pageno'] + first_page_num - 1) * page_size + + params['url'] = search_url.format(**fp) params['query'] = query + return params diff --git a/searx/settings.yml b/searx/settings.yml index c0324976c..473b8612a 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -84,7 +84,10 @@ engines: - name : erowid engine : xpath - search_url : https://www.erowid.org/search.php?q={query} + paging : True + first_page_num : 0 + page_size : 30 + search_url : https://www.erowid.org/search.php?q={query}&s={pageno} url_xpath : //dl[@class="results-list"]/dt[@class="result-title"]/a/@href title_xpath : //dl[@class="results-list"]/dt[@class="result-title"]/a/text() content_xpath : //dl[@class="results-list"]/dd[@class="result-details"] From 5b3c9f06ebf16f8dc4387e5b5514256e26813d6b Mon Sep 17 00:00:00 2001 From: Kirill Isakov Date: Mon, 28 Mar 2016 20:08:34 +0600 Subject: [PATCH 10/14] Add BitBucket & GitLab search engines --- searx/settings.yml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/searx/settings.yml b/searx/settings.yml index 473b8612a..f76ca98b6 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -56,6 +56,18 @@ engines: engine : bing_news shortcut : bin + - name : bitbucket + engine : xpath + paging : True + search_url : https://bitbucket.org/repo/all/{pageno}?name={query} + url_xpath : //article[@class="repo-summary"]//a[@class="repo-link"]/@href + title_xpath : //article[@class="repo-summary"]//a[@class="repo-link"] + content_xpath : //article[@class="repo-summary"]/p + categories : it + timeout : 4.0 + disabled : True + shortcut : bb + - name : btdigg engine : btdigg shortcut : bt @@ -143,6 +155,18 @@ engines: shortcut : gb disabled: True + - name : gitlab + engine : xpath + paging : True + search_url : https://gitlab.com/search?page={pageno}&search={query} + url_xpath : //li[@class="project-row"]//a[@class="project"]/@href + title_xpath : //li[@class="project-row"]//span[contains(@class, "project-full-name")] + content_xpath : //li[@class="project-row"]//div[@class="description"]/p + categories : it + shortcut : gl + timeout : 5.0 + disabled : True + - name : github engine : github shortcut : gh From 0bfbdff234535f0a190f272d2f03d7a801b74e10 Mon Sep 17 00:00:00 2001 From: Kirill Isakov Date: Mon, 28 Mar 2016 22:33:56 +0600 Subject: [PATCH 11/14] Add Habrahabr & Geektimes search engines --- searx/settings.yml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/searx/settings.yml b/searx/settings.yml index f76ca98b6..580ce1ac6 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -213,6 +213,30 @@ engines: shortcut : gps disabled : True + - name : geektimes + engine : xpath + paging : True + search_url : https://geektimes.ru/search/page{pageno}/?q={query} + url_xpath : //div[@class="search_results"]//a[@class="post_title"]/@href + title_xpath : //div[@class="search_results"]//a[@class="post_title"] + content_xpath : //div[@class="search_results"]//div[contains(@class, "content")] + categories : it + timeout : 4.0 + disabled : True + shortcut : gt + + - name : habrahabr + engine : xpath + paging : True + search_url : https://habrahabr.ru/search/page{pageno}/?q={query} + url_xpath : //div[@class="search_results"]//a[@class="post_title"]/@href + title_xpath : //div[@class="search_results"]//a[@class="post_title"] + content_xpath : //div[@class="search_results"]//div[contains(@class, "content")] + categories : it + timeout : 4.0 + disabled : True + shortcut : habr + - name : mixcloud engine : mixcloud shortcut : mc From d0001f10e6c604f3094d728bed703a60baa9ae17 Mon Sep 17 00:00:00 2001 From: Kirill Isakov Date: Wed, 30 Mar 2016 17:30:46 +0600 Subject: [PATCH 12/14] Add searx user agent to Reddit engine request to comply with API usage terms --- searx/engines/reddit.py | 4 ++++ tests/unit/engines/test_reddit.py | 2 ++ 2 files changed, 6 insertions(+) diff --git a/searx/engines/reddit.py b/searx/engines/reddit.py index 9729898e5..08c4fde4b 100644 --- a/searx/engines/reddit.py +++ b/searx/engines/reddit.py @@ -15,6 +15,7 @@ from cgi import escape from urllib import urlencode from urlparse import urlparse from datetime import datetime +from searx.utils import searx_useragent # engine dependent config categories = ['general', 'images', 'news', 'social media'] @@ -30,6 +31,9 @@ def request(query, params): 'limit': page_size}) params['url'] = search_url.format(query=query) + # using searx User-Agent + params['headers']['User-Agent'] = searx_useragent() + return params diff --git a/tests/unit/engines/test_reddit.py b/tests/unit/engines/test_reddit.py index 51589e300..9bbf6301c 100644 --- a/tests/unit/engines/test_reddit.py +++ b/tests/unit/engines/test_reddit.py @@ -3,6 +3,7 @@ import mock from searx.engines import reddit from searx.testing import SearxTestCase from datetime import datetime +from searx.utils import searx_useragent class TestRedditEngine(SearxTestCase): @@ -14,6 +15,7 @@ class TestRedditEngine(SearxTestCase): self.assertTrue('url' in params) self.assertTrue(query in params['url']) self.assertTrue('reddit.com' in params['url']) + self.assertEqual(params['headers']['User-Agent'], searx_useragent()) def test_response(self): resp = mock.Mock(text='{}') From d54e82dfb93bf6e0beca31601382c0a23818ec82 Mon Sep 17 00:00:00 2001 From: Kirill Isakov Date: Wed, 13 Apr 2016 22:06:00 +0600 Subject: [PATCH 13/14] Revert to using random UA in Reddit search engine --- searx/engines/reddit.py | 4 ---- tests/unit/engines/test_reddit.py | 2 -- 2 files changed, 6 deletions(-) diff --git a/searx/engines/reddit.py b/searx/engines/reddit.py index 08c4fde4b..9729898e5 100644 --- a/searx/engines/reddit.py +++ b/searx/engines/reddit.py @@ -15,7 +15,6 @@ from cgi import escape from urllib import urlencode from urlparse import urlparse from datetime import datetime -from searx.utils import searx_useragent # engine dependent config categories = ['general', 'images', 'news', 'social media'] @@ -31,9 +30,6 @@ def request(query, params): 'limit': page_size}) params['url'] = search_url.format(query=query) - # using searx User-Agent - params['headers']['User-Agent'] = searx_useragent() - return params diff --git a/tests/unit/engines/test_reddit.py b/tests/unit/engines/test_reddit.py index 9bbf6301c..51589e300 100644 --- a/tests/unit/engines/test_reddit.py +++ b/tests/unit/engines/test_reddit.py @@ -3,7 +3,6 @@ import mock from searx.engines import reddit from searx.testing import SearxTestCase from datetime import datetime -from searx.utils import searx_useragent class TestRedditEngine(SearxTestCase): @@ -15,7 +14,6 @@ class TestRedditEngine(SearxTestCase): self.assertTrue('url' in params) self.assertTrue(query in params['url']) self.assertTrue('reddit.com' in params['url']) - self.assertEqual(params['headers']['User-Agent'], searx_useragent()) def test_response(self): resp = mock.Mock(text='{}') From 90c51cb4494c90353cc97794eece486bd8bf92dd Mon Sep 17 00:00:00 2001 From: Kirill Isakov Date: Wed, 13 Apr 2016 23:04:53 +0600 Subject: [PATCH 14/14] Fix a few typos in Google search engine --- searx/engines/google.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/searx/engines/google.py b/searx/engines/google.py index dbca205a1..8b06e9de6 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -46,11 +46,11 @@ country_to_hostname = { 'NZ': 'www.google.co.nz', # New Zealand 'PH': 'www.google.com.ph', # Philippines 'SG': 'www.google.com.sg', # Singapore - # 'US': 'www.google.us', # United State, redirect to .com + # 'US': 'www.google.us', # United States, redirect to .com 'ZA': 'www.google.co.za', # South Africa 'AR': 'www.google.com.ar', # Argentina 'CL': 'www.google.cl', # Chile - 'ES': 'www.google.es', # Span + 'ES': 'www.google.es', # Spain 'MX': 'www.google.com.mx', # Mexico 'EE': 'www.google.ee', # Estonia 'FI': 'www.google.fi', # Finland @@ -61,7 +61,7 @@ country_to_hostname = { 'HU': 'www.google.hu', # Hungary 'IT': 'www.google.it', # Italy 'JP': 'www.google.co.jp', # Japan - 'KR': 'www.google.co.kr', # South Korean + 'KR': 'www.google.co.kr', # South Korea 'LT': 'www.google.lt', # Lithuania 'LV': 'www.google.lv', # Latvia 'NO': 'www.google.no', # Norway @@ -76,9 +76,9 @@ country_to_hostname = { 'SE': 'www.google.se', # Sweden 'TH': 'www.google.co.th', # Thailand 'TR': 'www.google.com.tr', # Turkey - 'UA': 'www.google.com.ua', # Ikraine - # 'CN': 'www.google.cn', # China, only from china ? - 'HK': 'www.google.com.hk', # Hong kong + 'UA': 'www.google.com.ua', # Ukraine + # 'CN': 'www.google.cn', # China, only from China ? + 'HK': 'www.google.com.hk', # Hong Kong 'TW': 'www.google.com.tw' # Taiwan }