[mod] improve implementation of presearch engine

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2023-10-04 14:31:31 +02:00 · 2023-10-04 14:31:31 +02:00 · 44392bd436
commit 44392bd436
parent 23582aac5c
2 changed files with 119 additions and 40 deletions
--- a/searx/engines/presearch.py
+++ b/searx/engines/presearch.py
@ -1,6 +1,20 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 # lint: pylint
 """Presearch (general, images, videos, news)
+
+.. hint::
+
+   The results in the video category are most often links to pages that contain
+   a video, for instance many links from preasearch's video category link
+   content from facebook (aka Meta) or Twitter (aka X).  Since these are not
+   real links to video streams SearXNG can't use the video template for this and
+   if SearXNG can't use this template, then the user doesn't want to see these
+   hits in the videos category.
+
+   TL;DR; by default presearch's video category is placed into categories::
+
+       categories: [general, web]
+
 """

 from urllib.parse import urlencode
@ -19,12 +33,18 @@ paging = True
 time_range_support = True
 categories = ["general", "web"]  # general, images, videos, news

-search_type = "search"  # must be any of "search", "images", "videos", "news"
+search_type = "search"
+"""must be any of ``search``, ``images``, ``videos``, ``news``"""

 base_url = "https://presearch.com"
 safesearch_map = {0: 'false', 1: 'true', 2: 'true'}


+def init(_):
+    if search_type not in ['search', 'images', 'videos', 'news']:
+        raise ValueError(f'presearch search_type: {search_type}')
+
+
 def _get_request_id(query, page, time_range, safesearch):
    args = {
        "q": query,
@ -38,7 +58,7 @@ def _get_request_id(query, page, time_range, safesearch):
        'User-Agent': gen_useragent(),
        'Cookie': f"b=1;presearch_session=;use_safe_search={safesearch_map[safesearch]}",
    }
-    resp_text = get(url, headers=headers).text
+    resp_text = get(url, headers=headers).text  # type: ignore

    for line in resp_text.split("\n"):
        if "window.searchId = " in line:
@ -47,11 +67,6 @@ def _get_request_id(query, page, time_range, safesearch):
    return None


-def _is_valid_img_src(url):
-    # in some cases, the image url is a base64 encoded string, which has to be skipped
-    return "https://" in url
-
-
 def request(query, params):
    request_id = _get_request_id(query, params["pageno"], params["time_range"], params["safesearch"])

@ -61,42 +76,105 @@ def request(query, params):
    return params


-def response(resp):
+def _strip_leading_strings(text):
+    for x in ['wikipedia', 'google']:
+        if text.lower().endswith(x):
+            text = text[: -len(x)]
+    return text.strip()
+
+
+def parse_search_query(json_results):
    results = []

-    json = resp.json()
-
-    json_results = []
-    if search_type == "search":
-        json_results = json['results'].get('standardResults', [])
-    else:
-        json_results = json.get(search_type, [])
-
-    for json_result in json_results:
+    for item in json_results.get('specialSections', {}).get('topStoriesCompact', {}).get('data', []):
        result = {
-            'url': json_result['link'],
-            'title': json_result['title'],
-            'content': html_to_text(json_result.get('description', '')),
+            'url': item['link'],
+            'title': item['title'],
+            'img_src': item['image'],
+            'content': '',
+            'metadata': item.get('source'),
        }
-        if search_type == "images":
-            result['template'] = 'images.html'
-
-            if not _is_valid_img_src(json_result['image']):
-                continue
-
-            result['img_src'] = json_result['image']
-            if _is_valid_img_src(json_result['thumbnail']):
-                result['thumbnail'] = json_result['thumbnail']
-
-        elif search_type == "videos":
-            result['template'] = 'videos.html'
-
-            if _is_valid_img_src(json_result['image']):
-                result['thumbnail'] = json_result['image']
-
-            result['duration'] = json_result['duration']
-            result['length'] = json_result['duration']
-
        results.append(result)

+    for item in json_results.get('standardResults', []):
+        result = {
+            'url': item['link'],
+            'title': item['title'],
+            'content': html_to_text(item['description']),
+        }
+        results.append(result)
+
+    info = json_results.get('infoSection', {}).get('data')
+    if info:
+        attributes = []
+        for item in info.get('about', []):
+            label, value = html_to_text(item).split(':', 1)
+            value = _strip_leading_strings(value)
+            attributes.append({'label': label, 'value': value})
+        content = []
+        for item in [info['subtitle'], info['description']]:
+            item = _strip_leading_strings(html_to_text(item))
+            if item:
+                content.append(item)
+
+        results.append(
+            {
+                'infobox': info['title'],
+                'id': info['title'],
+                'img_src': info.get('image'),
+                'content': ' | '.join(content),
+                'attributes': attributes,
+            }
+        )
+    return results
+
+
+def response(resp):
+    results = []
+    json_resp = resp.json()
+
+    if search_type == 'search':
+        results = parse_search_query(json_resp['results'])
+
+    elif search_type == 'images':
+        for item in json_resp['images']:
+            results.append(
+                {
+                    'template': 'images.html',
+                    'title': item['title'],
+                    'url': item['link'],
+                    'img_src': item['image'],
+                    'thumbnail_src': item['thumbnail'],
+                }
+            )
+
+    elif search_type == 'videos':
+        # The results in the video category are most often links to pages that contain
+        # a video and not to a video stream --> SearXNG can't use the video template.
+
+        for item in json_resp['videos']:
+            metadata = [x for x in [item.get('description'), item.get('duration')] if x]
+            results.append(
+                {
+                    'title': item['title'],
+                    'url': item['link'],
+                    'content': '',
+                    'metadata': ' / '.join(metadata),
+                    'img_src': item.get('image'),
+                }
+            )
+
+    elif search_type == 'news':
+        for item in json_resp['news']:
+            metadata = [x for x in [item.get('source'), item.get('time')] if x]
+            results.append(
+                {
+                    'title': item['title'],
+                    'url': item['link'],
+                    'content': item['description'],
+                    'metadata': ' / '.join(metadata),
+                    'img_src': item.get('image'),
+                }
+            )
+
    return results
--- a/searx/settings.yml
+++ b/searx/settings.yml
@ -1295,6 +1295,7 @@ engines:
    search_type: search
    categories: [general, web]
    shortcut: ps
+    disabled: true

  - name: presearch images
    engine: presearch
@ -1307,7 +1308,7 @@ engines:
  - name: presearch videos
    engine: presearch
    search_type: videos
-    categories: [videos, web]
+    categories: [general, web]
    timeout: 4.0
    shortcut: psvid
    disabled: true