From f97b4ff7b6607f4da66bc0f67b14b29317011cd2 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Mon, 15 Mar 2021 17:21:46 +0100 Subject: [PATCH] [fix] update youtube_noapi paging --- searx/engines/youtube_noapi.py | 79 +++++++++++++++++++++++++++++++--- 1 file changed, 72 insertions(+), 7 deletions(-) diff --git a/searx/engines/youtube_noapi.py b/searx/engines/youtube_noapi.py index 90b93f0a4..5b9e3e3f4 100644 --- a/searx/engines/youtube_noapi.py +++ b/searx/engines/youtube_noapi.py @@ -4,7 +4,7 @@ """ from functools import reduce -from json import loads +from json import loads, dumps from urllib.parse import quote_plus # about @@ -20,12 +20,15 @@ about = { # engine dependent config categories = ['videos', 'music'] paging = True +language_support = False time_range_support = True # search-url base_url = 'https://www.youtube.com/results' search_url = base_url + '?search_query={query}&page={page}' time_range_url = '&sp=EgII{time_range}%253D%253D' +# the key seems to be constant +next_page_url = 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' time_range_dict = {'day': 'Ag', 'week': 'Aw', 'month': 'BA', @@ -40,21 +43,73 @@ base_youtube_url = 'https://www.youtube.com/watch?v=' # do search-request def request(query, params): - params['url'] = search_url.format(query=quote_plus(query), - page=params['pageno']) - if params['time_range'] in time_range_dict: - params['url'] += time_range_url.format(time_range=time_range_dict[params['time_range']]) + if not params['engine_data'].get('next_page_token'): + params['url'] = search_url.format(query=quote_plus(query), page=params['pageno']) + if params['time_range'] in time_range_dict: + params['url'] += time_range_url.format(time_range=time_range_dict[params['time_range']]) + else: + print(params['engine_data']['next_page_token']) + params['url'] = next_page_url + params['method'] = 'POST' + params['data'] = dumps({ + 'context': {"client": {"clientName": "WEB", "clientVersion": "2.20210310.12.01"}}, + 'continuation': params['engine_data']['next_page_token'], + }) + params['headers']['Content-Type'] = 'application/json' return params # get response from search-request def response(resp): + if resp.search_params.get('engine_data'): + return parse_next_page_response(resp.text) + return parse_first_page_response(resp.text) + + +def parse_next_page_response(response_text): results = [] + result_json = loads(response_text) + with open("/tmp/x", "w") as f: + f.write(response_text) + for section in (result_json['onResponseReceivedCommands'][0] + .get('appendContinuationItemsAction')['continuationItems'][0] + .get('itemSectionRenderer')['contents']): + if 'videoRenderer' not in section: + continue + section = section['videoRenderer'] + content = "-" + if 'descriptionSnippet' in section: + content = ' '.join(x['text'] for x in section['descriptionSnippet']['runs']) + results.append({ + 'url': base_youtube_url + section['videoId'], + 'title': ' '.join(x['text'] for x in section['title']['runs']), + 'content': content, + 'author': section['ownerText']['runs'][0]['text'], + 'length': section['lengthText']['simpleText'], + 'template': 'videos.html', + 'embedded': embedded_url.format(videoid=section['videoId']), + 'thumbnail': section['thumbnail']['thumbnails'][-1]['url'], + }) + try: + token = result_json['onResponseReceivedCommands'][0]\ + .get('appendContinuationItemsAction')['continuationItems'][1]\ + .get('continuationItemRenderer')['continuationEndpoint']\ + .get('continuationCommand')['token'] + results.append({ + "engine_data": token, + "key": "next_page_token", + }) + except: + pass - results_data = resp.text[resp.text.find('ytInitialData'):] + return results + + +def parse_first_page_response(response_text): + results = [] + results_data = response_text[response_text.find('ytInitialData'):] results_data = results_data[results_data.find('{'):results_data.find(';')] - results_json = loads(results_data) if results_data else {} sections = results_json.get('contents', {})\ .get('twoColumnSearchResultsRenderer', {})\ @@ -63,6 +118,16 @@ def response(resp): .get('contents', []) for section in sections: + if "continuationItemRenderer" in section: + next_page_token = section["continuationItemRenderer"]\ + .get("continuationEndpoint", {})\ + .get("continuationCommand", {})\ + .get("token", "") + if next_page_token: + results.append({ + "engine_data": next_page_token, + "key": "next_page_token", + }) for video_container in section.get('itemSectionRenderer', {}).get('contents', []): video = video_container.get('videoRenderer', {}) videoid = video.get('videoId')