Merge branch 'master' into nyaa

This commit is contained in:
misnyo 2017-09-04 17:48:25 +02:00 committed by GitHub
commit c3232b0e1a
5 changed files with 77 additions and 84 deletions

View file

@ -1,62 +0,0 @@
"""
General Files (Files)
@website http://www.general-files.org
@provide-api no (nothing found)
@using-api no (because nothing found)
@results HTML (using search portal)
@stable no (HTML can change)
@parse url, title, content
@todo detect torrents?
"""
from lxml import html
# engine dependent config
categories = ['files']
paging = True
# search-url
base_url = 'http://www.general-file.com'
search_url = base_url + '/files-{letter}/{query}/{pageno}'
# specific xpath variables
result_xpath = '//table[@class="block-file"]'
title_xpath = './/h2/a//text()'
url_xpath = './/h2/a/@href'
content_xpath = './/p//text()'
# do search-request
def request(query, params):
params['url'] = search_url.format(query=query,
letter=query[0],
pageno=params['pageno'])
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath(result_xpath):
url = result.xpath(url_xpath)[0]
# skip fast download links
if not url.startswith('/'):
continue
# append result
results.append({'url': base_url + url,
'title': ''.join(result.xpath(title_xpath)),
'content': ''.join(result.xpath(content_xpath))})
# return results
return results

View file

@ -10,6 +10,7 @@
@parse url, title, content @parse url, title, content
""" """
import random
from json import loads from json import loads
from time import time from time import time
from lxml.html import fromstring from lxml.html import fromstring
@ -32,7 +33,8 @@ search_string = 'search?{query}'\
'&qh=0'\ '&qh=0'\
'&qlang={lang}'\ '&qlang={lang}'\
'&ff={safesearch}'\ '&ff={safesearch}'\
'&rxikd={rxikd}' # random number - 9 digits '&rxieu={rxieu}'\
'&rand={rxikd}' # current unix timestamp
# specific xpath variables # specific xpath variables
results_xpath = '//response//result' results_xpath = '//response//result'
@ -59,10 +61,12 @@ def request(query, params):
else: else:
safesearch = 0 safesearch = 0
# rxieu is some kind of hash from the search query, but accepts random atm
search_path = search_string.format(query=urlencode({'q': query}), search_path = search_string.format(query=urlencode({'q': query}),
offset=offset, offset=offset,
number_of_results=number_of_results, number_of_results=number_of_results,
rxikd=str(time())[:9], rxikd=int(time() * 1000),
rxieu=random.randint(1000000000, 9999999999),
lang=language, lang=language,
safesearch=safesearch) safesearch=safesearch)

View file

@ -67,8 +67,8 @@ def response(resp):
for result in dom.xpath('//div[@class="g"]|//div[@class="g _cy"]'): for result in dom.xpath('//div[@class="g"]|//div[@class="g _cy"]'):
try: try:
r = { r = {
'url': result.xpath('.//div[@class="_cnc"]//a/@href')[0], 'url': result.xpath('.//a[@class="l _PMs"]')[0].attrib.get("href"),
'title': ''.join(result.xpath('.//div[@class="_cnc"]//h3//text()')), 'title': ''.join(result.xpath('.//a[@class="l _PMs"]//text()')),
'content': ''.join(result.xpath('.//div[@class="st"]//text()')), 'content': ''.join(result.xpath('.//div[@class="st"]//text()')),
} }
except: except:

View file

@ -242,15 +242,16 @@ engines:
disabled: True disabled: True
- name : gitlab - name : gitlab
engine : xpath engine : json_engine
paging : True paging : True
search_url : https://gitlab.com/search?page={pageno}&search={query} search_url : https://gitlab.com/api/v4/projects?search={query}&page={pageno}
url_xpath : //li[@class="project-row"]//a[@class="project"]/@href url_query : web_url
title_xpath : //li[@class="project-row"]//span[contains(@class, "project-full-name")] title_query : name_with_namespace
content_xpath : //li[@class="project-row"]//div[@class="description"]/p content_query : description
page_size : 20
categories : it categories : it
shortcut : gl shortcut : gl
timeout : 5.0 timeout : 10.0
disabled : True disabled : True
- name : github - name : github
@ -321,9 +322,9 @@ engines:
engine : xpath engine : xpath
paging : True paging : True
search_url : https://geektimes.ru/search/page{pageno}/?q={query} search_url : https://geektimes.ru/search/page{pageno}/?q={query}
url_xpath : //div[@class="search_results"]//a[@class="post__title_link"]/@href url_xpath : //article[contains(@class, "post")]//a[@class="post__title_link"]/@href
title_xpath : //div[@class="search_results"]//a[@class="post__title_link"] title_xpath : //article[contains(@class, "post")]//a[@class="post__title_link"]
content_xpath : //div[@class="search_results"]//div[contains(@class, "content")] content_xpath : //article[contains(@class, "post")]//div[contains(@class, "post__text")]
categories : it categories : it
timeout : 4.0 timeout : 4.0
disabled : True disabled : True
@ -333,9 +334,9 @@ engines:
engine : xpath engine : xpath
paging : True paging : True
search_url : https://habrahabr.ru/search/page{pageno}/?q={query} search_url : https://habrahabr.ru/search/page{pageno}/?q={query}
url_xpath : //div[@class="search_results"]//a[contains(@class, "post__title_link")]/@href url_xpath : //article[contains(@class, "post")]//a[@class="post__title_link"]/@href
title_xpath : //div[@class="search_results"]//a[contains(@class, "post__title_link")] title_xpath : //article[contains(@class, "post")]//a[@class="post__title_link"]
content_xpath : //div[@class="search_results"]//div[contains(@class, "content")] content_xpath : //article[contains(@class, "post")]//div[contains(@class, "post__text")]
categories : it categories : it
timeout : 4.0 timeout : 4.0
disabled : True disabled : True

File diff suppressed because one or more lines are too long