[feat] startpage: support for news and images
This commit is contained in:
parent
feb15e3878
commit
0f2fc5879d
@ -74,24 +74,25 @@ Startpage's category (for Web-search, News, Videos, ..) is set by
|
|||||||
|
|
||||||
.. hint::
|
.. hint::
|
||||||
|
|
||||||
The default category is ``web`` .. and other categories than ``web`` are not
|
Supported categories are ``web``, ``news`` and ``images``.
|
||||||
yet implemented.
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# pylint: disable=too-many-statements
|
# pylint: disable=too-many-statements
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING, Any
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
import re
|
import re
|
||||||
from unicodedata import normalize, combining
|
from unicodedata import normalize, combining
|
||||||
from time import time
|
from time import time
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
from json import loads
|
||||||
|
|
||||||
import dateutil.parser
|
import dateutil.parser
|
||||||
import lxml.html
|
import lxml.html
|
||||||
import babel.localedata
|
import babel.localedata
|
||||||
|
|
||||||
from searx.utils import extract_text, eval_xpath, gen_useragent
|
from searx.utils import extr, extract_text, eval_xpath, gen_useragent, html_to_text, humanize_bytes, remove_pua_from_str
|
||||||
from searx.network import get # see https://github.com/searxng/searxng/issues/762
|
from searx.network import get # see https://github.com/searxng/searxng/issues/762
|
||||||
from searx.exceptions import SearxEngineCaptchaException
|
from searx.exceptions import SearxEngineCaptchaException
|
||||||
from searx.locales import region_tag
|
from searx.locales import region_tag
|
||||||
@ -250,22 +251,13 @@ def request(query, params):
|
|||||||
Additionally the arguments form Startpage's search form needs to be set in
|
Additionally the arguments form Startpage's search form needs to be set in
|
||||||
HTML POST data / compare ``<input>`` elements: :py:obj:`search_form_xpath`.
|
HTML POST data / compare ``<input>`` elements: :py:obj:`search_form_xpath`.
|
||||||
"""
|
"""
|
||||||
if startpage_categ == 'web':
|
|
||||||
return _request_cat_web(query, params)
|
|
||||||
|
|
||||||
logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
def _request_cat_web(query, params):
|
|
||||||
|
|
||||||
engine_region = traits.get_region(params['searxng_locale'], 'en-US')
|
engine_region = traits.get_region(params['searxng_locale'], 'en-US')
|
||||||
engine_language = traits.get_language(params['searxng_locale'], 'en')
|
engine_language = traits.get_language(params['searxng_locale'], 'en')
|
||||||
|
|
||||||
# build arguments
|
# build arguments
|
||||||
args = {
|
args = {
|
||||||
'query': query,
|
'query': query,
|
||||||
'cat': 'web',
|
'cat': startpage_categ,
|
||||||
't': 'device',
|
't': 'device',
|
||||||
'sc': get_sc_code(params['searxng_locale'], params), # hint: this func needs HTTP headers,
|
'sc': get_sc_code(params['searxng_locale'], params), # hint: this func needs HTTP headers,
|
||||||
'with_date': time_range_dict.get(params['time_range'], ''),
|
'with_date': time_range_dict.get(params['time_range'], ''),
|
||||||
@ -317,40 +309,7 @@ def _request_cat_web(query, params):
|
|||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
def _parse_published_date(content: str) -> tuple[str, datetime | None]:
|
||||||
def response(resp):
|
|
||||||
dom = lxml.html.fromstring(resp.text)
|
|
||||||
|
|
||||||
if startpage_categ == 'web':
|
|
||||||
return _response_cat_web(dom)
|
|
||||||
|
|
||||||
logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
def _response_cat_web(dom):
|
|
||||||
results = []
|
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in eval_xpath(dom, '//div[@class="w-gl"]/div[contains(@class, "result")]'):
|
|
||||||
links = eval_xpath(result, './/a[contains(@class, "result-title result-link")]')
|
|
||||||
if not links:
|
|
||||||
continue
|
|
||||||
link = links[0]
|
|
||||||
url = link.attrib.get('href')
|
|
||||||
|
|
||||||
# block google-ad url's
|
|
||||||
if re.match(r"^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# block startpage search url's
|
|
||||||
if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
|
|
||||||
continue
|
|
||||||
|
|
||||||
title = extract_text(eval_xpath(link, 'h2'))
|
|
||||||
content = eval_xpath(result, './/p[contains(@class, "description")]')
|
|
||||||
content = extract_text(content, allow_none=True) or ''
|
|
||||||
|
|
||||||
published_date = None
|
published_date = None
|
||||||
|
|
||||||
# check if search result starts with something like: "2 Sep 2014 ... "
|
# check if search result starts with something like: "2 Sep 2014 ... "
|
||||||
@ -376,14 +335,92 @@ def _response_cat_web(dom):
|
|||||||
# fix content string
|
# fix content string
|
||||||
content = content[date_pos:]
|
content = content[date_pos:]
|
||||||
|
|
||||||
if published_date:
|
return content, published_date
|
||||||
# append result
|
|
||||||
results.append({'url': url, 'title': title, 'content': content, 'publishedDate': published_date})
|
|
||||||
else:
|
def _get_web_result(result):
|
||||||
# append result
|
content = html_to_text(result.get('description'))
|
||||||
results.append({'url': url, 'title': title, 'content': content})
|
content, publishedDate = _parse_published_date(content)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'url': result['clickUrl'],
|
||||||
|
'title': html_to_text(result['title']),
|
||||||
|
'content': content,
|
||||||
|
'publishedDate': publishedDate,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _get_news_result(result):
|
||||||
|
|
||||||
|
title = remove_pua_from_str(html_to_text(result['title']))
|
||||||
|
content = remove_pua_from_str(html_to_text(result.get('description')))
|
||||||
|
|
||||||
|
publishedDate = None
|
||||||
|
if result.get('date'):
|
||||||
|
publishedDate = datetime.fromtimestamp(result['date'] / 1000)
|
||||||
|
|
||||||
|
thumbnailUrl = None
|
||||||
|
if result.get('thumbnailUrl'):
|
||||||
|
thumbnailUrl = base_url + result['thumbnailUrl']
|
||||||
|
|
||||||
|
return {
|
||||||
|
'url': result['clickUrl'],
|
||||||
|
'title': title,
|
||||||
|
'content': content,
|
||||||
|
'publishedDate': publishedDate,
|
||||||
|
'thumbnail': thumbnailUrl,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _get_image_result(result) -> dict[str, Any] | None:
|
||||||
|
url = result.get('altClickUrl')
|
||||||
|
if not url:
|
||||||
|
return None
|
||||||
|
|
||||||
|
thumbnailUrl = None
|
||||||
|
if result.get('thumbnailUrl'):
|
||||||
|
thumbnailUrl = base_url + result['thumbnailUrl']
|
||||||
|
|
||||||
|
resolution = None
|
||||||
|
if result.get('width') and result.get('height'):
|
||||||
|
resolution = f"{result['width']}x{result['height']}"
|
||||||
|
|
||||||
|
filesize = None
|
||||||
|
if result.get('filesize'):
|
||||||
|
size_str = ''.join(filter(str.isdigit, result['filesize']))
|
||||||
|
filesize = humanize_bytes(int(size_str))
|
||||||
|
|
||||||
|
return {
|
||||||
|
'template': 'images.html',
|
||||||
|
'url': url,
|
||||||
|
'title': html_to_text(result['title']),
|
||||||
|
'content': '',
|
||||||
|
'img_src': result.get('rawImageUrl'),
|
||||||
|
'thumbnail_src': thumbnailUrl,
|
||||||
|
'resolution': resolution,
|
||||||
|
'img_format': result.get('format'),
|
||||||
|
'filesize': filesize,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def response(resp):
|
||||||
|
categ = startpage_categ.capitalize()
|
||||||
|
results_raw = '{' + extr(resp.text, f"React.createElement(UIStartpage.AppSerp{categ}, {{", '}})') + '}}'
|
||||||
|
results_json = loads(results_raw)
|
||||||
|
results_obj = results_json.get('render', {}).get('presenter', {}).get('regions', {})
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for results_categ in results_obj.get('mainline', []):
|
||||||
|
for item in results_categ.get('results', []):
|
||||||
|
if results_categ['display_type'] == 'web-google':
|
||||||
|
results.append(_get_web_result(item))
|
||||||
|
elif results_categ['display_type'] == 'news-bing':
|
||||||
|
results.append(_get_news_result(item))
|
||||||
|
elif 'images' in results_categ['display_type']:
|
||||||
|
item = _get_image_result(item)
|
||||||
|
if item:
|
||||||
|
results.append(item)
|
||||||
|
|
||||||
# return results
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
@ -1792,6 +1792,20 @@ engines:
|
|||||||
additional_tests:
|
additional_tests:
|
||||||
rosebud: *test_rosebud
|
rosebud: *test_rosebud
|
||||||
|
|
||||||
|
- name: startpage news
|
||||||
|
engine: startpage
|
||||||
|
startpage_categ: news
|
||||||
|
shortcut: spn
|
||||||
|
timeout: 6.0
|
||||||
|
disabled: true
|
||||||
|
|
||||||
|
- name: startpage images
|
||||||
|
engine: startpage
|
||||||
|
startpage_categ: images
|
||||||
|
shortcut: spi
|
||||||
|
timeout: 6.0
|
||||||
|
disabled: true
|
||||||
|
|
||||||
- name: tokyotoshokan
|
- name: tokyotoshokan
|
||||||
engine: tokyotoshokan
|
engine: tokyotoshokan
|
||||||
shortcut: tt
|
shortcut: tt
|
||||||
|
@ -470,6 +470,21 @@ def ecma_unescape(string: str) -> str:
|
|||||||
return string
|
return string
|
||||||
|
|
||||||
|
|
||||||
|
def remove_pua_from_str(string):
|
||||||
|
"""Removes unicode's "PRIVATE USE CHARACTER"s (PUA_) from a string.
|
||||||
|
|
||||||
|
_PUA: https://en.wikipedia.org/wiki/Private_Use_Areas
|
||||||
|
"""
|
||||||
|
pua_ranges = ((0xE000, 0xF8FF), (0xF0000, 0xFFFFD), (0x100000, 0x10FFFD))
|
||||||
|
s = []
|
||||||
|
for c in string:
|
||||||
|
i = ord(c)
|
||||||
|
if any(a <= i <= b for (a, b) in pua_ranges):
|
||||||
|
continue
|
||||||
|
s.append(c)
|
||||||
|
return "".join(s)
|
||||||
|
|
||||||
|
|
||||||
def get_string_replaces_function(replaces: Dict[str, str]) -> Callable[[str], str]:
|
def get_string_replaces_function(replaces: Dict[str, str]) -> Callable[[str], str]:
|
||||||
rep = {re.escape(k): v for k, v in replaces.items()}
|
rep = {re.escape(k): v for k, v in replaces.items()}
|
||||||
pattern = re.compile("|".join(rep.keys()))
|
pattern = re.compile("|".join(rep.keys()))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user