[feat] startpage: support for news and images

This commit is contained in:
Bnyro 2025-02-13 11:43:45 +01:00 committed by Markus Heiser
parent feb15e3878
commit 0f2fc5879d
3 changed files with 143 additions and 77 deletions

View File

@ -74,24 +74,25 @@ Startpage's category (for Web-search, News, Videos, ..) is set by
.. hint::
The default category is ``web`` .. and other categories than ``web`` are not
yet implemented.
Supported categories are ``web``, ``news`` and ``images``.
"""
# pylint: disable=too-many-statements
from __future__ import annotations
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any
from collections import OrderedDict
import re
from unicodedata import normalize, combining
from time import time
from datetime import datetime, timedelta
from json import loads
import dateutil.parser
import lxml.html
import babel.localedata
from searx.utils import extract_text, eval_xpath, gen_useragent
from searx.utils import extr, extract_text, eval_xpath, gen_useragent, html_to_text, humanize_bytes, remove_pua_from_str
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx.exceptions import SearxEngineCaptchaException
from searx.locales import region_tag
@ -250,22 +251,13 @@ def request(query, params):
Additionally the arguments form Startpage's search form needs to be set in
HTML POST data / compare ``<input>`` elements: :py:obj:`search_form_xpath`.
"""
if startpage_categ == 'web':
return _request_cat_web(query, params)
logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
return params
def _request_cat_web(query, params):
engine_region = traits.get_region(params['searxng_locale'], 'en-US')
engine_language = traits.get_language(params['searxng_locale'], 'en')
# build arguments
args = {
'query': query,
'cat': 'web',
'cat': startpage_categ,
't': 'device',
'sc': get_sc_code(params['searxng_locale'], params), # hint: this func needs HTTP headers,
'with_date': time_range_dict.get(params['time_range'], ''),
@ -317,73 +309,118 @@ def _request_cat_web(query, params):
return params
# get response from search-request
def _parse_published_date(content: str) -> tuple[str, datetime | None]:
published_date = None
# check if search result starts with something like: "2 Sep 2014 ... "
if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
date_pos = content.find('...') + 4
date_string = content[0 : date_pos - 5]
# fix content string
content = content[date_pos:]
try:
published_date = dateutil.parser.parse(date_string, dayfirst=True)
except ValueError:
pass
# check if search result starts with something like: "5 days ago ... "
elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
date_pos = content.find('...') + 4
date_string = content[0 : date_pos - 5]
# calculate datetime
published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) # type: ignore
# fix content string
content = content[date_pos:]
return content, published_date
def _get_web_result(result):
content = html_to_text(result.get('description'))
content, publishedDate = _parse_published_date(content)
return {
'url': result['clickUrl'],
'title': html_to_text(result['title']),
'content': content,
'publishedDate': publishedDate,
}
def _get_news_result(result):
title = remove_pua_from_str(html_to_text(result['title']))
content = remove_pua_from_str(html_to_text(result.get('description')))
publishedDate = None
if result.get('date'):
publishedDate = datetime.fromtimestamp(result['date'] / 1000)
thumbnailUrl = None
if result.get('thumbnailUrl'):
thumbnailUrl = base_url + result['thumbnailUrl']
return {
'url': result['clickUrl'],
'title': title,
'content': content,
'publishedDate': publishedDate,
'thumbnail': thumbnailUrl,
}
def _get_image_result(result) -> dict[str, Any] | None:
url = result.get('altClickUrl')
if not url:
return None
thumbnailUrl = None
if result.get('thumbnailUrl'):
thumbnailUrl = base_url + result['thumbnailUrl']
resolution = None
if result.get('width') and result.get('height'):
resolution = f"{result['width']}x{result['height']}"
filesize = None
if result.get('filesize'):
size_str = ''.join(filter(str.isdigit, result['filesize']))
filesize = humanize_bytes(int(size_str))
return {
'template': 'images.html',
'url': url,
'title': html_to_text(result['title']),
'content': '',
'img_src': result.get('rawImageUrl'),
'thumbnail_src': thumbnailUrl,
'resolution': resolution,
'img_format': result.get('format'),
'filesize': filesize,
}
def response(resp):
dom = lxml.html.fromstring(resp.text)
categ = startpage_categ.capitalize()
results_raw = '{' + extr(resp.text, f"React.createElement(UIStartpage.AppSerp{categ}, {{", '}})') + '}}'
results_json = loads(results_raw)
results_obj = results_json.get('render', {}).get('presenter', {}).get('regions', {})
if startpage_categ == 'web':
return _response_cat_web(dom)
logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
return []
def _response_cat_web(dom):
results = []
for results_categ in results_obj.get('mainline', []):
for item in results_categ.get('results', []):
if results_categ['display_type'] == 'web-google':
results.append(_get_web_result(item))
elif results_categ['display_type'] == 'news-bing':
results.append(_get_news_result(item))
elif 'images' in results_categ['display_type']:
item = _get_image_result(item)
if item:
results.append(item)
# parse results
for result in eval_xpath(dom, '//div[@class="w-gl"]/div[contains(@class, "result")]'):
links = eval_xpath(result, './/a[contains(@class, "result-title result-link")]')
if not links:
continue
link = links[0]
url = link.attrib.get('href')
# block google-ad url's
if re.match(r"^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url):
continue
# block startpage search url's
if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
continue
title = extract_text(eval_xpath(link, 'h2'))
content = eval_xpath(result, './/p[contains(@class, "description")]')
content = extract_text(content, allow_none=True) or ''
published_date = None
# check if search result starts with something like: "2 Sep 2014 ... "
if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
date_pos = content.find('...') + 4
date_string = content[0 : date_pos - 5]
# fix content string
content = content[date_pos:]
try:
published_date = dateutil.parser.parse(date_string, dayfirst=True)
except ValueError:
pass
# check if search result starts with something like: "5 days ago ... "
elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
date_pos = content.find('...') + 4
date_string = content[0 : date_pos - 5]
# calculate datetime
published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) # type: ignore
# fix content string
content = content[date_pos:]
if published_date:
# append result
results.append({'url': url, 'title': title, 'content': content, 'publishedDate': published_date})
else:
# append result
results.append({'url': url, 'title': title, 'content': content})
# return results
return results

View File

@ -1792,6 +1792,20 @@ engines:
additional_tests:
rosebud: *test_rosebud
- name: startpage news
engine: startpage
startpage_categ: news
shortcut: spn
timeout: 6.0
disabled: true
- name: startpage images
engine: startpage
startpage_categ: images
shortcut: spi
timeout: 6.0
disabled: true
- name: tokyotoshokan
engine: tokyotoshokan
shortcut: tt

View File

@ -470,6 +470,21 @@ def ecma_unescape(string: str) -> str:
return string
def remove_pua_from_str(string):
"""Removes unicode's "PRIVATE USE CHARACTER"s (PUA_) from a string.
_PUA: https://en.wikipedia.org/wiki/Private_Use_Areas
"""
pua_ranges = ((0xE000, 0xF8FF), (0xF0000, 0xFFFFD), (0x100000, 0x10FFFD))
s = []
for c in string:
i = ord(c)
if any(a <= i <= b for (a, b) in pua_ranges):
continue
s.append(c)
return "".join(s)
def get_string_replaces_function(replaces: Dict[str, str]) -> Callable[[str], str]:
rep = {re.escape(k): v for k, v in replaces.items()}
pattern = re.compile("|".join(rep.keys()))