searxng/searx/engines/yahoo.py
Markus Heiser c20038e7c3 [fix] engine yahoo: replace fetch_traits by a list of languages
The Yahoo engine's fetch_traits function has been encountering an error in CI
jobs for several months [1], thus aborting the process for all other engines as
well.

The language selection dialog (which fetch_traits calls) requires an `EuConsent`
cookie. Strangely, the cookie is not needed for searching, which is why the
engine itself still works.

Since Yahoo won't be conquering any new marketplaces in the foreseeable future,
it should be sufficient to hard-implement the list of currently available
languages ​​(`yahoo_languages`).

[1] https://github.com/searxng/searxng/actions/runs/14720458830/job/41313149268

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2025-04-29 08:48:56 +02:00

188 lines
4.4 KiB
Python

# SPDX-License-Identifier: AGPL-3.0-or-later
"""Yahoo Search (Web)
Languages are supported by mapping the language to a domain. If domain is not
found in :py:obj:`lang2domain` URL ``<lang>.search.yahoo.com`` is used.
"""
from urllib.parse import (
unquote,
urlencode,
)
from lxml import html
from searx.utils import (
eval_xpath_getindex,
eval_xpath_list,
extract_text,
html_to_text,
)
from searx.enginelib.traits import EngineTraits
traits: EngineTraits
# about
about = {
"website": 'https://search.yahoo.com/',
"wikidata_id": None,
"official_api_documentation": 'https://developer.yahoo.com/api/',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['general', 'web']
paging = True
time_range_support = True
# send_accept_language_header = True
time_range_dict = {
'day': ('1d', 'd'),
'week': ('1w', 'w'),
'month': ('1m', 'm'),
}
lang2domain = {
'zh_chs': 'hk.search.yahoo.com',
'zh_cht': 'tw.search.yahoo.com',
'any': 'search.yahoo.com',
'en': 'search.yahoo.com',
'bg': 'search.yahoo.com',
'cs': 'search.yahoo.com',
'da': 'search.yahoo.com',
'el': 'search.yahoo.com',
'et': 'search.yahoo.com',
'he': 'search.yahoo.com',
'hr': 'search.yahoo.com',
'ja': 'search.yahoo.com',
'ko': 'search.yahoo.com',
'sk': 'search.yahoo.com',
'sl': 'search.yahoo.com',
}
"""Map language to domain"""
yahoo_languages = {
"all": "any",
"ar": "ar",
"bg": "bg",
"cs": "cs",
"da": "da",
"de": "de",
"el": "el",
"en": "en",
"es": "es",
"et": "et",
"fi": "fi",
"fr": "fr",
"he": "he",
"hr": "hr",
"hu": "hu",
"it": "it",
"ja": "ja",
"ko": "ko",
"lt": "lt",
"lv": "lv",
"nl": "nl",
"no": "no",
"pl": "pl",
"pt": "pt",
"ro": "ro",
"ru": "ru",
"sk": "sk",
"sl": "sl",
"sv": "sv",
"th": "th",
"tr": "tr",
"zh": "zh_chs",
"zh_Hans": "zh_chs",
'zh-CN': "zh_chs",
"zh_Hant": "zh_cht",
"zh-HK": "zh_cht",
'zh-TW': "zh_cht",
}
def request(query, params):
"""build request"""
lang = params["language"].split("-")[0]
lang = yahoo_languages.get(lang, "any")
offset = (params['pageno'] - 1) * 7 + 1
age, btf = time_range_dict.get(params['time_range'], ('', ''))
args = urlencode(
{
'p': query,
'ei': 'UTF-8',
'fl': 1,
'vl': 'lang_' + lang,
'btf': btf,
'fr2': 'time',
'age': age,
'b': offset,
'xargs': 0,
}
)
domain = lang2domain.get(lang, '%s.search.yahoo.com' % lang)
params['url'] = 'https://%s/search?%s' % (domain, args)
return params
def parse_url(url_string):
"""remove yahoo-specific tracking-url"""
endings = ['/RS', '/RK']
endpositions = []
start = url_string.find('http', url_string.find('/RU=') + 1)
for ending in endings:
endpos = url_string.rfind(ending)
if endpos > -1:
endpositions.append(endpos)
if start == 0 or len(endpositions) == 0:
return url_string
end = min(endpositions)
return unquote(url_string[start:end])
def response(resp):
"""parse response"""
results = []
dom = html.fromstring(resp.text)
# parse results
for result in eval_xpath_list(dom, '//div[contains(@class,"algo-sr")]'):
url = eval_xpath_getindex(result, './/h3/a/@href', 0, default=None)
if url is None:
continue
url = parse_url(url)
title = eval_xpath_getindex(result, './/h3//a/@aria-label', 0, default='')
title: str = extract_text(title)
content = eval_xpath_getindex(result, './/div[contains(@class, "compText")]', 0, default='')
content: str = extract_text(content, allow_none=True)
# append result
results.append(
{
'url': url,
# title sometimes contains HTML tags / see
# https://github.com/searxng/searxng/issues/3790
'title': " ".join(html_to_text(title).strip().split()),
'content': " ".join(html_to_text(content).strip().split()),
}
)
for suggestion in eval_xpath_list(dom, '//div[contains(@class, "AlsoTry")]//table//a'):
# append suggestion
results.append({'suggestion': extract_text(suggestion)})
return results