[enh] engine: mojeek - add language support
Improve region and language detection / all locale Testing has shown the following behaviour for the different default and empty values of Mojeeks parameters: | param | idx | value | behaviour | | -------- | --- | ------ | ------------------------- | | region | 0 | '' | detect region based on IP | | region | 1 | 'none' | all regions | | language | 0 | '' | all languages |
This commit is contained in:
		
							parent
							
								
									5b6f40414a
								
							
						
					
					
						commit
						8b6a3f3e11
					
				| @ -1,12 +1,15 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| """Mojeek (general, images, news)""" | """Mojeek (general, images, news)""" | ||||||
| 
 | 
 | ||||||
|  | from typing import TYPE_CHECKING | ||||||
|  | 
 | ||||||
| from datetime import datetime | from datetime import datetime | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| 
 | 
 | ||||||
| from dateutil.relativedelta import relativedelta | from dateutil.relativedelta import relativedelta | ||||||
| from searx.utils import eval_xpath, eval_xpath_list, extract_text | from searx.utils import eval_xpath, eval_xpath_list, extract_text | ||||||
|  | from searx.enginelib.traits import EngineTraits | ||||||
| 
 | 
 | ||||||
| about = { | about = { | ||||||
|     'website': 'https://mojeek.com', |     'website': 'https://mojeek.com', | ||||||
| @ -42,6 +45,18 @@ news_url_xpath = './/h2/a/@href' | |||||||
| news_title_xpath = './/h2/a' | news_title_xpath = './/h2/a' | ||||||
| news_content_xpath = './/p[@class="s"]' | news_content_xpath = './/p[@class="s"]' | ||||||
| 
 | 
 | ||||||
|  | language_param = 'lb' | ||||||
|  | region_param = 'arc' | ||||||
|  | 
 | ||||||
|  | _delta_kwargs = {'day': 'days', 'week': 'weeks', 'month': 'months', 'year': 'years'} | ||||||
|  | 
 | ||||||
|  | if TYPE_CHECKING: | ||||||
|  |     import logging | ||||||
|  | 
 | ||||||
|  |     logger = logging.getLogger() | ||||||
|  | 
 | ||||||
|  | traits: EngineTraits | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def init(_): | def init(_): | ||||||
|     if search_type not in ('', 'images', 'news'): |     if search_type not in ('', 'images', 'news'): | ||||||
| @ -53,13 +68,16 @@ def request(query, params): | |||||||
|         'q': query, |         'q': query, | ||||||
|         'safe': min(params['safesearch'], 1), |         'safe': min(params['safesearch'], 1), | ||||||
|         'fmt': search_type, |         'fmt': search_type, | ||||||
|  |         language_param: traits.get_language(params['searxng_locale'], traits.custom['language_all']), | ||||||
|  |         region_param: traits.get_region(params['searxng_locale'], traits.custom['region_all']), | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     if search_type == '': |     if search_type == '': | ||||||
|         args['s'] = 10 * (params['pageno'] - 1) |         args['s'] = 10 * (params['pageno'] - 1) | ||||||
| 
 | 
 | ||||||
|     if params['time_range'] and search_type != 'images': |     if params['time_range'] and search_type != 'images': | ||||||
|         args["since"] = (datetime.now() - relativedelta(**{f"{params['time_range']}s": 1})).strftime("%Y%m%d") |         kwargs = {_delta_kwargs[params['time_range']]: 1} | ||||||
|  |         args["since"] = (datetime.now() - relativedelta(**kwargs)).strftime("%Y%m%d")  # type: ignore | ||||||
|         logger.debug(args["since"]) |         logger.debug(args["since"]) | ||||||
| 
 | 
 | ||||||
|     params['url'] = f"{base_url}/search?{urlencode(args)}" |     params['url'] = f"{base_url}/search?{urlencode(args)}" | ||||||
| @ -94,7 +112,7 @@ def _image_results(dom): | |||||||
|                 'template': 'images.html', |                 'template': 'images.html', | ||||||
|                 'url': extract_text(eval_xpath(result, image_url_xpath)), |                 'url': extract_text(eval_xpath(result, image_url_xpath)), | ||||||
|                 'title': extract_text(eval_xpath(result, image_title_xpath)), |                 'title': extract_text(eval_xpath(result, image_title_xpath)), | ||||||
|                 'img_src': base_url + extract_text(eval_xpath(result, image_img_src_xpath)), |                 'img_src': base_url + extract_text(eval_xpath(result, image_img_src_xpath)),  # type: ignore | ||||||
|                 'content': '', |                 'content': '', | ||||||
|             } |             } | ||||||
|         ) |         ) | ||||||
| @ -130,3 +148,31 @@ def response(resp): | |||||||
|         return _news_results(dom) |         return _news_results(dom) | ||||||
| 
 | 
 | ||||||
|     raise ValueError(f"Invalid search type {search_type}") |     raise ValueError(f"Invalid search type {search_type}") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def fetch_traits(engine_traits: EngineTraits): | ||||||
|  |     # pylint: disable=import-outside-toplevel | ||||||
|  |     from searx import network | ||||||
|  |     from searx.locales import get_official_locales, region_tag | ||||||
|  |     from babel import Locale, UnknownLocaleError | ||||||
|  |     import contextlib | ||||||
|  | 
 | ||||||
|  |     resp = network.get(base_url + "/preferences", headers={'Accept-Language': 'en-US,en;q=0.5'}) | ||||||
|  |     dom = html.fromstring(resp.text)  # type: ignore | ||||||
|  | 
 | ||||||
|  |     languages = eval_xpath_list(dom, f'//select[@name="{language_param}"]/option/@value') | ||||||
|  | 
 | ||||||
|  |     engine_traits.custom['language_all'] = languages[0] | ||||||
|  | 
 | ||||||
|  |     for code in languages[1:]: | ||||||
|  |         with contextlib.suppress(UnknownLocaleError): | ||||||
|  |             locale = Locale(code) | ||||||
|  |             engine_traits.languages[locale.language] = code | ||||||
|  | 
 | ||||||
|  |     regions = eval_xpath_list(dom, f'//select[@name="{region_param}"]/option/@value') | ||||||
|  | 
 | ||||||
|  |     engine_traits.custom['region_all'] = regions[1] | ||||||
|  | 
 | ||||||
|  |     for code in regions[2:]: | ||||||
|  |         for locale in get_official_locales(code, engine_traits.languages): | ||||||
|  |             engine_traits.regions[region_tag(locale)] = code | ||||||
|  | |||||||
| @ -101,7 +101,7 @@ def fetch_traits_map(): | |||||||
| def filter_locales(traits_map: EngineTraitsMap): | def filter_locales(traits_map: EngineTraitsMap): | ||||||
|     """Filter language & region tags by a threshold.""" |     """Filter language & region tags by a threshold.""" | ||||||
| 
 | 
 | ||||||
|     min_eng_per_region = 15 |     min_eng_per_region = 18 | ||||||
|     min_eng_per_lang = 20 |     min_eng_per_lang = 20 | ||||||
| 
 | 
 | ||||||
|     _ = {} |     _ = {} | ||||||
|  | |||||||
| @ -32,12 +32,10 @@ class TestLocales(SearxTestCase): | |||||||
| 
 | 
 | ||||||
|     @parameterized.expand( |     @parameterized.expand( | ||||||
|         [ |         [ | ||||||
|             ('ca-es', 'ca-ES'), |  | ||||||
|             ('de-at', 'de-AT'), |             ('de-at', 'de-AT'), | ||||||
|             ('de-de', 'de-DE'), |             ('de-de', 'de-DE'), | ||||||
|             ('en-UK', 'en-GB'), |             ('en-UK', 'en-GB'), | ||||||
|             ('fr-be', 'fr-BE'), |             ('fr-be', 'fr-BE'), | ||||||
|             ('fr-be', 'fr-BE'), |  | ||||||
|             ('fr-ca', 'fr-CA'), |             ('fr-ca', 'fr-CA'), | ||||||
|             ('fr-ch', 'fr-CH'), |             ('fr-ch', 'fr-CH'), | ||||||
|             ('zh-cn', 'zh-CN'), |             ('zh-cn', 'zh-CN'), | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 0xhtml
						0xhtml