From 8baa7c8e714391d18a45507fb3841e57d4a9841c Mon Sep 17 00:00:00 2001 From: 0xhtml <34682885+0xhtml@users.noreply.github.com> Date: Wed, 26 Mar 2025 14:44:36 +0100 Subject: [PATCH] [feat] bing: raise error upon receiving wrong results page --- searx/engines/bing.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 47e8bb66d..c1f152ea3 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -38,6 +38,7 @@ import babel.languages from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex from searx.locales import language_tag, region_tag from searx.enginelib.traits import EngineTraits +from searx.exceptions import SearxEngineAPIException if TYPE_CHECKING: import logging @@ -161,27 +162,33 @@ def response(resp): results.append({'url': url, 'title': title, 'content': content}) # get number_of_results - try: + if results: result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()')) if "-" in result_len_container: - - # Remove the part "from-to" for paginated request ... - result_len_container = result_len_container[result_len_container.find("-") * 2 + 2 :] + start_str, result_len_container = re.split(r'-\d+', result_len_container) + start = int(start_str) + else: + start = 1 result_len_container = re.sub('[^0-9]', '', result_len_container) - if len(result_len_container) > 0: result_len = int(result_len_container) - except Exception as e: # pylint: disable=broad-except - logger.debug('result error :\n%s', e) + expected_start = _page_offset(resp.search_params.get("pageno", 1)) - if result_len and _page_offset(resp.search_params.get("pageno", 0)) > result_len: - # Avoid reading more results than available. - # For example, if there is 100 results from some search and we try to get results from 120 to 130, - # Bing will send back the results from 0 to 10 and no error. - # If we compare results count with the first parameter of the request we can avoid this "invalid" results. - return [] + if expected_start != start: + if expected_start > result_len: + # Avoid reading more results than available. + # For example, if there is 100 results from some search and we try to get results from 120 to 130, + # Bing will send back the results from 0 to 10 and no error. + # If we compare results count with the first parameter of the request we can avoid this "invalid" + # results. + return [] + + # Sometimes Bing will send back the first result page instead of the requested page as a rate limiting + # measure. + msg = f"Expected results to start at {expected_start}, but got results starting at {start}" + raise SearxEngineAPIException(msg) results.append({'number_of_results': result_len}) return results