From feb15e3878920ee7bf6e3d726fac0fcd1f89a896 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Thu, 20 Feb 2025 09:51:16 +0100 Subject: [PATCH] [fix] brave.news engine: response is HTML and no longer JSON The response from brave.com for news is no longer a JSON string. Closes: https://github.com/searxng/searxng/issues/4352 Signed-off-by: Markus Heiser --- searx/engines/brave.py | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/searx/engines/brave.py b/searx/engines/brave.py index 828f6154e..90cce4045 100644 --- a/searx/engines/brave.py +++ b/searx/engines/brave.py @@ -254,14 +254,14 @@ def response(resp) -> EngineResults: if brave_category in ('search', 'goggles'): return _parse_search(resp) + if brave_category in ('news'): + return _parse_news(resp) + datastr = extr(resp.text, "const data = ", ";\n").strip() json_data = js_variable_to_python(datastr) json_resp = json_data[1]['data']['body']['response'] - if brave_category == 'news': - return _parse_news(json_resp['news']) - if brave_category == 'images': return _parse_images(json_resp) if brave_category == 'videos': @@ -339,18 +339,31 @@ def _parse_search(resp) -> EngineResults: return result_list -def _parse_news(json_resp) -> EngineResults: - result_list = EngineResults() +def _parse_news(resp) -> EngineResults: + + result_list = EngineResults() + dom = html.fromstring(resp.text) + + for result in eval_xpath_list(dom, '//div[contains(@class, "results")]//div[@data-type="news"]'): + + # import pdb + # pdb.set_trace() + + url = eval_xpath_getindex(result, './/a[contains(@class, "result-header")]/@href', 0, default=None) + if url is None: + continue + + title = extract_text(eval_xpath_list(result, './/span[contains(@class, "snippet-title")]')) + content = extract_text(eval_xpath_list(result, './/p[contains(@class, "desc")]')) + thumbnail = eval_xpath_getindex(result, './/div[contains(@class, "image-wrapper")]//img/@src', 0, default='') - for result in json_resp["results"]: item = { - 'url': result['url'], - 'title': result['title'], - 'content': result['description'], - 'publishedDate': _extract_published_date(result['age']), + "url": url, + "title": title, + "content": content, + "thumbnail": thumbnail, } - if result['thumbnail'] is not None: - item['thumbnail'] = result['thumbnail']['src'] + result_list.append(item) return result_list