From e581921c9229f11a9ab23de2963b020546f2be0d Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Mon, 20 Jan 2025 11:45:00 +0100 Subject: [PATCH] [fix] engine brave: remove date from the content string Related: https://github.com/searxng/searxng/issues/4211#issuecomment-2601941440 Closes: https://github.com/searxng/searxng/issues/4006 Signed-off-by: Markus Heiser --- searx/engines/brave.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/searx/engines/brave.py b/searx/engines/brave.py index 648aee562..db1fc7976 100644 --- a/searx/engines/brave.py +++ b/searx/engines/brave.py @@ -291,15 +291,21 @@ def _parse_search(resp): if url is None or title_tag is None or not urlparse(url).netloc: # partial url likely means it's an ad continue - content_tag = eval_xpath_getindex(result, './/div[contains(@class, "snippet-description")]', 0, default='') + content: str = extract_text( + eval_xpath_getindex(result, './/div[contains(@class, "snippet-description")]', 0, default='') + ) # type: ignore pub_date_raw = eval_xpath(result, 'substring-before(.//div[contains(@class, "snippet-description")], "-")') + pub_date = _extract_published_date(pub_date_raw) + if pub_date and content.startswith(pub_date_raw): + content = content.lstrip(pub_date_raw).strip("- \n\t") + thumbnail = eval_xpath_getindex(result, './/img[contains(@class, "thumb")]/@src', 0, default='') item = { 'url': url, 'title': extract_text(title_tag), - 'content': extract_text(content_tag), - 'publishedDate': _extract_published_date(pub_date_raw), + 'content': content, + 'publishedDate': pub_date, 'thumbnail': thumbnail, }