[fix] engine brave: remove date from the content string

Related: https://github.com/searxng/searxng/issues/4211#issuecomment-2601941440
Closes: https://github.com/searxng/searxng/issues/4006

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2025-01-20 11:45:00 +01:00 committed by Markus Heiser
parent 073d9549a0
commit e581921c92

View File

@ -291,15 +291,21 @@ def _parse_search(resp):
if url is None or title_tag is None or not urlparse(url).netloc: # partial url likely means it's an ad if url is None or title_tag is None or not urlparse(url).netloc: # partial url likely means it's an ad
continue continue
content_tag = eval_xpath_getindex(result, './/div[contains(@class, "snippet-description")]', 0, default='') content: str = extract_text(
eval_xpath_getindex(result, './/div[contains(@class, "snippet-description")]', 0, default='')
) # type: ignore
pub_date_raw = eval_xpath(result, 'substring-before(.//div[contains(@class, "snippet-description")], "-")') pub_date_raw = eval_xpath(result, 'substring-before(.//div[contains(@class, "snippet-description")], "-")')
pub_date = _extract_published_date(pub_date_raw)
if pub_date and content.startswith(pub_date_raw):
content = content.lstrip(pub_date_raw).strip("- \n\t")
thumbnail = eval_xpath_getindex(result, './/img[contains(@class, "thumb")]/@src', 0, default='') thumbnail = eval_xpath_getindex(result, './/img[contains(@class, "thumb")]/@src', 0, default='')
item = { item = {
'url': url, 'url': url,
'title': extract_text(title_tag), 'title': extract_text(title_tag),
'content': extract_text(content_tag), 'content': content,
'publishedDate': _extract_published_date(pub_date_raw), 'publishedDate': pub_date,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
} }