From 02f5002a5f1c5d05a5876af66870d818eb37286e Mon Sep 17 00:00:00 2001 From: Aadniz <8147434+Aadniz@users.noreply.github.com> Date: Wed, 26 Mar 2025 19:56:58 +0100 Subject: [PATCH] [fix] baidu engine: properly decoding HTML escape codes --- searx/engines/baidu.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/searx/engines/baidu.py b/searx/engines/baidu.py index 1c9d86733..29c9c0e4d 100644 --- a/searx/engines/baidu.py +++ b/searx/engines/baidu.py @@ -9,6 +9,7 @@ from urllib.parse import urlencode from datetime import datetime +from html import unescape import time import json @@ -119,11 +120,15 @@ def parse_general(data): except (ValueError, TypeError): published_date = None + # title and content sometimes containing characters such as & ' " etc... + title = unescape(entry["title"]) + content = unescape(entry.get("abs", "")) + results.append( { - "title": entry["title"], + "title": title, "url": entry["url"], - "content": entry.get("abs", ""), + "content": content, "publishedDate": published_date, } )