[mod] json_engine: add content_html_to_text and title_html_to_text
Some JSON API returns HTML in either in the HTML or the content. This commit adds two new parameters to the json_engine: content_html_to_text and title_html_to_text, False by default. If True, then the searx.utils.html_to_text removes the HTML tags. Update crossref, openairedatasets and openairepublications engines
This commit is contained in:
		
							parent
							
								
									436d366448
								
							
						
					
					
						commit
						ff84a1af35
					
				| @ -3,13 +3,15 @@ | ||||
| from collections.abc import Iterable | ||||
| from json import loads | ||||
| from urllib.parse import urlencode | ||||
| from searx.utils import to_string | ||||
| from searx.utils import to_string, html_to_text | ||||
| 
 | ||||
| 
 | ||||
| search_url = None | ||||
| url_query = None | ||||
| content_query = None | ||||
| title_query = None | ||||
| content_html_to_text = False | ||||
| title_html_to_text = False | ||||
| paging = False | ||||
| suggestion_query = '' | ||||
| results_query = '' | ||||
| @ -92,9 +94,17 @@ def request(query, params): | ||||
|     return params | ||||
| 
 | ||||
| 
 | ||||
| def identity(arg): | ||||
|     return arg | ||||
| 
 | ||||
| 
 | ||||
| def response(resp): | ||||
|     results = [] | ||||
|     json = loads(resp.text) | ||||
| 
 | ||||
|     title_filter = html_to_text if title_html_to_text else identity | ||||
|     content_filter = html_to_text if content_html_to_text else identity | ||||
| 
 | ||||
|     if results_query: | ||||
|         rs = query(json, results_query) | ||||
|         if not len(rs): | ||||
| @ -111,8 +121,8 @@ def response(resp): | ||||
|                 content = "" | ||||
|             results.append({ | ||||
|                 'url': to_string(url), | ||||
|                 'title': to_string(title), | ||||
|                 'content': to_string(content), | ||||
|                 'title': title_filter(to_string(title)), | ||||
|                 'content': content_filter(to_string(content)), | ||||
|             }) | ||||
|     else: | ||||
|         for url, title, content in zip( | ||||
| @ -122,8 +132,8 @@ def response(resp): | ||||
|         ): | ||||
|             results.append({ | ||||
|                 'url': to_string(url), | ||||
|                 'title': to_string(title), | ||||
|                 'content': to_string(content), | ||||
|                 'title': title_filter(to_string(title)), | ||||
|                 'content': content_filter(to_string(content)), | ||||
|             }) | ||||
| 
 | ||||
|     if not suggestion_query: | ||||
|  | ||||
| @ -267,7 +267,9 @@ engines: | ||||
|     search_url : https://search.crossref.org/dois?q={query}&page={pageno} | ||||
|     url_query : doi | ||||
|     title_query : title | ||||
|     title_html_to_text: True | ||||
|     content_query : fullCitation | ||||
|     content_html_to_text: True | ||||
|     categories : science | ||||
|     shortcut : cr | ||||
|     about: | ||||
| @ -757,6 +759,7 @@ engines: | ||||
|     url_query : metadata/oaf:entity/oaf:result/children/instance/webresource/url/$ | ||||
|     title_query : metadata/oaf:entity/oaf:result/title/$ | ||||
|     content_query : metadata/oaf:entity/oaf:result/description/$ | ||||
|     content_html_to_text: True | ||||
|     categories : science | ||||
|     shortcut : oad | ||||
|     timeout: 5.0 | ||||
| @ -776,6 +779,7 @@ engines: | ||||
|     url_query : metadata/oaf:entity/oaf:result/children/instance/webresource/url/$ | ||||
|     title_query : metadata/oaf:entity/oaf:result/title/$ | ||||
|     content_query : metadata/oaf:entity/oaf:result/description/$ | ||||
|     content_html_to_text: True | ||||
|     categories : science | ||||
|     shortcut : oap | ||||
|     timeout: 5.0 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Alexandre Flament
						Alexandre Flament