[json_engine] mirror xpath functionality
This commit is contained in:
		
							parent
							
								
									591d9c2505
								
							
						
					
					
						commit
						64d954b350
					
				| @ -8,6 +8,8 @@ Configuration | |||||||
| Request: | Request: | ||||||
| 
 | 
 | ||||||
| - :py:obj:`search_url` | - :py:obj:`search_url` | ||||||
|  | - :py:obj:`lang_all` | ||||||
|  | - :py:obj:`soft_max_redirects` | ||||||
| - :py:obj:`method` | - :py:obj:`method` | ||||||
| - :py:obj:`request_body` | - :py:obj:`request_body` | ||||||
| - :py:obj:`cookies` | - :py:obj:`cookies` | ||||||
| @ -19,10 +21,22 @@ Paging: | |||||||
| - :py:obj:`page_size` | - :py:obj:`page_size` | ||||||
| - :py:obj:`first_page_num` | - :py:obj:`first_page_num` | ||||||
| 
 | 
 | ||||||
|  | Time Range: | ||||||
|  | 
 | ||||||
|  | - :py:obj:`time_range_support` | ||||||
|  | - :py:obj:`time_range_url` | ||||||
|  | - :py:obj:`time_range_map` | ||||||
|  | 
 | ||||||
|  | Safe-Search: | ||||||
|  | 
 | ||||||
|  | - :py:obj:`safe_search_support` | ||||||
|  | - :py:obj:`safe_search_map` | ||||||
|  | 
 | ||||||
| Response: | Response: | ||||||
| 
 | 
 | ||||||
| - :py:obj:`title_html_to_text` | - :py:obj:`title_html_to_text` | ||||||
| - :py:obj:`content_html_to_text` | - :py:obj:`content_html_to_text` | ||||||
|  | - :py:obj:`no_result_for_http_status` | ||||||
| 
 | 
 | ||||||
| JSON query: | JSON query: | ||||||
| 
 | 
 | ||||||
| @ -31,6 +45,8 @@ JSON query: | |||||||
| - :py:obj:`url_prefix` | - :py:obj:`url_prefix` | ||||||
| - :py:obj:`title_query` | - :py:obj:`title_query` | ||||||
| - :py:obj:`content_query` | - :py:obj:`content_query` | ||||||
|  | - :py:obj:`thumbnail_query` | ||||||
|  | - :py:obj:`thumbnail_prefix` | ||||||
| - :py:obj:`suggestion_query` | - :py:obj:`suggestion_query` | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -61,12 +77,13 @@ from collections.abc import Iterable | |||||||
| from json import loads | from json import loads | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from searx.utils import to_string, html_to_text | from searx.utils import to_string, html_to_text | ||||||
|  | from searx.network import raise_for_httperror | ||||||
| 
 | 
 | ||||||
| search_url = None | search_url = None | ||||||
| """ | """ | ||||||
| Search URL of the engine.  Example:: | Search URL of the engine.  Example:: | ||||||
| 
 | 
 | ||||||
|     https://example.org/?search={query}&page={pageno} |     https://example.org/?search={query}&page={pageno}{time_range}{safe_search} | ||||||
| 
 | 
 | ||||||
| Replacements are: | Replacements are: | ||||||
| 
 | 
 | ||||||
| @ -76,8 +93,41 @@ Replacements are: | |||||||
| ``{pageno}``: | ``{pageno}``: | ||||||
|   Page number if engine supports paging :py:obj:`paging` |   Page number if engine supports paging :py:obj:`paging` | ||||||
| 
 | 
 | ||||||
|  | ``{lang}``: | ||||||
|  |   ISO 639-1 language code (en, de, fr ..) | ||||||
|  | 
 | ||||||
|  | ``{time_range}``: | ||||||
|  |   :py:obj:`URL parameter <time_range_url>` if engine :py:obj:`supports time | ||||||
|  |   range <time_range_support>`.  The value for the parameter is taken from | ||||||
|  |   :py:obj:`time_range_map`. | ||||||
|  | 
 | ||||||
|  | ``{safe_search}``: | ||||||
|  |   Safe-search :py:obj:`URL parameter <safe_search_map>` if engine | ||||||
|  |   :py:obj:`supports safe-search <safe_search_support>`.  The ``{safe_search}`` | ||||||
|  |   replacement is taken from the :py:obj:`safes_search_map`.  Filter results:: | ||||||
|  | 
 | ||||||
|  |       0: none, 1: moderate, 2:strict | ||||||
|  | 
 | ||||||
|  |   If not supported, the URL parameter is an empty string. | ||||||
|  | 
 | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
|  | lang_all = 'en' | ||||||
|  | '''Replacement ``{lang}`` in :py:obj:`search_url` if language ``all`` is | ||||||
|  | selected. | ||||||
|  | ''' | ||||||
|  | 
 | ||||||
|  | no_result_for_http_status = [] | ||||||
|  | '''Return empty result for these HTTP status codes instead of throwing an error. | ||||||
|  | 
 | ||||||
|  | .. code:: yaml | ||||||
|  | 
 | ||||||
|  |     no_result_for_http_status: [] | ||||||
|  | ''' | ||||||
|  | 
 | ||||||
|  | soft_max_redirects = 0 | ||||||
|  | '''Maximum redirects, soft limit. Record an error but don't stop the engine''' | ||||||
|  | 
 | ||||||
| method = 'GET' | method = 'GET' | ||||||
| '''Some engines might require to do POST requests for search.''' | '''Some engines might require to do POST requests for search.''' | ||||||
| 
 | 
 | ||||||
| @ -140,6 +190,12 @@ title_query = None | |||||||
| content_query = None | content_query = None | ||||||
| '''JSON query of result's ``content``. For the query string documentation see :py:obj:`results_query`''' | '''JSON query of result's ``content``. For the query string documentation see :py:obj:`results_query`''' | ||||||
| 
 | 
 | ||||||
|  | thumbnail_query = False | ||||||
|  | '''JSON query of result's ``thumbnail``. For the query string documentation see :py:obj:`results_query`''' | ||||||
|  | 
 | ||||||
|  | thumbnail_prefix = '' | ||||||
|  | '''String to prepend to the result's ``thumbnail``.''' | ||||||
|  | 
 | ||||||
| suggestion_query = '' | suggestion_query = '' | ||||||
| '''JSON query of result's ``suggestion``. For the query string documentation see :py:obj:`results_query`''' | '''JSON query of result's ``suggestion``. For the query string documentation see :py:obj:`results_query`''' | ||||||
| 
 | 
 | ||||||
| @ -149,6 +205,53 @@ title_html_to_text = False | |||||||
| content_html_to_text = False | content_html_to_text = False | ||||||
| '''Extract text from a HTML content string''' | '''Extract text from a HTML content string''' | ||||||
| 
 | 
 | ||||||
|  | time_range_support = False | ||||||
|  | '''Engine supports search time range.''' | ||||||
|  | 
 | ||||||
|  | time_range_url = '&hours={time_range_val}' | ||||||
|  | '''Time range URL parameter in the in :py:obj:`search_url`.  If no time range is | ||||||
|  | requested by the user, the URL parameter is an empty string.  The | ||||||
|  | ``{time_range_val}`` replacement is taken from the :py:obj:`time_range_map`. | ||||||
|  | 
 | ||||||
|  | .. code:: yaml | ||||||
|  | 
 | ||||||
|  |     time_range_url : '&days={time_range_val}' | ||||||
|  | ''' | ||||||
|  | 
 | ||||||
|  | time_range_map = { | ||||||
|  |     'day': 24, | ||||||
|  |     'week': 24 * 7, | ||||||
|  |     'month': 24 * 30, | ||||||
|  |     'year': 24 * 365, | ||||||
|  | } | ||||||
|  | '''Maps time range value from user to ``{time_range_val}`` in | ||||||
|  | :py:obj:`time_range_url`. | ||||||
|  | 
 | ||||||
|  | .. code:: yaml | ||||||
|  | 
 | ||||||
|  |     time_range_map: | ||||||
|  |       day: 1 | ||||||
|  |       week: 7 | ||||||
|  |       month: 30 | ||||||
|  |       year: 365 | ||||||
|  | ''' | ||||||
|  | 
 | ||||||
|  | safe_search_support = False | ||||||
|  | '''Engine supports safe-search.''' | ||||||
|  | 
 | ||||||
|  | safe_search_map = {0: '&filter=none', 1: '&filter=moderate', 2: '&filter=strict'} | ||||||
|  | '''Maps safe-search value to ``{safe_search}`` in :py:obj:`search_url`. | ||||||
|  | 
 | ||||||
|  | .. code:: yaml | ||||||
|  | 
 | ||||||
|  |     safesearch: true | ||||||
|  |     safes_search_map: | ||||||
|  |       0: '&filter=none' | ||||||
|  |       1: '&filter=moderate' | ||||||
|  |       2: '&filter=strict' | ||||||
|  | 
 | ||||||
|  | ''' | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def iterate(iterable): | def iterate(iterable): | ||||||
|     if isinstance(iterable, dict): |     if isinstance(iterable, dict): | ||||||
| @ -207,10 +310,26 @@ def query(data, query_string): | |||||||
| 
 | 
 | ||||||
| def request(query, params):  # pylint: disable=redefined-outer-name | def request(query, params):  # pylint: disable=redefined-outer-name | ||||||
|     '''Build request parameters (see :ref:`engine request`).''' |     '''Build request parameters (see :ref:`engine request`).''' | ||||||
|     fp = {'query': urlencode({'q': query})[2:]}  # pylint: disable=invalid-name |     lang = lang_all | ||||||
|  |     if params['language'] != 'all': | ||||||
|  |         lang = params['language'][:2] | ||||||
| 
 | 
 | ||||||
|     if paging and search_url.find('{pageno}') >= 0: |     time_range = '' | ||||||
|         fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num |     if params.get('time_range'): | ||||||
|  |         time_range_val = time_range_map.get(params.get('time_range')) | ||||||
|  |         time_range = time_range_url.format(time_range_val=time_range_val) | ||||||
|  | 
 | ||||||
|  |     safe_search = '' | ||||||
|  |     if params['safesearch']: | ||||||
|  |         safe_search = safe_search_map[params['safesearch']] | ||||||
|  | 
 | ||||||
|  |     fp = {  # pylint: disable=invalid-name | ||||||
|  |         'query': urlencode({'q': query})[2:], | ||||||
|  |         'lang': lang, | ||||||
|  |         'pageno': (params['pageno'] - 1) * page_size + first_page_num, | ||||||
|  |         'time_range': time_range, | ||||||
|  |         'safe_search': safe_search, | ||||||
|  |     } | ||||||
| 
 | 
 | ||||||
|     params['cookies'].update(cookies) |     params['cookies'].update(cookies) | ||||||
|     params['headers'].update(headers) |     params['headers'].update(headers) | ||||||
| @ -223,6 +342,9 @@ def request(query, params):  # pylint: disable=redefined-outer-name | |||||||
|         fp['query'] = query |         fp['query'] = query | ||||||
|         params['data'] = request_body.format(**fp) |         params['data'] = request_body.format(**fp) | ||||||
| 
 | 
 | ||||||
|  |     params['soft_max_redirects'] = soft_max_redirects | ||||||
|  |     params['raise_for_httperror'] = False | ||||||
|  | 
 | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -234,10 +356,16 @@ def response(resp): | |||||||
|     '''Scrap *results* from the response (see :ref:`engine results`).''' |     '''Scrap *results* from the response (see :ref:`engine results`).''' | ||||||
|     results = [] |     results = [] | ||||||
| 
 | 
 | ||||||
|  |     if no_result_for_http_status and resp.status_code in no_result_for_http_status: | ||||||
|  |         return results | ||||||
|  | 
 | ||||||
|  |     raise_for_httperror(resp) | ||||||
|  | 
 | ||||||
|     if not resp.text: |     if not resp.text: | ||||||
|         return results |         return results | ||||||
| 
 | 
 | ||||||
|     json = loads(resp.text) |     json = loads(resp.text) | ||||||
|  |     is_onion = 'onions' in categories | ||||||
| 
 | 
 | ||||||
|     title_filter = html_to_text if title_html_to_text else identity |     title_filter = html_to_text if title_html_to_text else identity | ||||||
|     content_filter = html_to_text if content_html_to_text else identity |     content_filter = html_to_text if content_html_to_text else identity | ||||||
| @ -256,13 +384,24 @@ def response(resp): | |||||||
|                 content = query(result, content_query)[0] |                 content = query(result, content_query)[0] | ||||||
|             except:  # pylint: disable=bare-except |             except:  # pylint: disable=bare-except | ||||||
|                 content = "" |                 content = "" | ||||||
|             results.append( | 
 | ||||||
|                 { |             tmp_result = { | ||||||
|                 'url': url_prefix + to_string(url), |                 'url': url_prefix + to_string(url), | ||||||
|                 'title': title_filter(to_string(title)), |                 'title': title_filter(to_string(title)), | ||||||
|                 'content': content_filter(to_string(content)), |                 'content': content_filter(to_string(content)), | ||||||
|             } |             } | ||||||
|             ) | 
 | ||||||
|  |             if thumbnail_query: | ||||||
|  |                 try: | ||||||
|  |                     thumbnail_query_result = query(result, thumbnail_query)[0] | ||||||
|  |                     tmp_result['thumbnail'] = thumbnail_prefix + to_string(thumbnail_query_result) | ||||||
|  |                 except:  # pylint: disable=bare-except | ||||||
|  |                     continue | ||||||
|  | 
 | ||||||
|  |             if is_onion: | ||||||
|  |                 tmp_result['is_onion'] = True | ||||||
|  | 
 | ||||||
|  |             results.append(tmp_result) | ||||||
|     else: |     else: | ||||||
|         for result in json: |         for result in json: | ||||||
|             url = query(result, url_query)[0] |             url = query(result, url_query)[0] | ||||||
| @ -274,6 +413,7 @@ def response(resp): | |||||||
|                     'url': url_prefix + to_string(url), |                     'url': url_prefix + to_string(url), | ||||||
|                     'title': title_filter(to_string(title)), |                     'title': title_filter(to_string(title)), | ||||||
|                     'content': content_filter(to_string(content)), |                     'content': content_filter(to_string(content)), | ||||||
|  |                     'is_onion': is_onion, | ||||||
|                 } |                 } | ||||||
|             ) |             ) | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Lucki
						Lucki