[mod] engine ask.com - parse JS result to JSON
Parse the result list from ask.com given in the variable named
window.MESON.initialState::
    <script nonce="..">
        window.MESON = window.MESON || {};
        window.MESON.initialState = {"siteConfig": ...
          ...}};
        window.MESON.loadedLang = "en";
    </script>
The result list is in field::
    json_resp['search']['webResults']['results']
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
			
			
This commit is contained in:
		
							parent
							
								
									3df53d6e50
								
							
						
					
					
						commit
						76845ea42c
					
				| @ -3,8 +3,9 @@ | |||||||
| """Ask.com""" | """Ask.com""" | ||||||
| 
 | 
 | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| import re | import dateutil | ||||||
| from lxml import html | from lxml import html | ||||||
|  | from searx import utils | ||||||
| 
 | 
 | ||||||
| # Metadata | # Metadata | ||||||
| about = { | about = { | ||||||
| @ -37,20 +38,37 @@ def request(query, params): | |||||||
| 
 | 
 | ||||||
| def response(resp): | def response(resp): | ||||||
| 
 | 
 | ||||||
|     text = html.fromstring(resp.text).text_content() |     start_tag = 'window.MESON.initialState = {' | ||||||
|     urls_match = re.findall(r'"url":"(.*?)"', text) |     end_tag = '}};' | ||||||
|     titles_match = re.findall(r'"title":"(.*?)"', text)[3:] |  | ||||||
|     content_match = re.findall(r'"abstract":"(.*?)"', text) |  | ||||||
| 
 | 
 | ||||||
|     results = [ |     dom = html.fromstring(resp.text) | ||||||
|  |     script = utils.eval_xpath_getindex(dom, '//script', 0, default=None).text | ||||||
|  | 
 | ||||||
|  |     pos = script.index(start_tag) + len(start_tag) - 1 | ||||||
|  |     script = script[pos:] | ||||||
|  |     pos = script.index(end_tag) + len(end_tag) - 1 | ||||||
|  |     script = script[:pos] | ||||||
|  | 
 | ||||||
|  |     json_resp = utils.js_variable_to_python(script) | ||||||
|  | 
 | ||||||
|  |     results = [] | ||||||
|  | 
 | ||||||
|  |     for item in json_resp['search']['webResults']['results']: | ||||||
|  | 
 | ||||||
|  |         pubdate_original = item.get('pubdate_original') | ||||||
|  |         if pubdate_original: | ||||||
|  |             pubdate_original = dateutil.parser.parse(pubdate_original) | ||||||
|  |         metadata = [item.get(field) for field in ['category_l1', 'catsy'] if item.get(field)] | ||||||
|  | 
 | ||||||
|  |         results.append( | ||||||
|             { |             { | ||||||
|             "url": url, |                 "url": item['url'], | ||||||
|             "title": title, |                 "title": item['title'], | ||||||
|             "content": content, |                 "content": item['abstract'], | ||||||
|  |                 "publishedDate": pubdate_original, | ||||||
|  |                 # "img_src": item.get('image_url') or None, # these are not thumbs / to large | ||||||
|  |                 "metadata": ' | '.join(metadata), | ||||||
|             } |             } | ||||||
|         for url, title, content in zip(urls_match, titles_match, content_match) |         ) | ||||||
|         if "&qo=relatedSearchNarrow" not in url |  | ||||||
|         # Related searches shouldn't be in the search results: www.ask.com/web&q=related |  | ||||||
|     ] |  | ||||||
| 
 | 
 | ||||||
|     return results |     return results | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Markus Heiser
						Markus Heiser