Merge pull request #2306 from return42/fix-1959
[fix] engine google-News: fix decoding of URLs
This commit is contained in:
		
						commit
						a5155a32c0
					
				| @ -27,10 +27,8 @@ The google news API ignores some parameters from the common :ref:`google API`: | |||||||
| 
 | 
 | ||||||
| from typing import TYPE_CHECKING | from typing import TYPE_CHECKING | ||||||
| 
 | 
 | ||||||
| import binascii |  | ||||||
| import re |  | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from base64 import b64decode | import base64 | ||||||
| from lxml import html | from lxml import html | ||||||
| import babel | import babel | ||||||
| 
 | 
 | ||||||
| @ -144,34 +142,17 @@ def response(resp): | |||||||
| 
 | 
 | ||||||
|     for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'): |     for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'): | ||||||
| 
 | 
 | ||||||
|         # The first <a> tag in the <article> contains the link to the |         # The first <a> tag in the <article> contains the link to the article | ||||||
|         # article The href attribute of the <a> is a google internal link, |         # The href attribute of the <a> tag is a google internal link, we have | ||||||
|         # we can't use.  The real link is hidden in the jslog attribute: |         # to decode | ||||||
|         # |  | ||||||
|         #   <a ... |  | ||||||
|         #      jslog="95014; 4:https://www.cnn.com/.../index.html; track:click" |  | ||||||
|         #      href="./articles/CAIiENu3nGS...?hl=en-US&gl=US&ceid=US%3Aen" |  | ||||||
|         #      ... /> |  | ||||||
| 
 | 
 | ||||||
|         jslog = eval_xpath_getindex(result, './article/a/@jslog', 0) |         href = eval_xpath_getindex(result, './article/a/@href', 0) | ||||||
|         url = re.findall('http[^;]*', jslog) |         href = href.split('?')[0] | ||||||
|         if url: |         href = href.split('/')[-1] | ||||||
|             url = url[0] |         href = base64.urlsafe_b64decode(href + '====') | ||||||
|         else: |         href = href[4:].split(b'\xd2')[0] | ||||||
|             # The real URL is base64 encoded in the json attribute: |         href = href.decode() | ||||||
|             # jslog="95014; 5:W251bGwsbnVsbCxudW...giXQ==; track:click" |  | ||||||
|             jslog = jslog.split(";")[1].split(':')[1].strip() |  | ||||||
|             try: |  | ||||||
|                 padding = (4 - (len(jslog) % 4)) * "=" |  | ||||||
|                 jslog = b64decode(jslog + padding) |  | ||||||
|             except binascii.Error: |  | ||||||
|                 # URL can't be read, skip this result |  | ||||||
|                 continue |  | ||||||
| 
 | 
 | ||||||
|             # now we have : b'[null, ... null,"https://www.cnn.com/.../index.html"]' |  | ||||||
|             url = re.findall('http[^;"]*', str(jslog))[0] |  | ||||||
| 
 |  | ||||||
|         # the first <h3> tag in the <article> contains the title of the link |  | ||||||
|         title = extract_text(eval_xpath(result, './article/h3[1]')) |         title = extract_text(eval_xpath(result, './article/h3[1]')) | ||||||
| 
 | 
 | ||||||
|         # The pub_date is mostly a string like 'yesertday', not a real |         # The pub_date is mostly a string like 'yesertday', not a real | ||||||
| @ -189,7 +170,7 @@ def response(resp): | |||||||
| 
 | 
 | ||||||
|         results.append( |         results.append( | ||||||
|             { |             { | ||||||
|                 'url': url, |                 'url': href, | ||||||
|                 'title': title, |                 'title': title, | ||||||
|                 'content': content, |                 'content': content, | ||||||
|                 'img_src': img_src, |                 'img_src': img_src, | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Markus Heiser
						Markus Heiser