[fix] yahoo_news engine
This commit is contained in:
		
							parent
							
								
									bbd83f5a51
								
							
						
					
					
						commit
						44ed4424f6
					
				| @ -23,15 +23,15 @@ paging = True | |||||||
| language_support = True | language_support = True | ||||||
| 
 | 
 | ||||||
| # search-url | # search-url | ||||||
| search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}'  # noqa | search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&{lang}=uh3_news_web_gs_1&pz=10&xargs=0&vl=lang_{lang}'  # noqa | ||||||
| 
 | 
 | ||||||
| # specific xpath variables | # specific xpath variables | ||||||
| results_xpath = '//div[@class="res"]' | results_xpath = '//ol[contains(@class,"searchCenterMiddle")]//li' | ||||||
| url_xpath = './/h3/a/@href' | url_xpath = './/h3/a/@href' | ||||||
| title_xpath = './/h3/a' | title_xpath = './/h3/a' | ||||||
| content_xpath = './/div[@class="abstr"]' | content_xpath = './/div[@class="compText"]' | ||||||
| publishedDate_xpath = './/span[@class="timestamp"]' | publishedDate_xpath = './/span[contains(@class,"tri")]' | ||||||
| suggestion_xpath = '//div[@id="satat"]//a' | suggestion_xpath = '//div[contains(@class,"VerALSOTRY")]//a' | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # do search-request | # do search-request | ||||||
| @ -48,11 +48,18 @@ def request(query, params): | |||||||
|                                       lang=language) |                                       lang=language) | ||||||
| 
 | 
 | ||||||
|     # TODO required? |     # TODO required? | ||||||
|     params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\ |     params['cookies']['sB'] = '"v=1&vm=p&fl=1&vl=lang_{lang}&sh=1&pn=10&rw=new'\ | ||||||
|         .format(lang=language) |         .format(lang=language) | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def sanitize_url(url): | ||||||
|  |     if ".yahoo.com/" in url: | ||||||
|  |         return re.sub(u"\;\_ylt\=.+$", "", url) | ||||||
|  |     else: | ||||||
|  |         return url | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| # get response from search-request | # get response from search-request | ||||||
| def response(resp): | def response(resp): | ||||||
|     results = [] |     results = [] | ||||||
| @ -61,13 +68,17 @@ def response(resp): | |||||||
| 
 | 
 | ||||||
|     # parse results |     # parse results | ||||||
|     for result in dom.xpath(results_xpath): |     for result in dom.xpath(results_xpath): | ||||||
|         url = parse_url(extract_url(result.xpath(url_xpath), search_url)) |         urls = result.xpath(url_xpath) | ||||||
|  |         if len(urls) != 1: | ||||||
|  |             continue | ||||||
|  |         url = sanitize_url(parse_url(extract_url(urls, search_url))) | ||||||
|         title = extract_text(result.xpath(title_xpath)[0]) |         title = extract_text(result.xpath(title_xpath)[0]) | ||||||
|         content = extract_text(result.xpath(content_xpath)[0]) |         content = extract_text(result.xpath(content_xpath)[0]) | ||||||
| 
 | 
 | ||||||
|         # parse publishedDate |         # parse publishedDate | ||||||
|         publishedDate = extract_text(result.xpath(publishedDate_xpath)[0]) |         publishedDate = extract_text(result.xpath(publishedDate_xpath)[0]) | ||||||
| 
 | 
 | ||||||
|  |         # still useful ? | ||||||
|         if re.match("^[0-9]+ minute(s|) ago$", publishedDate): |         if re.match("^[0-9]+ minute(s|) ago$", publishedDate): | ||||||
|             publishedDate = datetime.now() - timedelta(minutes=int(re.match(r'\d+', publishedDate).group()))  # noqa |             publishedDate = datetime.now() - timedelta(minutes=int(re.match(r'\d+', publishedDate).group()))  # noqa | ||||||
|         else: |         else: | ||||||
|  | |||||||
| @ -39,19 +39,24 @@ class TestYahooNewsEngine(SearxTestCase): | |||||||
|         self.assertEqual(yahoo_news.response(response), []) |         self.assertEqual(yahoo_news.response(response), []) | ||||||
| 
 | 
 | ||||||
|         html = """ |         html = """ | ||||||
|         <div class="res"> |         <ol class=" reg searchCenterMiddle"> | ||||||
|             <div> |             <li class="first"> | ||||||
|                 <h3> |                 <div class="compTitle"> | ||||||
|                     <a class="yschttl spt" href="http://this.is.the.url" target="_blank"> |                     <h3> | ||||||
|                         This is |                         <a class="yschttl spt" href="http://this.is.the.url" target="_blank"> | ||||||
|                         the <b>title</b>... |                            This is | ||||||
|                     </a> |                            the <b>title</b>... | ||||||
|                 </h3> |                         </a> | ||||||
|             </div> |                     </h3> | ||||||
|             <span class="url">Business via Yahoo! Finance</span>   <span class="timestamp">Feb 03 09:45am</span> |                 </div> | ||||||
|             <div class="abstr"> |                 <div> | ||||||
|                 This is the content |                     <span class="cite">Business via Yahoo!</span> | ||||||
|             </div> |                     <span class="tri fc-2nd ml-10">May 01 10:00 AM</span> | ||||||
|  |                 </div> | ||||||
|  |                 <div class="compText"> | ||||||
|  |                    This is the content | ||||||
|  |                </div> | ||||||
|  |             </li> | ||||||
|         </div> |         </div> | ||||||
|         """ |         """ | ||||||
|         response = mock.Mock(text=html) |         response = mock.Mock(text=html) | ||||||
| @ -63,48 +68,59 @@ class TestYahooNewsEngine(SearxTestCase): | |||||||
|         self.assertEqual(results[0]['content'], 'This is the content') |         self.assertEqual(results[0]['content'], 'This is the content') | ||||||
| 
 | 
 | ||||||
|         html = """ |         html = """ | ||||||
|         <div class="res"> |         <ol class=" reg searchCenterMiddle"> | ||||||
|             <div> |             <li class="first"> | ||||||
|                 <h3> |                 <div class="compTitle"> | ||||||
|                     <a class="yschttl spt" href="http://this.is.the.url" target="_blank"> |                     <h3> | ||||||
|                         This is |                         <a class="yschttl spt" href="http://this.is.the.url" target="_blank"> | ||||||
|                         the <b>title</b>... |                             This is | ||||||
|                     </a> |                             the <b>title</b>... | ||||||
|                 </h3> |                         </a> | ||||||
|             </div> |                     </h3> | ||||||
|             <span class="url">Business via Yahoo!</span>   <span class="timestamp">2 hours, 22 minutes ago</span> |                 </div> | ||||||
|             <div class="abstr"> |                 <div> | ||||||
|                 This is the content |                     <span class="cite">Business via Yahoo!</span> | ||||||
|             </div> |                     <span class="tri fc-2nd ml-10">2 hours, 22 minutes ago</span> | ||||||
|         </div> |                 </div> | ||||||
|         <div class="res"> |                 <div class="compText"> | ||||||
|             <div> |                     This is the content | ||||||
|                 <h3> |                 </div> | ||||||
|                     <a class="yschttl spt" href="http://this.is.the.url" target="_blank"> |             </li> | ||||||
|                         This is |             <li> | ||||||
|                         the <b>title</b>... |                 <div class="compTitle"> | ||||||
|                     </a> |                     <h3> | ||||||
|                 </h3> |                         <a class="yschttl spt" href="http://this.is.the.url" target="_blank"> | ||||||
|             </div> |                             This is | ||||||
|             <span class="url">Business via Yahoo!</span>   <span class="timestamp">22 minutes ago</span> |                             the <b>title</b>... | ||||||
|             <div class="abstr"> |                         </a> | ||||||
|                 This is the content |                     </h3> | ||||||
|             </div> |                 </div> | ||||||
|         </div> |                 <div> | ||||||
|         <div class="res"> |                     <span class="cite">Business via Yahoo!</span> | ||||||
|             <div> |                     <span class="tri fc-2nd ml-10">22 minutes ago</span> | ||||||
|                 <h3> |                 </div> | ||||||
|                     <a class="yschttl spt" href="http://this.is.the.url" target="_blank"> |                 <div class="compText"> | ||||||
|                         This is |                     This is the content | ||||||
|                         the <b>title</b>... |                 </div> | ||||||
|                     </a> |             </li> | ||||||
|                 </h3> |             <li> | ||||||
|             </div> |                 <div class="compTitle"> | ||||||
|             <span class="url">Business via Yahoo!</span>   <span class="timestamp">Feb 03 09:45am 1900</span> |                     <h3> | ||||||
|             <div class="abstr"> |                         <a class="yschttl spt" href="http://this.is.the.url" target="_blank"> | ||||||
|                 This is the content |                             This is | ||||||
|             </div> |                             the <b>title</b>... | ||||||
|         </div> |                         </a> | ||||||
|  |                     </h3> | ||||||
|  |                 </div> | ||||||
|  |                 <div> | ||||||
|  |                     <span class="cite">Business via Yahoo!</span> | ||||||
|  |                     <span class="tri fc-2nd ml-10">Feb 03 09:45AM 1900</span> | ||||||
|  |                 </div> | ||||||
|  |                 <div class="compText"> | ||||||
|  |                     This is the content | ||||||
|  |                 </div> | ||||||
|  |             </li> | ||||||
|  |         </ol> | ||||||
|         """ |         """ | ||||||
|         response = mock.Mock(text=html) |         response = mock.Mock(text=html) | ||||||
|         results = yahoo_news.response(response) |         results = yahoo_news.response(response) | ||||||
| @ -114,30 +130,3 @@ class TestYahooNewsEngine(SearxTestCase): | |||||||
|         self.assertEqual(results[0]['url'], 'http://this.is.the.url/') |         self.assertEqual(results[0]['url'], 'http://this.is.the.url/') | ||||||
|         self.assertEqual(results[0]['content'], 'This is the content') |         self.assertEqual(results[0]['content'], 'This is the content') | ||||||
|         self.assertEqual(results[2]['publishedDate'].year, datetime.now().year) |         self.assertEqual(results[2]['publishedDate'].year, datetime.now().year) | ||||||
| 
 |  | ||||||
|         html = """ |  | ||||||
|         <li class="b_algo" u="0|5109|4755453613245655|UAGjXgIrPH5yh-o5oNHRx_3Zta87f_QO"> |  | ||||||
|             <div Class="sa_mc"> |  | ||||||
|                 <div class="sb_tlst"> |  | ||||||
|                     <h2> |  | ||||||
|                         <a href="http://this.should.be.the.link/" h="ID=SERP,5124.1"> |  | ||||||
|                         <strong>This</strong> should be the title</a> |  | ||||||
|                     </h2> |  | ||||||
|                 </div> |  | ||||||
|                 <div class="sb_meta"> |  | ||||||
|                 <cite> |  | ||||||
|                 <strong>this</strong>.meta.com</cite> |  | ||||||
|                     <span class="c_tlbxTrg"> |  | ||||||
|                         <span class="c_tlbxH" H="BASE:CACHEDPAGEDEFAULT" K="SERP,5125.1"> |  | ||||||
|                         </span> |  | ||||||
|                     </span> |  | ||||||
|                 </div> |  | ||||||
|                 <p> |  | ||||||
|                 <strong>This</strong> should be the content.</p> |  | ||||||
|             </div> |  | ||||||
|         </li> |  | ||||||
|         """ |  | ||||||
|         response = mock.Mock(text=html) |  | ||||||
|         results = yahoo_news.response(response) |  | ||||||
|         self.assertEqual(type(results), list) |  | ||||||
|         self.assertEqual(len(results), 0) |  | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Alexandre Flament
						Alexandre Flament