Merge pull request #1374 from dadosch/master
[WIP] [engine] Duden.de (German dictionary)
This commit is contained in:
		
						commit
						3126660be5
					
				
							
								
								
									
										76
									
								
								searx/engines/duden.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										76
									
								
								searx/engines/duden.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,76 @@ | |||||||
|  | """ | ||||||
|  |  Duden | ||||||
|  |  @website     https://www.duden.de | ||||||
|  |  @provide-api no | ||||||
|  |  @using-api   no | ||||||
|  |  @results     HTML (using search portal) | ||||||
|  |  @stable      no (HTML can change) | ||||||
|  |  @parse       url, title, content | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | from lxml import html, etree | ||||||
|  | import re | ||||||
|  | from searx.engines.xpath import extract_text | ||||||
|  | from searx.url_utils import quote | ||||||
|  | from searx import logger | ||||||
|  | 
 | ||||||
|  | categories = ['general'] | ||||||
|  | paging = True | ||||||
|  | language_support = False | ||||||
|  | 
 | ||||||
|  | # search-url | ||||||
|  | base_url = 'https://www.duden.de/' | ||||||
|  | search_url = base_url + 'suchen/dudenonline/{query}?page={offset}' | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def request(query, params): | ||||||
|  |     '''pre-request callback | ||||||
|  |     params<dict>: | ||||||
|  |       method  : POST/GET | ||||||
|  |       headers : {} | ||||||
|  |       data    : {} # if method == POST | ||||||
|  |       url     : '' | ||||||
|  |       category: 'search category' | ||||||
|  |       pageno  : 1 # number of the requested page | ||||||
|  |     ''' | ||||||
|  | 
 | ||||||
|  |     offset = (params['pageno'] - 1) | ||||||
|  |     params['url'] = search_url.format(offset=offset, query=quote(query)) | ||||||
|  |     return params | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def response(resp): | ||||||
|  |     '''post-response callback | ||||||
|  |     resp: requests response object | ||||||
|  |     ''' | ||||||
|  |     results = [] | ||||||
|  | 
 | ||||||
|  |     dom = html.fromstring(resp.text) | ||||||
|  | 
 | ||||||
|  |     try: | ||||||
|  |         number_of_results_string = re.sub('[^0-9]', '', dom.xpath( | ||||||
|  |             '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()')[0] | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         results.append({'number_of_results': int(number_of_results_string)}) | ||||||
|  | 
 | ||||||
|  |     except: | ||||||
|  |         logger.debug("Couldn't read number of results.") | ||||||
|  |         pass | ||||||
|  | 
 | ||||||
|  |     for result in dom.xpath('//section[@class="wide" and not(contains(@style,"overflow:hidden"))]'): | ||||||
|  |         try: | ||||||
|  |             logger.debug("running for %s" % str(result)) | ||||||
|  |             link = result.xpath('.//h2/a')[0] | ||||||
|  |             url = link.attrib.get('href') | ||||||
|  |             title = result.xpath('string(.//h2/a)') | ||||||
|  |             content = extract_text(result.xpath('.//p')) | ||||||
|  |             # append result | ||||||
|  |             results.append({'url': url, | ||||||
|  |                             'title': title, | ||||||
|  |                             'content': content}) | ||||||
|  |         except: | ||||||
|  |             logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |     return results | ||||||
| @ -714,6 +714,11 @@ engines: | |||||||
|     shortcut : 1337x |     shortcut : 1337x | ||||||
|     disabled : True |     disabled : True | ||||||
| 
 | 
 | ||||||
|  |   - name : Duden | ||||||
|  |     engine : duden | ||||||
|  |     shortcut : du | ||||||
|  |     disabled : True | ||||||
|  | 
 | ||||||
| #  - name : yacy | #  - name : yacy | ||||||
| #    engine : yacy | #    engine : yacy | ||||||
| #    shortcut : ya | #    shortcut : ya | ||||||
|  | |||||||
							
								
								
									
										41
									
								
								tests/unit/engines/test_duden.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										41
									
								
								tests/unit/engines/test_duden.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,41 @@ | |||||||
|  | from collections import defaultdict | ||||||
|  | import mock | ||||||
|  | from searx.engines import duden | ||||||
|  | from searx.testing import SearxTestCase | ||||||
|  | from datetime import datetime | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class TestDudenEngine(SearxTestCase): | ||||||
|  | 
 | ||||||
|  |     def test_request(self): | ||||||
|  |         query = 'Haus' | ||||||
|  |         dic = defaultdict(dict) | ||||||
|  |         dic['pageno'] = 1 | ||||||
|  |         params = duden.request(query, dic) | ||||||
|  |         self.assertTrue('url' in params) | ||||||
|  |         self.assertTrue(query in params['url']) | ||||||
|  |         self.assertTrue('duden.de' in params['url']) | ||||||
|  | 
 | ||||||
|  |     def test_response(self): | ||||||
|  |         resp = mock.Mock(text='<html></html>') | ||||||
|  |         self.assertEqual(duden.response(resp), []) | ||||||
|  | 
 | ||||||
|  |         html = """ | ||||||
|  |         <section class="wide"> | ||||||
|  |         <h2><a href="https://this.is.the.url/" class="hidden-link"><strong>This is the title</strong> also here</a></h2> | ||||||
|  |         <p>This is the <strong>content</strong></p> | ||||||
|  |         <a href="https://this.is.the.url/">Zum vollständigen Artikel</a> | ||||||
|  |         </section> | ||||||
|  |         """ | ||||||
|  | 
 | ||||||
|  |         resp = mock.Mock(text=html) | ||||||
|  |         results = duden.response(resp) | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(len(results), 1) | ||||||
|  |         self.assertEqual(type(results), list) | ||||||
|  | 
 | ||||||
|  |         # testing result (dictionary entry) | ||||||
|  |         r = results[0] | ||||||
|  |         self.assertEqual(r['url'], 'https://this.is.the.url/') | ||||||
|  |         self.assertEqual(r['title'], 'This is the title also here') | ||||||
|  |         self.assertEqual(r['content'], 'This is the content') | ||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Adam Tauber
						Adam Tauber