Merge pull request #1374 from dadosch/master
[WIP] [engine] Duden.de (German dictionary)
This commit is contained in:
		
						commit
						3126660be5
					
				
							
								
								
									
										76
									
								
								searx/engines/duden.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										76
									
								
								searx/engines/duden.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,76 @@ | ||||
| """ | ||||
|  Duden | ||||
|  @website     https://www.duden.de | ||||
|  @provide-api no | ||||
|  @using-api   no | ||||
|  @results     HTML (using search portal) | ||||
|  @stable      no (HTML can change) | ||||
|  @parse       url, title, content | ||||
| """ | ||||
| 
 | ||||
| from lxml import html, etree | ||||
| import re | ||||
| from searx.engines.xpath import extract_text | ||||
| from searx.url_utils import quote | ||||
| from searx import logger | ||||
| 
 | ||||
| categories = ['general'] | ||||
| paging = True | ||||
| language_support = False | ||||
| 
 | ||||
| # search-url | ||||
| base_url = 'https://www.duden.de/' | ||||
| search_url = base_url + 'suchen/dudenonline/{query}?page={offset}' | ||||
| 
 | ||||
| 
 | ||||
| def request(query, params): | ||||
|     '''pre-request callback | ||||
|     params<dict>: | ||||
|       method  : POST/GET | ||||
|       headers : {} | ||||
|       data    : {} # if method == POST | ||||
|       url     : '' | ||||
|       category: 'search category' | ||||
|       pageno  : 1 # number of the requested page | ||||
|     ''' | ||||
| 
 | ||||
|     offset = (params['pageno'] - 1) | ||||
|     params['url'] = search_url.format(offset=offset, query=quote(query)) | ||||
|     return params | ||||
| 
 | ||||
| 
 | ||||
| def response(resp): | ||||
|     '''post-response callback | ||||
|     resp: requests response object | ||||
|     ''' | ||||
|     results = [] | ||||
| 
 | ||||
|     dom = html.fromstring(resp.text) | ||||
| 
 | ||||
|     try: | ||||
|         number_of_results_string = re.sub('[^0-9]', '', dom.xpath( | ||||
|             '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()')[0] | ||||
|         ) | ||||
| 
 | ||||
|         results.append({'number_of_results': int(number_of_results_string)}) | ||||
| 
 | ||||
|     except: | ||||
|         logger.debug("Couldn't read number of results.") | ||||
|         pass | ||||
| 
 | ||||
|     for result in dom.xpath('//section[@class="wide" and not(contains(@style,"overflow:hidden"))]'): | ||||
|         try: | ||||
|             logger.debug("running for %s" % str(result)) | ||||
|             link = result.xpath('.//h2/a')[0] | ||||
|             url = link.attrib.get('href') | ||||
|             title = result.xpath('string(.//h2/a)') | ||||
|             content = extract_text(result.xpath('.//p')) | ||||
|             # append result | ||||
|             results.append({'url': url, | ||||
|                             'title': title, | ||||
|                             'content': content}) | ||||
|         except: | ||||
|             logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) | ||||
|             continue | ||||
| 
 | ||||
|     return results | ||||
| @ -714,6 +714,11 @@ engines: | ||||
|     shortcut : 1337x | ||||
|     disabled : True | ||||
| 
 | ||||
|   - name : Duden | ||||
|     engine : duden | ||||
|     shortcut : du | ||||
|     disabled : True | ||||
| 
 | ||||
| #  - name : yacy | ||||
| #    engine : yacy | ||||
| #    shortcut : ya | ||||
|  | ||||
							
								
								
									
										41
									
								
								tests/unit/engines/test_duden.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										41
									
								
								tests/unit/engines/test_duden.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,41 @@ | ||||
| from collections import defaultdict | ||||
| import mock | ||||
| from searx.engines import duden | ||||
| from searx.testing import SearxTestCase | ||||
| from datetime import datetime | ||||
| 
 | ||||
| 
 | ||||
| class TestDudenEngine(SearxTestCase): | ||||
| 
 | ||||
|     def test_request(self): | ||||
|         query = 'Haus' | ||||
|         dic = defaultdict(dict) | ||||
|         dic['pageno'] = 1 | ||||
|         params = duden.request(query, dic) | ||||
|         self.assertTrue('url' in params) | ||||
|         self.assertTrue(query in params['url']) | ||||
|         self.assertTrue('duden.de' in params['url']) | ||||
| 
 | ||||
|     def test_response(self): | ||||
|         resp = mock.Mock(text='<html></html>') | ||||
|         self.assertEqual(duden.response(resp), []) | ||||
| 
 | ||||
|         html = """ | ||||
|         <section class="wide"> | ||||
|         <h2><a href="https://this.is.the.url/" class="hidden-link"><strong>This is the title</strong> also here</a></h2> | ||||
|         <p>This is the <strong>content</strong></p> | ||||
|         <a href="https://this.is.the.url/">Zum vollständigen Artikel</a> | ||||
|         </section> | ||||
|         """ | ||||
| 
 | ||||
|         resp = mock.Mock(text=html) | ||||
|         results = duden.response(resp) | ||||
| 
 | ||||
|         self.assertEqual(len(results), 1) | ||||
|         self.assertEqual(type(results), list) | ||||
| 
 | ||||
|         # testing result (dictionary entry) | ||||
|         r = results[0] | ||||
|         self.assertEqual(r['url'], 'https://this.is.the.url/') | ||||
|         self.assertEqual(r['title'], 'This is the title also here') | ||||
|         self.assertEqual(r['content'], 'This is the content') | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Adam Tauber
						Adam Tauber