Merge pull request #348 from pointhi/swisscows
implement swisscows engine
This commit is contained in:
		
						commit
						9c7578bab6
					
				
							
								
								
									
										108
									
								
								searx/engines/swisscows.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										108
									
								
								searx/engines/swisscows.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,108 @@ | |||||||
|  | """ | ||||||
|  |  Swisscows (Web, Images) | ||||||
|  | 
 | ||||||
|  |  @website     https://swisscows.ch | ||||||
|  |  @provide-api no | ||||||
|  | 
 | ||||||
|  |  @using-api   no | ||||||
|  |  @results     HTML (using search portal) | ||||||
|  |  @stable      no (HTML can change) | ||||||
|  |  @parse       url, title, content | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | from json import loads | ||||||
|  | from urllib import urlencode, unquote | ||||||
|  | import re | ||||||
|  | 
 | ||||||
|  | # engine dependent config | ||||||
|  | categories = ['general', 'images'] | ||||||
|  | paging = True | ||||||
|  | language_support = True | ||||||
|  | 
 | ||||||
|  | # search-url | ||||||
|  | base_url = 'https://swisscows.ch/' | ||||||
|  | search_string = '?{query}&page={page}' | ||||||
|  | 
 | ||||||
|  | # regex | ||||||
|  | regex_json = re.compile('initialData: {"Request":(.|\n)*},\s*environment') | ||||||
|  | regex_json_remove_start = re.compile('^initialData:\s*') | ||||||
|  | regex_json_remove_end = re.compile(',\s*environment$') | ||||||
|  | regex_img_url_remove_start = re.compile('^https?://i\.swisscows\.ch/\?link=') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # do search-request | ||||||
|  | def request(query, params): | ||||||
|  |     if params['language'] == 'all': | ||||||
|  |         ui_language = 'browser' | ||||||
|  |         region = 'browser' | ||||||
|  |     else: | ||||||
|  |         region = params['language'].replace('_', '-') | ||||||
|  |         ui_language = params['language'].split('_')[0] | ||||||
|  | 
 | ||||||
|  |     search_path = search_string.format( | ||||||
|  |         query=urlencode({'query': query, | ||||||
|  |                          'uiLanguage': ui_language, | ||||||
|  |                          'region': region}), | ||||||
|  |         page=params['pageno']) | ||||||
|  | 
 | ||||||
|  |     # image search query is something like 'image?{query}&page={page}' | ||||||
|  |     if params['category'] == 'images': | ||||||
|  |         search_path = 'image' + search_path | ||||||
|  | 
 | ||||||
|  |     params['url'] = base_url + search_path | ||||||
|  | 
 | ||||||
|  |     return params | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # get response from search-request | ||||||
|  | def response(resp): | ||||||
|  |     results = [] | ||||||
|  | 
 | ||||||
|  |     json_regex = regex_json.search(resp.content) | ||||||
|  | 
 | ||||||
|  |     # check if results are returned | ||||||
|  |     if not json_regex: | ||||||
|  |         return [] | ||||||
|  | 
 | ||||||
|  |     json_raw = regex_json_remove_end.sub('', regex_json_remove_start.sub('', json_regex.group())) | ||||||
|  |     json = loads(json_raw) | ||||||
|  | 
 | ||||||
|  |     # parse results | ||||||
|  |     for result in json['Results'].get('items', []): | ||||||
|  |         result_title = result['Title'].replace(u'\uE000', '').replace(u'\uE001', '') | ||||||
|  | 
 | ||||||
|  |         # parse image results | ||||||
|  |         if result.get('ContentType', '').startswith('image'): | ||||||
|  |             img_url = unquote(regex_img_url_remove_start.sub('', result['Url'])) | ||||||
|  | 
 | ||||||
|  |             # append result | ||||||
|  |             results.append({'url': result['SourceUrl'], | ||||||
|  |                             'title': result['Title'], | ||||||
|  |                             'content': '', | ||||||
|  |                             'img_src': img_url, | ||||||
|  |                             'template': 'images.html'}) | ||||||
|  | 
 | ||||||
|  |         # parse general results | ||||||
|  |         else: | ||||||
|  |             result_url = result['Url'].replace(u'\uE000', '').replace(u'\uE001', '') | ||||||
|  |             result_content = result['Description'].replace(u'\uE000', '').replace(u'\uE001', '') | ||||||
|  | 
 | ||||||
|  |             # append result | ||||||
|  |             results.append({'url': result_url, | ||||||
|  |                             'title': result_title, | ||||||
|  |                             'content': result_content}) | ||||||
|  | 
 | ||||||
|  |     # parse images | ||||||
|  |     for result in json.get('Images', []): | ||||||
|  |         # decode image url | ||||||
|  |         img_url = unquote(regex_img_url_remove_start.sub('', result['Url'])) | ||||||
|  | 
 | ||||||
|  |         # append result | ||||||
|  |         results.append({'url': result['SourceUrl'], | ||||||
|  |                         'title': result['Title'], | ||||||
|  |                         'content': '', | ||||||
|  |                         'img_src': img_url, | ||||||
|  |                         'template': 'images.html'}) | ||||||
|  | 
 | ||||||
|  |     # return results | ||||||
|  |     return results | ||||||
| @ -213,6 +213,10 @@ engines: | |||||||
|     timeout : 6.0 |     timeout : 6.0 | ||||||
|     disabled : True |     disabled : True | ||||||
| 
 | 
 | ||||||
|  |   - name : swisscows | ||||||
|  |     engine : swisscows | ||||||
|  |     shortcut : sw | ||||||
|  | 
 | ||||||
|   - name : twitter |   - name : twitter | ||||||
|     engine : twitter |     engine : twitter | ||||||
|     shortcut : tw |     shortcut : tw | ||||||
|  | |||||||
							
								
								
									
										124
									
								
								searx/tests/engines/test_swisscows.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										124
									
								
								searx/tests/engines/test_swisscows.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,124 @@ | |||||||
|  | from collections import defaultdict | ||||||
|  | import mock | ||||||
|  | from searx.engines import swisscows | ||||||
|  | from searx.testing import SearxTestCase | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class TestSwisscowsEngine(SearxTestCase): | ||||||
|  | 
 | ||||||
|  |     def test_request(self): | ||||||
|  |         query = 'test_query' | ||||||
|  |         dicto = defaultdict(dict) | ||||||
|  |         dicto['pageno'] = 1 | ||||||
|  |         dicto['language'] = 'de_DE' | ||||||
|  |         params = swisscows.request(query, dicto) | ||||||
|  |         self.assertTrue('url' in params) | ||||||
|  |         self.assertTrue(query in params['url']) | ||||||
|  |         self.assertTrue('swisscows.ch' in params['url']) | ||||||
|  |         self.assertTrue('uiLanguage=de' in params['url']) | ||||||
|  |         self.assertTrue('region=de-DE' in params['url']) | ||||||
|  | 
 | ||||||
|  |         dicto['language'] = 'all' | ||||||
|  |         params = swisscows.request(query, dicto) | ||||||
|  |         self.assertTrue('uiLanguage=browser' in params['url']) | ||||||
|  |         self.assertTrue('region=browser' in params['url']) | ||||||
|  | 
 | ||||||
|  |     def test_response(self): | ||||||
|  |         self.assertRaises(AttributeError, swisscows.response, None) | ||||||
|  |         self.assertRaises(AttributeError, swisscows.response, []) | ||||||
|  |         self.assertRaises(AttributeError, swisscows.response, '') | ||||||
|  |         self.assertRaises(AttributeError, swisscows.response, '[]') | ||||||
|  | 
 | ||||||
|  |         response = mock.Mock(content='<html></html>') | ||||||
|  |         self.assertEqual(swisscows.response(response), []) | ||||||
|  | 
 | ||||||
|  |         response = mock.Mock(content='<html></html>') | ||||||
|  |         self.assertEqual(swisscows.response(response), []) | ||||||
|  | 
 | ||||||
|  |         html = u""" | ||||||
|  |         <script> | ||||||
|  |             App.Dispatcher.dispatch("initialize", { | ||||||
|  |                 html5history: true, | ||||||
|  |                 initialData: {"Request": | ||||||
|  |                     {"Page":1, | ||||||
|  |                     "ItemsCount":1, | ||||||
|  |                     "Query":"This should ", | ||||||
|  |                     "NormalizedQuery":"This should ", | ||||||
|  |                     "Region":"de-AT", | ||||||
|  |                     "UILanguage":"de"}, | ||||||
|  |                     "Results":{"items":[ | ||||||
|  |                             {"Title":"\uE000This should\uE001 be the title", | ||||||
|  |                             "Description":"\uE000This should\uE001 be the content.", | ||||||
|  |                             "Url":"http://this.should.be.the.link/", | ||||||
|  |                             "DisplayUrl":"www.\uE000this.should.be.the\uE001.link", | ||||||
|  |                             "Id":"782ef287-e439-451c-b380-6ebc14ba033d"}, | ||||||
|  |                             {"Title":"Datei:This should1.svg", | ||||||
|  |                             "Url":"https://i.swisscows.ch/?link=http%3a%2f%2fts2.mm.This/should1.png", | ||||||
|  |                             "SourceUrl":"http://de.wikipedia.org/wiki/Datei:This should1.svg", | ||||||
|  |                             "DisplayUrl":"de.wikipedia.org/wiki/Datei:This should1.svg", | ||||||
|  |                             "Width":950, | ||||||
|  |                             "Height":534, | ||||||
|  |                             "FileSize":92100, | ||||||
|  |                             "ContentType":"image/jpeg", | ||||||
|  |                             "Thumbnail":{ | ||||||
|  |                                 "Url":"https://i.swisscows.ch/?link=http%3a%2f%2fts2.mm.This/should1.png", | ||||||
|  |                                 "ContentType":"image/jpeg", | ||||||
|  |                                 "Width":300, | ||||||
|  |                                 "Height":168, | ||||||
|  |                                 "FileSize":9134}, | ||||||
|  |                                 "Id":"6a97a542-8f65-425f-b7f6-1178c3aba7be" | ||||||
|  |                             } | ||||||
|  |                         ],"TotalCount":55300, | ||||||
|  |                         "Query":"This should " | ||||||
|  |                     }, | ||||||
|  |                     "Images":[{"Title":"Datei:This should.svg", | ||||||
|  |                         "Url":"https://i.swisscows.ch/?link=http%3a%2f%2fts2.mm.This/should.png", | ||||||
|  |                         "SourceUrl":"http://de.wikipedia.org/wiki/Datei:This should.svg", | ||||||
|  |                         "DisplayUrl":"de.wikipedia.org/wiki/Datei:This should.svg", | ||||||
|  |                         "Width":1280, | ||||||
|  |                         "Height":677, | ||||||
|  |                         "FileSize":50053, | ||||||
|  |                         "ContentType":"image/png", | ||||||
|  |                         "Thumbnail":{"Url":"https://i.swisscows.ch/?link=http%3a%2f%2fts2.mm.This/should.png", | ||||||
|  |                             "ContentType":"image/png", | ||||||
|  |                             "Width":300, | ||||||
|  |                             "Height":158, | ||||||
|  |                             "FileSize":8023}, | ||||||
|  |                         "Id":"ae230fd8-a06a-47d6-99d5-e74766d8143a"}]}, | ||||||
|  |                 environment: "production" | ||||||
|  |             }).then(function (options) { | ||||||
|  |                 $('#Search_Form').on('submit', function (e) { | ||||||
|  |                     if (!Modernizr.history) return; | ||||||
|  |                     e.preventDefault(); | ||||||
|  | 
 | ||||||
|  |                     var $form = $(this), | ||||||
|  |                         $query = $('#Query'), | ||||||
|  |                         query = $.trim($query.val()), | ||||||
|  |                         path = App.Router.makePath($form.attr('action'), null, $form.serializeObject()) | ||||||
|  | 
 | ||||||
|  |                     if (query.length) { | ||||||
|  |                         options.html5history ? | ||||||
|  |                             ReactRouter.HistoryLocation.push(path) : | ||||||
|  |                             ReactRouter.RefreshLocation.push(path); | ||||||
|  |                     } | ||||||
|  |                     else $('#Query').trigger('blur'); | ||||||
|  |                 }); | ||||||
|  | 
 | ||||||
|  |             }); | ||||||
|  |         </script> | ||||||
|  |         """ | ||||||
|  |         response = mock.Mock(content=html) | ||||||
|  |         results = swisscows.response(response) | ||||||
|  |         self.assertEqual(type(results), list) | ||||||
|  |         self.assertEqual(len(results), 3) | ||||||
|  |         self.assertEqual(results[0]['title'], 'This should be the title') | ||||||
|  |         self.assertEqual(results[0]['url'], 'http://this.should.be.the.link/') | ||||||
|  |         self.assertEqual(results[0]['content'], 'This should be the content.') | ||||||
|  |         self.assertEqual(results[1]['title'], 'Datei:This should1.svg') | ||||||
|  |         self.assertEqual(results[1]['url'], 'http://de.wikipedia.org/wiki/Datei:This should1.svg') | ||||||
|  |         self.assertEqual(results[1]['img_src'], 'http://ts2.mm.This/should1.png') | ||||||
|  |         self.assertEqual(results[1]['template'], 'images.html') | ||||||
|  |         self.assertEqual(results[2]['title'], 'Datei:This should.svg') | ||||||
|  |         self.assertEqual(results[2]['url'], 'http://de.wikipedia.org/wiki/Datei:This should.svg') | ||||||
|  |         self.assertEqual(results[2]['img_src'], 'http://ts2.mm.This/should.png') | ||||||
|  |         self.assertEqual(results[2]['template'], 'images.html') | ||||||
| @ -32,6 +32,7 @@ from searx.tests.engines.test_spotify import *  # noqa | |||||||
| from searx.tests.engines.test_stackoverflow import *  # noqa | from searx.tests.engines.test_stackoverflow import *  # noqa | ||||||
| from searx.tests.engines.test_startpage import *  # noqa | from searx.tests.engines.test_startpage import *  # noqa | ||||||
| from searx.tests.engines.test_subtitleseeker import *  # noqa | from searx.tests.engines.test_subtitleseeker import *  # noqa | ||||||
|  | from searx.tests.engines.test_swisscows import *  # noqa | ||||||
| from searx.tests.engines.test_twitter import *  # noqa | from searx.tests.engines.test_twitter import *  # noqa | ||||||
| from searx.tests.engines.test_vimeo import *  # noqa | from searx.tests.engines.test_vimeo import *  # noqa | ||||||
| from searx.tests.engines.test_www1x import *  # noqa | from searx.tests.engines.test_www1x import *  # noqa | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Adam Tauber
						Adam Tauber