Merge pull request #522 from jibe-b/master
add BASE engine in category "Science"
This commit is contained in:
		
						commit
						fff9460238
					
				
							
								
								
									
										122
									
								
								searx/engines/base.py
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										122
									
								
								searx/engines/base.py
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,122 @@ | ||||
| #!/usr/bin/env python | ||||
| 
 | ||||
| """ | ||||
|  BASE (Scholar publications) | ||||
| 
 | ||||
|  @website     https://base-search.net | ||||
|  @provide-api yes with authorization (https://api.base-search.net/) | ||||
| 
 | ||||
|  @using-api   yes | ||||
|  @results     XML | ||||
|  @stable      ? | ||||
|  @parse       url, title, publishedDate, content | ||||
|  More info on api: http://base-search.net/about/download/base_interface.pdf | ||||
| """ | ||||
| 
 | ||||
| from lxml import etree | ||||
| from urllib import urlencode | ||||
| from searx.utils import searx_useragent | ||||
| from cgi import escape | ||||
| from datetime import datetime | ||||
| import re | ||||
| 
 | ||||
| 
 | ||||
| categories = ['science'] | ||||
| 
 | ||||
| base_url = 'https://api.base-search.net/cgi-bin/BaseHttpSearchInterface.fcgi'\ | ||||
|            + '?func=PerformSearch&{query}&boost=oa&hits={hits}&offset={offset}' | ||||
| 
 | ||||
| # engine dependent config | ||||
| paging = True | ||||
| number_of_results = 10 | ||||
| 
 | ||||
| # shortcuts for advanced search | ||||
| shorcut_dict = { | ||||
|     # user-friendly keywords | ||||
|     'format:': 'dcformat:', | ||||
|     'author:': 'dccreator:', | ||||
|     'collection:': 'dccollection:', | ||||
|     'hdate:': 'dchdate:', | ||||
|     'contributor:': 'dccontributor:', | ||||
|     'coverage:': 'dccoverage:', | ||||
|     'date:': 'dcdate:', | ||||
|     'abstract:': 'dcdescription:', | ||||
|     'urls:': 'dcidentifier:', | ||||
|     'language:': 'dclanguage:', | ||||
|     'publisher:': 'dcpublisher:', | ||||
|     'relation:': 'dcrelation:', | ||||
|     'rights:': 'dcrights:', | ||||
|     'source:': 'dcsource:', | ||||
|     'subject:': 'dcsubject:', | ||||
|     'title:': 'dctitle:', | ||||
|     'type:': 'dcdctype:' | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| def request(query, params): | ||||
|     # replace shortcuts with API advanced search keywords | ||||
|     for key in shorcut_dict.keys(): | ||||
|         query = re.sub(str(key), str(shorcut_dict[key]), query) | ||||
| 
 | ||||
|     # basic search | ||||
|     offset = (params['pageno'] - 1) * number_of_results | ||||
| 
 | ||||
|     string_args = dict(query=urlencode({'query': query}), | ||||
|                        offset=offset, | ||||
|                        hits=number_of_results) | ||||
| 
 | ||||
|     params['url'] = base_url.format(**string_args) | ||||
| 
 | ||||
|     params['headers']['User-Agent'] = searx_useragent() | ||||
|     return params | ||||
| 
 | ||||
| 
 | ||||
| def response(resp): | ||||
|     results = [] | ||||
| 
 | ||||
|     search_results = etree.XML(resp.content) | ||||
| 
 | ||||
|     for entry in search_results.xpath('./result/doc'): | ||||
|         content = "No description available" | ||||
| 
 | ||||
|         date = datetime.now()  # needed in case no dcdate is available for an item | ||||
|         for item in entry: | ||||
|             if item.attrib["name"] == "dchdate": | ||||
|                 harvestDate = item.text | ||||
| 
 | ||||
|             elif item.attrib["name"] == "dcdate": | ||||
|                 date = item.text | ||||
| 
 | ||||
|             elif item.attrib["name"] == "dctitle": | ||||
|                 title = item.text | ||||
| 
 | ||||
|             elif item.attrib["name"] == "dclink": | ||||
|                 url = item.text | ||||
| 
 | ||||
|             elif item.attrib["name"] == "dcdescription": | ||||
|                 content = escape(item.text[:300]) | ||||
|                 if len(item.text) > 300: | ||||
|                     content += "..." | ||||
| 
 | ||||
| # dates returned by the BASE API are not several formats | ||||
|         publishedDate = None | ||||
|         for date_format in ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d', '%Y-%m', '%Y']: | ||||
|             try: | ||||
|                 publishedDate = datetime.strptime(date, date_format) | ||||
|                 break | ||||
|             except: | ||||
|                 pass | ||||
| 
 | ||||
|         if publishedDate is not None: | ||||
|             res_dict = {'url': url, | ||||
|                         'title': title, | ||||
|                         'publishedDate': publishedDate, | ||||
|                         'content': content} | ||||
|         else: | ||||
|             res_dict = {'url': url, | ||||
|                         'title': title, | ||||
|                         'content': content} | ||||
| 
 | ||||
|         results.append(res_dict) | ||||
| 
 | ||||
|     return results | ||||
| @ -38,6 +38,10 @@ engines: | ||||
|     engine : archlinux | ||||
|     shortcut : al | ||||
| 
 | ||||
|   - name : base | ||||
|     engine : base | ||||
|     shortcut : bs | ||||
| 
 | ||||
|   - name : wikipedia | ||||
|     engine : mediawiki | ||||
|     shortcut : wp | ||||
|  | ||||
| @ -408,17 +408,21 @@ def index(): | ||||
| 
 | ||||
|         # TODO, check if timezone is calculated right | ||||
|         if 'publishedDate' in result: | ||||
|             result['pubdate'] = result['publishedDate'].strftime('%Y-%m-%d %H:%M:%S%z') | ||||
|             if result['publishedDate'].replace(tzinfo=None) >= datetime.now() - timedelta(days=1): | ||||
|                 timedifference = datetime.now() - result['publishedDate'].replace(tzinfo=None) | ||||
|                 minutes = int((timedifference.seconds / 60) % 60) | ||||
|                 hours = int(timedifference.seconds / 60 / 60) | ||||
|                 if hours == 0: | ||||
|                     result['publishedDate'] = gettext(u'{minutes} minute(s) ago').format(minutes=minutes) | ||||
|                 else: | ||||
|                     result['publishedDate'] = gettext(u'{hours} hour(s), {minutes} minute(s) ago').format(hours=hours, minutes=minutes)  # noqa | ||||
|             try:  # test if publishedDate >= 1900 (datetime module bug) | ||||
|                 result['pubdate'] = result['publishedDate'].strftime('%Y-%m-%d %H:%M:%S%z') | ||||
|             except ValueError: | ||||
|                 result['publishedDate'] = None | ||||
|             else: | ||||
|                 result['publishedDate'] = format_date(result['publishedDate']) | ||||
|                 if result['publishedDate'].replace(tzinfo=None) >= datetime.now() - timedelta(days=1): | ||||
|                     timedifference = datetime.now() - result['publishedDate'].replace(tzinfo=None) | ||||
|                     minutes = int((timedifference.seconds / 60) % 60) | ||||
|                     hours = int(timedifference.seconds / 60 / 60) | ||||
|                     if hours == 0: | ||||
|                         result['publishedDate'] = gettext(u'{minutes} minute(s) ago').format(minutes=minutes) | ||||
|                     else: | ||||
|                         result['publishedDate'] = gettext(u'{hours} hour(s), {minutes} minute(s) ago').format(hours=hours, minutes=minutes)  # noqa | ||||
|                 else: | ||||
|                     result['publishedDate'] = format_date(result['publishedDate']) | ||||
| 
 | ||||
|     if search.request_data.get('format') == 'json': | ||||
|         return Response(json.dumps({'query': search.query, | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Adam Tauber
						Adam Tauber