[fix] semantic scholar: method not allowed / engine doesn't work
Fixes the semantic scholar engine by extracting a ui version token. BTW: remove html tags from the content. Author's checklist: - they are ratelimiting very fast, if you do approx more than 2 requests per minute, you have to wait some time again... - they also have an official api at api.semanticscholar.org, but it's ratelimits are even harder Closes: https://github.com/searxng/searxng/issues/4685
This commit is contained in:
		
							parent
							
								
									60e31eacfc
								
							
						
					
					
						commit
						2757f8ec33
					
				@ -1,11 +1,14 @@
 | 
				
			|||||||
# SPDX-License-Identifier: AGPL-3.0-or-later
 | 
					# SPDX-License-Identifier: AGPL-3.0-or-later
 | 
				
			||||||
"""Semantic Scholar (Science)
 | 
					"""Semantic Scholar (Science)"""
 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from json import dumps, loads
 | 
					from json import dumps
 | 
				
			||||||
from datetime import datetime
 | 
					from datetime import datetime
 | 
				
			||||||
 | 
					from lxml import html
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from flask_babel import gettext
 | 
					from flask_babel import gettext
 | 
				
			||||||
 | 
					from searx.network import get
 | 
				
			||||||
 | 
					from searx.utils import eval_xpath_getindex, gen_useragent, html_to_text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
about = {
 | 
					about = {
 | 
				
			||||||
    "website": 'https://www.semanticscholar.org/',
 | 
					    "website": 'https://www.semanticscholar.org/',
 | 
				
			||||||
@ -19,13 +22,31 @@ about = {
 | 
				
			|||||||
categories = ['science', 'scientific publications']
 | 
					categories = ['science', 'scientific publications']
 | 
				
			||||||
paging = True
 | 
					paging = True
 | 
				
			||||||
search_url = 'https://www.semanticscholar.org/api/1/search'
 | 
					search_url = 'https://www.semanticscholar.org/api/1/search'
 | 
				
			||||||
paper_url = 'https://www.semanticscholar.org/paper'
 | 
					base_url = 'https://www.semanticscholar.org'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _get_ui_version():
 | 
				
			||||||
 | 
					    resp = get(base_url)
 | 
				
			||||||
 | 
					    if not resp.ok:
 | 
				
			||||||
 | 
					        raise RuntimeError("Can't determine Semantic Scholar UI version")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    doc = html.fromstring(resp.text)
 | 
				
			||||||
 | 
					    ui_version = eval_xpath_getindex(doc, "//meta[@name='s2-ui-version']/@content", 0)
 | 
				
			||||||
 | 
					    if not ui_version:
 | 
				
			||||||
 | 
					        raise RuntimeError("Can't determine Semantic Scholar UI version")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return ui_version
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def request(query, params):
 | 
					def request(query, params):
 | 
				
			||||||
    params['url'] = search_url
 | 
					    params['url'] = search_url
 | 
				
			||||||
    params['method'] = 'POST'
 | 
					    params['method'] = 'POST'
 | 
				
			||||||
    params['headers']['content-type'] = 'application/json'
 | 
					    params['headers'] = {
 | 
				
			||||||
 | 
					        'Content-Type': 'application/json',
 | 
				
			||||||
 | 
					        'X-S2-UI-Version': _get_ui_version(),
 | 
				
			||||||
 | 
					        'X-S2-Client': "webapp-browser",
 | 
				
			||||||
 | 
					        'User-Agent': gen_useragent(),
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
    params['data'] = dumps(
 | 
					    params['data'] = dumps(
 | 
				
			||||||
        {
 | 
					        {
 | 
				
			||||||
            "queryString": query,
 | 
					            "queryString": query,
 | 
				
			||||||
@ -43,7 +64,8 @@ def request(query, params):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def response(resp):
 | 
					def response(resp):
 | 
				
			||||||
    res = loads(resp.text)
 | 
					    res = resp.json()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    results = []
 | 
					    results = []
 | 
				
			||||||
    for result in res['results']:
 | 
					    for result in res['results']:
 | 
				
			||||||
        url = result.get('primaryPaperLink', {}).get('url')
 | 
					        url = result.get('primaryPaperLink', {}).get('url')
 | 
				
			||||||
@ -54,7 +76,7 @@ def response(resp):
 | 
				
			|||||||
            if alternatePaperLinks:
 | 
					            if alternatePaperLinks:
 | 
				
			||||||
                url = alternatePaperLinks[0].get('url')
 | 
					                url = alternatePaperLinks[0].get('url')
 | 
				
			||||||
        if not url:
 | 
					        if not url:
 | 
				
			||||||
            url = paper_url + '/%s' % result['id']
 | 
					            url = base_url + '/paper/%s' % result['id']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # publishedDate
 | 
					        # publishedDate
 | 
				
			||||||
        if 'pubDate' in result:
 | 
					        if 'pubDate' in result:
 | 
				
			||||||
@ -88,7 +110,7 @@ def response(resp):
 | 
				
			|||||||
                'template': 'paper.html',
 | 
					                'template': 'paper.html',
 | 
				
			||||||
                'url': url,
 | 
					                'url': url,
 | 
				
			||||||
                'title': result['title']['text'],
 | 
					                'title': result['title']['text'],
 | 
				
			||||||
                'content': result['paperAbstract']['text'],
 | 
					                'content': html_to_text(result['paperAbstract']['text']),
 | 
				
			||||||
                'journal': result.get('venue', {}).get('text') or result.get('journal', {}).get('name'),
 | 
					                'journal': result.get('venue', {}).get('text') or result.get('journal', {}).get('name'),
 | 
				
			||||||
                'doi': result.get('doiInfo', {}).get('doi'),
 | 
					                'doi': result.get('doiInfo', {}).get('doi'),
 | 
				
			||||||
                'tags': result.get('fieldsOfStudy'),
 | 
					                'tags': result.get('fieldsOfStudy'),
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user