[fix] semantic scholar: method not allowed / engine doesn't work

Fixes the semantic scholar engine by extracting a ui version token.

BTW: remove html tags from the content.

Author's checklist:

- they are ratelimiting very fast, if you do approx more than 2 requests per
  minute, you have to wait some time again...

- they also have an official api at api.semanticscholar.org, but it's ratelimits
  are even harder

Closes: https://github.com/searxng/searxng/issues/4685
This commit is contained in:
Bnyro 2025-04-29 22:34:44 +02:00 committed by Markus Heiser
parent 41e3a0baa7
commit 590b211652

View File

@ -1,11 +1,14 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
"""Semantic Scholar (Science) """Semantic Scholar (Science)"""
"""
from json import dumps, loads from json import dumps
from datetime import datetime from datetime import datetime
from lxml import html
from flask_babel import gettext from flask_babel import gettext
from searx.network import get
from searx.utils import eval_xpath_getindex, gen_useragent, html_to_text
about = { about = {
"website": 'https://www.semanticscholar.org/', "website": 'https://www.semanticscholar.org/',
@ -19,13 +22,31 @@ about = {
categories = ['science', 'scientific publications'] categories = ['science', 'scientific publications']
paging = True paging = True
search_url = 'https://www.semanticscholar.org/api/1/search' search_url = 'https://www.semanticscholar.org/api/1/search'
paper_url = 'https://www.semanticscholar.org/paper' base_url = 'https://www.semanticscholar.org'
def _get_ui_version():
resp = get(base_url)
if not resp.ok:
raise RuntimeError("Can't determine Semantic Scholar UI version")
doc = html.fromstring(resp.text)
ui_version = eval_xpath_getindex(doc, "//meta[@name='s2-ui-version']/@content", 0)
if not ui_version:
raise RuntimeError("Can't determine Semantic Scholar UI version")
return ui_version
def request(query, params): def request(query, params):
params['url'] = search_url params['url'] = search_url
params['method'] = 'POST' params['method'] = 'POST'
params['headers']['content-type'] = 'application/json' params['headers'] = {
'Content-Type': 'application/json',
'X-S2-UI-Version': _get_ui_version(),
'X-S2-Client': "webapp-browser",
'User-Agent': gen_useragent(),
}
params['data'] = dumps( params['data'] = dumps(
{ {
"queryString": query, "queryString": query,
@ -43,7 +64,8 @@ def request(query, params):
def response(resp): def response(resp):
res = loads(resp.text) res = resp.json()
results = [] results = []
for result in res['results']: for result in res['results']:
url = result.get('primaryPaperLink', {}).get('url') url = result.get('primaryPaperLink', {}).get('url')
@ -54,7 +76,7 @@ def response(resp):
if alternatePaperLinks: if alternatePaperLinks:
url = alternatePaperLinks[0].get('url') url = alternatePaperLinks[0].get('url')
if not url: if not url:
url = paper_url + '/%s' % result['id'] url = base_url + '/paper/%s' % result['id']
# publishedDate # publishedDate
if 'pubDate' in result: if 'pubDate' in result:
@ -88,7 +110,7 @@ def response(resp):
'template': 'paper.html', 'template': 'paper.html',
'url': url, 'url': url,
'title': result['title']['text'], 'title': result['title']['text'],
'content': result['paperAbstract']['text'], 'content': html_to_text(result['paperAbstract']['text']),
'journal': result.get('venue', {}).get('text') or result.get('journal', {}).get('name'), 'journal': result.get('venue', {}).get('text') or result.get('journal', {}).get('name'),
'doi': result.get('doiInfo', {}).get('doi'), 'doi': result.get('doiInfo', {}).get('doi'),
'tags': result.get('fieldsOfStudy'), 'tags': result.get('fieldsOfStudy'),