From 40feede51e19d17128e685e14f70d0257b0457b5 Mon Sep 17 00:00:00 2001 From: Tan Yong Sheng <64836390+tan-yong-sheng@users.noreply.github.com> Date: Sun, 16 Mar 2025 07:00:47 +0000 Subject: [PATCH] [fix] engine: core.ac.uk implement API v3 / v2 is no longer supported --- docs/dev/engines/online/core.rst | 13 +++ searx/engines/core.py | 164 +++++++++++++++++++------------ searx/engines/mariadb_server.py | 2 +- 3 files changed, 114 insertions(+), 65 deletions(-) create mode 100644 docs/dev/engines/online/core.rst diff --git a/docs/dev/engines/online/core.rst b/docs/dev/engines/online/core.rst new file mode 100644 index 000000000..944d97876 --- /dev/null +++ b/docs/dev/engines/online/core.rst @@ -0,0 +1,13 @@ +.. _core engine: + +==== +CORE +==== + +.. contents:: + :depth: 2 + :local: + :backlinks: entry + +.. automodule:: searx.engines.core + :members: diff --git a/searx/engines/core.py b/searx/engines/core.py index 5305cb224..489b6252b 100644 --- a/searx/engines/core.py +++ b/searx/engines/core.py @@ -1,7 +1,33 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -"""CORE (science) +"""CORE_ (COnnecting REpositories) provides a comprehensive bibliographic +database of the world’s scholarly literature, collecting and indexing +research from repositories and journals. + +.. _CORE: https://core.ac.uk/about + +.. _core engine config: + +Configuration +============= + +The engine has the following additional settings: + +- :py:obj:`api_key` + +.. code:: yaml + + - name: core.ac.uk + engine: core + categories: science + shortcut: cor + api_key: "..." + timeout: 5 + +Implementations +=============== """ +# pylint: disable=too-many-branches from datetime import datetime from urllib.parse import urlencode @@ -11,34 +37,36 @@ from searx.exceptions import SearxEngineAPIException about = { "website": 'https://core.ac.uk', "wikidata_id": 'Q22661180', - "official_api_documentation": 'https://core.ac.uk/documentation/api/', + "official_api_documentation": 'https://api.core.ac.uk/docs/v3', "use_official_api": True, "require_api_key": True, "results": 'JSON', } +api_key = 'unset' +"""For an API key register at https://core.ac.uk/services/api and insert +the API key in the engine :ref:`core engine config`.""" + categories = ['science', 'scientific publications'] paging = True nb_per_page = 10 - -api_key = 'unset' - -base_url = 'https://core.ac.uk:443/api-v2/search/' -search_string = '{query}?page={page}&pageSize={nb_per_page}&apiKey={apikey}' +base_url = 'https://api.core.ac.uk/v3/search/works/' def request(query, params): - if api_key == 'unset': raise SearxEngineAPIException('missing CORE API key') - search_path = search_string.format( - query=urlencode({'q': query}), - nb_per_page=nb_per_page, - page=params['pageno'], - apikey=api_key, - ) - params['url'] = base_url + search_path + # API v3 uses different parameters + search_params = { + 'q': query, + 'offset': (params['pageno'] - 1) * nb_per_page, + 'limit': nb_per_page, + 'sort': 'relevance', + } + + params['url'] = base_url + '?' + urlencode(search_params) + params['headers'] = {'Authorization': f'Bearer {api_key}'} return params @@ -47,68 +75,76 @@ def response(resp): results = [] json_data = resp.json() - for result in json_data['data']: - source = result['_source'] - url = None - if source.get('urls'): - url = source['urls'][0].replace('http://', 'https://', 1) - - if url is None and source.get('doi'): - # use the DOI reference - url = 'https://doi.org/' + source['doi'] - - if url is None and source.get('downloadUrl'): - # use the downloadUrl - url = source['downloadUrl'] - - if url is None and source.get('identifiers'): - # try to find an ark id, see - # https://www.wikidata.org/wiki/Property:P8091 - # and https://en.wikipedia.org/wiki/Archival_Resource_Key - arkids = [ - identifier[5:] # 5 is the length of "ark:/" - for identifier in source.get('identifiers') - if isinstance(identifier, str) and identifier.startswith('ark:/') - ] - if len(arkids) > 0: - url = 'https://n2t.net/' + arkids[0] - - if url is None: + for result in json_data.get('results', []): + # Get title + if not result.get('title'): continue - publishedDate = None - time = source['publishedDate'] or source['depositedDate'] - if time: - publishedDate = datetime.fromtimestamp(time / 1000) + # Get URL - try different options + url = None - # sometimes the 'title' is None / filter None values - journals = [j['title'] for j in (source.get('journals') or []) if j['title']] + # Try DOI first + doi = result.get('doi') + if doi: + url = f'https://doi.org/{doi}' - publisher = source['publisher'] + if url is None and result.get('doi'): + # use the DOI reference + url = 'https://doi.org/' + str(result['doi']) + elif result.get('id'): + url = 'https://core.ac.uk/works/' + str(result['id']) + elif result.get('downloadUrl'): + url = result['downloadUrl'] + elif result.get('sourceFulltextUrls'): + url = result['sourceFulltextUrls'] + else: + continue + + # Published date + published_date = None + + raw_date = result.get('publishedDate') or result.get('depositedDate') + if raw_date: + try: + published_date = datetime.fromisoformat(result['publishedDate'].replace('Z', '+00:00')) + except (ValueError, AttributeError): + pass + + # Handle journals + journals = [] + if result.get('journals'): + journals = [j.get('title') for j in result['journals'] if j.get('title')] + + # Handle publisher + publisher = result.get('publisher', '').strip("'") if publisher: - publisher = source['publisher'].strip("'") + publisher = publisher.strip("'") + + # Handle authors + authors = set() + for i in result.get('authors', []): + name = i.get("name") + if name: + authors.add(name) results.append( { 'template': 'paper.html', - 'title': source['title'], + 'title': result.get('title'), 'url': url, - 'content': source['description'] or '', + 'content': result.get('fullText', '') or '', # 'comments': '', - 'tags': source['topics'], - 'publishedDate': publishedDate, - 'type': (source['types'] or [None])[0], - 'authors': source['authors'], - 'editor': ', '.join(source['contributors'] or []), + 'tags': result.get('fieldOfStudy', []), + 'publishedDate': published_date, + 'type': result.get('documentType', '') or '', + 'authors': authors, + 'editor': ', '.join(result.get('contributors', [])), 'publisher': publisher, 'journal': ', '.join(journals), - # 'volume': '', - # 'pages' : '', - # 'number': '', - 'doi': source['doi'], - 'issn': [x for x in [source.get('issn')] if x], - 'isbn': [x for x in [source.get('isbn')] if x], # exists in the rawRecordXml - 'pdf_url': source.get('repositoryDocument', {}).get('pdfOrigin'), + 'doi': result.get('doi'), + # 'issn' : '' + # 'isbn' : '' + 'pdf_url': result.get('downloadUrl', {}) or result.get("sourceFulltextUrls", {}), } ) diff --git a/searx/engines/mariadb_server.py b/searx/engines/mariadb_server.py index 26f537373..4c1ccd363 100644 --- a/searx/engines/mariadb_server.py +++ b/searx/engines/mariadb_server.py @@ -29,7 +29,7 @@ Implementations from typing import TYPE_CHECKING try: - import mariadb + import mariadb # pyright: ignore [reportMissingImports] except ImportError: # import error is ignored because the admin has to install mysql manually to use # the engine