From 7a49e956877e443ca64e4142b1cccd7a7f208a06 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 4 May 2025 20:37:06 +0200 Subject: [PATCH] [mod] implement searx.wikidata_units for unit converters --- searx/plugins/unit_converter.py | 128 +--------- searx/wikidata_units.py | 231 ++++++++++++++++++ searxng_extra/update/update_wikidata_units.py | 65 +---- 3 files changed, 234 insertions(+), 190 deletions(-) create mode 100644 searx/wikidata_units.py diff --git a/searx/plugins/unit_converter.py b/searx/plugins/unit_converter.py index 2bab598f2..0072afe55 100644 --- a/searx/plugins/unit_converter.py +++ b/searx/plugins/unit_converter.py @@ -15,7 +15,7 @@ import babel.numbers from flask_babel import gettext, get_locale -from searx import data +from searx.units import symbol_to_si from searx.plugins import Plugin, PluginInfo from searx.result_types import EngineResults @@ -86,132 +86,6 @@ RE_MEASURE = r''' ''' -ADDITIONAL_UNITS = [ - { - "si_name": "Q11579", - "symbol": "°C", - "to_si": lambda val: val + 273.15, - "from_si": lambda val: val - 273.15, - }, - { - "si_name": "Q11579", - "symbol": "°F", - "to_si": lambda val: (val + 459.67) * 5 / 9, - "from_si": lambda val: (val * 9 / 5) - 459.67, - }, -] -"""Additional items to convert from a measure unit to a SI unit (vice versa). - -.. code:: python - - { - "si_name": "Q11579", # Wikidata item ID of the SI unit (Kelvin) - "symbol": "°C", # symbol of the measure unit - "to_si": lambda val: val + 273.15, # convert measure value (val) to SI unit - "from_si": lambda val: val - 273.15, # convert SI value (val) measure unit - }, - { - "si_name": "Q11573", - "symbol": "mi", - "to_si": 1609.344, # convert measure value (val) to SI unit - "from_si": 1 / 1609.344 # convert SI value (val) measure unit - }, - -The values of ``to_si`` and ``from_si`` can be of :py:obj:`float` (a multiplier) -or a callable_ (val in / converted value returned). - -.. _callable: https://docs.python.org/3/glossary.html#term-callable -""" - - -ALIAS_SYMBOLS = { - '°C': ('C',), - '°F': ('F',), - 'mi': ('L',), -} -"""Alias symbols for known unit of measure symbols / by example:: - - '°C': ('C', ...), # list of alias symbols for °C (Q69362731) - '°F': ('F', ...), # list of alias symbols for °F (Q99490479) - 'mi': ('L',), # list of alias symbols for mi (Q253276) -""" - - -SYMBOL_TO_SI = [] - - -def symbol_to_si(): - """Generates a list of tuples, each tuple is a measure unit and the fields - in the tuple are: - - 0. Symbol of the measure unit (e.g. 'mi' for measure unit 'miles' Q253276) - - 1. SI name of the measure unit (e.g. Q11573 for SI unit 'metre') - - 2. Factor to get SI value from measure unit (e.g. 1mi is equal to SI 1m - multiplied by 1609.344) - - 3. Factor to get measure value from from SI value (e.g. SI 100m is equal to - 100mi divided by 1609.344) - - The returned list is sorted, the first items are created from - ``WIKIDATA_UNITS``, the second group of items is build from - :py:obj:`ADDITIONAL_UNITS` and items created from :py:obj:`ALIAS_SYMBOLS`. - - If you search this list for a symbol, then a match with a symbol from - Wikidata has the highest weighting (first hit in the list), followed by the - symbols from the :py:obj:`ADDITIONAL_UNITS` and the lowest weighting is - given to the symbols resulting from the aliases :py:obj:`ALIAS_SYMBOLS`. - - """ - - global SYMBOL_TO_SI # pylint: disable=global-statement - if SYMBOL_TO_SI: - return SYMBOL_TO_SI - - # filter out units which can't be normalized to a SI unit and filter out - # units without a symbol / arcsecond does not have a symbol - # https://www.wikidata.org/wiki/Q829073 - - for item in data.WIKIDATA_UNITS.values(): - if item['to_si_factor'] and item['symbol']: - SYMBOL_TO_SI.append( - ( - item['symbol'], - item['si_name'], - 1 / item['to_si_factor'], # from_si - item['to_si_factor'], # to_si - item['symbol'], - ) - ) - - for item in ADDITIONAL_UNITS: - SYMBOL_TO_SI.append( - ( - item['symbol'], - item['si_name'], - item['from_si'], - item['to_si'], - item['symbol'], - ) - ) - - alias_items = [] - for item in SYMBOL_TO_SI: - for alias in ALIAS_SYMBOLS.get(item[0], ()): - alias_items.append( - ( - alias, - item[1], - item[2], # from_si - item[3], # to_si - item[0], # origin unit - ) - ) - SYMBOL_TO_SI = SYMBOL_TO_SI + alias_items - return SYMBOL_TO_SI - - def _parse_text_and_convert(from_query, to_query) -> str | None: # pylint: disable=too-many-branches, too-many-locals diff --git a/searx/wikidata_units.py b/searx/wikidata_units.py new file mode 100644 index 000000000..9fc94585f --- /dev/null +++ b/searx/wikidata_units.py @@ -0,0 +1,231 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Unit conversion on the basis of `SPARQL/WIKIDATA Precision, Units and +Coordinates`_ + +.. _SPARQL/WIKIDATA Precision, Units and Coordinates: + https://en.wikibooks.org/wiki/SPARQL/WIKIDATA_Precision,_Units_and_Coordinates#Quantities +""" + +__all__ = ["convert_from_si", "convert_to_si", "symbol_to_si"] + +import collections + +from searx import data +from searx.engines import wikidata + +ADDITIONAL_UNITS = [ + { + "si_name": "Q11579", + "symbol": "°C", + "to_si": lambda val: val + 273.15, + "from_si": lambda val: val - 273.15, + }, + { + "si_name": "Q11579", + "symbol": "°F", + "to_si": lambda val: (val + 459.67) * 5 / 9, + "from_si": lambda val: (val * 9 / 5) - 459.67, + }, +] +"""Additional items to convert from a measure unit to a SI unit (vice versa). + +.. code:: python + + { + "si_name": "Q11579", # Wikidata item ID of the SI unit (Kelvin) + "symbol": "°C", # symbol of the measure unit + "to_si": lambda val: val + 273.15, # convert measure value (val) to SI unit + "from_si": lambda val: val - 273.15, # convert SI value (val) measure unit + }, + { + "si_name": "Q11573", + "symbol": "mi", + "to_si": 1609.344, # convert measure value (val) to SI unit + "from_si": 1 / 1609.344 # convert SI value (val) measure unit + }, + +The values of ``to_si`` and ``from_si`` can be of :py:obj:`float` (a multiplier) +or a callable_ (val in / converted value returned). + +.. _callable: https://docs.python.org/3/glossary.html#term-callable +""" + + +ALIAS_SYMBOLS = { + '°C': ('C',), + '°F': ('F',), + 'mi': ('L',), +} +"""Alias symbols for known unit of measure symbols / by example:: + + '°C': ('C', ...), # list of alias symbols for °C (Q69362731) + '°F': ('F', ...), # list of alias symbols for °F (Q99490479) + 'mi': ('L',), # list of alias symbols for mi (Q253276) +""" + + +SYMBOL_TO_SI = [] +UNITS_BY_SI_NAME: dict | None = None + + +def convert_from_si(si_name: str, symbol: str, value: float | int) -> float: + from_si = units_by_si_name(si_name)[symbol][symbol]["from_si"] + if isinstance(from_si, (float, int)): + value = float(value) * from_si + else: + value = from_si(float(value)) + return value + + +def convert_to_si(si_name: str, symbol: str, value: float | int) -> float: + to_si = units_by_si_name(si_name)[symbol][symbol]["to_si"] + if isinstance(to_si, (float, int)): + value = float(value) * to_si + else: + value = to_si(float(value)) + return value + + +def units_by_si_name(si_name): + + global UNITS_BY_SI_NAME + if UNITS_BY_SI_NAME is not None: + return UNITS_BY_SI_NAME[si_name] + + UNITS_BY_SI_NAME = {} + for item in symbol_to_si(): + by_symbol = UNITS_BY_SI_NAME.get(si_name) + if by_symbol is None: + by_symbol = {} + UNITS_BY_SI_NAME[si_name] = by_symbol + by_symbol[item["symbol"]] = item + return UNITS_BY_SI_NAME[si_name] + + +def symbol_to_si(): + """Generates a list of tuples, each tuple is a measure unit and the fields + in the tuple are: + + 0. Symbol of the measure unit (e.g. 'mi' for measure unit 'miles' Q253276) + + 1. SI name of the measure unit (e.g. Q11573 for SI unit 'metre') + + 2. Factor to get SI value from measure unit (e.g. 1mi is equal to SI 1m + multiplied by 1609.344) + + 3. Factor to get measure value from from SI value (e.g. SI 100m is equal to + 100mi divided by 1609.344) + + The returned list is sorted, the first items are created from + ``WIKIDATA_UNITS``, the second group of items is build from + :py:obj:`ADDITIONAL_UNITS` and items created from :py:obj:`ALIAS_SYMBOLS`. + + If you search this list for a symbol, then a match with a symbol from + Wikidata has the highest weighting (first hit in the list), followed by the + symbols from the :py:obj:`ADDITIONAL_UNITS` and the lowest weighting is + given to the symbols resulting from the aliases :py:obj:`ALIAS_SYMBOLS`. + + """ + + global SYMBOL_TO_SI # pylint: disable=global-statement + if SYMBOL_TO_SI: + return SYMBOL_TO_SI + + # filter out units which can't be normalized to a SI unit and filter out + # units without a symbol / arcsecond does not have a symbol + # https://www.wikidata.org/wiki/Q829073 + + for item in data.WIKIDATA_UNITS.values(): + if item['to_si_factor'] and item['symbol']: + SYMBOL_TO_SI.append( + ( + item['symbol'], + item['si_name'], + 1 / item['to_si_factor'], # from_si + item['to_si_factor'], # to_si + item['symbol'], + ) + ) + + for item in ADDITIONAL_UNITS: + SYMBOL_TO_SI.append( + ( + item['symbol'], + item['si_name'], + item['from_si'], + item['to_si'], + item['symbol'], + ) + ) + + alias_items = [] + for item in SYMBOL_TO_SI: + for alias in ALIAS_SYMBOLS.get(item[0], ()): + alias_items.append( + ( + alias, + item[1], + item[2], # from_si + item[3], # to_si + item[0], # origin unit + ) + ) + SYMBOL_TO_SI = SYMBOL_TO_SI + alias_items + return SYMBOL_TO_SI + + +# the response contains duplicate ?item with the different ?symbol +# "ORDER BY ?item DESC(?rank) ?symbol" provides a deterministic result +# even if a ?item has different ?symbol of the same rank. +# A deterministic result +# see: +# * https://www.wikidata.org/wiki/Help:Ranking +# * https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format ("Statement representation" section) +# * https://w.wiki/32BT +# * https://en.wikibooks.org/wiki/SPARQL/WIKIDATA_Precision,_Units_and_Coordinates#Quantities +# see the result for https://www.wikidata.org/wiki/Q11582 +# there are multiple symbols the same rank + +SARQL_REQUEST = """ +SELECT DISTINCT ?item ?symbol ?tosi ?tosiUnit +WHERE +{ + ?item wdt:P31/wdt:P279 wd:Q47574 . + ?item p:P5061 ?symbolP . + ?symbolP ps:P5061 ?symbol ; + wikibase:rank ?rank . + OPTIONAL { + ?item p:P2370 ?tosistmt . + ?tosistmt psv:P2370 ?tosinode . + ?tosinode wikibase:quantityAmount ?tosi . + ?tosinode wikibase:quantityUnit ?tosiUnit . + } + FILTER(LANG(?symbol) = "en"). +} +ORDER BY ?item DESC(?rank) ?symbol +""" + + +def fetch_units(): + """Fetch units from Wikidata. Function is used to update persistence of + :py:obj:`searx.data.WIKIDATA_UNITS`.""" + + results = collections.OrderedDict() + response = wikidata.send_wikidata_query(SARQL_REQUEST) + for unit in response['results']['bindings']: + + symbol = unit['symbol']['value'] + name = unit['item']['value'].rsplit('/', 1)[1] + si_name = unit.get('tosiUnit', {}).get('value', '') + if si_name: + si_name = si_name.rsplit('/', 1)[1] + + to_si_factor = unit.get('tosi', {}).get('value', '') + if name not in results: + # ignore duplicate: always use the first one + results[name] = { + 'symbol': symbol, + 'si_name': si_name if si_name else None, + 'to_si_factor': float(to_si_factor) if to_si_factor else None, + } + return results diff --git a/searxng_extra/update/update_wikidata_units.py b/searxng_extra/update/update_wikidata_units.py index 96326874a..d815dc85d 100755 --- a/searxng_extra/update/update_wikidata_units.py +++ b/searxng_extra/update/update_wikidata_units.py @@ -8,76 +8,15 @@ Output file: :origin:`searx/data/wikidata_units.json` (:origin:`CI Update data """ import json -import collections -# set path -from os.path import join - -from searx import searx_dir from searx.engines import wikidata, set_loggers from searx.data import data_dir +from searx.wikidata_units import fetch_units DATA_FILE = data_dir / 'wikidata_units.json' - set_loggers(wikidata, 'wikidata') -# the response contains duplicate ?item with the different ?symbol -# "ORDER BY ?item DESC(?rank) ?symbol" provides a deterministic result -# even if a ?item has different ?symbol of the same rank. -# A deterministic result -# see: -# * https://www.wikidata.org/wiki/Help:Ranking -# * https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format ("Statement representation" section) -# * https://w.wiki/32BT -# * https://en.wikibooks.org/wiki/SPARQL/WIKIDATA_Precision,_Units_and_Coordinates#Quantities -# see the result for https://www.wikidata.org/wiki/Q11582 -# there are multiple symbols the same rank -SARQL_REQUEST = """ -SELECT DISTINCT ?item ?symbol ?tosi ?tosiUnit -WHERE -{ - ?item wdt:P31/wdt:P279 wd:Q47574 . - ?item p:P5061 ?symbolP . - ?symbolP ps:P5061 ?symbol ; - wikibase:rank ?rank . - OPTIONAL { - ?item p:P2370 ?tosistmt . - ?tosistmt psv:P2370 ?tosinode . - ?tosinode wikibase:quantityAmount ?tosi . - ?tosinode wikibase:quantityUnit ?tosiUnit . - } - FILTER(LANG(?symbol) = "en"). -} -ORDER BY ?item DESC(?rank) ?symbol -""" - - -def get_data(): - results = collections.OrderedDict() - response = wikidata.send_wikidata_query(SARQL_REQUEST) - for unit in response['results']['bindings']: - - symbol = unit['symbol']['value'] - name = unit['item']['value'].rsplit('/', 1)[1] - si_name = unit.get('tosiUnit', {}).get('value', '') - if si_name: - si_name = si_name.rsplit('/', 1)[1] - - to_si_factor = unit.get('tosi', {}).get('value', '') - if name not in results: - # ignore duplicate: always use the first one - results[name] = { - 'symbol': symbol, - 'si_name': si_name if si_name else None, - 'to_si_factor': float(to_si_factor) if to_si_factor else None, - } - return results - - -def get_wikidata_units_filename(): - return join(join(searx_dir, "data"), "") - if __name__ == '__main__': with DATA_FILE.open('w', encoding="utf8") as f: - json.dump(get_data(), f, indent=4, sort_keys=True, ensure_ascii=False) + json.dump(fetch_units(), f, indent=4, sort_keys=True, ensure_ascii=False)