[mod] implement searx.wikidata_units for unit converters

This commit is contained in:
Markus Heiser 2025-05-04 20:37:06 +02:00
parent 8bee676935
commit 7a49e95687
3 changed files with 234 additions and 190 deletions

View File

@ -15,7 +15,7 @@ import babel.numbers
from flask_babel import gettext, get_locale
from searx import data
from searx.units import symbol_to_si
from searx.plugins import Plugin, PluginInfo
from searx.result_types import EngineResults
@ -86,132 +86,6 @@ RE_MEASURE = r'''
'''
ADDITIONAL_UNITS = [
{
"si_name": "Q11579",
"symbol": "°C",
"to_si": lambda val: val + 273.15,
"from_si": lambda val: val - 273.15,
},
{
"si_name": "Q11579",
"symbol": "°F",
"to_si": lambda val: (val + 459.67) * 5 / 9,
"from_si": lambda val: (val * 9 / 5) - 459.67,
},
]
"""Additional items to convert from a measure unit to a SI unit (vice versa).
.. code:: python
{
"si_name": "Q11579", # Wikidata item ID of the SI unit (Kelvin)
"symbol": "°C", # symbol of the measure unit
"to_si": lambda val: val + 273.15, # convert measure value (val) to SI unit
"from_si": lambda val: val - 273.15, # convert SI value (val) measure unit
},
{
"si_name": "Q11573",
"symbol": "mi",
"to_si": 1609.344, # convert measure value (val) to SI unit
"from_si": 1 / 1609.344 # convert SI value (val) measure unit
},
The values of ``to_si`` and ``from_si`` can be of :py:obj:`float` (a multiplier)
or a callable_ (val in / converted value returned).
.. _callable: https://docs.python.org/3/glossary.html#term-callable
"""
ALIAS_SYMBOLS = {
'°C': ('C',),
'°F': ('F',),
'mi': ('L',),
}
"""Alias symbols for known unit of measure symbols / by example::
'°C': ('C', ...), # list of alias symbols for °C (Q69362731)
'°F': ('F', ...), # list of alias symbols for °F (Q99490479)
'mi': ('L',), # list of alias symbols for mi (Q253276)
"""
SYMBOL_TO_SI = []
def symbol_to_si():
"""Generates a list of tuples, each tuple is a measure unit and the fields
in the tuple are:
0. Symbol of the measure unit (e.g. 'mi' for measure unit 'miles' Q253276)
1. SI name of the measure unit (e.g. Q11573 for SI unit 'metre')
2. Factor to get SI value from measure unit (e.g. 1mi is equal to SI 1m
multiplied by 1609.344)
3. Factor to get measure value from from SI value (e.g. SI 100m is equal to
100mi divided by 1609.344)
The returned list is sorted, the first items are created from
``WIKIDATA_UNITS``, the second group of items is build from
:py:obj:`ADDITIONAL_UNITS` and items created from :py:obj:`ALIAS_SYMBOLS`.
If you search this list for a symbol, then a match with a symbol from
Wikidata has the highest weighting (first hit in the list), followed by the
symbols from the :py:obj:`ADDITIONAL_UNITS` and the lowest weighting is
given to the symbols resulting from the aliases :py:obj:`ALIAS_SYMBOLS`.
"""
global SYMBOL_TO_SI # pylint: disable=global-statement
if SYMBOL_TO_SI:
return SYMBOL_TO_SI
# filter out units which can't be normalized to a SI unit and filter out
# units without a symbol / arcsecond does not have a symbol
# https://www.wikidata.org/wiki/Q829073
for item in data.WIKIDATA_UNITS.values():
if item['to_si_factor'] and item['symbol']:
SYMBOL_TO_SI.append(
(
item['symbol'],
item['si_name'],
1 / item['to_si_factor'], # from_si
item['to_si_factor'], # to_si
item['symbol'],
)
)
for item in ADDITIONAL_UNITS:
SYMBOL_TO_SI.append(
(
item['symbol'],
item['si_name'],
item['from_si'],
item['to_si'],
item['symbol'],
)
)
alias_items = []
for item in SYMBOL_TO_SI:
for alias in ALIAS_SYMBOLS.get(item[0], ()):
alias_items.append(
(
alias,
item[1],
item[2], # from_si
item[3], # to_si
item[0], # origin unit
)
)
SYMBOL_TO_SI = SYMBOL_TO_SI + alias_items
return SYMBOL_TO_SI
def _parse_text_and_convert(from_query, to_query) -> str | None:
# pylint: disable=too-many-branches, too-many-locals

231
searx/wikidata_units.py Normal file
View File

@ -0,0 +1,231 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Unit conversion on the basis of `SPARQL/WIKIDATA Precision, Units and
Coordinates`_
.. _SPARQL/WIKIDATA Precision, Units and Coordinates:
https://en.wikibooks.org/wiki/SPARQL/WIKIDATA_Precision,_Units_and_Coordinates#Quantities
"""
__all__ = ["convert_from_si", "convert_to_si", "symbol_to_si"]
import collections
from searx import data
from searx.engines import wikidata
ADDITIONAL_UNITS = [
{
"si_name": "Q11579",
"symbol": "°C",
"to_si": lambda val: val + 273.15,
"from_si": lambda val: val - 273.15,
},
{
"si_name": "Q11579",
"symbol": "°F",
"to_si": lambda val: (val + 459.67) * 5 / 9,
"from_si": lambda val: (val * 9 / 5) - 459.67,
},
]
"""Additional items to convert from a measure unit to a SI unit (vice versa).
.. code:: python
{
"si_name": "Q11579", # Wikidata item ID of the SI unit (Kelvin)
"symbol": "°C", # symbol of the measure unit
"to_si": lambda val: val + 273.15, # convert measure value (val) to SI unit
"from_si": lambda val: val - 273.15, # convert SI value (val) measure unit
},
{
"si_name": "Q11573",
"symbol": "mi",
"to_si": 1609.344, # convert measure value (val) to SI unit
"from_si": 1 / 1609.344 # convert SI value (val) measure unit
},
The values of ``to_si`` and ``from_si`` can be of :py:obj:`float` (a multiplier)
or a callable_ (val in / converted value returned).
.. _callable: https://docs.python.org/3/glossary.html#term-callable
"""
ALIAS_SYMBOLS = {
'°C': ('C',),
'°F': ('F',),
'mi': ('L',),
}
"""Alias symbols for known unit of measure symbols / by example::
'°C': ('C', ...), # list of alias symbols for °C (Q69362731)
'°F': ('F', ...), # list of alias symbols for °F (Q99490479)
'mi': ('L',), # list of alias symbols for mi (Q253276)
"""
SYMBOL_TO_SI = []
UNITS_BY_SI_NAME: dict | None = None
def convert_from_si(si_name: str, symbol: str, value: float | int) -> float:
from_si = units_by_si_name(si_name)[symbol][symbol]["from_si"]
if isinstance(from_si, (float, int)):
value = float(value) * from_si
else:
value = from_si(float(value))
return value
def convert_to_si(si_name: str, symbol: str, value: float | int) -> float:
to_si = units_by_si_name(si_name)[symbol][symbol]["to_si"]
if isinstance(to_si, (float, int)):
value = float(value) * to_si
else:
value = to_si(float(value))
return value
def units_by_si_name(si_name):
global UNITS_BY_SI_NAME
if UNITS_BY_SI_NAME is not None:
return UNITS_BY_SI_NAME[si_name]
UNITS_BY_SI_NAME = {}
for item in symbol_to_si():
by_symbol = UNITS_BY_SI_NAME.get(si_name)
if by_symbol is None:
by_symbol = {}
UNITS_BY_SI_NAME[si_name] = by_symbol
by_symbol[item["symbol"]] = item
return UNITS_BY_SI_NAME[si_name]
def symbol_to_si():
"""Generates a list of tuples, each tuple is a measure unit and the fields
in the tuple are:
0. Symbol of the measure unit (e.g. 'mi' for measure unit 'miles' Q253276)
1. SI name of the measure unit (e.g. Q11573 for SI unit 'metre')
2. Factor to get SI value from measure unit (e.g. 1mi is equal to SI 1m
multiplied by 1609.344)
3. Factor to get measure value from from SI value (e.g. SI 100m is equal to
100mi divided by 1609.344)
The returned list is sorted, the first items are created from
``WIKIDATA_UNITS``, the second group of items is build from
:py:obj:`ADDITIONAL_UNITS` and items created from :py:obj:`ALIAS_SYMBOLS`.
If you search this list for a symbol, then a match with a symbol from
Wikidata has the highest weighting (first hit in the list), followed by the
symbols from the :py:obj:`ADDITIONAL_UNITS` and the lowest weighting is
given to the symbols resulting from the aliases :py:obj:`ALIAS_SYMBOLS`.
"""
global SYMBOL_TO_SI # pylint: disable=global-statement
if SYMBOL_TO_SI:
return SYMBOL_TO_SI
# filter out units which can't be normalized to a SI unit and filter out
# units without a symbol / arcsecond does not have a symbol
# https://www.wikidata.org/wiki/Q829073
for item in data.WIKIDATA_UNITS.values():
if item['to_si_factor'] and item['symbol']:
SYMBOL_TO_SI.append(
(
item['symbol'],
item['si_name'],
1 / item['to_si_factor'], # from_si
item['to_si_factor'], # to_si
item['symbol'],
)
)
for item in ADDITIONAL_UNITS:
SYMBOL_TO_SI.append(
(
item['symbol'],
item['si_name'],
item['from_si'],
item['to_si'],
item['symbol'],
)
)
alias_items = []
for item in SYMBOL_TO_SI:
for alias in ALIAS_SYMBOLS.get(item[0], ()):
alias_items.append(
(
alias,
item[1],
item[2], # from_si
item[3], # to_si
item[0], # origin unit
)
)
SYMBOL_TO_SI = SYMBOL_TO_SI + alias_items
return SYMBOL_TO_SI
# the response contains duplicate ?item with the different ?symbol
# "ORDER BY ?item DESC(?rank) ?symbol" provides a deterministic result
# even if a ?item has different ?symbol of the same rank.
# A deterministic result
# see:
# * https://www.wikidata.org/wiki/Help:Ranking
# * https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format ("Statement representation" section)
# * https://w.wiki/32BT
# * https://en.wikibooks.org/wiki/SPARQL/WIKIDATA_Precision,_Units_and_Coordinates#Quantities
# see the result for https://www.wikidata.org/wiki/Q11582
# there are multiple symbols the same rank
SARQL_REQUEST = """
SELECT DISTINCT ?item ?symbol ?tosi ?tosiUnit
WHERE
{
?item wdt:P31/wdt:P279 wd:Q47574 .
?item p:P5061 ?symbolP .
?symbolP ps:P5061 ?symbol ;
wikibase:rank ?rank .
OPTIONAL {
?item p:P2370 ?tosistmt .
?tosistmt psv:P2370 ?tosinode .
?tosinode wikibase:quantityAmount ?tosi .
?tosinode wikibase:quantityUnit ?tosiUnit .
}
FILTER(LANG(?symbol) = "en").
}
ORDER BY ?item DESC(?rank) ?symbol
"""
def fetch_units():
"""Fetch units from Wikidata. Function is used to update persistence of
:py:obj:`searx.data.WIKIDATA_UNITS`."""
results = collections.OrderedDict()
response = wikidata.send_wikidata_query(SARQL_REQUEST)
for unit in response['results']['bindings']:
symbol = unit['symbol']['value']
name = unit['item']['value'].rsplit('/', 1)[1]
si_name = unit.get('tosiUnit', {}).get('value', '')
if si_name:
si_name = si_name.rsplit('/', 1)[1]
to_si_factor = unit.get('tosi', {}).get('value', '')
if name not in results:
# ignore duplicate: always use the first one
results[name] = {
'symbol': symbol,
'si_name': si_name if si_name else None,
'to_si_factor': float(to_si_factor) if to_si_factor else None,
}
return results

View File

@ -8,76 +8,15 @@ Output file: :origin:`searx/data/wikidata_units.json` (:origin:`CI Update data
"""
import json
import collections
# set path
from os.path import join
from searx import searx_dir
from searx.engines import wikidata, set_loggers
from searx.data import data_dir
from searx.wikidata_units import fetch_units
DATA_FILE = data_dir / 'wikidata_units.json'
set_loggers(wikidata, 'wikidata')
# the response contains duplicate ?item with the different ?symbol
# "ORDER BY ?item DESC(?rank) ?symbol" provides a deterministic result
# even if a ?item has different ?symbol of the same rank.
# A deterministic result
# see:
# * https://www.wikidata.org/wiki/Help:Ranking
# * https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format ("Statement representation" section)
# * https://w.wiki/32BT
# * https://en.wikibooks.org/wiki/SPARQL/WIKIDATA_Precision,_Units_and_Coordinates#Quantities
# see the result for https://www.wikidata.org/wiki/Q11582
# there are multiple symbols the same rank
SARQL_REQUEST = """
SELECT DISTINCT ?item ?symbol ?tosi ?tosiUnit
WHERE
{
?item wdt:P31/wdt:P279 wd:Q47574 .
?item p:P5061 ?symbolP .
?symbolP ps:P5061 ?symbol ;
wikibase:rank ?rank .
OPTIONAL {
?item p:P2370 ?tosistmt .
?tosistmt psv:P2370 ?tosinode .
?tosinode wikibase:quantityAmount ?tosi .
?tosinode wikibase:quantityUnit ?tosiUnit .
}
FILTER(LANG(?symbol) = "en").
}
ORDER BY ?item DESC(?rank) ?symbol
"""
def get_data():
results = collections.OrderedDict()
response = wikidata.send_wikidata_query(SARQL_REQUEST)
for unit in response['results']['bindings']:
symbol = unit['symbol']['value']
name = unit['item']['value'].rsplit('/', 1)[1]
si_name = unit.get('tosiUnit', {}).get('value', '')
if si_name:
si_name = si_name.rsplit('/', 1)[1]
to_si_factor = unit.get('tosi', {}).get('value', '')
if name not in results:
# ignore duplicate: always use the first one
results[name] = {
'symbol': symbol,
'si_name': si_name if si_name else None,
'to_si_factor': float(to_si_factor) if to_si_factor else None,
}
return results
def get_wikidata_units_filename():
return join(join(searx_dir, "data"), "")
if __name__ == '__main__':
with DATA_FILE.open('w', encoding="utf8") as f:
json.dump(get_data(), f, indent=4, sort_keys=True, ensure_ascii=False)
json.dump(fetch_units(), f, indent=4, sort_keys=True, ensure_ascii=False)