[mod] implement searx.wikidata_units for unit converters
This commit is contained in:
parent
8bee676935
commit
7a49e95687
@ -15,7 +15,7 @@ import babel.numbers
|
||||
|
||||
from flask_babel import gettext, get_locale
|
||||
|
||||
from searx import data
|
||||
from searx.units import symbol_to_si
|
||||
from searx.plugins import Plugin, PluginInfo
|
||||
from searx.result_types import EngineResults
|
||||
|
||||
@ -86,132 +86,6 @@ RE_MEASURE = r'''
|
||||
'''
|
||||
|
||||
|
||||
ADDITIONAL_UNITS = [
|
||||
{
|
||||
"si_name": "Q11579",
|
||||
"symbol": "°C",
|
||||
"to_si": lambda val: val + 273.15,
|
||||
"from_si": lambda val: val - 273.15,
|
||||
},
|
||||
{
|
||||
"si_name": "Q11579",
|
||||
"symbol": "°F",
|
||||
"to_si": lambda val: (val + 459.67) * 5 / 9,
|
||||
"from_si": lambda val: (val * 9 / 5) - 459.67,
|
||||
},
|
||||
]
|
||||
"""Additional items to convert from a measure unit to a SI unit (vice versa).
|
||||
|
||||
.. code:: python
|
||||
|
||||
{
|
||||
"si_name": "Q11579", # Wikidata item ID of the SI unit (Kelvin)
|
||||
"symbol": "°C", # symbol of the measure unit
|
||||
"to_si": lambda val: val + 273.15, # convert measure value (val) to SI unit
|
||||
"from_si": lambda val: val - 273.15, # convert SI value (val) measure unit
|
||||
},
|
||||
{
|
||||
"si_name": "Q11573",
|
||||
"symbol": "mi",
|
||||
"to_si": 1609.344, # convert measure value (val) to SI unit
|
||||
"from_si": 1 / 1609.344 # convert SI value (val) measure unit
|
||||
},
|
||||
|
||||
The values of ``to_si`` and ``from_si`` can be of :py:obj:`float` (a multiplier)
|
||||
or a callable_ (val in / converted value returned).
|
||||
|
||||
.. _callable: https://docs.python.org/3/glossary.html#term-callable
|
||||
"""
|
||||
|
||||
|
||||
ALIAS_SYMBOLS = {
|
||||
'°C': ('C',),
|
||||
'°F': ('F',),
|
||||
'mi': ('L',),
|
||||
}
|
||||
"""Alias symbols for known unit of measure symbols / by example::
|
||||
|
||||
'°C': ('C', ...), # list of alias symbols for °C (Q69362731)
|
||||
'°F': ('F', ...), # list of alias symbols for °F (Q99490479)
|
||||
'mi': ('L',), # list of alias symbols for mi (Q253276)
|
||||
"""
|
||||
|
||||
|
||||
SYMBOL_TO_SI = []
|
||||
|
||||
|
||||
def symbol_to_si():
|
||||
"""Generates a list of tuples, each tuple is a measure unit and the fields
|
||||
in the tuple are:
|
||||
|
||||
0. Symbol of the measure unit (e.g. 'mi' for measure unit 'miles' Q253276)
|
||||
|
||||
1. SI name of the measure unit (e.g. Q11573 for SI unit 'metre')
|
||||
|
||||
2. Factor to get SI value from measure unit (e.g. 1mi is equal to SI 1m
|
||||
multiplied by 1609.344)
|
||||
|
||||
3. Factor to get measure value from from SI value (e.g. SI 100m is equal to
|
||||
100mi divided by 1609.344)
|
||||
|
||||
The returned list is sorted, the first items are created from
|
||||
``WIKIDATA_UNITS``, the second group of items is build from
|
||||
:py:obj:`ADDITIONAL_UNITS` and items created from :py:obj:`ALIAS_SYMBOLS`.
|
||||
|
||||
If you search this list for a symbol, then a match with a symbol from
|
||||
Wikidata has the highest weighting (first hit in the list), followed by the
|
||||
symbols from the :py:obj:`ADDITIONAL_UNITS` and the lowest weighting is
|
||||
given to the symbols resulting from the aliases :py:obj:`ALIAS_SYMBOLS`.
|
||||
|
||||
"""
|
||||
|
||||
global SYMBOL_TO_SI # pylint: disable=global-statement
|
||||
if SYMBOL_TO_SI:
|
||||
return SYMBOL_TO_SI
|
||||
|
||||
# filter out units which can't be normalized to a SI unit and filter out
|
||||
# units without a symbol / arcsecond does not have a symbol
|
||||
# https://www.wikidata.org/wiki/Q829073
|
||||
|
||||
for item in data.WIKIDATA_UNITS.values():
|
||||
if item['to_si_factor'] and item['symbol']:
|
||||
SYMBOL_TO_SI.append(
|
||||
(
|
||||
item['symbol'],
|
||||
item['si_name'],
|
||||
1 / item['to_si_factor'], # from_si
|
||||
item['to_si_factor'], # to_si
|
||||
item['symbol'],
|
||||
)
|
||||
)
|
||||
|
||||
for item in ADDITIONAL_UNITS:
|
||||
SYMBOL_TO_SI.append(
|
||||
(
|
||||
item['symbol'],
|
||||
item['si_name'],
|
||||
item['from_si'],
|
||||
item['to_si'],
|
||||
item['symbol'],
|
||||
)
|
||||
)
|
||||
|
||||
alias_items = []
|
||||
for item in SYMBOL_TO_SI:
|
||||
for alias in ALIAS_SYMBOLS.get(item[0], ()):
|
||||
alias_items.append(
|
||||
(
|
||||
alias,
|
||||
item[1],
|
||||
item[2], # from_si
|
||||
item[3], # to_si
|
||||
item[0], # origin unit
|
||||
)
|
||||
)
|
||||
SYMBOL_TO_SI = SYMBOL_TO_SI + alias_items
|
||||
return SYMBOL_TO_SI
|
||||
|
||||
|
||||
def _parse_text_and_convert(from_query, to_query) -> str | None:
|
||||
|
||||
# pylint: disable=too-many-branches, too-many-locals
|
||||
|
231
searx/wikidata_units.py
Normal file
231
searx/wikidata_units.py
Normal file
@ -0,0 +1,231 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Unit conversion on the basis of `SPARQL/WIKIDATA Precision, Units and
|
||||
Coordinates`_
|
||||
|
||||
.. _SPARQL/WIKIDATA Precision, Units and Coordinates:
|
||||
https://en.wikibooks.org/wiki/SPARQL/WIKIDATA_Precision,_Units_and_Coordinates#Quantities
|
||||
"""
|
||||
|
||||
__all__ = ["convert_from_si", "convert_to_si", "symbol_to_si"]
|
||||
|
||||
import collections
|
||||
|
||||
from searx import data
|
||||
from searx.engines import wikidata
|
||||
|
||||
ADDITIONAL_UNITS = [
|
||||
{
|
||||
"si_name": "Q11579",
|
||||
"symbol": "°C",
|
||||
"to_si": lambda val: val + 273.15,
|
||||
"from_si": lambda val: val - 273.15,
|
||||
},
|
||||
{
|
||||
"si_name": "Q11579",
|
||||
"symbol": "°F",
|
||||
"to_si": lambda val: (val + 459.67) * 5 / 9,
|
||||
"from_si": lambda val: (val * 9 / 5) - 459.67,
|
||||
},
|
||||
]
|
||||
"""Additional items to convert from a measure unit to a SI unit (vice versa).
|
||||
|
||||
.. code:: python
|
||||
|
||||
{
|
||||
"si_name": "Q11579", # Wikidata item ID of the SI unit (Kelvin)
|
||||
"symbol": "°C", # symbol of the measure unit
|
||||
"to_si": lambda val: val + 273.15, # convert measure value (val) to SI unit
|
||||
"from_si": lambda val: val - 273.15, # convert SI value (val) measure unit
|
||||
},
|
||||
{
|
||||
"si_name": "Q11573",
|
||||
"symbol": "mi",
|
||||
"to_si": 1609.344, # convert measure value (val) to SI unit
|
||||
"from_si": 1 / 1609.344 # convert SI value (val) measure unit
|
||||
},
|
||||
|
||||
The values of ``to_si`` and ``from_si`` can be of :py:obj:`float` (a multiplier)
|
||||
or a callable_ (val in / converted value returned).
|
||||
|
||||
.. _callable: https://docs.python.org/3/glossary.html#term-callable
|
||||
"""
|
||||
|
||||
|
||||
ALIAS_SYMBOLS = {
|
||||
'°C': ('C',),
|
||||
'°F': ('F',),
|
||||
'mi': ('L',),
|
||||
}
|
||||
"""Alias symbols for known unit of measure symbols / by example::
|
||||
|
||||
'°C': ('C', ...), # list of alias symbols for °C (Q69362731)
|
||||
'°F': ('F', ...), # list of alias symbols for °F (Q99490479)
|
||||
'mi': ('L',), # list of alias symbols for mi (Q253276)
|
||||
"""
|
||||
|
||||
|
||||
SYMBOL_TO_SI = []
|
||||
UNITS_BY_SI_NAME: dict | None = None
|
||||
|
||||
|
||||
def convert_from_si(si_name: str, symbol: str, value: float | int) -> float:
|
||||
from_si = units_by_si_name(si_name)[symbol][symbol]["from_si"]
|
||||
if isinstance(from_si, (float, int)):
|
||||
value = float(value) * from_si
|
||||
else:
|
||||
value = from_si(float(value))
|
||||
return value
|
||||
|
||||
|
||||
def convert_to_si(si_name: str, symbol: str, value: float | int) -> float:
|
||||
to_si = units_by_si_name(si_name)[symbol][symbol]["to_si"]
|
||||
if isinstance(to_si, (float, int)):
|
||||
value = float(value) * to_si
|
||||
else:
|
||||
value = to_si(float(value))
|
||||
return value
|
||||
|
||||
|
||||
def units_by_si_name(si_name):
|
||||
|
||||
global UNITS_BY_SI_NAME
|
||||
if UNITS_BY_SI_NAME is not None:
|
||||
return UNITS_BY_SI_NAME[si_name]
|
||||
|
||||
UNITS_BY_SI_NAME = {}
|
||||
for item in symbol_to_si():
|
||||
by_symbol = UNITS_BY_SI_NAME.get(si_name)
|
||||
if by_symbol is None:
|
||||
by_symbol = {}
|
||||
UNITS_BY_SI_NAME[si_name] = by_symbol
|
||||
by_symbol[item["symbol"]] = item
|
||||
return UNITS_BY_SI_NAME[si_name]
|
||||
|
||||
|
||||
def symbol_to_si():
|
||||
"""Generates a list of tuples, each tuple is a measure unit and the fields
|
||||
in the tuple are:
|
||||
|
||||
0. Symbol of the measure unit (e.g. 'mi' for measure unit 'miles' Q253276)
|
||||
|
||||
1. SI name of the measure unit (e.g. Q11573 for SI unit 'metre')
|
||||
|
||||
2. Factor to get SI value from measure unit (e.g. 1mi is equal to SI 1m
|
||||
multiplied by 1609.344)
|
||||
|
||||
3. Factor to get measure value from from SI value (e.g. SI 100m is equal to
|
||||
100mi divided by 1609.344)
|
||||
|
||||
The returned list is sorted, the first items are created from
|
||||
``WIKIDATA_UNITS``, the second group of items is build from
|
||||
:py:obj:`ADDITIONAL_UNITS` and items created from :py:obj:`ALIAS_SYMBOLS`.
|
||||
|
||||
If you search this list for a symbol, then a match with a symbol from
|
||||
Wikidata has the highest weighting (first hit in the list), followed by the
|
||||
symbols from the :py:obj:`ADDITIONAL_UNITS` and the lowest weighting is
|
||||
given to the symbols resulting from the aliases :py:obj:`ALIAS_SYMBOLS`.
|
||||
|
||||
"""
|
||||
|
||||
global SYMBOL_TO_SI # pylint: disable=global-statement
|
||||
if SYMBOL_TO_SI:
|
||||
return SYMBOL_TO_SI
|
||||
|
||||
# filter out units which can't be normalized to a SI unit and filter out
|
||||
# units without a symbol / arcsecond does not have a symbol
|
||||
# https://www.wikidata.org/wiki/Q829073
|
||||
|
||||
for item in data.WIKIDATA_UNITS.values():
|
||||
if item['to_si_factor'] and item['symbol']:
|
||||
SYMBOL_TO_SI.append(
|
||||
(
|
||||
item['symbol'],
|
||||
item['si_name'],
|
||||
1 / item['to_si_factor'], # from_si
|
||||
item['to_si_factor'], # to_si
|
||||
item['symbol'],
|
||||
)
|
||||
)
|
||||
|
||||
for item in ADDITIONAL_UNITS:
|
||||
SYMBOL_TO_SI.append(
|
||||
(
|
||||
item['symbol'],
|
||||
item['si_name'],
|
||||
item['from_si'],
|
||||
item['to_si'],
|
||||
item['symbol'],
|
||||
)
|
||||
)
|
||||
|
||||
alias_items = []
|
||||
for item in SYMBOL_TO_SI:
|
||||
for alias in ALIAS_SYMBOLS.get(item[0], ()):
|
||||
alias_items.append(
|
||||
(
|
||||
alias,
|
||||
item[1],
|
||||
item[2], # from_si
|
||||
item[3], # to_si
|
||||
item[0], # origin unit
|
||||
)
|
||||
)
|
||||
SYMBOL_TO_SI = SYMBOL_TO_SI + alias_items
|
||||
return SYMBOL_TO_SI
|
||||
|
||||
|
||||
# the response contains duplicate ?item with the different ?symbol
|
||||
# "ORDER BY ?item DESC(?rank) ?symbol" provides a deterministic result
|
||||
# even if a ?item has different ?symbol of the same rank.
|
||||
# A deterministic result
|
||||
# see:
|
||||
# * https://www.wikidata.org/wiki/Help:Ranking
|
||||
# * https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format ("Statement representation" section)
|
||||
# * https://w.wiki/32BT
|
||||
# * https://en.wikibooks.org/wiki/SPARQL/WIKIDATA_Precision,_Units_and_Coordinates#Quantities
|
||||
# see the result for https://www.wikidata.org/wiki/Q11582
|
||||
# there are multiple symbols the same rank
|
||||
|
||||
SARQL_REQUEST = """
|
||||
SELECT DISTINCT ?item ?symbol ?tosi ?tosiUnit
|
||||
WHERE
|
||||
{
|
||||
?item wdt:P31/wdt:P279 wd:Q47574 .
|
||||
?item p:P5061 ?symbolP .
|
||||
?symbolP ps:P5061 ?symbol ;
|
||||
wikibase:rank ?rank .
|
||||
OPTIONAL {
|
||||
?item p:P2370 ?tosistmt .
|
||||
?tosistmt psv:P2370 ?tosinode .
|
||||
?tosinode wikibase:quantityAmount ?tosi .
|
||||
?tosinode wikibase:quantityUnit ?tosiUnit .
|
||||
}
|
||||
FILTER(LANG(?symbol) = "en").
|
||||
}
|
||||
ORDER BY ?item DESC(?rank) ?symbol
|
||||
"""
|
||||
|
||||
|
||||
def fetch_units():
|
||||
"""Fetch units from Wikidata. Function is used to update persistence of
|
||||
:py:obj:`searx.data.WIKIDATA_UNITS`."""
|
||||
|
||||
results = collections.OrderedDict()
|
||||
response = wikidata.send_wikidata_query(SARQL_REQUEST)
|
||||
for unit in response['results']['bindings']:
|
||||
|
||||
symbol = unit['symbol']['value']
|
||||
name = unit['item']['value'].rsplit('/', 1)[1]
|
||||
si_name = unit.get('tosiUnit', {}).get('value', '')
|
||||
if si_name:
|
||||
si_name = si_name.rsplit('/', 1)[1]
|
||||
|
||||
to_si_factor = unit.get('tosi', {}).get('value', '')
|
||||
if name not in results:
|
||||
# ignore duplicate: always use the first one
|
||||
results[name] = {
|
||||
'symbol': symbol,
|
||||
'si_name': si_name if si_name else None,
|
||||
'to_si_factor': float(to_si_factor) if to_si_factor else None,
|
||||
}
|
||||
return results
|
@ -8,76 +8,15 @@ Output file: :origin:`searx/data/wikidata_units.json` (:origin:`CI Update data
|
||||
"""
|
||||
|
||||
import json
|
||||
import collections
|
||||
|
||||
# set path
|
||||
from os.path import join
|
||||
|
||||
from searx import searx_dir
|
||||
from searx.engines import wikidata, set_loggers
|
||||
from searx.data import data_dir
|
||||
from searx.wikidata_units import fetch_units
|
||||
|
||||
DATA_FILE = data_dir / 'wikidata_units.json'
|
||||
|
||||
set_loggers(wikidata, 'wikidata')
|
||||
|
||||
# the response contains duplicate ?item with the different ?symbol
|
||||
# "ORDER BY ?item DESC(?rank) ?symbol" provides a deterministic result
|
||||
# even if a ?item has different ?symbol of the same rank.
|
||||
# A deterministic result
|
||||
# see:
|
||||
# * https://www.wikidata.org/wiki/Help:Ranking
|
||||
# * https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format ("Statement representation" section)
|
||||
# * https://w.wiki/32BT
|
||||
# * https://en.wikibooks.org/wiki/SPARQL/WIKIDATA_Precision,_Units_and_Coordinates#Quantities
|
||||
# see the result for https://www.wikidata.org/wiki/Q11582
|
||||
# there are multiple symbols the same rank
|
||||
SARQL_REQUEST = """
|
||||
SELECT DISTINCT ?item ?symbol ?tosi ?tosiUnit
|
||||
WHERE
|
||||
{
|
||||
?item wdt:P31/wdt:P279 wd:Q47574 .
|
||||
?item p:P5061 ?symbolP .
|
||||
?symbolP ps:P5061 ?symbol ;
|
||||
wikibase:rank ?rank .
|
||||
OPTIONAL {
|
||||
?item p:P2370 ?tosistmt .
|
||||
?tosistmt psv:P2370 ?tosinode .
|
||||
?tosinode wikibase:quantityAmount ?tosi .
|
||||
?tosinode wikibase:quantityUnit ?tosiUnit .
|
||||
}
|
||||
FILTER(LANG(?symbol) = "en").
|
||||
}
|
||||
ORDER BY ?item DESC(?rank) ?symbol
|
||||
"""
|
||||
|
||||
|
||||
def get_data():
|
||||
results = collections.OrderedDict()
|
||||
response = wikidata.send_wikidata_query(SARQL_REQUEST)
|
||||
for unit in response['results']['bindings']:
|
||||
|
||||
symbol = unit['symbol']['value']
|
||||
name = unit['item']['value'].rsplit('/', 1)[1]
|
||||
si_name = unit.get('tosiUnit', {}).get('value', '')
|
||||
if si_name:
|
||||
si_name = si_name.rsplit('/', 1)[1]
|
||||
|
||||
to_si_factor = unit.get('tosi', {}).get('value', '')
|
||||
if name not in results:
|
||||
# ignore duplicate: always use the first one
|
||||
results[name] = {
|
||||
'symbol': symbol,
|
||||
'si_name': si_name if si_name else None,
|
||||
'to_si_factor': float(to_si_factor) if to_si_factor else None,
|
||||
}
|
||||
return results
|
||||
|
||||
|
||||
def get_wikidata_units_filename():
|
||||
return join(join(searx_dir, "data"), "")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
with DATA_FILE.open('w', encoding="utf8") as f:
|
||||
json.dump(get_data(), f, indent=4, sort_keys=True, ensure_ascii=False)
|
||||
json.dump(fetch_units(), f, indent=4, sort_keys=True, ensure_ascii=False)
|
||||
|
Loading…
x
Reference in New Issue
Block a user